r - Remove rows in data set based on multiple criteria -
my data set contains animal id, date, year, month, , day. need remove animal ids have less 40 locations (in case 40 rows in r) in given year. in other words, animal id = 1 has 20 locations in 2001; therefore, remove individual data set. need calculate how many months worth of data there remaining set of records. in other words, need have >= 40 locations per animal id per year spanned across @ least 6 months. example: animal id 2 had > 40 rows of data in 2001 met first criteria mentioned above 40 rows of data in 2001 span 3 months; therefore, individual needs removed data set. can't seem figure out quick way in r subset data set address 2 aforementioned questions.
initial coding i've started working on:
newdata<-data[as.character(ave(data$animal_id, data$animal_id, fun=length)) >= 40, ]
but know isn't correct.
dput(dataset) structure(list(animal_id = c(1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l), date = structure(c(1l, 2l, 39l, 46l, 43l, 53l, 55l, 57l, 62l, 72l, 77l, 77l, 78l, 79l, 80l, 81l, 81l, 81l, 82l, 83l, 84l, 84l, 84l, 85l, 86l, 87l, 87l, 88l, 92l, 102l, 102l, 103l, 104l, 104l, 104l, 104l, 104l, 104l, 104l, 104l, 104l, 105l, 89l, 89l, 90l, 90l, 90l, 91l, 93l, 93l, 94l, 95l, 96l, 96l, 97l, 97l, 98l, 98l, 98l, 98l, 98l, 98l, 98l, 98l, 99l, 100l, 117l, 118l, 120l, 106l, 108l, 109l, 111l, 115l, 116l, 3l, 3l, 8l, 13l, 15l, 16l, 17l, 18l, 19l, 4l, 45l, 47l, 51l, 48l, 52l, 52l, 61l, 63l, 63l, 64l, 54l, 56l, 58l, 58l, 59l, 60l, 60l, 60l, 71l, 73l, 74l, 75l, 76l, 76l, 65l, 66l, 66l, 67l, 68l, 69l, 70l, 40l, 41l, 42l, 44l, 45l, 47l, 49l, 49l, 49l, 49l, 49l, 49l, 49l, 49l, 50l, 50l, 51l, 89l, 90l, 91l, 93l, 94l, 94l, 94l, 94l, 94l, 94l, 94l, 96l, 97l, 99l, 100l, 100l, 101l, 117l, 118l, 118l, 119l, 120l, 121l, 106l, 107l, 107l, 108l, 109l, 110l, 111l, 112l, 113l, 114l, 114l, 115l, 115l, 116l, 3l, 3l, 8l, 13l, 17l, 18l, 18l, 19l, 4l, 5l, 5l, 6l, 7l, 9l, 9l, 10l, 11l, 12l, 14l, 14l, 26l, 27l, 28l, 29l, 30l, 20l, 20l, 21l, 21l, 22l, 23l, 24l, 25l, 34l, 35l, 37l, 38l, 31l, 32l, 33l, 36l), .label = c("1/23/2001", "1/30/2001", "10/1/2002", "10/10/2002", "10/14/2002", "10/17/2002", "10/18/2002", "10/2/2002", "10/21/2002", "10/23/2002", "10/25/2002", "10/28/2002", "10/3/2002", "10/30/2002", "10/4/2002", "10/6/2002", "10/7/2002", "10/8/2002", "10/9/2002", "11/12/2002", "11/13/2002", "11/15/2002", "11/21/2002", "11/25/2002", "11/27/2002", "11/4/2002", "11/5/2002", "11/6/2002", "11/7/2002", "11/8/2002", "12/11/2002", "12/13/2002", "12/17/2002", "12/2/2002", "12/3/2002", "12/30/2002", "12/6/2002", "12/9/2002", "2/21/2001", "3/11/2002", "3/13/2002", "3/22/2002", "3/23/2001", "3/23/2002", "3/25/2002", "3/8/2001", "4/1/2002", "4/10/2002", "4/2/2002", "4/5/2002", "4/7/2002", "5/1/2002", "5/13/2001", "5/14/2002", "5/15/2001", "5/15/2002", "5/17/2001", "5/20/2002", "5/28/2002", "5/29/2002", "5/3/2002", "5/30/2001", "5/8/2002", "5/9/2002", "6/10/2002", "6/12/2002", "6/13/2002", "6/17/2002", "6/19/2002", "6/20/2002", "6/3/2002", "6/4/2001", "6/4/2002", "6/5/2002", "6/6/2002", "6/7/2002", "7/11/2002", "7/12/2002", "7/15/2002", "7/16/2002", "7/17/2002", "7/18/2002", "7/24/2002", "7/25/2002", "7/27/2002", "7/29/2002", "7/31/2002", "8/1/2002", "8/12/2002", "8/14/2002", "8/19/2002", "8/2/2002", "8/20/2002", "8/21/2002", "8/22/2002", "8/23/2002", "8/26/2002", "8/27/2002", "8/28/2002", "8/29/2002", "8/30/2002", "8/5/2002", "8/7/2002", "8/8/2002", "8/9/2002", "9/10/2002", "9/11/2002", "9/13/2002", "9/16/2002", "9/17/2002", "9/18/2002", "9/19/2002", "9/20/2002", "9/23/2002", "9/25/2002", "9/26/2002", "9/3/2002", "9/4/2002", "9/5/2002", "9/6/2002", "9/9/2002"), class = "factor"), year = c(2001l, 2001l, 2001l, 2001l, 2001l, 2001l, 2001l, 2001l, 2001l, 2001l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l), month = c(1l, 1l, 2l, 3l, 3l, 5l, 5l, 5l, 5l, 6l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 3l, 4l, 4l, 4l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 3l, 3l, 3l, 3l, 3l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l), day = c(23l, 30l, 21l, 8l, 23l, 13l, 15l, 17l, 30l, 4l, 11l, 11l, 12l, 15l, 16l, 17l, 17l, 17l, 18l, 24l, 25l, 25l, 25l, 27l, 29l, 31l, 31l, 1l, 2l, 5l, 5l, 7l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 9l, 12l, 12l, 14l, 14l, 14l, 19l, 20l, 20l, 21l, 22l, 23l, 23l, 26l, 26l, 27l, 27l, 27l, 27l, 27l, 27l, 27l, 27l, 28l, 29l, 3l, 4l, 6l, 10l, 13l, 16l, 18l, 25l, 26l, 1l, 1l, 2l, 3l, 4l, 6l, 7l, 8l, 9l, 10l, 25l, 1l, 7l, 10l, 1l, 1l, 3l, 8l, 8l, 9l, 14l, 15l, 20l, 20l, 28l, 29l, 29l, 29l, 3l, 4l, 5l, 6l, 7l, 7l, 10l, 12l, 12l, 13l, 17l, 19l, 20l, 11l, 13l, 22l, 23l, 25l, 1l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 5l, 5l, 7l, 12l, 14l, 19l, 20l, 21l, 21l, 21l, 21l, 21l, 21l, 21l, 23l, 26l, 28l, 29l, 29l, 30l, 3l, 4l, 4l, 5l, 6l, 9l, 10l, 11l, 11l, 13l, 16l, 17l, 18l, 19l, 20l, 23l, 23l, 25l, 25l, 26l, 1l, 1l, 2l, 3l, 7l, 8l, 8l, 9l, 10l, 14l, 14l, 17l, 18l, 21l, 21l, 23l, 25l, 28l, 30l, 30l, 4l, 5l, 6l, 7l, 8l, 12l, 12l, 13l, 13l, 15l, 21l, 25l, 27l, 2l, 3l, 6l, 9l, 11l, 13l, 17l, 30l)), .names = c("animal_id", "date", "year", "month", "day"), class = "data.frame", row.names = c(na, -211l))
you can in dplyr package. assuming name of dataset animal_data
here how run this.
updated - admit careless before , had made big mistake. following new code set let achieve intended outcome, though sure can still improved.
library(dplyr) animal_data_by_n <- new_data %>% group_by(animal_id, year) %>% filter(n() >= 40) # selecting animals have records greater 40 records given year animal_data_by_n_month <- animal_data_by_n %>% group_by(animal_id, year) %>% summarise(n_month = n_distinct(month)) new_output <- merge(animal_data_by_n, animal_data_by_n_month, by=c("animal_id","year"), all.x=true) final_subset <- subset(new_output, n_month >= 6)
you may remove n_month column later final dataframe
Comments
Post a Comment