r - Remove rows in data set based on multiple criteria -


my data set contains animal id, date, year, month, , day. need remove animal ids have less 40 locations (in case 40 rows in r) in given year. in other words, animal id = 1 has 20 locations in 2001; therefore, remove individual data set. need calculate how many months worth of data there remaining set of records. in other words, need have >= 40 locations per animal id per year spanned across @ least 6 months. example: animal id 2 had > 40 rows of data in 2001 met first criteria mentioned above 40 rows of data in 2001 span 3 months; therefore, individual needs removed data set. can't seem figure out quick way in r subset data set address 2 aforementioned questions.

initial coding i've started working on:

newdata<-data[as.character(ave(data$animal_id, data$animal_id, fun=length)) >= 40, ] 

but know isn't correct.

sample data set

dput(dataset) structure(list(animal_id = c(1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l,  1l, 1l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l,  4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l,  4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l,  4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l,  4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 5l, 5l, 5l,  5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l,  5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 6l, 6l, 6l, 6l,  6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l,  6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l,  6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l,  6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l,  6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l,  6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l), date = structure(c(1l,  2l, 39l, 46l, 43l, 53l, 55l, 57l, 62l, 72l, 77l, 77l, 78l, 79l,  80l, 81l, 81l, 81l, 82l, 83l, 84l, 84l, 84l, 85l, 86l, 87l, 87l,  88l, 92l, 102l, 102l, 103l, 104l, 104l, 104l, 104l, 104l, 104l,  104l, 104l, 104l, 105l, 89l, 89l, 90l, 90l, 90l, 91l, 93l, 93l,  94l, 95l, 96l, 96l, 97l, 97l, 98l, 98l, 98l, 98l, 98l, 98l, 98l,  98l, 99l, 100l, 117l, 118l, 120l, 106l, 108l, 109l, 111l, 115l,  116l, 3l, 3l, 8l, 13l, 15l, 16l, 17l, 18l, 19l, 4l, 45l, 47l,  51l, 48l, 52l, 52l, 61l, 63l, 63l, 64l, 54l, 56l, 58l, 58l, 59l,  60l, 60l, 60l, 71l, 73l, 74l, 75l, 76l, 76l, 65l, 66l, 66l, 67l,  68l, 69l, 70l, 40l, 41l, 42l, 44l, 45l, 47l, 49l, 49l, 49l, 49l,  49l, 49l, 49l, 49l, 50l, 50l, 51l, 89l, 90l, 91l, 93l, 94l, 94l,  94l, 94l, 94l, 94l, 94l, 96l, 97l, 99l, 100l, 100l, 101l, 117l,  118l, 118l, 119l, 120l, 121l, 106l, 107l, 107l, 108l, 109l, 110l,  111l, 112l, 113l, 114l, 114l, 115l, 115l, 116l, 3l, 3l, 8l, 13l,  17l, 18l, 18l, 19l, 4l, 5l, 5l, 6l, 7l, 9l, 9l, 10l, 11l, 12l,  14l, 14l, 26l, 27l, 28l, 29l, 30l, 20l, 20l, 21l, 21l, 22l, 23l,  24l, 25l, 34l, 35l, 37l, 38l, 31l, 32l, 33l, 36l), .label = c("1/23/2001",  "1/30/2001", "10/1/2002", "10/10/2002", "10/14/2002", "10/17/2002",  "10/18/2002", "10/2/2002", "10/21/2002", "10/23/2002", "10/25/2002",  "10/28/2002", "10/3/2002", "10/30/2002", "10/4/2002", "10/6/2002",  "10/7/2002", "10/8/2002", "10/9/2002", "11/12/2002", "11/13/2002",  "11/15/2002", "11/21/2002", "11/25/2002", "11/27/2002", "11/4/2002",  "11/5/2002", "11/6/2002", "11/7/2002", "11/8/2002", "12/11/2002",  "12/13/2002", "12/17/2002", "12/2/2002", "12/3/2002", "12/30/2002",  "12/6/2002", "12/9/2002", "2/21/2001", "3/11/2002", "3/13/2002",  "3/22/2002", "3/23/2001", "3/23/2002", "3/25/2002", "3/8/2001",  "4/1/2002", "4/10/2002", "4/2/2002", "4/5/2002", "4/7/2002",  "5/1/2002", "5/13/2001", "5/14/2002", "5/15/2001", "5/15/2002",  "5/17/2001", "5/20/2002", "5/28/2002", "5/29/2002", "5/3/2002",  "5/30/2001", "5/8/2002", "5/9/2002", "6/10/2002", "6/12/2002",  "6/13/2002", "6/17/2002", "6/19/2002", "6/20/2002", "6/3/2002",  "6/4/2001", "6/4/2002", "6/5/2002", "6/6/2002", "6/7/2002", "7/11/2002",  "7/12/2002", "7/15/2002", "7/16/2002", "7/17/2002", "7/18/2002",  "7/24/2002", "7/25/2002", "7/27/2002", "7/29/2002", "7/31/2002",  "8/1/2002", "8/12/2002", "8/14/2002", "8/19/2002", "8/2/2002",  "8/20/2002", "8/21/2002", "8/22/2002", "8/23/2002", "8/26/2002",  "8/27/2002", "8/28/2002", "8/29/2002", "8/30/2002", "8/5/2002",  "8/7/2002", "8/8/2002", "8/9/2002", "9/10/2002", "9/11/2002",  "9/13/2002", "9/16/2002", "9/17/2002", "9/18/2002", "9/19/2002",  "9/20/2002", "9/23/2002", "9/25/2002", "9/26/2002", "9/3/2002",  "9/4/2002", "9/5/2002", "9/6/2002", "9/9/2002"), class = "factor"),      year = c(2001l, 2001l, 2001l, 2001l, 2001l, 2001l, 2001l,      2001l, 2001l, 2001l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l, 2002l,      2002l, 2002l, 2002l, 2002l, 2002l, 2002l), month = c(1l,      1l, 2l, 3l, 3l, 5l, 5l, 5l, 5l, 6l, 7l, 7l, 7l, 7l, 7l, 7l,      7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 8l, 8l, 8l, 8l,      8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l,      8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l,      8l, 8l, 8l, 8l, 8l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 10l,      10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 3l, 4l, 4l,      4l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l,      6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 3l, 3l,      3l, 3l, 3l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l,      8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l,      8l, 8l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l,      9l, 9l, 9l, 9l, 9l, 9l, 9l, 10l, 10l, 10l, 10l, 10l, 10l,      10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l,      10l, 10l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l,      11l, 11l, 11l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l), day = c(23l,      30l, 21l, 8l, 23l, 13l, 15l, 17l, 30l, 4l, 11l, 11l, 12l,      15l, 16l, 17l, 17l, 17l, 18l, 24l, 25l, 25l, 25l, 27l, 29l,      31l, 31l, 1l, 2l, 5l, 5l, 7l, 8l, 8l, 8l, 8l, 8l, 8l, 8l,      8l, 8l, 9l, 12l, 12l, 14l, 14l, 14l, 19l, 20l, 20l, 21l,      22l, 23l, 23l, 26l, 26l, 27l, 27l, 27l, 27l, 27l, 27l, 27l,      27l, 28l, 29l, 3l, 4l, 6l, 10l, 13l, 16l, 18l, 25l, 26l,      1l, 1l, 2l, 3l, 4l, 6l, 7l, 8l, 9l, 10l, 25l, 1l, 7l, 10l,      1l, 1l, 3l, 8l, 8l, 9l, 14l, 15l, 20l, 20l, 28l, 29l, 29l,      29l, 3l, 4l, 5l, 6l, 7l, 7l, 10l, 12l, 12l, 13l, 17l, 19l,      20l, 11l, 13l, 22l, 23l, 25l, 1l, 2l, 2l, 2l, 2l, 2l, 2l,      2l, 2l, 5l, 5l, 7l, 12l, 14l, 19l, 20l, 21l, 21l, 21l, 21l,      21l, 21l, 21l, 23l, 26l, 28l, 29l, 29l, 30l, 3l, 4l, 4l,      5l, 6l, 9l, 10l, 11l, 11l, 13l, 16l, 17l, 18l, 19l, 20l,      23l, 23l, 25l, 25l, 26l, 1l, 1l, 2l, 3l, 7l, 8l, 8l, 9l,      10l, 14l, 14l, 17l, 18l, 21l, 21l, 23l, 25l, 28l, 30l, 30l,      4l, 5l, 6l, 7l, 8l, 12l, 12l, 13l, 13l, 15l, 21l, 25l, 27l,      2l, 3l, 6l, 9l, 11l, 13l, 17l, 30l)), .names = c("animal_id",  "date", "year", "month", "day"), class = "data.frame", row.names = c(na,  -211l)) 

you can in dplyr package. assuming name of dataset animal_data here how run this.

updated - admit careless before , had made big mistake. following new code set let achieve intended outcome, though sure can still improved.

library(dplyr)  animal_data_by_n <- new_data %>%    group_by(animal_id, year) %>%    filter(n() >= 40) # selecting animals have records greater 40 records given year  animal_data_by_n_month <- animal_data_by_n %>%    group_by(animal_id, year) %>%     summarise(n_month = n_distinct(month))  new_output <- merge(animal_data_by_n, animal_data_by_n_month, by=c("animal_id","year"), all.x=true) final_subset <- subset(new_output, n_month >= 6) 

you may remove n_month column later final dataframe


Comments

Popular posts from this blog

php - Permission denied. Laravel linux server -

google bigquery - Delta between query execution time and Java query call to finish -

python - Pandas two dataframes multiplication? -