r - Calculating mean difference between combinations of dates -
i calculate pairwise average , median number of days between multiple date variables.
my raw data df might following:
id invitation account_date first_order second_order third_order 1 1/1/2016 1/7/2016 1/20/2016 1/22/2016 na 2 1/1/2016 1/8/2016 1/22/2016 1/23/2016 1/25/2016 3 1/1/2016 1/5/2016 1/20/2016 2/1/2016 na 4 1/1/2016 1/2/2016 1/18/2016 2/4/2016 2/6/2016 given data formatted dates, it's pretty easy manually calculate average , median difference combinations of dates first calculating pairwise differences, e.g.:
id inv_to_act act_to_first act_to_sec act_to_third 1 6 13 2 na 2 7 14 1 2 3 4 15 12 na 4 1 16 17 2 and using base r: mean(df$act_to_first,na.rm=t).
but, i'd compute these calculations on several datasets or subsets of same dataset, it's not scalable each step on , on again. plus, i'm pretty sure there must melt or plyr solution haven't figured out.
you compute date differences between each pair of dates looping through pairs , using difftime:
combos <- combn(tail(names(df), -1), 2) diffs <- apply(combos, 2, function(x) { difftime(df[,x[2]], df[,x[1]], units="days") }) colnames(diffs) <- paste0(combos[1,], "_to_", combos[2,]) diffs # invitation_to_account_date invitation_to_first_order invitation_to_second_order invitation_to_third_order account_date_to_first_order # [1,] 6 19 21 na 13 # [2,] 7 21 22 24 14 # [3,] 4 19 31 na 15 # [4,] 1 17 34 36 16 # account_date_to_second_order account_date_to_third_order first_order_to_second_order first_order_to_third_order second_order_to_third_order # [1,] 15 na 2 na na # [2,] 15 17 1 3 2 # [3,] 27 na 12 na na # [4,] 33 35 17 19 2 after step, should able compute average of each column:
colmeans(diffs, na.rm=true) # invitation_to_account_date invitation_to_first_order invitation_to_second_order invitation_to_third_order account_date_to_first_order # 4.5 19.0 27.0 30.0 14.5 # account_date_to_second_order account_date_to_third_order first_order_to_second_order first_order_to_third_order second_order_to_third_order # 22.5 26.0 8.0 11.0 2.0 once have these functions, can put them in function , apply function input df:
meandateranges <- function(df) { combos <- combn(tail(names(df), -1), 2) diffs <- apply(combos, 2, function(x) { difftime(df[,x[2]], df[,x[1]], units="days") }) colnames(diffs) <- paste0(combos[1,], "_to_", combos[2,]) colmeans(diffs, na.rm=true) } you run function on input data frame meandateranges(df) or on list of them lapply(df.list, meandateranges).
data:
df <- structure(list(id = 1:4, invitation = structure(list(sec = c(0, 0, 0, 0), min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(1l, 1l, 1l, 1l), mon = c(0l, 0l, 0l, 0l), year = c(116l, 116l, 116l, 116l), wday = c(5l, 5l, 5l, 5l), yday = c(0l, 0l, 0l, 0l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est", "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_, na_integer_)), .names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt", "posixt")), account_date = structure(list(sec = c(0, 0, 0, 0), min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(7l, 8l, 5l, 2l), mon = c(0l, 0l, 0l, 0l), year = c(116l, 116l, 116l, 116l), wday = c(4l, 5l, 2l, 6l), yday = c(6l, 7l, 4l, 1l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est", "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_, na_integer_)), .names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt", "posixt")), first_order = structure(list(sec = c(0, 0, 0, 0), min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(20l, 22l, 20l, 18l), mon = c(0l, 0l, 0l, 0l), year = c(116l, 116l, 116l, 116l), wday = c(3l, 5l, 3l, 1l), yday = c(19l, 21l, 19l, 17l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est", "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_, na_integer_)), .names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt", "posixt")), second_order = structure(list(sec = c(0, 0, 0, 0), min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(22l, 23l, 1l, 4l), mon = c(0l, 0l, 1l, 1l), year = c(116l, 116l, 116l, 116l), wday = c(5l, 6l, 1l, 4l), yday = c(21l, 22l, 31l, 34l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est", "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_, na_integer_)), .names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt", "posixt")), third_order = structure(list(sec = c(na, 0, na, 0 ), min = c(na, 0l, na, 0l), hour = c(na, 0l, na, 0l), mday = c(na, 25l, na, 6l), mon = c(na, 0l, na, 1l), year = c(na, 116l, na, 116l), wday = c(na, 1l, na, 6l), yday = c(na, 24l, na, 36l), isdst = c(-1l, 0l, -1l, 0l), zone = c("", "est", "", "est" ), gmtoff = c(na_integer_, na_integer_, na_integer_, na_integer_ )), .names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt", "posixt"))), .names = c("id", "invitation", "account_date", "first_order", "second_order", "third_order"), row.names = c(na, -4l), class = "data.frame")
Comments
Post a Comment