r - Calculating mean difference between combinations of dates -


i calculate pairwise average , median number of days between multiple date variables.

my raw data df might following:

id     invitation    account_date   first_order    second_order    third_order 1    1/1/2016      1/7/2016       1/20/2016      1/22/2016        na 2    1/1/2016      1/8/2016       1/22/2016      1/23/2016        1/25/2016 3    1/1/2016      1/5/2016       1/20/2016      2/1/2016         na 4    1/1/2016      1/2/2016       1/18/2016      2/4/2016         2/6/2016 

given data formatted dates, it's pretty easy manually calculate average , median difference combinations of dates first calculating pairwise differences, e.g.:

id     inv_to_act act_to_first    act_to_sec    act_to_third 1      6          13              2             na 2      7          14              1             2 3      4          15              12            na 4      1          16              17            2 

and using base r: mean(df$act_to_first,na.rm=t).

but, i'd compute these calculations on several datasets or subsets of same dataset, it's not scalable each step on , on again. plus, i'm pretty sure there must melt or plyr solution haven't figured out.

you compute date differences between each pair of dates looping through pairs , using difftime:

combos <- combn(tail(names(df), -1), 2) diffs <- apply(combos, 2, function(x) {   difftime(df[,x[2]], df[,x[1]], units="days") }) colnames(diffs) <- paste0(combos[1,], "_to_", combos[2,]) diffs #      invitation_to_account_date invitation_to_first_order invitation_to_second_order invitation_to_third_order account_date_to_first_order # [1,]                          6                        19                         21                        na                          13 # [2,]                          7                        21                         22                        24                          14 # [3,]                          4                        19                         31                        na                          15 # [4,]                          1                        17                         34                        36                          16 #      account_date_to_second_order account_date_to_third_order first_order_to_second_order first_order_to_third_order second_order_to_third_order # [1,]                           15                          na                           2                         na                          na # [2,]                           15                          17                           1                          3                           2 # [3,]                           27                          na                          12                         na                          na # [4,]                           33                          35                          17                         19                           2 

after step, should able compute average of each column:

colmeans(diffs, na.rm=true) #   invitation_to_account_date    invitation_to_first_order   invitation_to_second_order    invitation_to_third_order  account_date_to_first_order  #                          4.5                         19.0                         27.0                         30.0                         14.5  # account_date_to_second_order  account_date_to_third_order  first_order_to_second_order   first_order_to_third_order  second_order_to_third_order  #                         22.5                         26.0                          8.0                         11.0                          2.0  

once have these functions, can put them in function , apply function input df:

meandateranges <- function(df) {   combos <- combn(tail(names(df), -1), 2)   diffs <- apply(combos, 2, function(x) {     difftime(df[,x[2]], df[,x[1]], units="days")   })   colnames(diffs) <- paste0(combos[1,], "_to_", combos[2,])   colmeans(diffs, na.rm=true) } 

you run function on input data frame meandateranges(df) or on list of them lapply(df.list, meandateranges).

data:

df <- structure(list(id = 1:4, invitation = structure(list(sec = c(0,  0, 0, 0), min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l),      mday = c(1l, 1l, 1l, 1l), mon = c(0l, 0l, 0l, 0l), year = c(116l,      116l, 116l, 116l), wday = c(5l, 5l, 5l, 5l), yday = c(0l,      0l, 0l, 0l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est",      "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_,      na_integer_)), .names = c("sec", "min", "hour", "mday", "mon",  "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt",  "posixt")), account_date = structure(list(sec = c(0, 0, 0, 0),      min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(7l,      8l, 5l, 2l), mon = c(0l, 0l, 0l, 0l), year = c(116l, 116l,      116l, 116l), wday = c(4l, 5l, 2l, 6l), yday = c(6l, 7l, 4l,      1l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est", "est",      "est"), gmtoff = c(na_integer_, na_integer_, na_integer_,      na_integer_)), .names = c("sec", "min", "hour", "mday", "mon",  "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt",  "posixt")), first_order = structure(list(sec = c(0, 0, 0, 0),      min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(20l,      22l, 20l, 18l), mon = c(0l, 0l, 0l, 0l), year = c(116l, 116l,      116l, 116l), wday = c(3l, 5l, 3l, 1l), yday = c(19l, 21l,      19l, 17l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est",      "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_,      na_integer_)), .names = c("sec", "min", "hour", "mday", "mon",  "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt",  "posixt")), second_order = structure(list(sec = c(0, 0, 0, 0),      min = c(0l, 0l, 0l, 0l), hour = c(0l, 0l, 0l, 0l), mday = c(22l,      23l, 1l, 4l), mon = c(0l, 0l, 1l, 1l), year = c(116l, 116l,      116l, 116l), wday = c(5l, 6l, 1l, 4l), yday = c(21l, 22l,      31l, 34l), isdst = c(0l, 0l, 0l, 0l), zone = c("est", "est",      "est", "est"), gmtoff = c(na_integer_, na_integer_, na_integer_,      na_integer_)), .names = c("sec", "min", "hour", "mday", "mon",  "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt",  "posixt")), third_order = structure(list(sec = c(na, 0, na, 0 ), min = c(na, 0l, na, 0l), hour = c(na, 0l, na, 0l), mday = c(na,  25l, na, 6l), mon = c(na, 0l, na, 1l), year = c(na, 116l, na,  116l), wday = c(na, 1l, na, 6l), yday = c(na, 24l, na, 36l),      isdst = c(-1l, 0l, -1l, 0l), zone = c("", "est", "", "est"     ), gmtoff = c(na_integer_, na_integer_, na_integer_, na_integer_     )), .names = c("sec", "min", "hour", "mday", "mon", "year",  "wday", "yday", "isdst", "zone", "gmtoff"), class = c("posixlt",  "posixt"))), .names = c("id", "invitation", "account_date", "first_order",  "second_order", "third_order"), row.names = c(na, -4l), class = "data.frame") 

Comments

Popular posts from this blog

routing - AngularJS State management ->load multiple states in one page -

python - GRASS parser() error -

json - Gson().fromJson(jsonResult, Myobject.class) return values in 0's -