Rolling mean lag function

Question 1

I'm working in base R here, but would like to convert these functions to something more efficient in R (e.g. speed up processing with dplyr). This takes a while when processing many variables.

Sample Data

dat <- structure(list(year = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 
7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 
16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 
24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 
32, 32, 33, 33), fips = c(1001, 1003, 1001, 1003, 1001, 1003, 
1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 
1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 
1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 
1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 
1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 1003, 1001, 
1003, 1001, 1003, 1001, 1003), x = c(125.045095764706, 142.392000772532, 
93.784066, 114.046112317597, 63.7282256470588, 82.9741328755365, 
80.1740505882354, 71.3666624463519, 59.9823712941177, 58.3210325321888, 
71.398721882353, 78.0599068669528, 100.269594705882, 100.605611201717, 
103.085137647059, 67.8735998283261, 80.7074510588235, 58.1754221459227, 
68.051650117647, 43.5071235622318, 119.816953647059, 99.0901919742489, 
52.6859196470588, 41.9522472961373, 32.3911284705882, 30.8885944206009, 
72.7453448235295, 75.4619375107297, 18.3169755294118, 25.7082367381974, 
100.474256941176, 81.1751539055795, 47.0679831764706, 53.7178891416309, 
42.8417696470588, 61.0489666523605, 105.094850823529, 136.818042832618, 
66.7111776470588, 91.2838116309014, 108.546239411765, 137.704349785408, 
35.8870510588235, 44.0777507725322, 63.7891365882353, 78.2038918025751, 
24.4378204705882, 34.8266493133048, 38.4591315294118, 35.6041833476395, 
48.21366, 63.417606223176, 108.736805647059, 113.645038755365, 
117.392536235294, 100.660605751073, 63.2259095294118, 70.5639424034334, 
48.6272797647059, 78.3992572532189, 144.360976352941, 154.45886472103, 
127.108438588235, 130.812303390558, 63.4038565882353, 76.3053522317597
)), .Names = c("year", "fips", "x"), row.names = c(200161L, 200162L, 
202663L, 202664L, 205165L, 205166L, 207667L, 207668L, 210169L, 
210170L, 212671L, 212672L, 215173L, 215174L, 217675L, 217676L, 
220177L, 220178L, 222679L, 222680L, 225181L, 225182L, 227683L, 
227684L, 230185L, 230186L, 232687L, 232688L, 235189L, 235190L, 
237691L, 237692L, 240193L, 240194L, 242695L, 242696L, 245197L, 
245198L, 247699L, 247700L, 250201L, 250202L, 252703L, 252704L, 
255205L, 255206L, 257707L, 257708L, 260209L, 260210L, 262711L, 
262712L, 265213L, 265214L, 267715L, 267716L, 270217L, 270218L, 
272719L, 272720L, 275221L, 275222L, 277723L, 277724L, 280225L, 
280226L), class = "data.frame")

Functions

rollMean = function(vec, len){
 n = length(vec)
 n2 = n - len + 1
 for( i in 1:n2 ) {
 if (i==1) x = sum(vec[1:len])/len
 else x = c(x, sum(vec[i:(len+(i-1))])/len)
 }
 x
}
allFipsRM = function(varName, len){
 y = c()
 for( i in 1:nfip){
 z = dat[dat$fips==fips.index[i], varName]
 x = rollMean(z, len)
 lenx = length(x)
 x = cbind( rep(fips.index[i], length(x)), x, 0:(lenx-1) )
 if(i == 1) y = x
 else y = rbind(y, x)
 }
 y = data.frame(y)
 colnames(y) = c("fips",paste("rm",len,sep=""),"year")
 y
}

Run and merge

fips.index = unique(dat$fips)
nfip = length(fips.index)
rm1 = allFipsRM("x",1)
rm2 = allFipsRM("x",2)
outdat = merge( dat[, c("fips","x","year")], rm1, by=c("fips","year") )
outdat = merge( outdat, rm2, by=c("fips","year") )

Output

> head(outdat)
 fips year x rm1 rm2
1 1001 1 125.04510 93.78407 78.75615
2 1001 10 68.05165 119.81695 86.25144
3 1001 11 119.81695 52.68592 42.53852
4 1001 12 52.68592 32.39113 52.56824
5 1001 13 32.39113 72.74534 45.53116
6 1001 14 72.74534 18.31698 59.39562

Question 2

It seems that your allFipsRM function takes as input a data variable name and rolling mean length and outputs information about the rolling mean with the indicated length, computed for each fips value separately.

I see two key issues with the code as currently written:

In two places you are growing objects element by element. Please see Circle 2 of The R Inferno for why this is an inefficient way to grow objects.
A vectorized rolling mean function should be much more efficient than one you code on your own with a loop in R.

The following code fixes these two issues by combining the data for all fips codes in a single call to rbind and by using a vectorized rolling mean function from the RcppRoll package. I did a few other cleanup tasks: passing the data to the function to make it more flexible and using paste0 instead of paste with sep="".

library(RcppRoll)
allFipsRM2 = function(dat, varName, len){
 y <- do.call(rbind, lapply(split(dat, dat$fips), function(x) {
 data.frame(fips=x$fips[1], rm=roll_mean(x[,varName], len), year=seq_len(nrow(x)-len+1)-1)
 }))
 colnames(y)[2] <- paste0("rm",len)
 y
}

We can confirm that this given the same merged results as the original code:

rm1b = allFipsRM2(dat, "x",1)
rm2b = allFipsRM2(dat, "x",2)
outdat2 = merge( dat[, c("fips","x","year")], rm1b, by=c("fips","year") )
outdat2 = merge( outdat2, rm2b, by=c("fips","year") )
all.equal(outdat, outdat2)
# [1] TRUE

To see the performance impact, let's run on a modestly larger version of dat, with 66k rows:

dat <- dat[rep(seq_len(nrow(dat)), 1000),]
system.time(allFipsRM("x", 2))
# user system elapsed 
# 5.964 2.511 8.549 
system.time(allFipsRM2(dat, "x", 2))
# user system elapsed 
# 0.109 0.007 0.117

We see a speedup approaching 100x for this test data.

You could also update your function to take a vector of lengths in a pretty straightforward way using sapply:

library(RcppRoll)
allFipsRM3 = function(dat, varName, len){
 do.call(rbind, lapply(split(dat, dat$fips), function(x) {
 all.rm <- as.data.frame(sapply(len, function(l) c(roll_mean(x[,varName], l), rep(NA, l-1))))
 colnames(all.rm) <- paste0("rm", len)
 cbind(data.frame(fips=x$fips[1]), all.rm, data.frame(year=seq_len(nrow(x))-1))
 }))
}
outdat3 <- allFipsRM3(dat, "x", c(1, 2))

Question 3

Thanks! I couldn't get RcppRoll installed on linux, but I used the zoo package and rollmean instead. What if I wanted to provide a vector of different len, such as len = c(1, 2, 3, 4, 5)?

Question 4

@Amstell You could certainly use rollmean from zoo (it will be faster than your original implementation), but it will still be quite a bit slower than what you can get from compiled code like the code in RcppRoll; see stackoverflow.com/q/30090336/3093387 for details.

Question 5

Thanks for the reference and comment. Can I ask, what if I wanted to provide a vector of different len, such as len = c(1, 2, 3, 4, 5)? I know this isn't the original question, but would appreciate guidance on getting multiple rollmean lenths.

Question 6

@Amstell the code I added at the end handles that case.

Question 7

I may be taking advantage of your kindness, but how would I adjust this function to account for multiple varName?

josliber josliber 1,2219 silver badges17 bronze badges · Accepted Answer · 2018-02-26 20:51:22Z

It seems that your allFipsRM function takes as input a data variable name and rolling mean length and outputs information about the rolling mean with the indicated length, computed for each fips value separately.

I see two key issues with the code as currently written:

In two places you are growing objects element by element. Please see Circle 2 of The R Inferno for why this is an inefficient way to grow objects.
A vectorized rolling mean function should be much more efficient than one you code on your own with a loop in R.

The following code fixes these two issues by combining the data for all fips codes in a single call to rbind and by using a vectorized rolling mean function from the RcppRoll package. I did a few other cleanup tasks: passing the data to the function to make it more flexible and using paste0 instead of paste with sep="".

library(RcppRoll)
allFipsRM2 = function(dat, varName, len){
 y <- do.call(rbind, lapply(split(dat, dat$fips), function(x) {
 data.frame(fips=x$fips[1], rm=roll_mean(x[,varName], len), year=seq_len(nrow(x)-len+1)-1)
 }))
 colnames(y)[2] <- paste0("rm",len)
 y
}

We can confirm that this given the same merged results as the original code:

rm1b = allFipsRM2(dat, "x",1)
rm2b = allFipsRM2(dat, "x",2)
outdat2 = merge( dat[, c("fips","x","year")], rm1b, by=c("fips","year") )
outdat2 = merge( outdat2, rm2b, by=c("fips","year") )
all.equal(outdat, outdat2)
# [1] TRUE

To see the performance impact, let's run on a modestly larger version of dat, with 66k rows:

dat <- dat[rep(seq_len(nrow(dat)), 1000),]
system.time(allFipsRM("x", 2))
# user system elapsed 
# 5.964 2.511 8.549 
system.time(allFipsRM2(dat, "x", 2))
# user system elapsed 
# 0.109 0.007 0.117

We see a speedup approaching 100x for this test data.

You could also update your function to take a vector of lengths in a pretty straightforward way using sapply:

library(RcppRoll)
allFipsRM3 = function(dat, varName, len){
 do.call(rbind, lapply(split(dat, dat$fips), function(x) {
 all.rm <- as.data.frame(sapply(len, function(l) c(roll_mean(x[,varName], l), rep(NA, l-1))))
 colnames(all.rm) <- paste0("rm", len)
 cbind(data.frame(fips=x$fips[1]), all.rm, data.frame(year=seq_len(nrow(x))-1))
 }))
}
outdat3 <- allFipsRM3(dat, "x", c(1, 2))

Thanks! I couldn't get RcppRoll installed on linux, but I used the zoo package and rollmean instead. What if I wanted to provide a vector of different len, such as len = c(1, 2, 3, 4, 5)?
@Amstell You could certainly use rollmean from zoo (it will be faster than your original implementation), but it will still be quite a bit slower than what you can get from compiled code like the code in RcppRoll; see stackoverflow.com/q/30090336/3093387 for details.
Thanks for the reference and comment. Can I ask, what if I wanted to provide a vector of different len, such as len = c(1, 2, 3, 4, 5)? I know this isn't the original question, but would appreciate guidance on getting multiple rollmean lenths.
I may be taking advantage of your kindness, but how would I adjust this function to account for multiple varName?

Stack Exchange Network

Rolling mean lag function

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Rolling mean lag function

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions