Computing rowMeans for every combination of columns

Question 1

The data:

 pre1 <-structure(list(A = c(0.0276, 0.0165, 0.0113, 0.0229, 0.0113, 
0.0151, 0.015, 0.0122, 0.0113, 0.0113, 0.0113, 0.0113), B = c(0.0884, 
0.0135, 0.0001, 0.0523, 0, 0.0069, 0.0069, 0.0007, 0, 0, 0, 0
), C = c(0.04, 0.0155, 0.0065, 0.0291, 0.0065, 0.0128, 0.0127, 
0.0078, 0.0065, 0.0065, 0.0065, 0.0065), D = c(0.0897, 0.014, 
0.0001, 0.0546, 0, 0.0073, 0.0073, 0.0007, 0, 0, 0, 0), E = c(0.0911, 
0.0129, 0, 0.0537, 0, 0.0065, 0.0065, 0.0006, 0, 0, 0, 0), F = c(0.0891, 
0.0134, 0, 0.0529, 0, 0.0069, 0.007, 0.0006, 0, 0, 0, 0), G = c(0.0921, 
0.0118, 0, 0.0536, 0, 0.0035, 0.004, 0.0001, 0, 0, 0, 0), H = c(0.0906, 
0.0168, 0, 0.0631, 0, 0.0024, 0.0032, 0.0001, 0, 0, 0, 0), I = c(0.0922, 
0.0156, 0, 0.0625, 0, 0.0024, 0.0031, 0, 0, 0, 0, 0), J = c(0.1115, 
0.0052, 0.0006, 0.0458, 0.0006, 0.005, 0.005, 0.0007, 0.0006, 
0.0006, 0.0006, 0.0006), K = c(0.0892, 0.0128, 0, 0.0514, 0, 
0.0073, 0.0072, 0.0006, 0, 0, 0, 0), L = c(0.0895, 0.009, 0.0002, 
0.0515, 0.0002, 0.0055, 0.0052, 0.0008, 0.0002, 0.0002, 0.0002, 
0.0002), M = c(0.0887, 0.0135, 0.0001, 0.0525, 0, 0.0068, 0.0069, 
0.0007, 0, 0, 0, 0), N = c(0.0892, 0.0128, 0, 0.0514, 0, 0.0073, 
0.0072, 0.0006, 0, 0, 0, 0), O = c(0.087, 0.0133, 0.0001, 0.0511, 
0.0001, 0.0072, 0.0072, 0.0007, 0.0001, 0.0001, 0.0001, 0.0001
), P = c(0.0875, 0.011, 0, 0.0492, 0, 0.002, 0.0027, 0.0002, 
0, 0, 0, 0), Q = c(0.0893, 0.0126, 0, 0.0518, 0, 0.0063, 0.0063, 
0.0004, 0, 0, 0, 0), R = c(0.0763, 0.0142, 0.0006, 0.0494, 0.0003, 
0.0018, 0.0027, 0.0007, 0.0002, 0.0002, 0.0003, 0.0002), S = c(0.0176, 
0.0173, 0.0172, 0.0175, 0.0172, 0.0173, 0.0172, 0.0172, 0.0172, 
0.0172, 0.0172, 0.0172)), .Names = c("A", "B", "C", "D", "E", 
"F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", 
"S"), row.names = c(NA, -12L), class = "data.frame")
obs<- c(0.0958964144767174, 0.00112749085522926, 0, 0.0538084597701149, 
0, 0, 0, 0, 0, 0, 0, 0)

The list of column combinations:

library(foreach)
xcomb <- foreach(z=1:ncol(pre1), .combine=c) %do% { 
 combn(names(pre1), z, simplify=FALSE) }

The loop:

CAUTION

The following code takes 2 minutes to run on my PC (in fact, pre1 has 23 columns and takes more than 37 minutes to run):

 library(plyr)
 res <- ldply(xcomb, function(l) {
 pred <- rowMeans(pre1[l])
 rmse <- sqrt(mean((obs-pred)^2))
 if(rmse<0.00345){
 me <- mean(obs-pred)
 smape <- sum(abs(pred-obs))/sum(pred+obs)
 data.frame(combin=paste(names(pre1[l]),collapse="-"),me = me, rmse = rmse, smape = smape)
 }
})

Example output of the reproducible:

 combin me rmse smape
1 J-L -0.0015764696 0.003398144 0.09119202
2 J-P -0.0011556362 0.003281430 0.08393609
3 E-J-L -0.0016195251 0.003424861 0.08216952
4 E-J-P -0.0013389696 0.003357466 0.07727013
5 F-J-P -0.0013000807 0.003448260 0.07759392
6 G-H-J -0.0018223029 0.003387601 0.06759025
7 G-I-J -0.0018111918 0.003316197 0.06720583
8 G-J-L -0.0014473029 0.003126429 0.07643285
9 G-J-P -0.0011667474 0.003091874 0.07144020
10 G-J-Q -0.0015584140 0.003436998 0.07965237
11 G-J-R -0.0010084140 0.003394027 0.08194241
12 H-J-L -0.0017556362 0.003236715 0.06739709
13 H-J-P -0.0014750807 0.003194667 0.06236702
14 I-J-L -0.0017445251 0.003162696 0.06825156
15 I-J-P -0.0014639696 0.003108024 0.06322841
16 I-J-R -0.0013056362 0.003443850 0.07335332
17 J-L-P -0.0011000807 0.003086438 0.07352729
18 J-L-Q -0.0014917474 0.003427897 0.08172932
19 J-L-R -0.0009417474 0.003443418 0.08960742
20 J-P-Q -0.0012111918 0.003384422 0.07680148
21 B-G-J-L -0.0014598029 0.003434496 0.07700107
22 B-G-J-P -0.0012493862 0.003418938 0.07643082
23 B-J-L-P -0.0011993862 0.003432305 0.08211289
24 D-G-J-L -0.0015618862 0.003446124 0.07491208
25 D-G-J-P -0.0013514696 0.003416486 0.07432744
26 D-J-L-P -0.0013014696 0.003417190 0.07998359
27 E-G-J-L -0.0015118862 0.003349091 0.07335527
28 E-G-J-P -0.0013014696 0.003321645 0.07178900
29 E-I-J-L -0.0017348029 0.003414617 0.06724261
30 E-I-J-P -0.0015243862 0.003377062 0.06548492
31 E-J-L-P -0.0012514696 0.003320299 0.07745106
32 F-G-J-L -0.0014827196 0.003419382 0.07576092
33 F-G-J-P -0.0012723029 0.003398932 0.07518129
34 F-I-J-P -0.0014952196 0.003443578 0.06884192
35 F-J-L-P -0.0012223029 0.003408147 0.08085604
36 G-H-J-L -0.0016139696 0.003264481 0.06225090
37 G-H-J-P -0.0014035529 0.003252500 0.06138754
38 G-I-J-L -0.0016056362 0.003192294 0.06289348
39 G-I-J-P -0.0013952196 0.003173631 0.05952244
40 G-J-K-L -0.0014535529 0.003434714 0.07694082
41 G-J-K-P -0.0012431362 0.003414584 0.07636996
42 G-J-L-M -0.0014681362 0.003426825 0.07650715
43 G-J-L-N -0.0014535529 0.003434714 0.07694082
44 G-J-L-P -0.0011223029 0.003097579 0.07148546
45 G-J-L-Q -0.0014160529 0.003354734 0.07485276
46 G-J-L-R -0.0010035529 0.003420185 0.08480000
47 G-J-M-P -0.0012577196 0.003410148 0.07593315
48 G-J-N-P -0.0012431362 0.003414584 0.07636996
49 G-J-P-Q -0.0012056362 0.003340910 0.07426441
50 H-J-L-P -0.0013535529 0.003183342 0.06700815
51 H-J-P-Q -0.0014368862 0.003449975 0.06977695
52 I-J-K-P -0.0014660529 0.003434489 0.07001368
53 I-J-L-P -0.0013452196 0.003103169 0.06514130
54 I-J-L-Q -0.0016389696 0.003392922 0.06855015
55 I-J-L-R -0.0012264696 0.003444943 0.07831468
56 I-J-N-P -0.0014660529 0.003434489 0.07001368
57 I-J-P-Q -0.0014285529 0.003368587 0.06791682
58 J-K-L-P -0.0011931362 0.003432281 0.08205326
59 J-L-M-P -0.0012077196 0.003421419 0.08161247
60 J-L-N-P -0.0011931362 0.003432281 0.08205326
61 J-L-P-Q -0.0011556362 0.003353159 0.07995181
62 B-G-J-L-P -0.0011973029 0.003415942 0.07979367
63 B-I-J-L-P -0.0013756362 0.003437632 0.07466828
64 D-G-J-L-P -0.0012789696 0.003409087 0.07809595
65 E-G-I-J-L -0.0016256362 0.003415821 0.06699637
66 E-G-I-J-P -0.0014573029 0.003398820 0.06648042
67 E-G-J-L-P -0.0012389696 0.003324390 0.07606610
68 E-H-J-L-P -0.0014239696 0.003427462 0.07246184
69 E-I-J-L-P -0.0014173029 0.003357870 0.07097378
70 F-G-J-L-P -0.0012156362 0.003397033 0.07878956
71 F-I-J-L-P -0.0013939696 0.003423666 0.07367445
72 G-H-J-L-P -0.0013206362 0.003251859 0.06770566
73 G-I-J-K-P -0.0014106362 0.003447292 0.07011274
74 G-I-J-L-P -0.0013139696 0.003181862 0.06621059
75 G-I-J-L-Q -0.0015489696 0.003404413 0.06893746
76 G-I-J-N-P -0.0014106362 0.003447292 0.07011274
77 G-I-J-P-Q -0.0013806362 0.003395895 0.06843233
78 G-J-K-L-P -0.0011923029 0.003408375 0.07974553
79 G-J-L-M-P -0.0012039696 0.003407449 0.07939387
80 G-J-L-N-P -0.0011923029 0.003408375 0.07974553
81 G-J-L-P-Q -0.0011623029 0.003347930 0.07806216
82 H-I-J-L-P -0.0014989696 0.003391859 0.06651120
83 H-J-L-P-Q -0.0013473029 0.003434350 0.07443353
84 I-J-K-L-P -0.0013706362 0.003419123 0.07461949
85 I-J-L-M-P -0.0013823029 0.003431058 0.07427245
86 I-J-L-N-P -0.0013706362 0.003419123 0.07461949
87 I-J-L-P-Q -0.0013406362 0.003363572 0.07294166
88 E-G-H-J-L-P -0.0013848029 0.003448137 0.07213946
89 E-G-I-J-L-P -0.0013792474 0.003387181 0.07089756
90 F-G-I-J-L-P -0.0013598029 0.003443722 0.07315096
91 G-H-I-J-L-P -0.0014473029 0.003420080 0.06695391
92 G-I-J-K-L-P -0.0013403585 0.003437713 0.07393901
93 G-I-J-L-N-P -0.0013403585 0.003437713 0.07393901
94 G-I-J-L-P-Q -0.0013153585 0.003392820 0.07253884

Question 2

If your data set is numeric, using a matrix instead of data.frame should drastically increase performance. Also, rowMeans needs to convert to a matrix first anyway. Also, don't use plyr if performance is on stake. Finally, if you insist on a data.frame, Reduce will be much faster as it doesn't convert to matrix. Also. Can you add your desired output so we won't need to run your code?

Question 3

I edited to include the desired output.

Question 4

So you want to run this over ~500K column combinations?

Question 5

..and maybe more :-)

Question 6

So your desired output are just selected column? Cause it should start with A, B, etc., no? Either way, a quick speedup should be converting pre1 <- as.matrix(pre1), replacing the data.frame(.. part with cbind(.. (you should always try avoiding classes conversions/methods dispatching while optimizing code). And use sapply or lapply or a for loop instead of ldply (which probably also converts to a data.frame). A bit more complicated fix would be converting to a long format and trying to run this using data.table. Though your desired output is a bit confuses me.

Question 7

This can be handled using matrix multiplication. Under the hood, matrix multiplication contains a for loop just like your code does, but it is a lot faster since it is all implemented in pre-compiled code.

So first compute a matrix of 0 and 1 where each row corresponds to a combination and each column corresponds to one of your 19 variables:

weightMatrix <- function(pre1) {
 nvars <- ncol(pre1)
 varnames <- colnames(pre1)
 wposs <- replicate(nvars, 0:1, simplify = FALSE)
 nposs <- Map(c, "", paste0("-",varnames))
 weights <- data.matrix(do.call(expand.grid, wposs))
 cnames <- do.call(paste0, do.call(expand.grid, nposs))
 cnames <- sub("^-", "", cnames)
 dimnames(weights) <- list(cnames, varnames)
 weights <- tail(weights, -1)
 return(weights)
}

Then your code translates to:

wmat <- weightMatrix(pre1)
pred <- wmat %*% t(data.matrix(pre1)) / rowSums(wmat)
rmse <- sqrt(colMeans((obs - t(pred)) ^ 2))
me <- colMeans(obs - t(pred))
smape <- colSums(abs(t(pred) - obs)) / colSums(t(pred) + obs)
out <- data.frame(rmse, me, smape)
subset(out, rmse < 0.00345)

This code takes about 5~6 seconds on my machine with 19 variables. With 23 variables, it will have 16 times more combinations so it should still run in under 2 minutes. With many more variables, you will likely run out of memory trying to compute the weight matrix: you will have to adapt the code so it finds a good balance between memory usage and computation times.

Question 8

Yes, your idea is very clever, but has the disadvantage that the pred table must be created in it's full dimensions before it gets filtered, which consumes a lot of memory.

Question 9

For reference though, using object.size, your xcomb alone takes 305Mb, while my wmat, pred and out are only 75, 85, and 45Mb respectively.

Question 10

However, when pre1 has 23 columns, your pred cannot be created on my pc (8 GB Ram).

flodel flodel 3,5551 gold badge16 silver badges15 bronze badges · Accepted Answer · 2016-03-14 23:56:03Z

This can be handled using matrix multiplication. Under the hood, matrix multiplication contains a for loop just like your code does, but it is a lot faster since it is all implemented in pre-compiled code.

So first compute a matrix of 0 and 1 where each row corresponds to a combination and each column corresponds to one of your 19 variables:

weightMatrix <- function(pre1) {
 nvars <- ncol(pre1)
 varnames <- colnames(pre1)
 wposs <- replicate(nvars, 0:1, simplify = FALSE)
 nposs <- Map(c, "", paste0("-",varnames))
 weights <- data.matrix(do.call(expand.grid, wposs))
 cnames <- do.call(paste0, do.call(expand.grid, nposs))
 cnames <- sub("^-", "", cnames)
 dimnames(weights) <- list(cnames, varnames)
 weights <- tail(weights, -1)
 return(weights)
}

Then your code translates to:

wmat <- weightMatrix(pre1)
pred <- wmat %*% t(data.matrix(pre1)) / rowSums(wmat)
rmse <- sqrt(colMeans((obs - t(pred)) ^ 2))
me <- colMeans(obs - t(pred))
smape <- colSums(abs(t(pred) - obs)) / colSums(t(pred) + obs)
out <- data.frame(rmse, me, smape)
subset(out, rmse < 0.00345)

This code takes about 5~6 seconds on my machine with 19 variables. With 23 variables, it will have 16 times more combinations so it should still run in under 2 minutes. With many more variables, you will likely run out of memory trying to compute the weight matrix: you will have to adapt the code so it finds a good balance between memory usage and computation times.

Yes, your idea is very clever, but has the disadvantage that the pred table must be created in it's full dimensions before it gets filtered, which consumes a lot of memory.
For reference though, using object.size, your xcomb alone takes 305Mb, while my wmat, pred and out are only 75, 85, and 45Mb respectively.
However, when pre1 has 23 columns, your pred cannot be created on my pc (8 GB Ram).

Stack Exchange Network

Computing rowMeans for every combination of columns

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Computing rowMeans for every combination of columns

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions