I have a function that checks for the presence of logical sequences in a dataframe
fu <- function(dat , rule , res.only=T){
debug.vec <- rep("no",nrow(dat)) # control of rule triggers
rule.id <- 1 # rule number in vector
for(i in 1:nrow(dat)){
# check if the rule "rule[rule.id]" has worked on this "i" index in dat[i,]
current_rule <- with(data = dat[i,] , expr = eval(parse(text = rule[rule.id])) )
if(current_rule){ # if the rule is triggered
debug.vec[i] <- rule[rule.id]
if( rule.id==length(rule) ) break # stop if there are no more rules
rule.id <- rule.id+1 # go to the next rule
}}
if(!res.only) return( cbind(dat,debug.vec) )
return( sum(debug.vec!="no")==length(rule) )
}
for example i have some data
set.seed(123)
dat <- as.data.frame(matrix(data = sample(10,30,replace = T),ncol = 3))
colnames(dat) <- paste0("x" ,1:ncol(dat))
..
dat
x1 x2 x3
1 3 5 9
2 3 3 3
3 10 9 4
4 2 9 1
5 6 9 7
6 5 3 5
7 4 8 10
8 6 10 7
9 9 7 9
10 10 10 9
there is also a vector with rules
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
the function checks if there is such a logical sequence in the dataframe and gives a logical answer
> fu(dat = dat, rule = rule, res.only = T)
[1] TRUE
or you can change the flag res.only = F and see where the sequence was in the debug.vec column
> fu(dat = dat, rule = rule, res.only = F)
x1 x2 x3 debug.vec
1 3 5 9 no
2 3 3 3 no
3 10 9 4 x1>5 & x2>2
4 2 9 1 no
5 6 9 7 no
6 5 3 5 x1>x2
7 4 8 10 x3!=4
8 6 10 7 no
9 9 7 9 no
10 10 10 9 no
I need the fastest possible version of this function, perhaps using the Rccp package or something like that..
UPD=======================
the Waldi function is not working identically to my function, something is wrong
UPD_2_====================================
# Is this correct?
Yes, this is correct if the rule[k] is triggered then the search for rule[k+1] starts with a new row of dat
enter image description here forgive me for not being precise enough in my question, this is my fault
my function returned FALSE because the last rule "x3!=4" did not work, it should be
dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L),
x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L),
x3 = c(4L, 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)),
class = "data.frame", row.names = c(NA, -10L))
dat
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
my_fu(dat = dat, rule = rule, res.only = F)
only two rules worked
> my_fu(dat = dat, rule = rule, res.only = F)
x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
it should be
> my_fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
2 Answers 2
Update
As per your update, I wrote a new fu function, i.e., TIC_fu()
TIC_fu <- function(dat, rule, res.only = TRUE) {
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
idx <- na.omit(
Reduce(
function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}, m,
init = 0, accumulate = TRUE
)
)[-1]
if (!res.only) {
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
length(idx) >= length(rule)
}
and you will see
> TIC_fu(dat, rule, FALSE)
x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
> TIC_fu(dat,rule)
[1] FALSE
For benchmarking
> microbenchmark(
+ TIC_fu(dat, rule, FALSE),
+ fu(dat, rule, FALSE),
+ unit = "relative"
+ )
Unit: relative
expr min lq mean median uq max
TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532
neval
100
100
Previous Answer
Here are some options similar to what @Waldi has done, but the only difference is among parse, str2lang and str2expression
microbenchmark::microbenchmark(
any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))
)
and you will see
Unit: microseconds
expr
any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) == length(rule)))
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) == length(rule)))
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) == length(rule)))
any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
min lq mean median uq max neval
94.0 98.6 131.431 107.35 121.90 632.7 100
37.5 39.2 48.887 44.05 48.50 174.1 100
36.8 39.6 51.627 46.20 48.45 241.4 100
12.7 15.8 19.786 17.00 19.75 97.9 100
5 Comments
dat (many rows), and outperforms mine. If you want higher speed, I guess you should try Cpp, rather than any R code.A possible simple base R way:
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
x1>5 & x2>2 x1>x2 x3!=4
[1,] FALSE FALSE TRUE
[2,] FALSE FALSE TRUE
[3,] TRUE TRUE FALSE
[4,] FALSE FALSE TRUE
[5,] TRUE FALSE TRUE
[6,] FALSE TRUE TRUE
[7,] FALSE FALSE TRUE
[8,] TRUE FALSE TRUE
[9,] TRUE TRUE TRUE
[10,] TRUE FALSE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
Performance :
microbenchmark::microbenchmark(any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule)),
fu(dat = dat, rule = rule, res.only = T))
Unit: microseconds
expr min lq mean median
any(with(dat, sapply(rule, function(rule) eval(parse(text = rule))))) 93.201 97.7010 127.817 104.9010
fu(dat = dat, rule = rule, res.only = T) 465.902 499.7015 611.827 523.2505
uq max neval
124.8010 834.201 100
643.2015 2018.500 100
Other test:
dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L),
x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), x3 = c(4L,
9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)), class = "data.frame", row.names = c(NA,
-10L))
dat
x1 x2 x3
1 2 2 4
2 5 1 9
3 1 6 8
4 3 10 7
5 9 8 10
6 2 10 1
7 6 10 2
8 3 4 8
9 3 6 3
10 9 4 10
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
x1>5 & x2>2 x1>x2 x3!=4
[1,] FALSE FALSE FALSE
[2,] FALSE TRUE TRUE
[3,] FALSE FALSE TRUE
[4,] FALSE FALSE TRUE
[5,] TRUE TRUE TRUE
[6,] FALSE FALSE TRUE
[7,] TRUE FALSE TRUE
[8,] FALSE FALSE TRUE
[9,] FALSE FALSE TRUE
[10,] TRUE TRUE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
fu(dat)
fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
# Is this correct?
3 Comments
str2lang or str2expression instead, which may give your more options :)
rule[1]? e.g. -library(data.table); setDT(dat)[ eval(parse(text = rule[1] ) )]rcppandc++labels off here. This is likely a question fordata.tableor maybecollapse. And SO is not a 'ask for someone to write code for me' service ...