Match all logic rules with a dataframe (need super fast function)

Question 1

I have a function that checks for the presence of logical sequences in a dataframe

fu <- function(dat , rule , res.only=T){
debug.vec <- rep("no",nrow(dat)) # control of rule triggers
rule.id <- 1 # rule number in vector
for(i in 1:nrow(dat)){
 # check if the rule "rule[rule.id]" has worked on this "i" index in dat[i,]
 current_rule <- with(data = dat[i,] , expr = eval(parse(text = rule[rule.id])) )
 if(current_rule){ # if the rule is triggered
 debug.vec[i] <- rule[rule.id]
 if( rule.id==length(rule) ) break # stop if there are no more rules
 rule.id <- rule.id+1 # go to the next rule
 }} 
if(!res.only) return( cbind(dat,debug.vec) ) 
return( sum(debug.vec!="no")==length(rule) )
}

for example i have some data

set.seed(123)
dat <- as.data.frame(matrix(data = sample(10,30,replace = T),ncol = 3))
colnames(dat) <- paste0("x" ,1:ncol(dat))

..

there is also a vector with rules

rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )

the function checks if there is such a logical sequence in the dataframe and gives a logical answer

> fu(dat = dat, rule = rule, res.only = T)
[1] TRUE

or you can change the flag res.only = F and see where the sequence was in the debug.vec column

> fu(dat = dat, rule = rule, res.only = F)
 x1 x2 x3 debug.vec
1 3 5 9 no
2 3 3 3 no
3 10 9 4 x1>5 & x2>2
4 2 9 1 no
5 6 9 7 no
6 5 3 5 x1>x2
7 4 8 10 x3!=4
8 6 10 7 no
9 9 7 9 no
10 10 10 9 no

I need the fastest possible version of this function, perhaps using the Rccp package or something like that..

UPD=======================

the Waldi function is not working identically to my function, something is wrong

UPD_2_====================================

# Is this correct?

Yes, this is correct if the rule[k] is triggered then the search for rule[k+1] starts with a new row of dat

enter image description here forgive me for not being precise enough in my question, this is my fault

my function returned FALSE because the last rule "x3!=4" did not work, it should be

dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L), 
 x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), 
 x3 = c(4L, 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)),
 class = "data.frame", row.names = c(NA, -10L))
dat
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
my_fu(dat = dat, rule = rule, res.only = F)

only two rules worked

> my_fu(dat = dat, rule = rule, res.only = F)
 x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2

it should be

> my_fu(dat = dat, rule = rule, res.only = T)
[1] FALSE

Question 2

In your exmaple output, why doesn't row 5 satisfy rule[1] ? e.g. - library(data.table); setDT(dat)[ eval(parse(text = rule[1] ) )]

Question 3

Hello! Because 'rule[1]' has already worked on index 3 and from this moment we are looking for rules 'rule[2]' and so on .. The answer at index 5, the rule 'rule[1]' did not work because the algorithm by that time was looking for the rule 'rule[2]'

Question 4

I am taking the rcpp and c++ labels off here. This is likely a question for data.table or maybe collapse . And SO is not a 'ask for someone to write code for me' service ...

Question 5

Update

As per your update, I wrote a new fu function, i.e., TIC_fu()

TIC_fu <- function(dat, rule, res.only = TRUE) {
 m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
 idx <- na.omit(
 Reduce(
 function(x, y) {
 k <- which(y)
 ifelse(all(k <= x), NA, min(k[k > x]))
 }, m,
 init = 0, accumulate = TRUE
 )
 )[-1]
 if (!res.only) {
 fidx <- head(idx, length(rule))
 debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
 return(cbind(dat, debug.vec))
 }
 length(idx) >= length(rule)
}

and you will see

> TIC_fu(dat, rule, FALSE)
 x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
> TIC_fu(dat,rule)
[1] FALSE

For benchmarking

> microbenchmark(
+ TIC_fu(dat, rule, FALSE),
+ fu(dat, rule, FALSE),
+ unit = "relative"
+ )
Unit: relative
 expr min lq mean median uq max
 TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
 fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532
 neval
 100
 100

Previous Answer

Here are some options similar to what @Waldi has done, but the only difference is among parse, str2lang and str2expression

microbenchmark::microbenchmark(
 any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
 any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))
)

and you will see

Unit: microseconds
 expr
 any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) == length(rule)))
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) == length(rule)))
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) == length(rule)))
 any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
 min lq mean median uq max neval
 94.0 98.6 131.431 107.35 121.90 632.7 100
 37.5 39.2 48.887 44.05 48.50 174.1 100
 36.8 39.6 51.627 46.20 48.45 241.4 100
 12.7 15.8 19.786 17.00 19.75 97.9 100

Question 6

@mr.T See my udpate

Question 7

@mr.T If you want the speed. you may have to resort to Cpp. I am sorry I am not familiar with Rcpp, so this is what I can do so far.

Question 8

@mr.T I found your R code is sufficiently efficient even with large dat (many rows), and outperforms mine. If you want higher speed, I guess you should try Cpp, rather than any R code.

Question 9

Hi Thomas, please check my question if you have time stackoverflow.com/questions/68867191/…

Question 10

@mr.T You can see my answer to your question there.

Question 11

A possible simple base R way:

with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
 x1>5 & x2>2 x1>x2 x3!=4
 [1,] FALSE FALSE TRUE
 [2,] FALSE FALSE TRUE
 [3,] TRUE TRUE FALSE
 [4,] FALSE FALSE TRUE
 [5,] TRUE FALSE TRUE
 [6,] FALSE TRUE TRUE
 [7,] FALSE FALSE TRUE
 [8,] TRUE FALSE TRUE
 [9,] TRUE TRUE TRUE
[10,] TRUE FALSE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE

Performance :

microbenchmark::microbenchmark(any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule)),
 fu(dat = dat, rule = rule, res.only = T))
Unit: microseconds
 expr min lq mean median
 any(with(dat, sapply(rule, function(rule) eval(parse(text = rule))))) 93.201 97.7010 127.817 104.9010
 fu(dat = dat, rule = rule, res.only = T) 465.902 499.7015 611.827 523.2505
 uq max neval
 124.8010 834.201 100
 643.2015 2018.500 100

Other test:

dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L), 
 x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), x3 = c(4L, 
 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)), class = "data.frame", row.names = c(NA, 
-10L))
dat
 x1 x2 x3
1 2 2 4
2 5 1 9
3 1 6 8
4 3 10 7
5 9 8 10
6 2 10 1
7 6 10 2
8 3 4 8
9 3 6 3
10 9 4 10
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
 x1>5 & x2>2 x1>x2 x3!=4
 [1,] FALSE FALSE FALSE
 [2,] FALSE TRUE TRUE
 [3,] FALSE FALSE TRUE
 [4,] FALSE FALSE TRUE
 [5,] TRUE TRUE TRUE
 [6,] FALSE FALSE TRUE
 [7,] TRUE FALSE TRUE
 [8,] FALSE FALSE TRUE
 [9,] FALSE FALSE TRUE
[10,] TRUE TRUE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
fu(dat)
fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
# Is this correct?

Question 12

Good answer! Upvoted! You can try str2lang or str2expression instead, which may give your more options :)

Question 13

See my edit with new data set : the function I propose seems to fullfil the rules? Please note that I edited the Waldi_fu, I had a copy/paste error in the last bit

Question 14

HiI I answered your question "# Is this correct?" in my new update

ThomasIsCoding 106k9 gold badges38 silver badges110 bronze badges · Accepted Answer · 2021-08-04 18:48:01Z

Update

As per your update, I wrote a new fu function, i.e., TIC_fu()

TIC_fu <- function(dat, rule, res.only = TRUE) {
 m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
 idx <- na.omit(
 Reduce(
 function(x, y) {
 k <- which(y)
 ifelse(all(k <= x), NA, min(k[k > x]))
 }, m,
 init = 0, accumulate = TRUE
 )
 )[-1]
 if (!res.only) {
 fidx <- head(idx, length(rule))
 debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
 return(cbind(dat, debug.vec))
 }
 length(idx) >= length(rule)
}

and you will see

> TIC_fu(dat, rule, FALSE)
 x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
> TIC_fu(dat,rule)
[1] FALSE

For benchmarking

> microbenchmark(
+ TIC_fu(dat, rule, FALSE),
+ fu(dat, rule, FALSE),
+ unit = "relative"
+ )
Unit: relative
 expr min lq mean median uq max
 TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
 fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532
 neval
 100
 100

Previous Answer

Here are some options similar to what @Waldi has done, but the only difference is among parse, str2lang and str2expression

microbenchmark::microbenchmark(
 any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
 any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))
)

and you will see

Unit: microseconds
 expr
 any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) == length(rule)))
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) == length(rule)))
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) == length(rule)))
 any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
 min lq mean median uq max neval
 94.0 98.6 131.431 107.35 121.90 632.7 100
 37.5 39.2 48.887 44.05 48.50 174.1 100
 36.8 39.6 51.627 46.20 48.45 241.4 100
 12.7 15.8 19.786 17.00 19.75 97.9 100

@mr.T If you want the speed. you may have to resort to Cpp. I am sorry I am not familiar with Rcpp, so this is what I can do so far.
@mr.T I found your R code is sufficiently efficient even with large dat (many rows), and outperforms mine. If you want higher speed, I guess you should try Cpp, rather than any R code.
Hi Thomas, please check my question if you have time stackoverflow.com/questions/68867191/…

CollectivesTM on Stack Overflow

Match all logic rules with a dataframe (need super fast function)

2 Answers 2

Update

Previous Answer

5 Comments

3 Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

CollectivesTM on Stack Overflow

2 Answers 2

Update

Previous Answer

5 Comments

3 Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related