7
\$\begingroup\$

I am trying to merge overlapping/intersecting sets given as a list of string vectors below in R (my actual data set has thousands of such sets). The overlap/intersection is based on the SDxyz: string and not the string after the ":".

d <- list(
 c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), 
 c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"),
 c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"),
 c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"),
 c("SD455:TOMATO", "SD39:MATURE"),
 c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),
 c("SD32:TRUMPET", "SD1:FIELD"))

The final desired out put is as follows.

out <- list (
 c("SD1:LUSH", "SD1:FIELD", "SD23:SWITCH", "SD32:TRUMPET", "SD44:CANCEL", "SD384:FR563", "SD567:TREK") ,
 c("SD12:PAINTING", "SD42:CRAYON", "SD183:WIRE", "SD340:DUST", "SD345:FOX", "SD345:HOLE", "SD315:MONEY31", "SD387:SPRING", "SD387:ROLL"),
 c("SD455:TOMATO", "SD39:MATURE"))

This is the code I could come up with using data.table package.

### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD39:MATURE"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), c("SD32:TRUMPET", "SD1:FIELD"))
d2 <- lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, paste0, collapse=", ")
d2 <- lapply(d2, paste0, collapse=", ")
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)
require(data.table)
Bloc <- data.table( d , key = "sd" )
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
Bloc$ID <- gsub("^\\s+|\\s+$", "", Bloc$ID)
Bloc <- data.table( Bloc , key = "ID" )
### Loop to merge the vectors having ids intersecting between them
Bloc <- as.data.frame(Bloc)
M <- nrow(Bloc)
#create blankd data.frame
G <- data.frame(matrix(ncol=3), stringsAsFactors=FALSE)
G[,1:3] <- as.character(G[,1:3])
#G <- data.frame(sdw=character(), sd=character(), ID= character())
colnames(G) <- c("sdw", "sd", "ID")
N <- M
mch <- as.data.frame(Bloc)
#Loop to sequentially fill data.frame
for (i in 1:M) {
 # test if ID already in previous groups
 if(Bloc[i,"ID"] %in% G$ID == FALSE) { 
 # convert element to vector to check for intersect
 tm <- strsplit(x=Bloc[i, "sd"], split=", ")
 mch$t <- numeric(length=M)
 }
 for (j in 1:N){
 #if intersect exists apply code as 1 mch$t column
 ff <- strsplit(x=mch[j, "sd"], split=", ")[[1]]
 dd <- intersect (tm[[1]], ff)
 if (identical(dd, character(0))== FALSE) mch[j,"t"] = 1
 }
 submch <- subset(mch, t == 1 )
 ID <- submch$ID
 Group1 <- sort((unlist(strsplit(paste0(submch$sdw, collapse=","), ","))))
 Group1 <- unique(gsub(" ","", Group1))
 sdw <- rep(paste0(Group1, collapse=", "), nrow(submch))
 Group2 <- sort((unlist(strsplit(paste0(submch$sd, collapse=","), ","))))
 Group2 <- unique(gsub(" ","", Group2))
 sd <- rep(paste0(Group2, collapse=", "), nrow(submch))
 G1 <- cbind(sdw, sd, ID)
 G1 <- unique(G1)
 G <- rbind(G, G1)
 mch$t <- NULL
}
G <- unique(G)
G2 <- data.table(G, key="ID")
G2 <- G2[, list(sdw = paste0(sort(unique(unlist(strsplit(sdw, split=", ")))), collapse=", "), 
 sd = paste0(sort(unique(unlist(strsplit(sd, split=", ")))), collapse=", ")) , by = "ID"]
G2 <- data.table( G2, key=c("sd", "sdw"))
G2 <- unique(G2)
### Get the output as data.table
Bloc <- G2[-1,]
Bloc$ID <- NULL
### Repeat the above loop until no more intersects are left
repeat
{
 N1 <- nrow(Bloc)
 Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
 Bloc$ID <- gsub("^\\s+|\\s+$", "", Bloc$ID)
 Bloc <- data.table( Bloc , key = "ID" )
 Bloc <- as.data.frame(Bloc)
 M <- nrow(Bloc)
 #create blankd data.frame
 G <- data.frame(matrix(ncol=3), stringsAsFactors=FALSE)
 G[,1:3] <- as.character(G[,1:3])
 #G <- data.frame(sdw=character(), sd=character(), ID= character())
 colnames(G) <- c("sdw", "sd", "ID")
 N <- M
 mch <- as.data.frame(Bloc)
 #Loop to sequentially fill data.frame
 for (i in 1:M) {
 # test if ID already in previous groups
 if(Bloc[i,"ID"] %in% G$ID == FALSE) { 
 # convert element to vector to check for intersect
 tm <- strsplit(x=Bloc[i, "sd"], split=", ")
 mch$t <- numeric(length=M)
 }
 for (j in 1:N){
 #check if intersect exists and code accordingly
 ff <- strsplit(x=mch[j, "sd"], split=", ")[[1]]
 dd <- intersect (tm[[1]], ff)
 if (identical(dd, character(0))== FALSE) mch[j,"t"] = 1
 }
 submch <- subset(mch, t == 1 )
 ID <- submch$ID
 Group1 <- sort((unlist(strsplit(paste0(submch$sdw, collapse=","), ","))))
 Group1 <- unique(gsub(" ","", Group1))
 sdw <- rep(paste0(Group1, collapse=", "), nrow(submch))
 Group2 <- sort((unlist(strsplit(paste0(submch$sd, collapse=","), ","))))
 Group2 <- unique(gsub(" ","", Group2))
 sd <- rep(paste0(Group2, collapse=", "), nrow(submch))
 G1 <- cbind(sdw, sd, ID)
 G1 <- unique(G1)
 G <- rbind(G, G1)
 mch$t <- NULL
 }
 G <- unique(G)
 G2 <- data.table(G, key="ID")
 G2 <- G2[, list(sdw = paste0(sort(unique(unlist(strsplit(sdw, split=", ")))), collapse=", "), 
 sd = paste0(sort(unique(unlist(strsplit(sd, split=", ")))), collapse=", ")) , by = "ID"]
 G2 <- data.table( G2, key=c("sd", "sdw"))
 G2 <- unique(G2)
 Bloc <- G2[-1,]
 Bloc$ID <- NULL
 N2 <- nrow(Bloc) 
 if (N1 == N2)
 break
}
### Output
Bloc$sdw

I know it is ugly. Is there any way to vectorize and speed up the code. Right now it works, but is painfully slow for large number of sets.

asked Jun 27, 2014 at 10:42
\$\endgroup\$
5
  • \$\begingroup\$ I really don't get what you're trying to do here. The out vector looks like a union of all the vectors without duplicates, but instead of a simple vector of N string elements, it's a vector of 3 strings with comma separated values. Why should there be 3 elements in the output? \$\endgroup\$ Commented Jun 27, 2014 at 17:15
  • \$\begingroup\$ @janos I want to get the union of all the intersecting/overlapping vectors based on the SDxyz strings. The above code searches if there is any overlap between vectors. If there is an overlap, the respective strings are merged. \$\endgroup\$ Commented Jun 27, 2014 at 18:03
  • \$\begingroup\$ @janos For greater clarity I have changed the desired output out to a list. d[[1]], d[[2]] and d[[7]] have overlaps among them, so they are merged to out[[1]]. Similarly d[[3]] ,d[[4]] and d[[6]] have been merged to out[[2]]. d[[5]] doesn't have any overlaps with other sets, so it remains as such in out[[3]]. \$\endgroup\$ Commented Jun 28, 2014 at 5:44
  • 1
    \$\begingroup\$ Isn't SD315:MONEY31 missing at the out[[2]]? \$\endgroup\$ Commented Jun 28, 2014 at 14:05
  • \$\begingroup\$ @djhurio That is right. Good catch. I have corrected it now. \$\endgroup\$ Commented Jun 29, 2014 at 9:46

1 Answer 1

1
\$\begingroup\$

Using the data.table syntax, you can shorten and speed up your code a lot. The following code should get the same result as you got (except for the order).

### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list(c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), 
 c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), 
 c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), 
 c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), 
 c("SD455:TOMATO", "SD39:MATURE"), 
 c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), 
 c("SD32:TRUMPET", "SD1:FIELD"))
# using substr instead of 2 loops (only works if all elements really have a :)
d2 <- lapply(d, function(x) substr(x, 1, regexpr(":", x)-1))
# using sapply 
# using stringsAsFactors = FALSE to avoid later conversion
d <- as.data.frame(sapply(d, paste0, collapse=", "), stringsAsFactors=FALSE)
d2 <- as.data.frame(sapply(d2, paste0, collapse=", "), stringsAsFactors=FALSE)
# binding
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
#
require(data.table)
Bloc <- data.table(d , key = "sd")
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
## use := to avoid copying
Bloc[, ID := gsub("^\\s+|\\s+$", "", Bloc$ID)]
# use setkey
setkey(Bloc, "ID")
# grouping
Bloc[, group:=.GRP, by = sdw]
# while there are 'wrong' groups
while (any(Bloc[, group>min(group), by = ID][, V1], 
 Bloc[, group>min(group), by = sdw][, V1])){
 # adjust grouping
 Bloc[, group:=min(group), by = ID]
 Bloc[, group:=min(group), by = sdw]
}
# 
res <- Bloc[, unique(unlist(strsplit(unique(sdw), ", "))), by=group]
# splitting to get the list comparable to out
split(res$V1, res$group)
answered Feb 25, 2015 at 15:37
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.