I am trying to merge overlapping/intersecting sets given as a list of string vectors below in R (my actual data set has thousands of such sets). The overlap/intersection is based on the SDxyz:
string and not the string after the ":".
d <- list(
c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"),
c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"),
c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"),
c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"),
c("SD455:TOMATO", "SD39:MATURE"),
c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),
c("SD32:TRUMPET", "SD1:FIELD"))
The final desired out put is as follows.
out <- list (
c("SD1:LUSH", "SD1:FIELD", "SD23:SWITCH", "SD32:TRUMPET", "SD44:CANCEL", "SD384:FR563", "SD567:TREK") ,
c("SD12:PAINTING", "SD42:CRAYON", "SD183:WIRE", "SD340:DUST", "SD345:FOX", "SD345:HOLE", "SD315:MONEY31", "SD387:SPRING", "SD387:ROLL"),
c("SD455:TOMATO", "SD39:MATURE"))
This is the code I could come up with using data.table
package.
### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD39:MATURE"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), c("SD32:TRUMPET", "SD1:FIELD"))
d2 <- lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, paste0, collapse=", ")
d2 <- lapply(d2, paste0, collapse=", ")
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)
require(data.table)
Bloc <- data.table( d , key = "sd" )
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
Bloc$ID <- gsub("^\\s+|\\s+$", "", Bloc$ID)
Bloc <- data.table( Bloc , key = "ID" )
### Loop to merge the vectors having ids intersecting between them
Bloc <- as.data.frame(Bloc)
M <- nrow(Bloc)
#create blankd data.frame
G <- data.frame(matrix(ncol=3), stringsAsFactors=FALSE)
G[,1:3] <- as.character(G[,1:3])
#G <- data.frame(sdw=character(), sd=character(), ID= character())
colnames(G) <- c("sdw", "sd", "ID")
N <- M
mch <- as.data.frame(Bloc)
#Loop to sequentially fill data.frame
for (i in 1:M) {
# test if ID already in previous groups
if(Bloc[i,"ID"] %in% G$ID == FALSE) {
# convert element to vector to check for intersect
tm <- strsplit(x=Bloc[i, "sd"], split=", ")
mch$t <- numeric(length=M)
}
for (j in 1:N){
#if intersect exists apply code as 1 mch$t column
ff <- strsplit(x=mch[j, "sd"], split=", ")[[1]]
dd <- intersect (tm[[1]], ff)
if (identical(dd, character(0))== FALSE) mch[j,"t"] = 1
}
submch <- subset(mch, t == 1 )
ID <- submch$ID
Group1 <- sort((unlist(strsplit(paste0(submch$sdw, collapse=","), ","))))
Group1 <- unique(gsub(" ","", Group1))
sdw <- rep(paste0(Group1, collapse=", "), nrow(submch))
Group2 <- sort((unlist(strsplit(paste0(submch$sd, collapse=","), ","))))
Group2 <- unique(gsub(" ","", Group2))
sd <- rep(paste0(Group2, collapse=", "), nrow(submch))
G1 <- cbind(sdw, sd, ID)
G1 <- unique(G1)
G <- rbind(G, G1)
mch$t <- NULL
}
G <- unique(G)
G2 <- data.table(G, key="ID")
G2 <- G2[, list(sdw = paste0(sort(unique(unlist(strsplit(sdw, split=", ")))), collapse=", "),
sd = paste0(sort(unique(unlist(strsplit(sd, split=", ")))), collapse=", ")) , by = "ID"]
G2 <- data.table( G2, key=c("sd", "sdw"))
G2 <- unique(G2)
### Get the output as data.table
Bloc <- G2[-1,]
Bloc$ID <- NULL
### Repeat the above loop until no more intersects are left
repeat
{
N1 <- nrow(Bloc)
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
Bloc$ID <- gsub("^\\s+|\\s+$", "", Bloc$ID)
Bloc <- data.table( Bloc , key = "ID" )
Bloc <- as.data.frame(Bloc)
M <- nrow(Bloc)
#create blankd data.frame
G <- data.frame(matrix(ncol=3), stringsAsFactors=FALSE)
G[,1:3] <- as.character(G[,1:3])
#G <- data.frame(sdw=character(), sd=character(), ID= character())
colnames(G) <- c("sdw", "sd", "ID")
N <- M
mch <- as.data.frame(Bloc)
#Loop to sequentially fill data.frame
for (i in 1:M) {
# test if ID already in previous groups
if(Bloc[i,"ID"] %in% G$ID == FALSE) {
# convert element to vector to check for intersect
tm <- strsplit(x=Bloc[i, "sd"], split=", ")
mch$t <- numeric(length=M)
}
for (j in 1:N){
#check if intersect exists and code accordingly
ff <- strsplit(x=mch[j, "sd"], split=", ")[[1]]
dd <- intersect (tm[[1]], ff)
if (identical(dd, character(0))== FALSE) mch[j,"t"] = 1
}
submch <- subset(mch, t == 1 )
ID <- submch$ID
Group1 <- sort((unlist(strsplit(paste0(submch$sdw, collapse=","), ","))))
Group1 <- unique(gsub(" ","", Group1))
sdw <- rep(paste0(Group1, collapse=", "), nrow(submch))
Group2 <- sort((unlist(strsplit(paste0(submch$sd, collapse=","), ","))))
Group2 <- unique(gsub(" ","", Group2))
sd <- rep(paste0(Group2, collapse=", "), nrow(submch))
G1 <- cbind(sdw, sd, ID)
G1 <- unique(G1)
G <- rbind(G, G1)
mch$t <- NULL
}
G <- unique(G)
G2 <- data.table(G, key="ID")
G2 <- G2[, list(sdw = paste0(sort(unique(unlist(strsplit(sdw, split=", ")))), collapse=", "),
sd = paste0(sort(unique(unlist(strsplit(sd, split=", ")))), collapse=", ")) , by = "ID"]
G2 <- data.table( G2, key=c("sd", "sdw"))
G2 <- unique(G2)
Bloc <- G2[-1,]
Bloc$ID <- NULL
N2 <- nrow(Bloc)
if (N1 == N2)
break
}
### Output
Bloc$sdw
I know it is ugly. Is there any way to vectorize and speed up the code. Right now it works, but is painfully slow for large number of sets.
1 Answer 1
Using the data.table
syntax, you can shorten and speed up your code a lot. The following code should get the same result as you got (except for the order).
### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list(c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"),
c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"),
c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"),
c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"),
c("SD455:TOMATO", "SD39:MATURE"),
c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),
c("SD32:TRUMPET", "SD1:FIELD"))
# using substr instead of 2 loops (only works if all elements really have a :)
d2 <- lapply(d, function(x) substr(x, 1, regexpr(":", x)-1))
# using sapply
# using stringsAsFactors = FALSE to avoid later conversion
d <- as.data.frame(sapply(d, paste0, collapse=", "), stringsAsFactors=FALSE)
d2 <- as.data.frame(sapply(d2, paste0, collapse=", "), stringsAsFactors=FALSE)
# binding
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
#
require(data.table)
Bloc <- data.table(d , key = "sd")
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
## use := to avoid copying
Bloc[, ID := gsub("^\\s+|\\s+$", "", Bloc$ID)]
# use setkey
setkey(Bloc, "ID")
# grouping
Bloc[, group:=.GRP, by = sdw]
# while there are 'wrong' groups
while (any(Bloc[, group>min(group), by = ID][, V1],
Bloc[, group>min(group), by = sdw][, V1])){
# adjust grouping
Bloc[, group:=min(group), by = ID]
Bloc[, group:=min(group), by = sdw]
}
#
res <- Bloc[, unique(unlist(strsplit(unique(sdw), ", "))), by=group]
# splitting to get the list comparable to out
split(res$V1, res$group)
Explore related questions
See similar questions with these tags.
out
vector looks like a union of all the vectors without duplicates, but instead of a simple vector of N string elements, it's a vector of 3 strings with comma separated values. Why should there be 3 elements in the output? \$\endgroup\$out
to a list.d[[1]]
,d[[2]]
andd[[7]]
have overlaps among them, so they are merged toout[[1]]
. Similarlyd[[3]]
,d[[4]]
andd[[6]]
have been merged toout[[2]]
.d[[5]]
doesn't have any overlaps with other sets, so it remains as such inout[[3]]
. \$\endgroup\$SD315:MONEY31
missing at theout[[2]]
? \$\endgroup\$