Merging overlapping/intersecting string vectors

Question 1

I am trying to merge overlapping/intersecting sets given as a list of string vectors below in R (my actual data set has thousands of such sets). The overlap/intersection is based on the SDxyz: string and not the string after the ":".

d <- list(
 c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), 
 c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"),
 c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"),
 c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"),
 c("SD455:TOMATO", "SD39:MATURE"),
 c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),
 c("SD32:TRUMPET", "SD1:FIELD"))

The final desired out put is as follows.

out <- list (
 c("SD1:LUSH", "SD1:FIELD", "SD23:SWITCH", "SD32:TRUMPET", "SD44:CANCEL", "SD384:FR563", "SD567:TREK") ,
 c("SD12:PAINTING", "SD42:CRAYON", "SD183:WIRE", "SD340:DUST", "SD345:FOX", "SD345:HOLE", "SD315:MONEY31", "SD387:SPRING", "SD387:ROLL"),
 c("SD455:TOMATO", "SD39:MATURE"))

This is the code I could come up with using data.table package.

### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD39:MATURE"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), c("SD32:TRUMPET", "SD1:FIELD"))
d2 <- lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, paste0, collapse=", ")
d2 <- lapply(d2, paste0, collapse=", ")
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)
require(data.table)
Bloc <- data.table( d , key = "sd" )
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
Bloc$ID <- gsub("^\\s+|\\s+$", "", Bloc$ID)
Bloc <- data.table( Bloc , key = "ID" )
### Loop to merge the vectors having ids intersecting between them
Bloc <- as.data.frame(Bloc)
M <- nrow(Bloc)
#create blankd data.frame
G <- data.frame(matrix(ncol=3), stringsAsFactors=FALSE)
G[,1:3] <- as.character(G[,1:3])
#G <- data.frame(sdw=character(), sd=character(), ID= character())
colnames(G) <- c("sdw", "sd", "ID")
N <- M
mch <- as.data.frame(Bloc)
#Loop to sequentially fill data.frame
for (i in 1:M) {
 # test if ID already in previous groups
 if(Bloc[i,"ID"] %in% G$ID == FALSE) { 
 # convert element to vector to check for intersect
 tm <- strsplit(x=Bloc[i, "sd"], split=", ")
 mch$t <- numeric(length=M)
 }
 for (j in 1:N){
 #if intersect exists apply code as 1 mch$t column
 ff <- strsplit(x=mch[j, "sd"], split=", ")[[1]]
 dd <- intersect (tm[[1]], ff)
 if (identical(dd, character(0))== FALSE) mch[j,"t"] = 1
 }
 submch <- subset(mch, t == 1 )
 ID <- submch$ID
 Group1 <- sort((unlist(strsplit(paste0(submch$sdw, collapse=","), ","))))
 Group1 <- unique(gsub(" ","", Group1))
 sdw <- rep(paste0(Group1, collapse=", "), nrow(submch))
 Group2 <- sort((unlist(strsplit(paste0(submch$sd, collapse=","), ","))))
 Group2 <- unique(gsub(" ","", Group2))
 sd <- rep(paste0(Group2, collapse=", "), nrow(submch))
 G1 <- cbind(sdw, sd, ID)
 G1 <- unique(G1)
 G <- rbind(G, G1)
 mch$t <- NULL
}
G <- unique(G)
G2 <- data.table(G, key="ID")
G2 <- G2[, list(sdw = paste0(sort(unique(unlist(strsplit(sdw, split=", ")))), collapse=", "), 
 sd = paste0(sort(unique(unlist(strsplit(sd, split=", ")))), collapse=", ")) , by = "ID"]
G2 <- data.table( G2, key=c("sd", "sdw"))
G2 <- unique(G2)
### Get the output as data.table
Bloc <- G2[-1,]
Bloc$ID <- NULL
### Repeat the above loop until no more intersects are left
repeat
{
 N1 <- nrow(Bloc)
 Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
 Bloc$ID <- gsub("^\\s+|\\s+$", "", Bloc$ID)
 Bloc <- data.table( Bloc , key = "ID" )
 Bloc <- as.data.frame(Bloc)
 M <- nrow(Bloc)
 #create blankd data.frame
 G <- data.frame(matrix(ncol=3), stringsAsFactors=FALSE)
 G[,1:3] <- as.character(G[,1:3])
 #G <- data.frame(sdw=character(), sd=character(), ID= character())
 colnames(G) <- c("sdw", "sd", "ID")
 N <- M
 mch <- as.data.frame(Bloc)
 #Loop to sequentially fill data.frame
 for (i in 1:M) {
 # test if ID already in previous groups
 if(Bloc[i,"ID"] %in% G$ID == FALSE) { 
 # convert element to vector to check for intersect
 tm <- strsplit(x=Bloc[i, "sd"], split=", ")
 mch$t <- numeric(length=M)
 }
 for (j in 1:N){
 #check if intersect exists and code accordingly
 ff <- strsplit(x=mch[j, "sd"], split=", ")[[1]]
 dd <- intersect (tm[[1]], ff)
 if (identical(dd, character(0))== FALSE) mch[j,"t"] = 1
 }
 submch <- subset(mch, t == 1 )
 ID <- submch$ID
 Group1 <- sort((unlist(strsplit(paste0(submch$sdw, collapse=","), ","))))
 Group1 <- unique(gsub(" ","", Group1))
 sdw <- rep(paste0(Group1, collapse=", "), nrow(submch))
 Group2 <- sort((unlist(strsplit(paste0(submch$sd, collapse=","), ","))))
 Group2 <- unique(gsub(" ","", Group2))
 sd <- rep(paste0(Group2, collapse=", "), nrow(submch))
 G1 <- cbind(sdw, sd, ID)
 G1 <- unique(G1)
 G <- rbind(G, G1)
 mch$t <- NULL
 }
 G <- unique(G)
 G2 <- data.table(G, key="ID")
 G2 <- G2[, list(sdw = paste0(sort(unique(unlist(strsplit(sdw, split=", ")))), collapse=", "), 
 sd = paste0(sort(unique(unlist(strsplit(sd, split=", ")))), collapse=", ")) , by = "ID"]
 G2 <- data.table( G2, key=c("sd", "sdw"))
 G2 <- unique(G2)
 Bloc <- G2[-1,]
 Bloc$ID <- NULL
 N2 <- nrow(Bloc) 
 if (N1 == N2)
 break
}
### Output
Bloc$sdw

I know it is ugly. Is there any way to vectorize and speed up the code. Right now it works, but is painfully slow for large number of sets.

Question 2

I really don't get what you're trying to do here. The out vector looks like a union of all the vectors without duplicates, but instead of a simple vector of N string elements, it's a vector of 3 strings with comma separated values. Why should there be 3 elements in the output?

Question 3

@janos I want to get the union of all the intersecting/overlapping vectors based on the SDxyz strings. The above code searches if there is any overlap between vectors. If there is an overlap, the respective strings are merged.

Question 4

@janos For greater clarity I have changed the desired output out to a list. d[[1]], d[[2]] and d[[7]] have overlaps among them, so they are merged to out[[1]]. Similarly d[[3]] ,d[[4]] and d[[6]] have been merged to out[[2]]. d[[5]] doesn't have any overlaps with other sets, so it remains as such in out[[3]].

Question 5

Isn't SD315:MONEY31 missing at the out[[2]]?

Question 6

@djhurio That is right. Good catch. I have corrected it now.

Question 7

Using the data.table syntax, you can shorten and speed up your code a lot. The following code should get the same result as you got (except for the order).

### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list(c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), 
 c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), 
 c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), 
 c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), 
 c("SD455:TOMATO", "SD39:MATURE"), 
 c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), 
 c("SD32:TRUMPET", "SD1:FIELD"))
# using substr instead of 2 loops (only works if all elements really have a :)
d2 <- lapply(d, function(x) substr(x, 1, regexpr(":", x)-1))
# using sapply 
# using stringsAsFactors = FALSE to avoid later conversion
d <- as.data.frame(sapply(d, paste0, collapse=", "), stringsAsFactors=FALSE)
d2 <- as.data.frame(sapply(d2, paste0, collapse=", "), stringsAsFactors=FALSE)
# binding
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
#
require(data.table)
Bloc <- data.table(d , key = "sd")
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
## use := to avoid copying
Bloc[, ID := gsub("^\\s+|\\s+$", "", Bloc$ID)]
# use setkey
setkey(Bloc, "ID")
# grouping
Bloc[, group:=.GRP, by = sdw]
# while there are 'wrong' groups
while (any(Bloc[, group>min(group), by = ID][, V1], 
 Bloc[, group>min(group), by = sdw][, V1])){
 # adjust grouping
 Bloc[, group:=min(group), by = ID]
 Bloc[, group:=min(group), by = sdw]
}
# 
res <- Bloc[, unique(unlist(strsplit(unique(sdw), ", "))), by=group]
# splitting to get the list comparable to out
split(res$V1, res$group)

shadow shadowshadow 2264 silver badges8 bronze badges · Accepted Answer · 2015-02-25 15:37:20Z

Using the data.table syntax, you can shorten and speed up your code a lot. The following code should get the same result as you got (except for the order).

### Create a data.table Bloc with one column with original groups and the other with the separated ids
d <- list(c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), 
 c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), 
 c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), 
 c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), 
 c("SD455:TOMATO", "SD39:MATURE"), 
 c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), 
 c("SD32:TRUMPET", "SD1:FIELD"))
# using substr instead of 2 loops (only works if all elements really have a :)
d2 <- lapply(d, function(x) substr(x, 1, regexpr(":", x)-1))
# using sapply 
# using stringsAsFactors = FALSE to avoid later conversion
d <- as.data.frame(sapply(d, paste0, collapse=", "), stringsAsFactors=FALSE)
d2 <- as.data.frame(sapply(d2, paste0, collapse=", "), stringsAsFactors=FALSE)
# binding
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
#
require(data.table)
Bloc <- data.table(d , key = "sd")
### Fetch all the ids along with the corresponding data in Bloc
Bloc <- Bloc[ , list( ID = unlist( strsplit( sd , "," ) ) ) , by = list(sdw, sd) ]
## use := to avoid copying
Bloc[, ID := gsub("^\\s+|\\s+$", "", Bloc$ID)]
# use setkey
setkey(Bloc, "ID")
# grouping
Bloc[, group:=.GRP, by = sdw]
# while there are 'wrong' groups
while (any(Bloc[, group>min(group), by = ID][, V1], 
 Bloc[, group>min(group), by = sdw][, V1])){
 # adjust grouping
 Bloc[, group:=min(group), by = ID]
 Bloc[, group:=min(group), by = sdw]
}
# 
res <- Bloc[, unique(unlist(strsplit(unique(sdw), ", "))), by=group]
# splitting to get the list comparable to out
split(res$V1, res$group)

Stack Exchange Network

Merging overlapping/intersecting string vectors

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Merging overlapping/intersecting string vectors

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions