I have a matrix of 36,000 genes measurements for 60 samples. For calculating most correlated genes of the first gene in the matrix, my following program gives the result. However processing time is extremely slow.
dat <- as.matrix(read.table("/path/gene_matrix.txt", header = TRUE, fill = TRUE))
corr_list <- data.frame(top=numeric(), correlated=numeric(), cor=numeric(), p.value=numeric())
for (i in 2:nrow(dat)) {
r <- cor.test(dat[1,], dat[i,])
corr_list[i-1,] <- c(rownames(dat)[1], rownames(dat)[i], r$estimate, r$p.value)
}
corr_list <- corr_list[order(corr_list$cor, decreasing = TRUE), ]
write.table(corr_list, "/path/most_related.txt", quote = FALSE, row.names = FALSE, sep="\t")
Anyone could suggest an efficient method for the above problem.
2 Answers 2
If you had profiled your code, you would have seen that unusually long time is taken by line:
corr_list[i - 1,] <- c(rNames[1], rNames[i], r$estimate, r$p.value)
It is probably because you are combining multiple type values into one vector. So this should be mush faster:
f3 <- function(dat) {
require(data.table)
mainVar <- dat[1,]
rNames <- rownames(dat)
rez <- lapply(2:nrow(dat), function(i) {
r <- cor.test(x = mainVar, y = dat[i,])
c(r$estimate, r$p.value)
})
fin <- data.table(rNames[1], rNames[-1])
fin <- cbind(fin, data.table(do.call(rbind, rez)))
setnames(fin, c("top", "correlated", "cor", "p.value"))
return(rez)
}
I <- 60
N <- 1000
dat <- MASS::mvrnorm(N, mu = rep(0, I), diag(I))
rownames(dat) <- paste0("G", 1:N)
# Comparison
res1 <- microbenchmark(
f1(dat),
f2(dat),
f3(dat),
times = 10
)
print(res1, unit = "s")
# Unit: seconds
# expr min lq mean median uq max neval cld
# f1(dat) 0.4333447 0.4609346 0.4880110 0.4863479 0.5050751 0.5782526 20 c
# f2(dat) 0.3585121 0.3729532 0.4021378 0.3979831 0.4269367 0.4899676 20 b
# f3(dat) 0.1483598 0.1610634 0.1782496 0.1787396 0.1934256 0.2273650 20 a
autoplot(res1)
Update
If we calculate correlation with base function and p.value - ourselves, we can get rid of for loop and do all of this much faster. Only problems my arise if your data contains missing values:
pcor <- function(r, n){
t <- r * sqrt(n - 2) / sqrt(1 - r^2)
p <- 2 * (1 - pt(abs(t), (n - 2)))
p[p > 1] <- 1
p
}
f4 <- function(dat){
require(data.table)
rNames <- rownames(dat)
d2 <- t(dat)
cors <- cor(d2[, 1], d2[, -1])
cors <- t(cors)
cp <- pcor(cors, ncol(dat))
cp <- cbind(cors, cp)
fin <- data.table(rNames[1], rNames[-1])
fin <- cbind(fin, cp)
setnames(fin, c("top", "correlated", "cor", "p.value"))
return(fin)
}
I <- 60
N <- 1000
dat <- MASS::mvrnorm(N, mu = rep(0, I), diag(I))
rownames(dat) <- paste0("G", 1:N)
res1 <- microbenchmark(
f1(dat),
f2(dat),
f3(dat),
f4(dat),
times = 10
)
print(res1, unit = "s")
# Unit: seconds
# expr min lq mean median uq max neval cld
# f1(dat) 0.408167816 0.410614574 0.428363371 0.429430744 0.442042920 0.456887399 10 d
# f2(dat) 0.334446197 0.345779175 0.366422216 0.362465218 0.378226388 0.426838502 10 c
# f3(dat) 0.139088268 0.145095289 0.156048277 0.153321298 0.162591518 0.183544069 10 b
# f4(dat) 0.002363351 0.002428473 0.002615854 0.002483812 0.002797716 0.003117556 10 a
-
\$\begingroup\$ excellent. I was too blind to spot that
c(rNames[1], rNames[i], r$estimate, r$p.value)
issue. \$\endgroup\$hplieninger– hplieninger2017年08月25日 14:44:05 +00:00Commented Aug 25, 2017 at 14:44
You should always initialize your data structures first, because that's safer and faster (e.g., this blog post).
# corr_list <- data.frame(top = numeric(), correlated = numeric()))
corr_list <- data.frame(top = numeric(length = nrow(dat)-1),
correlated = NA)
The comparison of both versions indicates that it may reduce time for you by a factor larger that 2. initializing data performance
library("microbenchmark")
library("ggplot2")
# Data
I <- 60
N <- 100
dat <- MASS::mvrnorm(N, mu = rep(0, I), diag(I))
rownames(dat) <- paste0("G", 1:N)
# OP's version
f1 <- function(dat) {
corr_list <- data.frame(top = numeric(), correlated = numeric(),
cor = numeric(), p.value = numeric())
for (i in 2:nrow(dat)) {
r <- cor.test(dat[1,], dat[i,])
corr_list[i - 1,] <- c(rownames(dat)[1], rownames(dat)[i], r$estimate, r$p.value)
}
return(corr_list)
}
# Initialized version
f2 <- function(dat) {
corr_list <- data.frame(top = numeric(length = nrow(dat) - 1),
correlated = NA, cor = NA, p.value = NA)
for (i in 2:nrow(dat)) {
r <- cor.test(x = dat[1,], y = dat[i,])
corr_list[i - 1,] <- c(rownames(dat)[1], rownames(dat)[i], r$estimate, r$p.value)
}
return(corr_list)
}
# Comparison
res1 <- microbenchmark(
f1(dat),
f2(dat),
times = 20
)
print(res1, unit = "s")
autoplot(res1)
Explore related questions
See similar questions with these tags.