I am running a simulation study where I am using two-part hurdle modeling on different effect sizes. I tried to stream-line my code for the best optimization including foreach loops and parallelization. I attempted to post here prematurely a few weeks ago and am now posting my updated code.
I keep running into an error where the hessian matrix is singular that stopped the loops. I continued the loops and logged the errors (any advice on this would be great). I recognize this is a statistical problem, but often others know more than I.
Any feedback on optimization would be ideal as I am running 1000 resamples and 1000 bootstraps. For 50 iterations and 50 bootstraps, it took 7 minutes. For 500 and 500, it took around 3.5 hours.
install.packages("boot")
install.packages("doParallel")
install.packages("doRNG")
library(pscl)
library(boot)
library(doParallel)
library(doRNG)
# if TRUE, keeps a processor free to be used by the OS.
kFlag.free.processor <- TRUE
#Store the path coefficients as a list and a matrix (for different uses)
kParameters <- (function(){
##path coefficient vectors
sizes <- c(50, 100, 200, 300, 500, 1000)
seeds.small <- c(51, 53, 55, 57, 58, 59)
seeds.medium <- c(61, 63, 65, 67, 68, 69)
seeds.large <- c(81, 73, 75, 77, 78, 79)
#cbind call will coerce an integer to a vector filled with that value
c <- .25
i <- 1
##end path coefficient vectors
effect.small <- cbind(a = .18, b = .16, c, i, n = sizes, seed = seeds.small)
effect.medium <- cbind(a = .31, b = .35, c, i, n = sizes, seed = seeds.medium)
effect.large <- cbind(a = .52, b = .49, c, i, n = sizes, seed = seeds.large)
list(list = list(small = effect.small, medium = effect.medium, large = effect.large),
matrix = rbind(effect.small, effect.medium, effect.large))#return
})() #IIFE
# RNG MODULE FOR TWO_PART HURDLE MODEL
gen.hurdle = function(n, a, b1, b2, c1, c2, i0, i1, i2){
x = round(rnorm(n),3)
e = rnorm(n)
m = round(i0 + a*x + e, 3)
lambda = exp(i1 + b1*m + c1*x) # PUT REGRESSION TERMS FOR THE CONTINUUM PART HERE; KEEP exp()
ystar = qpois(runif(n, dpois(0, lambda), 1), lambda) # Zero-TRUNCATED POISSON DIST.; THE CONTINUUM PART
z = i2 + b2*m + c2*x # PUT REGRESSION TERMS FOR THE BINARY PART HERE
z_p = exp(z) / (1+exp(z)) # p(1) = 1-p(0)
tstar = rbinom(n, 1, z_p) # BINOMIAL DIST. ; THE BINARY PART
y= ystar*tstar # TWO-PART COUNT OUTCOME
return(cbind(x,m,y,z,z_p,tstar))
}
##################################################################################################
# MODEL ##########################################################################################
##################################################################################################
#model
iterations = 500
r = 500
# capture runtime
system.time({
## Setup Parallelization
message("Setting up Parallelization...")
# create clusters for each available processor
cl <- makeCluster(detectCores() - 1*kFlag.free.processor, outfile="")
message(paste0("Created ", detectCores() - 1*kFlag.free.processor, " workers..."))
# register parallelization backend
registerDoParallel(cl)
# pass functions and variables from the "Global Environment" to the "master R process" (being run on each processor)
clusterExport(cl, c("gen.hurdle", "hurdle"))
message(paste0("Parallelization ready. Reserving ", 1*kFlag.free.processor,
" processor for the OS..."))
## end Setup Parallelization
results.all <- foreach(parameters.index = 1:nrow(kParameters$matrix), .combine = rbind) %do%{
message("Beginning simulation on new set of parameters...")
n <- kParameters$matrix[[parameters.index, 'n']]
a <- kParameters$matrix[[parameters.index, 'a']]
b <- kParameters$matrix[[parameters.index, 'b']]
c <- kParameters$matrix[[parameters.index, 'c']]
i <- kParameters$matrix[[parameters.index, 'i']]
seed <- kParameters$matrix[[parameters.index, 'seed']]
registerDoRNG(seed) # set.seed() for parallel code
errors <- 0
no.zeros <- 0
results <- foreach(iiii=icount(iterations), .combine = rbind) %do%{
message(paste0("Iteration ", iiii, " of ", iterations))
data = data.frame(gen.hurdle(n, a, b, b, c, c, i, i, i))
data0 = data.frame(gen.hurdle(n, a, 0, 0, c, c, i, i, i))
p_0 =1-mean(data$z_p)
p_0_hat =1-mean(data$tstar)
p_0_b0 =1-mean(data0$z_p)
p_0_hat_b0 =1-mean(data0$tstar)
#power
fit1 = lm(m ~ x, data=data)
fit2 = hurdle(formula = y ~ m + x, data=data, dist = "poisson", zero.dist = "binomial")
a_hat = summary(fit1)$coef[2,1]
b1_hat = summary(fit2)[[1]]$count[2,1]
b2_hat = summary(fit2)[[1]]$zero[2,1]
ab1_hat = prod(a_hat,b1_hat)
ab2_hat = prod(a_hat,b2_hat)
#type I error
fit3 = lm(m ~ x, data=data0)
fit4 = hurdle(formula = y ~ m + x, data=data0, dist = "poisson", zero.dist = "binomial")
a_hat_b0 = summary(fit3)$coef[2,1]
b1_hat_b0 = summary(fit4)[[1]]$count[2,1]
b2_hat_b0 = summary(fit4)[[1]]$zero[2,1]
ab1_hat_b0 = prod(a_hat_b0,b1_hat_b0)
ab2_hat_b0 = prod(a_hat_b0,b2_hat_b0)
message(paste0("Bootstrapping..."))
# bootstrap
boot <- foreach(jjjj = icount(r), .combine = rbind, .errorhandling = "remove", .packages = c("pscl")) %dopar%{
#power
boot.data = data[sample(nrow(data), replace = TRUE), ]
has.zero <- prod(boot.data$y) > 0
if(!has.zero) {
no.zeros <- no.zeros + 1
boot.data$y[1] = 0
warning(paste0("Iteration #",iiii, " Bootstrap #",jjjj, " had no zeros!"), immediate. = TRUE, call. = FALSE)
}
boot.fit1 = lm(m ~ x, data=boot.data)
boot.fit2 = hurdle(formula = y ~ m + x, data=boot.data, dist = "poisson", zero.dist = "binomial")
boot.a = summary(boot.fit1)$coef[2,1]
boot.b1 = summary(boot.fit2)[[1]]$count[2,1]
boot.b2 = summary(boot.fit2)[[1]]$zero[2,1]
boot.ab1 = prod(boot.a,boot.b1)
boot.ab2 = prod(boot.a,boot.b2)
#Type I error
boot.data0 = data0[sample(nrow(data0), replace = TRUE), ]
boot.data0$y[1] = if(prod(boot.data0$y) > 0) 0 else boot.data0$y[1]
boot.fit3 = lm(m ~ x, data=boot.data0)
boot.fit4 = hurdle(formula = y ~ m + x, data=boot.data0, dist = "poisson", zero.dist = "binomial")
boot.a_b0 = summary(boot.fit3)$coef[2,1]
boot.b1_b0 = summary(boot.fit4)[[1]]$count[2,1]
boot.b2_b0 = summary(boot.fit4)[[1]]$zero[2,1]
boot.ab1_b0 = prod(boot.a_b0,boot.b1_b0)
boot.ab2_b0 = prod(boot.a_b0,boot.b2_b0)
cbind(ab1_hat, ab2_hat, boot.ab1, boot.ab2,
ab1_hat_b0, ab2_hat_b0, boot.ab1_b0, boot.ab2_b0) #return
} # end bootstrap
if(nrow(boot)!=r){
warning(paste0("Iteration #",iiii," threw ",r-nrow(boot)," error(s)"), immediate. = TRUE, call. = FALSE)
errors <- errors + r-nrow(boot)
}
z0.1 = qnorm((sum(boot[,3] > boot[,1])+sum(boot[,3]==boot[,1])/2)/nrow(boot))
z0.2 = qnorm((sum(boot[,4] > boot[,2])+sum(boot[,4]==boot[,2])/2)/nrow(boot))
z0.1_b0 = qnorm((sum(boot[,7] > boot[,5])+sum(boot[,7]==boot[,5])/2)/nrow(boot))
z0.2_b0 = qnorm((sum(boot[,8] > boot[,6])+sum(boot[,8]==boot[,6])/2)/nrow(boot))
alpha=0.05 # 95% limits
z=qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits
p1 = pnorm(z-2*z0.1) # bias-correct & convert to proportions
p2 = pnorm(z-2*z0.2)
p1_b0 = pnorm(z-2*z0.1_b0)
p2_b0 = pnorm(z-2*z0.2_b0)
ci1 = quantile(boot[,3],p=p1) # Bias-corrected percentile lims
ci2 = quantile(boot[,4],p=p2)
ci1_b0 = quantile(boot[,7],p=p1_b0)
ci2_b0 = quantile(boot[,8],p=p2_b0)
sig.ab1 = if(prod(ci1) > 0) 1 else 0
sig.ab2 = if(prod(ci2) > 0) 1 else 0
sig.ab1_b0 = if(prod(ci1_b0) > 0) 1 else 0
sig.ab2_b0 = if(prod(ci2_b0) > 0) 1 else 0
#results
cbind(sig.ab1, sig.ab2, sig.ab1_b0, sig.ab2_b0)
} # end iterations
mean.results <- t(apply(results, 2, mean))
colnames(mean.results) <- c("power of ab1", "power of ab2",
"type I error of ab1", "type I error of ab2")
cbind(t(kParameters$matrix[parameters.index, ]), mean.results, errors, no.zeros)
} # end parameters loop
# release cores back to the OS
stopCluster(cl)
View(results.all)
}) # end System.time
Please tear me apart as I love to learn and am relying on your wisdom. Also, realize I am fairly new to R/Programming. Thank you in advance.
1 Answer 1
A couple of style / minor performance improvement comments. I believe it will take someone with statistical knowledge of the model you are using to help you more:
message()
andwarning()
will concatenate strings if provided with multiple arguments so no need forpaste0()
here. Note that excessive verbosity might slow down your program because too much resources are used to print messages.when working with scalars, it is better/faster to use
*
rather thanprod
.
library(microbenchmark)
microbenchmark({prod(10, 11)}, {10*11}, times = 10000)
#> Unit: nanoseconds
#> expr min lq mean median uq max neval
#> { prod(10, 11) } 394 413 461.8892 430 456 7903 10000
#> { 10 * 11 } 204 221 242.8107 228 241 4206 10000
- as far as I can tell, you can drop the following lines because you're not using them:
p_0 =1-mean(data$z_p)
p_0_hat =1-mean(data$tstar)
p_0_b0 =1-mean(data0$z_p)
p_0_hat_b0 =1-mean(data0$tstar)
-
\$\begingroup\$ Thank you for taking the time to review this. I had no idea that
message()
andwarning()
functioned like that. Also, a very good point about less is more for printing. I meant to comment those lines out. I do not need them. Another great catch. \$\endgroup\$J Bacon– J Bacon2019年06月12日 13:25:38 +00:00Commented Jun 12, 2019 at 13:25
Explore related questions
See similar questions with these tags.