Backwards stepwise regression code in R (using cross-validation as criteria)

Question 1

I want to write code that does backward stepwise selection using cross-validation as a criterion.

I have only started learning R a month ago and I have almost zero programming experience prior to that. Hence, I would appreciate any comments on the code.

(I understand that there are issues with the backward stepwise selection process itself.)

rm(list=ls())
library(forecast)
library(fpp)
set.seed(1)
x1<-c(1:500)*runif(20,min=0,max=200)
x2<-c(1:500)*runif(20,min=0,max=200)
library(caret)
data(credit)
score <- credit$score 
log.savings <-log(credit$savings+1)
log.income <-log(credit$income+1)
log.address<- log(credit$time.address+1)
log.employed <- log(credit$time.employed+1)
################################## 
#Input
full.model<- score ~ log.savings + log.income + log.address + log.employed + x1 +x2 
# 1. Start with the full model 
counter=1
full <- lm(full.model)
scopevars<- attr(full$terms, "predvars") # list of model predicotrs 
n <- length(full$residuals); # sample size
current_best <- full #what is the currenly the best model ? that would be our full model for now
while(T){ #process until we break the loop 
 best_cv <- CV(current_best)[1] #the CV of the current_best model
 temp <- summary(current_best) #summary output of the current_best model 
 p <- dim(temp$coefficients)[1] # current model's size
 rnames <- rownames(temp$coefficients) # list of terms in the current model
 compare <- matrix(NA,p,1) # create a matrix to store CV information 
 rownames(compare) <- rnames #name the matrix 
 #create a loop that drops one predictor at a time and calculates the CV of the regression when that predictor is dropped 
 #put all the information into "compare"
 #drop the variable that has the lowerst CV when it is dropped.
 for (i in 2:p) { 
 drop_var <- rnames[i] #variable to be deleted 
 f <- formula(current_best) #current formula 
 f <- as.formula(paste(f[2], "~", paste(f[3], drop_var, sep=" - "))) # modify the formula to drop the chosen variable (by subtracting it)
 new_fit <- lm(f) # fit the modified model 
 compare[i]<-CV(new_fit)[1]
 }
 remove_var <-rownames(compare)[which.min(compare)] #drop is the varibale that is associate with the lowerst CV 
 update_cv <- compare[which.min(compare)] 
 if(update_cv>best_cv) break #we should not continue if dropping a variable will not improve Cv 
 write(paste("--- Dropping",counter, remove_var , update_cv, "\n"), file="") # output the variables we are dropping. 
 f <- formula(current_best) #current formula
 f <- as.formula(paste(f[2], "~", paste(f[3], remove_var , sep=" - "))); # modify the formula to drop the chosen variable (by subtracting it)
 current_best <- lm(f)# we now have a new best model 
 counter=counter+1 #update our counter 
 next
} #end the while loop 
print(current_best)
CV(current_best)

Question 2

Your code is good. I could not find important corrections to be made. I could find only some small cosmetic improvements - nothing serious.

For example. You do not need the caret package loaded for this task. You do not need to use next at the end of the while loop. It is good practice to keep your code in width of 80 characters. It improves readability a lot. See my code attached.

rm(list=ls())
gc()
library(forecast)
library(fpp)
set.seed(1)
x1 <- c(1:500) * runif(20, min=0, max=200)
x2 <- c(1:500) * runif(20, min=0, max=200)
data(credit)
score <- credit$score 
log.savings <- log(credit$savings+1)
log.income <- log(credit$income+1)
log.address <- log(credit$time.address+1)
log.employed <- log(credit$time.employed+1)
################################## 
#Input
full.model <- score ~ log.savings + log.income + log.address +
 log.employed + x1 +x2
# 1. Start with the full model 
counter <- 1L
full <- lm(full.model)
scopevars <- attr(full$terms, "predvars") # list of model predicotrs
n <- length(full$residuals) # sample size
current_best <- full #what is the currenly the best model?
 # that would be our full model for now
while(T) { #process until we break the loop
 best_cv <- CV(current_best)[1] #the CV of the current_best model
 temp <- summary(current_best) #summary output of the current_best model 
 p <- dim(temp$coefficients)[1] # current model's size
 rnames <- rownames(temp$coefficients) # list of terms in the current model
 compare <- matrix(NA,p,1) # create a matrix to store CV information 
 rownames(compare) <- rnames #name the matrix 
 # create a loop that drops one predictor at a time and calculates
 # the CV of the regression when that predictor is dropped 
 # put all the information into "compare"
 # drop the variable that has the lowest CV when it is dropped.
 for (i in 2:p) { 
 drop_var <- rnames[i] #variable to be deleted 
 f <- formula(current_best) #current formula 
 f <- as.formula(paste(f[2], "~", paste(f[3], drop_var, sep=" - ")))
 # modify the formula to drop the chosen variable (by subtracting it)
 new_fit <- lm(f) # fit the modified model 
 compare[i] <- CV(new_fit)[1]
 }
 remove_var <- rownames(compare)[which.min(compare)]
 # drop is the varibale that is associate with the lowest CV
 update_cv <- compare[which.min(compare)]
 if (update_cv > best_cv) break # we should not continue
 # if dropping a variable will not improve CV
 write(paste("--- Dropping", counter, remove_var , update_cv, "\n"), file="")
 # output the variables we are dropping
 f <- formula(current_best) #current formula
 f <- as.formula(paste(f[2], "~", paste(f[3], remove_var , sep=" - ")))
 # modify the formula to drop the chosen variable (by subtracting it)
 current_best <- lm(f) # we now have a new best model
 counter <- counter + 1L # update our counter
} #end the while loop 
print(current_best)
CV(current_best)

djhurio djhurio 2861 silver badge7 bronze badges · Accepted Answer · 2014-03-09 10:33:53Z

Your code is good. I could not find important corrections to be made. I could find only some small cosmetic improvements - nothing serious.

For example. You do not need the caret package loaded for this task. You do not need to use next at the end of the while loop. It is good practice to keep your code in width of 80 characters. It improves readability a lot. See my code attached.

rm(list=ls())
gc()
library(forecast)
library(fpp)
set.seed(1)
x1 <- c(1:500) * runif(20, min=0, max=200)
x2 <- c(1:500) * runif(20, min=0, max=200)
data(credit)
score <- credit$score 
log.savings <- log(credit$savings+1)
log.income <- log(credit$income+1)
log.address <- log(credit$time.address+1)
log.employed <- log(credit$time.employed+1)
################################## 
#Input
full.model <- score ~ log.savings + log.income + log.address +
 log.employed + x1 +x2
# 1. Start with the full model 
counter <- 1L
full <- lm(full.model)
scopevars <- attr(full$terms, "predvars") # list of model predicotrs
n <- length(full$residuals) # sample size
current_best <- full #what is the currenly the best model?
 # that would be our full model for now
while(T) { #process until we break the loop
 best_cv <- CV(current_best)[1] #the CV of the current_best model
 temp <- summary(current_best) #summary output of the current_best model 
 p <- dim(temp$coefficients)[1] # current model's size
 rnames <- rownames(temp$coefficients) # list of terms in the current model
 compare <- matrix(NA,p,1) # create a matrix to store CV information 
 rownames(compare) <- rnames #name the matrix 
 # create a loop that drops one predictor at a time and calculates
 # the CV of the regression when that predictor is dropped 
 # put all the information into "compare"
 # drop the variable that has the lowest CV when it is dropped.
 for (i in 2:p) { 
 drop_var <- rnames[i] #variable to be deleted 
 f <- formula(current_best) #current formula 
 f <- as.formula(paste(f[2], "~", paste(f[3], drop_var, sep=" - ")))
 # modify the formula to drop the chosen variable (by subtracting it)
 new_fit <- lm(f) # fit the modified model 
 compare[i] <- CV(new_fit)[1]
 }
 remove_var <- rownames(compare)[which.min(compare)]
 # drop is the varibale that is associate with the lowest CV
 update_cv <- compare[which.min(compare)]
 if (update_cv > best_cv) break # we should not continue
 # if dropping a variable will not improve CV
 write(paste("--- Dropping", counter, remove_var , update_cv, "\n"), file="")
 # output the variables we are dropping
 f <- formula(current_best) #current formula
 f <- as.formula(paste(f[2], "~", paste(f[3], remove_var , sep=" - ")))
 # modify the formula to drop the chosen variable (by subtracting it)
 current_best <- lm(f) # we now have a new best model
 counter <- counter + 1L # update our counter
} #end the while loop 
print(current_best)
CV(current_best)

Stack Exchange Network

Backwards stepwise regression code in R (using cross-validation as criteria)

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Backwards stepwise regression code in R (using cross-validation as criteria)

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions