I want to write code that does backward stepwise selection using cross-validation as a criterion.
I have only started learning R a month ago and I have almost zero programming experience prior to that. Hence, I would appreciate any comments on the code.
(I understand that there are issues with the backward stepwise selection process itself.)
rm(list=ls())
library(forecast)
library(fpp)
set.seed(1)
x1<-c(1:500)*runif(20,min=0,max=200)
x2<-c(1:500)*runif(20,min=0,max=200)
library(caret)
data(credit)
score <- credit$score
log.savings <-log(credit$savings+1)
log.income <-log(credit$income+1)
log.address<- log(credit$time.address+1)
log.employed <- log(credit$time.employed+1)
##################################
#Input
full.model<- score ~ log.savings + log.income + log.address + log.employed + x1 +x2
# 1. Start with the full model
counter=1
full <- lm(full.model)
scopevars<- attr(full$terms, "predvars") # list of model predicotrs
n <- length(full$residuals); # sample size
current_best <- full #what is the currenly the best model ? that would be our full model for now
while(T){ #process until we break the loop
best_cv <- CV(current_best)[1] #the CV of the current_best model
temp <- summary(current_best) #summary output of the current_best model
p <- dim(temp$coefficients)[1] # current model's size
rnames <- rownames(temp$coefficients) # list of terms in the current model
compare <- matrix(NA,p,1) # create a matrix to store CV information
rownames(compare) <- rnames #name the matrix
#create a loop that drops one predictor at a time and calculates the CV of the regression when that predictor is dropped
#put all the information into "compare"
#drop the variable that has the lowerst CV when it is dropped.
for (i in 2:p) {
drop_var <- rnames[i] #variable to be deleted
f <- formula(current_best) #current formula
f <- as.formula(paste(f[2], "~", paste(f[3], drop_var, sep=" - "))) # modify the formula to drop the chosen variable (by subtracting it)
new_fit <- lm(f) # fit the modified model
compare[i]<-CV(new_fit)[1]
}
remove_var <-rownames(compare)[which.min(compare)] #drop is the varibale that is associate with the lowerst CV
update_cv <- compare[which.min(compare)]
if(update_cv>best_cv) break #we should not continue if dropping a variable will not improve Cv
write(paste("--- Dropping",counter, remove_var , update_cv, "\n"), file="") # output the variables we are dropping.
f <- formula(current_best) #current formula
f <- as.formula(paste(f[2], "~", paste(f[3], remove_var , sep=" - "))); # modify the formula to drop the chosen variable (by subtracting it)
current_best <- lm(f)# we now have a new best model
counter=counter+1 #update our counter
next
} #end the while loop
print(current_best)
CV(current_best)
1 Answer 1
Your code is good. I could not find important corrections to be made. I could find only some small cosmetic improvements - nothing serious.
For example. You do not need the caret
package loaded for this task. You do not need to use next
at the end of the while
loop. It is good practice to keep your code in width of 80 characters. It improves readability a lot. See my code attached.
rm(list=ls())
gc()
library(forecast)
library(fpp)
set.seed(1)
x1 <- c(1:500) * runif(20, min=0, max=200)
x2 <- c(1:500) * runif(20, min=0, max=200)
data(credit)
score <- credit$score
log.savings <- log(credit$savings+1)
log.income <- log(credit$income+1)
log.address <- log(credit$time.address+1)
log.employed <- log(credit$time.employed+1)
##################################
#Input
full.model <- score ~ log.savings + log.income + log.address +
log.employed + x1 +x2
# 1. Start with the full model
counter <- 1L
full <- lm(full.model)
scopevars <- attr(full$terms, "predvars") # list of model predicotrs
n <- length(full$residuals) # sample size
current_best <- full #what is the currenly the best model?
# that would be our full model for now
while(T) { #process until we break the loop
best_cv <- CV(current_best)[1] #the CV of the current_best model
temp <- summary(current_best) #summary output of the current_best model
p <- dim(temp$coefficients)[1] # current model's size
rnames <- rownames(temp$coefficients) # list of terms in the current model
compare <- matrix(NA,p,1) # create a matrix to store CV information
rownames(compare) <- rnames #name the matrix
# create a loop that drops one predictor at a time and calculates
# the CV of the regression when that predictor is dropped
# put all the information into "compare"
# drop the variable that has the lowest CV when it is dropped.
for (i in 2:p) {
drop_var <- rnames[i] #variable to be deleted
f <- formula(current_best) #current formula
f <- as.formula(paste(f[2], "~", paste(f[3], drop_var, sep=" - ")))
# modify the formula to drop the chosen variable (by subtracting it)
new_fit <- lm(f) # fit the modified model
compare[i] <- CV(new_fit)[1]
}
remove_var <- rownames(compare)[which.min(compare)]
# drop is the varibale that is associate with the lowest CV
update_cv <- compare[which.min(compare)]
if (update_cv > best_cv) break # we should not continue
# if dropping a variable will not improve CV
write(paste("--- Dropping", counter, remove_var , update_cv, "\n"), file="")
# output the variables we are dropping
f <- formula(current_best) #current formula
f <- as.formula(paste(f[2], "~", paste(f[3], remove_var , sep=" - ")))
# modify the formula to drop the chosen variable (by subtracting it)
current_best <- lm(f) # we now have a new best model
counter <- counter + 1L # update our counter
} #end the while loop
print(current_best)
CV(current_best)