Wednesday, March 13, 2013

Regression in R: best subset, stepwise, ridge, lasso, and PCR

# The following code is for the book The Elements of Statistical Learning, chapter 3
# Data was described on page 3

rm(list=ls())
library(ElemStatLearn)
data(prostate)
head(prostate)
train = subset(prostate, train==T)[,1:9]
pairs(train, main="prostate cancer data",pch = 21)
cor(train[,-9])
# generate a table in the Latex form
library(xtable)
xtable( cor(train[,-9]), caption="Table 3.1", digits=3 )

lm.model = lm(lpsa ~ ., data=train )
summary(lm.model)
# graphical summary
layout(matrix(c(1,2,3,4),2,2))
plot(lm.model)
dev.off()
# parameter estimation by hand
names(train)
Y = as.numeric(train[,9])
# design matrix
X = cbind(rep(1,nrow(train)),as.matrix(train[,-9]))
# equation 3.6
beta.hat = solve(t(X)%*%X)%*%t(X)%*%Y
beta.hat
Y.hat = X%*%beta.hat
Y.hat
H = X%*%solve(t(X)%*%X)%*%t(X)
p = ncol(X) - 1
N = nrow(train)
sigma2.hat = sum((Y - Y.hat)^2)/(N - p -1)
covarbeta.hat = solve(t(X)%*%X)*sigma2.hat
# equation 3.12
stdbeta.hat = sqrt(diag(covarbeta.hat))
z = beta.hat/stdbeta.hat
# best-subset selection
library(leaps)
model.bestsub = leaps(train[,-9], train[,9], method="r2")
plot(model.bestsub$size, model.bestsub$r2, xlab="# predictors+1", ylab="r2")
abline(0, 1)
# Forward- and Backward-Stepwise Selection
fit.null = lm(lpsa~1,data=train)
fit.f1 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              k=log(nrow(train)))
summary(fit.f1)
fit.f2 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              k=2)
summary(fit.f2)
fit.full = fit.null = lm(lpsa~.,data=train)
# BIC
fit.r1 = step(fit.full,direction="backward",data=train, trace=0, k=log(nrow(train)))
# AIC
fit.r2 = step(fit.full,direction="backward",data=train, trace=1 ,k=2)

train.s = scale(train)
# fit ridge regression
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])
lam = 1000*nrow(train.s)
# least square
betals = solve(t(X)%*%X)%*%t(X)%*%Y
# ridge regression
betar = solve(t(X)%*%X + diag(rep(lam, ncol(X))))%*%t(X)%*%Y
cbind(betals, betar)
# ridge regression coefficient paths
lambdas = exp(seq(log(0.01), log(ncol(X)), l=100))
betar.path = matrix(0, length(lambdas), ncol(X))
for (i in 1:length(lambdas)){
  betar.path[i,] = solve(t(X)%*%X + diag(rep(lambdas[i], ncol(X))))%*%t(X)%*%Y
}
plot(c(1,length(lambdas)), range(betar.path), type='n', ylab='Coefficients', xlab='Lambda Index')
for (j in c(1:8)){
  lines(betar.path[length(lambdas):1,j],col=j)
}
legend(0,0.2,legend=names(train)[1:8],col=1:8,lty=rep(1,8))

# lasso
library(glmnet)
betals = solve(t(X)%*%X)%*%t(X)%*%Y
lam = 0.1
lasso.fit = glmnet(x=X,y=Y,family="gaussian",lambda=lam,alpha=1)
# comparison between least square and lasso
cbind(betals,as.matrix(lasso.fit$beta))
lasso.fit2 = glmnet(x=X,y=Y,family="gaussian",alpha=1)
plot(lasso.fit2,col=1:ncol(X))
legend(0,0.5,legend=names(train)[1:8],col=1:8,lty=rep(1,8),cex=0.8)
# Principal Components Regression
x.svd = svd(X)
#scatterplots of samples PCs
par(mar=c(1,1,1,1))
layout(matrix(1:64,8,8))
mycols = rainbow(length(Y))
orY = order(Y)
for(i in 1:8){
  for(j in 1:8){
    plot(x.svd$u[,i],x.svd$u[,j],type="p",pch=16,col=mycols[orY])
  }
}
# amount of variance explained
var = 0
var.cum = 0;
for(i in 1:8){
  var[i] = x.svd$d[i]/sum(x.svd$d)
  var.cum[i] = sum(var)
}
par(mfrow=c(1,2))
par(mar=c(5,4,4,2))
barplot(var,ylab="Amount of Var Explained",xlab="PCs")
barplot(var.cum,ylab="Cummulative Var Explained",xlab="PCs")
# PC direction weights
par(mfrow=c(3,3))
par(mar=c(5,4,3,2))
for(i in 1:8){
  barplot(x.svd$v[,i],names.arg=names(train)[1:8])
}
#PC regression
beta.pcr = diag(1/x.svd$d)%*%t(x.svd$u)%*%Y
Z = X%*%x.svd$v
#training error
err.pcr = 0
for(i in 1:8){
  err.pcr[i] = sum(as.numeric(Y - Z[,1:i,drop=FALSE]%*%beta.pcr[1:i,1])^2)/length(Y)
}
err.pcr

No comments:

Post a Comment