Monday, March 25, 2013

Regression IV: Principal Components Regression

rm(list=ls())
library(ElemStatLearn)
data(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
# scale function in R substract mean,
# and divide by std dev (with df = N-1)
train.s = scale(train)
# fit ridge regression manually
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])

x.svd = svd(X)
#scatterplots of samples PCs
par(mar=c(1,1,1,1))
layout(matrix(1:64,8,8))
mycols = rainbow(length(Y))
orY = order(Y)
for(i in 1:8){
  for(j in 1:8){
    plot(x.svd$u[,i],x.svd$u[,j],type="p",pch=16,col=mycols[orY])
  }
}

# amount of variance explained
var = 0
var.cum = 0;
for(i in 1:8){
  var[i] = x.svd$d[i]/sum(x.svd$d)
  var.cum[i] = sum(var)
}
par(mfrow=c(1,2))
par(mar=c(5,4,4,2))
barplot(var,ylab="Amount of Var Explained",xlab="PCs")
barplot(var.cum,ylab="Cummulative Var Explained",xlab="PCs")

# PC direction weights
par(mfrow=c(3,3))
par(mar=c(5,4,3,2))
for(i in 1:8){
  barplot(x.svd$v[,i],names.arg=names(train)[1:8])
}

#PC regression
beta.pcr = diag(1/x.svd$d)%*%t(x.svd$u)%*%Y
Z = X%*%x.svd$v
#training error
err.pcr = 0
for(i in 1:8){
  err.pcr[i] = sum(as.numeric(Y - Z[,1:i,drop=FALSE]%*%beta.pcr[1:i,1])^2)/length(Y)
}
err.pcr

No comments:

Post a Comment