Data Science Blog: Regression I: Basics and Subset Selection

# The following code is for the book The Elements of Statistical Learning, chapter 3
# Data was described on page 3
rm(list=ls())
library(ElemStatLearn)
data(prostate)
head(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
pairs(data, main="prostate cancer data",pch = 21)

cor(train[,-9])
# # generate a table in the Latex form
# library(xtable)
# xtable( cor(train[,-9]), caption="Table 3.1", digits=3 )
lm.model = lm(lpsa ~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45
              , data=train )
summary(lm.model)
# graphical summary
layout(matrix(c(1,2,3,4),2,2))
plot(lm.model)
dev.off()
y.pred = predict(lm.model, newdata = test)
y.test = test$lpsa
pred.mse = mean((y.pred - y.test)^2)
# reduced model
lm.model_reduce = lm(lpsa ~
                       lcavol
                       + lweight
#                        + age
                       + lbph
                       + svi
#                        + lcp
#                        + gleason
#                        + pgg45
                       , data=train )
summary(lm.model_reduce)
anova(lm.model_reduce,lm.model)

# parameter estimation by hand
names(train)
Y = as.numeric(train[,9])
# design matrix
X = cbind(rep(1,nrow(train)),as.matrix(train[,-9]))
# equation 3.6: coefficients estimation
beta.hat = solve(t(X)%*%X)%*%t(X)%*%Y
# equation 3.7: prediction
Y.hat = X%*%beta.hat
# Hat matrix
H = X%*%solve(t(X)%*%X)%*%t(X)
# equation 3.8: variance estimation
p = ncol(X) - 1
N = nrow(train)
sigma2.hat = sum((Y - Y.hat)^2)/(N - p -1)
# kind of equation 3.10: covariance matrix of beta
covarbeta.hat = solve(t(X)%*%X)*sigma2.hat
# equation 3.12: Z-score (actually t-distribution with df N-p-1)
stdbeta.hat = sqrt(diag(covarbeta.hat))
z = beta.hat/stdbeta.hat
# beta and y.hat based on QR decomposition
X.QR = qr(X)
qr.X(X.QR)
Q = qr.Q(X.QR)
R = qr.R(X.QR)
beta.qr = solve(R)%*%t(Q)%*%Y
y.qr = Q%*%t(Q)%*%Y
# subset selection
# 1. Best-Subset Selection
# for logistic regression, use bestglm package
# need library leaps:
library(leaps)
# recreate Figure 3.5 on page 58
model.bestsub = leaps(train[,-9], train[,9], method="r2")
# method: Cp, adjr2, or r2
plot(model.bestsub$size, model.bestsub$r2, xlab="# predictors+1", ylab="r2")

#
# 2. Stepwise Selection
# Forward- and Backward-Stepwise Selection
# test could be Chisq or F for lm
# Rao, LRT, Chisq, F for glm
# Forward Stepwise
fit.null = lm(lpsa~1,data=train)
fit.f1 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              trace = FALSE, # print out stepwise summary or not
              k=log(nrow(train)) # BIC, or any number here
              )
summary(fit.f1)
fit.f2 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              trace = TRUE,
              k=2)
summary(fit.f2)
# Backward Stepwise
fit.full = fit.null = lm(lpsa~.,data=train)
# BIC
fit.r1 = step(fit.full,direction="backward",data=train, trace=0, k=log(nrow(train)))
# AIC
fit.r2 = step(fit.full,direction="backward",data=train, trace=1 ,k=2)

#
# 3. Forward Stagewise Regression
library(lars)
Y = as.matrix(train[,9])
X = as.matrix(train[,-9])
cv.lars(X,Y,type="for",K=10,mode="step",trace=T)
model.fsr = lars(X,Y,type="for",normalize=F)
plot(model.fsr, xvar = "step")

Data Science Blog

Saturday, March 23, 2013

Regression I: Basics and Subset Selection

No comments:

Post a Comment