Sunday, March 31, 2013

Spline Regression

Chapter 5
 

rm(list=ls())
library(ElemStatLearn)
library(splines)
# Basic Examples
set.seed(0)
x = seq(0, 4*pi, length.out=50)
y = cos(x) + 0.3*rnorm(length(x))
par(mfrow=c(2,3))
# linear, no interior knots
plot(x,y,type="p", main = "Deg = 1, Df = 1")
m1 = lm(y ~ bs(x,degree=1,df=1))
lines(x, fitted(m1))
# linear, 1 interior knot
plot(x,y,type="p", main = "Deg = 1, Df = 2")
m2 = lm(y ~ bs(x,degree=1,df=2))
lines(x, fitted(m2))
# linear, 2 interior knots
plot(x, y,type="p", main = "Deg = 1, Df = 3")
m3 = lm(y ~ bs(x,degree=1,df=3))
lines(x, fitted(m3))
# quadratic, no interior knots
plot(x, y,type="p", main = "Deg = 2, Df = 2")
m4 = lm( y ~ bs(x,degree=2,df=2))
lines(x, fitted(m4))
# quadratic, 1 interior knot
plot(x, y,type="p", main = "Deg = 2, Df = 3")
m5 = lm(y ~ bs(x,degree=2,df=3))
lines(x, fitted(m5))
# quadratic, 2 interior knots
plot(x, y,type="p", main = "Deg = 2, Df = 4")
m6 = lm(y ~ bs(x,degree=2,df=4))
lines(x, fitted(m6))
par(mfrow=c(1,1))

##########################################################################
##########################################################################
################   South African Heart Disease   #########################
##########################################################################
##########################################################################

form="chd~ns(sbp,df=4)+ns(tobacco,df=4)+ns(ldl,df=4)+famhist+ns(obesity,df=4)+ns(alcohol,df=4)+ns(age,df=4)"

form = formula(form)
sbp.ns = ns(SAheart$sbp,df=4)
attr(sbp.ns,"knots")
summary(SAheart$sbp)
# Natural Cubic Splines
model.ncs = glm(form, data=SAheart, family=binomial("logit"))
summary(model.ncs)
names(model.ncs)
# model.ncs$coefficients
# # model.ncs$effects
# model.ncs$xlevels
# model.ncs$method
# # model.ncs$data
# model.ncs$terms # knots
# # model.ncs$model
# recreate Table 5.1
drop1( model.ncs, scope=form, test="Chisq" )
# create the model in Table 5.1
library(MASS)
stepAIC(model.ncs, scope=form, k=2)

##########################################################################
##########################################################################
#######################   Phoneme Recognition   ##########################
############  Example: Filtering and Feature Extraction  #################
##########################################################################
##########################################################################
rm(list=ls())
data(phoneme)
set.seed(0)
aa_indx = which(phoneme$g == "aa")
ao_indx = which(phoneme$g == "ao")
AA_data = phoneme[aa_indx[1:15], 1:256]
AO_data = phoneme[ao_indx[1:15], 1:256]
min_l = min(c(min(AA_data), min(AO_data)))
max_l = max(c(max(AA_data), max(AO_data)))
# Figure 5.5 on page 149
ii=1
plot( as.double(AA_data[ii,]),
      ylim=c(min_l,max_l), type="l", col="red",
      xlab="Frequency", ylab="Log-periodogram" )
for( ii in 2:dim(AA_data)[1] ){
  lines( as.double(AA_data[ii,]), col="red" )
}
for( ii in 1:dim(AO_data)[1] ){
  lines( as.double(AO_data[ii,]), col="blue" )
}

rm(list=ls())
aa_indx = which(phoneme$g == "aa")
ao_indx = which(phoneme$g == "ao")
###########################################################
AA_data = phoneme[aa_indx,]
train_inds = grep("^train", AA_data$speaker)
test_inds = grep("^test", AA_data$speaker)
AA_data_train = AA_data[train_inds, 1:256]
AA_data_test = AA_data[test_inds, 1:256]
#####################################
###### call AA class 1  #############
#####################################
AA_data_train$Y = rep(1, dim(AA_data_train)[1])
AA_data_test$Y = rep(1, dim(AA_data_test)[1])
###########################################################
AO_data = phoneme[ao_indx,]
train_inds = grep("^train", AO_data$speaker)
test_inds = grep("^test", AO_data$speaker)
AO_data_train = AO_data[train_inds, 1:256]
AO_data_test = AO_data[test_inds, 1:256]
#####################################
###### call AO class 1  #############
#####################################
AO_data_train$Y = rep(0, dim(AO_data_train)[1])
AO_data_test$Y = rep(0, dim(AO_data_test)[1])
####################################################
##################  Combine Data  ##################
####################################################
Data_train = rbind(AA_data_train, AO_data_train)
Data_test = rbind(AA_data_test, AO_data_test)

#########################################################################
#############   Logistic Regression Without Intercept  ##################
#########################################################################
form = paste("Y ~ -1 + ", paste(colnames(Data_train)[1:256], collapse="+"))
m = glm(form, family=binomial("logit"), data=Data_train)
mc = as.double(coefficients(m))
m.smooth = smooth.spline(mc,df=15)
plot(mc, ylim=c(-0.8,+0.6),type="l",
     xlab="Frequency",ylab="Logistic Regression Coefficients")
lines(m.smooth, col="blue" )

abline(h=0)
Y_hat_train = predict(m, Data_train[,1:256], type="response")
predicted_class_label_train = as.double(Y_hat_train > 0.5)
Y_hat_test  = predict(m, Data_test[,1:256], type="response")
predicted_class_label_test = as.double(Y_hat_test > 0.5)
err_rate_train1 = mean(Data_train[,257] != predicted_class_label_train)
err_rate_test1  = mean(Data_test[,257] != predicted_class_label_test)
print(sprintf('Logistic Regression (Raw): err_rate_train= %10.6f; err_rate_test= %10.5f',
              err_rate_train1, err_rate_test1))
beta = matrix(m.smooth$y,ncol=1)
logit_train = as.matrix(Data_train[,1:256])%*%beta
Y_smooth_train = exp(logit_train)/(1+exp(logit_train))
predicted_class_smooth_train = as.double(Y_smooth_train > 0.5)
logit_test = as.matrix(Data_test[,1:256])%*%beta
Y_smooth_test = exp(logit_test)/(1+exp(logit_test))
predicted_class_smooth_test = as.double(Y_smooth_test > 0.5)
err_rate_train2 = mean(Data_train[,257] != predicted_class_smooth_train)
err_rate_test2  = mean(Data_test[,257] != predicted_class_smooth_test)
print(sprintf('Logistic Regression (Raw): err_rate_train= %10.6f; err_rate_test= %10.5f',
              err_rate_train1, err_rate_test1))
print(sprintf('Logistic Regression (Regularized): err_rate_train= %10.6f; err_rate_test= %10.5f',
              err_rate_train2, err_rate_test2))
#######################################################################
#############  BMD Example  ############################################
#######################################################################
males = bone$gender == "male"
females = bone$gender == "female"
# df is monotone in lambda for smoothing splines
boneMaleSmooth = smooth.spline( bone[bone$gender == "male","age"],
                                bone[bone$gender == "male","spnbmd"], df=12)
boneFemaleSmooth = smooth.spline( bone[bone$gender == "female","age"],
                                  bone[bone$gender == "female","spnbmd"], df=12)
plot(boneMaleSmooth, ylim=c(-0.05,0.20), col="blue", type="l", xlab="Age", ylab="spnbmd")
points(bone[males,c(2,4)], col="blue", pch=20)
lines(boneFemaleSmooth, ylim=c(-0.05,0.20), col="red")
points(bone[females,c(2,4)], col="red", pch=20)

Tuesday, March 26, 2013

Simple LDA and QDA in R

rm(list=ls())
library(MASS)
library(ElemStatLearn)
data(mixture.example)
data = mixture.example
rm(mixture.example)
x = data$x
y = data$y
summary(factor(y))
rm(data)
plot(x,col = 2*y + 1, xlab = "X1", ylab = "X2")
# LDA
model.lda = lda(x, y)
summary(model.lda)
model.lda
model.lda$prior
model.lda$means
model.lda$scaling
abline(model.lda$scaling)
points(model.lda$means, pch = 20)
# QDA
model.qda = qda(x, y)
model.qda
scaling.matrix1 = model.qda$scaling[,,1]
scaling.matrix2 = model.qda$scaling[,,2]
names(model.qda)
model2.qda = qda(x, y, CV = TRUE)
model2.qda
names(model2.qda)
model2.qda$class

Monday, March 25, 2013

Logistic and LASSO Regression

# The following code is for the book The Elements of Statistical Learning, chapter 4
# Example: South African Heart Disease (Page: 122)
# load data
rm(list=ls())
library(ElemStatLearn)
data(SAheart)
data = SAheart[,c(1:3,5,7:10)]
# to convert factor variables into dummy variables
temp = matrix(0,nrow(data),1)
for (i in c(1:nrow(data))){
  if (data[i,4] == 'Present'){temp[i] = 1}
  else {temp[i] = 0}
}
temp = as.data.frame(temp)
names(temp) = "famhist"
data.new = cbind(data[,1:3], temp, data[,5:8])
# in order to apply glmnet function
# change dataframe to matrix
Y = as.matrix(data.new[,8])
X = as.matrix(data.new[,-8])
logit = glm(chd ~ ., family=binomial("logit"),data=data.new,na.action=na.exclude)
# table 4.2 on page 122
summary(logit)
# Figure 4.12
pairs(data.new, main="South African heart disease data",pch = 21)























# reduced model: stepwise (same as the linear regression)
logit.reduced = glm(chd ~ tobacco + ldl + famhist + age,
                    family=binomial("logit"),data=data.new,na.action=na.exclude)
# table 4.3 on page 124
summary(logit.reduced)
library(glmnet)
# not work if X has a factor variable 
# This is the only package I know to apply lasso for binormial response
# work
lasso = glmnet(scale(X), Y, family = "binomial", alpha = 1, standardize = FALSE, intercept=TRUE)
# not work
lasso = glmnet(X, Y, family = "binomial", alpha = 1, standardize = TRUE, intercept=TRUE)
# FIGURE 4.13 on page 126
plot(lasso, xvar = "norm", label=TRUE)
abline(h=0)

Regression IV: Principal Components Regression

rm(list=ls())
library(ElemStatLearn)
data(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
# scale function in R substract mean,
# and divide by std dev (with df = N-1)
train.s = scale(train)
# fit ridge regression manually
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])

x.svd = svd(X)
#scatterplots of samples PCs
par(mar=c(1,1,1,1))
layout(matrix(1:64,8,8))
mycols = rainbow(length(Y))
orY = order(Y)
for(i in 1:8){
  for(j in 1:8){
    plot(x.svd$u[,i],x.svd$u[,j],type="p",pch=16,col=mycols[orY])
  }
}

# amount of variance explained
var = 0
var.cum = 0;
for(i in 1:8){
  var[i] = x.svd$d[i]/sum(x.svd$d)
  var.cum[i] = sum(var)
}
par(mfrow=c(1,2))
par(mar=c(5,4,4,2))
barplot(var,ylab="Amount of Var Explained",xlab="PCs")
barplot(var.cum,ylab="Cummulative Var Explained",xlab="PCs")

# PC direction weights
par(mfrow=c(3,3))
par(mar=c(5,4,3,2))
for(i in 1:8){
  barplot(x.svd$v[,i],names.arg=names(train)[1:8])
}

#PC regression
beta.pcr = diag(1/x.svd$d)%*%t(x.svd$u)%*%Y
Z = X%*%x.svd$v
#training error
err.pcr = 0
for(i in 1:8){
  err.pcr[i] = sum(as.numeric(Y - Z[,1:i,drop=FALSE]%*%beta.pcr[1:i,1])^2)/length(Y)
}
err.pcr

Regression III: LASSO

rm(list=ls())
library(ElemStatLearn)
data(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
# scale function in R substract mean,
# and divide by std dev (with df = N-1)
train.s = scale(train)
center = attributes(train.s)$'scaled:center'
scale = attributes(train.s)$'scaled:scale'
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])
# scale testing data based on center and scale of training data
test.s = t((t(test) - center)/scale)
Y1 = as.numeric(test.s[,9])
X1 = as.matrix(test.s[,-9])
# unscaled data
Y0 = as.numeric(train[,9])
X0 = as.matrix(train[,-9])
Y01 = as.numeric(test[,9])
X01 = as.matrix(test[,-9])
#################################################################################
########### fit LASSO regression by glmnet package  #############################
#################################################################################
library(glmnet)
# Way 1
lasso.glm = glmnet(X,Y,
                   family = "gaussian",
                   alpha = 1,
                   standardize = FALSE,
                   intercept = FALSE,
                   standardize.response = FALSE
)
# # Way 2: NOT WORK
lasso.glm = glmnet(X0,Y0,
                   family = "gaussian",
                   alpha = 1,
                   standardize = TRUE,
                   intercept = TRUE,
                   standardize.response = TRUE
)
# # Way 3: NOT WORK
lasso.glm = glmnet(X0,Y0,
                   family = "gaussian",
                   alpha = 1
)
# Based on my experiments, I have to scale data first
names(lasso.glm)
plot(lasso.glm, xvar = "norm", label=TRUE)
abline(h=0)

lasso.glm$a0
lasso.glm$beta
lasso.glm$lambda
y = predict(lasso.glm, X, type="link")
y1 = predict(lasso.glm, X1, type="link")

lasso.mse = matrix(0, length(lasso.glm$lambda), 1)
lasso1.mse = matrix(0, length(lasso.glm$lambda), 1)
for (i in 1:length(lasso.glm$lambda)){
  lasso.mse[i] = mean((Y - y[,i])^2)
  lasso1.mse[i] = mean((Y1 - y1[,i])^2)
}
############################################################################
###################   Figure 3.10  on Page 70  #############################
############################################################################
plot(lasso.glm$lambda, lasso.mse)
lines(lasso.glm$lambda, lasso1.mse)
# cross-validation
lasso.cv = cv.glmnet(X,Y,
                     family = "gaussian",
                     alpha = 1,
                     nfolds = 10,
                     standardize = FALSE,
                     intercept = FALSE,
                     standardize.response = FALSE,
                     type.measure = "mse"
)
names(lasso.cv)
plot(lasso.cv)
plot(lasso.cv, -1)
lasso.cv$lambda.1se
lasso.cv$cvm
lasso.cv$cvsd
lasso.cv$glmnet.fit
y.cv = predict(lasso.cv, X, s="lambda.1se")
###########################################################################
############# fit LASSO regression by lars package  ###############################
###########################################################################
library(lars)
# cross-validation is available in this package
# scaled version
model.ridge = lars(X,Y,type="lasso",
                   trace=TRUE,
                   normalize=FALSE,
                   intercept=FALSE
)
# unscaled version
model.ridge = lars(X0,Y0,type="lasso",
                   trace=TRUE,
                   normalize=TRUE,
                   intercept=TRUE
)
summary(model.ridge)
model.ridge
plot(model.ridge,
     xvar="arc.length",
     breaks=TRUE,
     plottype="coefficients"
)
predict(model.ridge,
        X.test,
        type="fit",
        mode="step")
coef(model.ridge, s=5.6,mode="step")
# always scale X and Y manually before applying functions

Sunday, March 24, 2013

Regression II: Ridge Regression

rm(list=ls())
library(ElemStatLearn)
data(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
# scale function in R substract mean,
# and divide by std dev (with df = N-1)
train.s = scale(train)
center = attributes(train.s)$'scaled:center'
scale = attributes(train.s)$'scaled:scale'
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])
# scale testing data based on center and scale of training data
test.s = t((t(test) - center)/scale)
Y1 = as.numeric(test.s[,9])
X1 = as.matrix(test.s[,-9])
# unscaled data
Y0 = as.numeric(train[,9])
X0 = as.matrix(train[,-9])
Y01 = as.numeric(test[,9])
X01 = as.matrix(test[,-9])
##########################################################################
############### fit ridge regression manually  ###################################
##########################################################################
# ridge regression coefficient paths
lambdas = exp(seq(log(0.01), log(10000), l=100))
betar.path = matrix(0, length(lambdas), ncol(X))
for (i in 1:length(lambdas)){
  betar.path[i,] = solve(t(X)%*%X + diag(rep(lambdas[i], ncol(X))))%*%t(X)%*%Y
}
X.d = svd(X)$d
df = matrix(0, length(lambdas), 1)
for (i in 1:length(lambdas)){
  df[i] = sum(X.d^2/(X.d^2 + lambdas[i]))
}
png(file = "fig1.png", width = 600, height = 600)
plot(range(df)*1.1, range(betar.path)*1.1, type='n', ylab='Coefficients', xlab='Degrees of Freedom')
for (j in c(1:8)){
  lines(df, betar.path[,j],col=j)
  if( names(train)[j]=="svi" ){
    text( 1.075*df[1], 1.08*betar.path[1,j], names(train)[j] )
  }else if( names(train)[j]=="lweight" ){
    text( 1.075*df[1], betar.path[1,j], names(train)[j] )
  }else if( names(train)[j]=="pgg45" ){
    text( 1.075*df[1], 0.9*betar.path[1,j], names(train)[j] )
  }else{
    text( 1.075*df[1], betar.path[1,j], names(train)[j] )
  }
}
abline(h=0)
dev.off()

# Remark: Recreate Figure 3.8 in the book

################################################################################
################## fit ridge regression by glmnet package  #############################
###############################################################################
library(glmnet)
lambdas = exp(seq(log(0.01), log(10000), l=100))
# Way 1
ridge.glm = glmnet(X,Y,
                   family = "gaussian",
                   lambda = 2*lambdas,
                   alpha = 0,
                   standardize = FALSE,
                   intercept = TRUE,
)
# Way 2
ridge.glm = glmnet(X,Y,family = "gaussian", alpha = 0)
# Way 3
ridge.glm = glmnet(X0,Y0,
                   family = "gaussian",
                   lambda = 2*lambdas,
                   alpha = 0,
                   standardize = TRUE,
                   intercept = TRUE
                   )
names(ridge.glm)
print(ridge.glm)
# plot(ridge.glm, xvar = "lambda", label = TRUE)
X.d = svd(X)$d
df = matrix(0, length(ridge.glm$lambda), 1)
for (i in 1:length(lambdas)){
  df[i] = sum(X.d^2/(X.d^2 + ridge.glm$lambda[i]))
}
plot(range(df)*1.1, range(ridge.glm$beta)*1.1, type='n', ylab='Coefficients', xlab='Degrees of Freedom')
for (j in c(1:8)){
  lines(df, ridge.glm$beta[j,])
}
# Remark: I cannot recreate Figure 3.8 in the book
# cross-validation and prediction is available
########################################################################
######### fit ridge regression by MASS package  ###############################
########################################################################
library(MASS)
train.s = as.data.frame(train.s)
train = as.data.frame(train)
# model.ridge = lm.ridge(lpsa ~ .,
#                        train.s,
#                        lambda = exp(seq(log(0.01), log(10000), l=100))
# )
model.ridge = lm.ridge(lpsa ~ .,
                       train,
                       x = TRUE,
                       y = TRUE,
                       lambda = exp(seq(log(0.01), log(10000), l=100))
)
model.ridge$coef
model.ridge$lambda
X.d = svd(X)$d
df = matrix(0, length(model.ridge$lambda), 1)
for (i in 1:length(model.ridge$lambda)){
  df[i] = sum(X.d^2/(X.d^2 + model.ridge$lambda[i]))
}
plot(range(df)*1.1, range(model.ridge$coef)*1.1, type='n', ylab='Coefficients', xlab='Degrees of Freedom')
for (j in c(1:8)){
  lines(df, model.ridge$coef[j,])
}
summary(model.ridge)
names(model.ridge)
model.ridge
model.ridge$Inter
model.ridge$ym
model.ridge$xm
model.ridge$scales
model.ridge$GCV
model.ridge$coef
plot(model.ridge)
select(model.ridge)
# Do not need to scale X and Y
# Be able to recreate the Figure 3.8
############################################################################
############### fit ridge regression by ridge package  ##############################
############################################################################
library(ridge)
train.s = as.data.frame(train.s)
train = as.data.frame(train)
# scaled version
# model.ridge = linearRidge(lpsa ~ .,
#                           train.s,
#                           lambda = exp(seq(log(0.01), log(10000), l=100))
# )
# unscaled version
model.ridge = linearRidge(lpsa ~ .,
                          train,
                          scaling = "scale",
                          lambda = exp(seq(log(0.01), log(10000), l=100))
)
names(model.ridge)
X.d = svd(X)$d
df = matrix(0, length(model.ridge$lambda), 1)
for (i in 1:length(model.ridge$lambda)){
  df[i] = sum(X.d^2/(X.d^2 + model.ridge$lambda[i]))
}
plot(range(df)*1.1, range(model.ridge$coef)*1.1, type='n', ylab='Coefficients', xlab='Degrees of Freedom')
for (j in c(1:8)){
  lines(df, model.ridge$coef[j,])
}
# recreate Figure 3.8
######################################################################
################### Conclusion  ########################################
######################################################################
# I don't know how to apply glmnet for ridge regression

Saturday, March 23, 2013

Regression I: Basics and Subset Selection

# The following code is for the book The Elements of Statistical Learning, chapter 3
# Data was described on page 3
rm(list=ls())
library(ElemStatLearn)
data(prostate)
head(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
pairs(data, main="prostate cancer data",pch = 21)

cor(train[,-9])
# # generate a table in the Latex form
# library(xtable)
# xtable( cor(train[,-9]), caption="Table 3.1", digits=3 )
lm.model = lm(lpsa ~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45
              , data=train )
summary(lm.model)
# graphical summary
layout(matrix(c(1,2,3,4),2,2))
plot(lm.model)
dev.off()
y.pred = predict(lm.model, newdata = test)
y.test = test$lpsa
pred.mse = mean((y.pred - y.test)^2)
# reduced model
lm.model_reduce = lm(lpsa ~
                       lcavol
                       + lweight
#                        + age
                       + lbph
                       + svi
#                        + lcp
#                        + gleason
#                        + pgg45
                       , data=train )
summary(lm.model_reduce)
anova(lm.model_reduce,lm.model)

# parameter estimation by hand
names(train)
Y = as.numeric(train[,9])
# design matrix
X = cbind(rep(1,nrow(train)),as.matrix(train[,-9]))
# equation 3.6: coefficients estimation
beta.hat = solve(t(X)%*%X)%*%t(X)%*%Y
# equation 3.7: prediction
Y.hat = X%*%beta.hat
# Hat matrix
H = X%*%solve(t(X)%*%X)%*%t(X)
# equation 3.8: variance estimation
p = ncol(X) - 1
N = nrow(train)
sigma2.hat = sum((Y - Y.hat)^2)/(N - p -1)
# kind of equation 3.10: covariance matrix of beta
covarbeta.hat = solve(t(X)%*%X)*sigma2.hat
# equation 3.12: Z-score (actually t-distribution with df N-p-1)
stdbeta.hat = sqrt(diag(covarbeta.hat))
z = beta.hat/stdbeta.hat
# beta and y.hat based on QR decomposition
X.QR = qr(X)
qr.X(X.QR)
Q = qr.Q(X.QR)
R = qr.R(X.QR)
beta.qr = solve(R)%*%t(Q)%*%Y
y.qr = Q%*%t(Q)%*%Y
# subset selection
# 1. Best-Subset Selection
# for logistic regression, use bestglm package
# need library leaps:
library(leaps)
# recreate Figure 3.5 on page 58
model.bestsub = leaps(train[,-9], train[,9], method="r2")
# method: Cp, adjr2, or r2
plot(model.bestsub$size, model.bestsub$r2, xlab="# predictors+1", ylab="r2")

#
# 2. Stepwise Selection
# Forward- and Backward-Stepwise Selection
# test could be Chisq or F for lm
# Rao, LRT, Chisq, F for glm
# Forward Stepwise
fit.null = lm(lpsa~1,data=train)
fit.f1 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              trace = FALSE, # print out stepwise summary or not
              k=log(nrow(train)) # BIC, or any number here
              )
summary(fit.f1)
fit.f2 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              trace = TRUE,
              k=2)
summary(fit.f2)
# Backward Stepwise
fit.full = fit.null = lm(lpsa~.,data=train)
# BIC
fit.r1 = step(fit.full,direction="backward",data=train, trace=0, k=log(nrow(train)))
# AIC
fit.r2 = step(fit.full,direction="backward",data=train, trace=1 ,k=2)

#
# 3. Forward Stagewise Regression
library(lars)
Y = as.matrix(train[,9])
X = as.matrix(train[,-9])
cv.lars(X,Y,type="for",K=10,mode="step",trace=T)
model.fsr = lars(X,Y,type="for",normalize=F)
plot(model.fsr, xvar = "step")

Thursday, March 14, 2013

My SQL cheatsheet

1.
IF OBJECT_ID('dbo.TABLE') IS NOT NULL
 BEGIN
  DROP TABLE dbo.TABLE
 END;
--------------------------------------------------
2.
ROW_NUMBER() OVER(PARTITION BY MONTHID
      ORDER BY DATEID
     ) AS TranOrder
-------------------------------------------------
3.
;WITH A AS (
SELECT .....
FROM ........ WITH (NOLOCK)
WHERE .........
)
,B AS (
SELECT ....
 FROM ....
WHERE .....
)
SELECT A.*, B.*
FROM ......
-----------------------------------------------
4.
IF OBJECT_ID('dbo.Object_choosen') IS NOT NULL
 BEGIN
  DROP TABLE dbo.Object_choosen
  END;
GO

SELECT TOP 50000 ID
 INTO dbo.Object_choosen
 FROM dbo.Object_left
;
.........................................
DELETE FROM dbo.Object_left
WHERE ID IN (SELECT * FROM dbo.Object_choosen)

-----------------------------------------------------

5. choose ID in A not in B
SELECT TOP 10 A.ID
FROM A
LEFT JOIN B WITH (NOLOCK)
ON A.ID = B.ID
WHERE B.ID IS NULL
------------------------------------------------------
6. change table
ALTER TABLE A
ADD ID int
ALTER TABLE A
DROP COLUMN ID
UPDATE A
SET A.NUM = B.NUM
from DATA1 A
LEFT JOIN DATA2 B
 ON A.ID = B.ID
UPDATE DATA
SET LENGTH = DATEDIFF(DAY, DT_EARLY, DT_LATE)
-------------------------------------------------------------------------
7. bucket

DECLARE @CNT FLOAT
SET @CNT = (SELECT COUNT(*) FROM #V)
     
SELECT BUCKET
 ,SCORE_AVG = AVG(SCORE)
 ,Y_RATE = AVG(CONVERT(FLOAT, Y))
 ,NUM = COUNT(*)
FROM (SELECT BUCKET = CEILING(ROW_NUMBER() OVER (ORDER BY SCORE)/@CNT*20)/20
   ,*
   FROM #V
   ) TEMP
GROUP BY BUCKET
ORDER BY BUCKET
---------------------------------------------------------------------------------------------------

8. percentile
/****************  Main Category  *******************************/
WITH SourceData AS (
    --Table of sets over which we want to produce multiple quartiles.
     SELECT obj_ID, DtID, FLAG_1, ID
    FROM ..........
)
/*****************  Percentile  ******************************/
,  Percentile AS (
SELECT  ID
  ,MAX(P1Val) P1
  ,MAX(P5Val) P5
  ,MAX(P10Val) P10
   ,MAX(P25Val) P25
  ,MAX(P50Val) P50
  ,MAX(P75Val) P75
  ,MAX(P80Val) P80
  ,MAX(P85Val) P85
  ,MAX(P90Val) P90
  ,MAX(P95Val) P95
  ,MAX(P99Val) P99
FROM (
    --Expose the detail values for only the records at the index values
    --generated by the summary subquery. All other values are left as NULL.
    SELECT detail.ID,
     CASE WHEN RowNum = P1Idx THEN DtID ELSE NULL END P1Val,
 CASE WHEN RowNum = P5Idx THEN DtID ELSE NULL END P5Val,
  CASE WHEN RowNum = P10Idx THEN DtID ELSE NULL END P10Val,
 CASE WHEN RowNum = P25Idx THEN DtID ELSE NULL END P25Val,
 CASE WHEN RowNum = P50Idx THEN DtID ELSE NULL END P50Val,
 CASE WHEN RowNum = P75Idx THEN DtID ELSE NULL END P75Val,
  CASE WHEN RowNum = P80Idx THEN DtID ELSE NULL END P80Val,
 CASE WHEN RowNum = P85Idx THEN DtID ELSE NULL END P85Val,
 CASE WHEN RowNum = P90Idx THEN DtID ELSE NULL END P90Val,
 CASE WHEN RowNum = P95Idx THEN DtID ELSE NULL END P95Val,
  CASE WHEN RowNum = P99Idx THEN DtID ELSE NULL END P99Val
    FROM
        --Calculate a row number sorted by [OpsDtID] for each group.
        (SELECT *, ROW_NUMBER() OVER (PARTITION BY ID
          ORDER BY DtID
           ) RowNum
        FROM SourceData) AS detail
    INNER JOIN (
        --Summarize to find index numbers and fractions we need to use to locate
        --the values at the quartile points.
        SELECT ID,
  --Calculate percentiles based on Nearest Rank
 FLOOR(ROUND(COUNT(*)*1/100.0 + 1/2.0, 0)) P1Idx,
 FLOOR(ROUND(COUNT(*)*5/100.0 + 1/2.0, 0)) P5Idx,
 FLOOR(ROUND(COUNT(*)*10/100.0 + 1/2.0, 0)) P10Idx,
 FLOOR(ROUND(COUNT(*)*25/100.0 + 1/2.0, 0)) P25Idx,
  FLOOR(ROUND(COUNT(*)*50/100.0 + 1/2.0, 0)) P50Idx,
 FLOOR(ROUND(COUNT(*)*75/100.0 + 1/2.0, 0)) P75Idx,
 FLOOR(ROUND(COUNT(*)*80/100.0 + 1/2.0, 0)) P80Idx,
 FLOOR(ROUND(COUNT(*)*85/100.0 + 1/2.0, 0)) P85Idx,
 FLOOR(ROUND(COUNT(*)*90/100.0 + 1/2.0, 0)) P90Idx,
  FLOOR(ROUND(COUNT(*)*95/100.0 + 1/2.0, 0)) P95Idx,
 FLOOR(ROUND(COUNT(*)*99/100.0 + 1/2.0, 0)) P99Idx
        FROM SourceData
        GROUP BY ID
        HAVING COUNT(*) > 1
   
    ) AS summary
    ON detail.ID  = summary.ID
) AS combined
GROUP BY ID
)

/****************  Basic Statistic Summary  ***********************/
, Summary AS (
SELECT ID
  ,ClosingRatio = (CASE
         WHEN COUNT(DISTINCT obj_ID) = 0 THEN NULL
          ELSE CAST(SUM(FLAG_1) AS DECIMAL(15,5))*100/COUNT(DISTINCT obj_ID)
     END
     )
  ,MIN(DtID) MIN
  ,MAX(DtID) MAX
  ,AVG(DtID) AVG
  ,CAST(COUNT(DISTINCT obj_ID)*100/
   (SELECT CAST(COUNT(DISTINCT obj_ID) AS DECIMAL(15,5))
     FROM SourceData
    )AS DECIMAL(15,5)
    ) AS PCT
 ,COUNT(DISTINCT obj_ID) AS Count
 FROM SourceData
 GROUP BY ID
)
/**************  Results  ******************************************/

SELECT B.ID,B.ClosingRatio
  ,PCT
  ,P1
  ,P5
  ,P10
  ,P25
  ,P50
  ,P75
  ,P80
  ,P85
  ,P90
  ,P95
  ,P99
  ,MIN
  ,AVG
  ,MAX
  ,Count
FROM Percentile AS A
 FULL JOIN Summary AS B
ON A.ID  = B.ID
ORDER BY B.ID
---------------------------------------------------------------------------------

CART: rpart in R

setwd("C:/TreeModel")
library(rpart)
library(pROC)
##################################################################################
###########    Regression Tree   #################################################
##################################################################################
data1 = read.table("C:/TreeModel/Pollute.txt", header=T)
# attach(data1)
# detach(data1)
summary(data1)
names(data1)
#plot to check nonlinear relationships
pairs(data1, pch=16, cex=1)
#build regression model
tree.data1 = rpart(Pollution ~ .,
                   method = "anova",
                   data = data1,
                   control=rpart.control(minsplit=2, cp=0.01),
                   na.action = na.exclude
                   )
print(tree.data1)
# plot(tree.data1)
# text(tree.data1)
#plot easier to read
plot(tree.data1, compress=T, uniform=T, branch=0.4, margin=0.1)
text(tree.data1)
summary(tree.data1)
tree.data1$terms
names(tree.data1)
#diagnostics (check errors)
plot(predict(tree.data1), residuals(tree.data1), xlab="Fitted", ylab="Residuals")
qqnorm(residuals(tree.data1))
#predict
x0 = apply(data1[,-1], 2, median)
x0
predict(tree.data1, data.frame(t(x0)))
#pruning tree
#initially make a big tree, choose cp very small
tree.data1 = rpart(Pollution ~ .,
                   method = "anova",
                   data = data1,
                   control=rpart.control(minsplit=2, cp=0.0001),
                   na.action = na.exclude
)
#select the best splits based on the cross-validation reslut
printcp(tree.data1)
plotcp(tree.data1)
#choose cp as 0.05739633
data.prune = prune.rpart(tree.data1, 0.05739633)
plot(data.prune, compress=T, uniform=T, branch=0.4, margin=0.1)
text(data.prune)
print(data.prune)
predict(data.prune, data.frame(t(x0)))
par(mfrow=c(1,2))
rsq.rpart(data.prune)
par(mfrow=c(1,1))
summary(data.prune)
# ROC curve
data.predict = predict(data.prune)
roc(data1$Pollution,data.predict,plot=T)
###############################################################################
############ Classification Tree  #############################################
###############################################################################
# based on information (entropy)
data2 = read.table("C:/TreeModel/epilobium.txt", header=T)
tree.data2 = rpart(species ~ .,
                   method = "class",
                   data = data2,
                   control=rpart.control(minsplit=2, cp=0.01),
                   parms=list(split='information'),
                   na.action = na.exclude
)
plot(tree.data2, compress=T, uniform=T, branch=0.4, margin=0.1)
text(tree.data2)
print(tree.data2)
# based on Gini index
tree2.data2 = rpart(species ~ .,
                   method = "class",
                   data = data2,
                   control=rpart.control(minsplit=2, cp=0.01),
                   parms=list(split='gini'),
                   na.action = na.exclude
)
plot(tree2.data2, compress=T, uniform=T, branch=0.4, margin=0.1)
text(tree2.data2)
print(tree2.data2)

########################################################
###################     DATA      ##########################
########################################################
Pollution Temp Industry Population Wind Rain Wet.days
24 61.5 368 497 9.1 48.34 115
30 55.6 291 593 8.3 43.11 123
56 55.9 775 622 9.5 35.89 105
28 51 137 176 8.7 15.17 89
14 68.4 136 529 8.8 54.47 116
46 47.6 44 116 8.8 33.36 135
9 66.2 641 844 10.9 35.94 78
35 49.9 1064 1513 10.1 30.96 129
26 57.8 197 299 7.6 42.59 115
61 50.4 347 520 9.4 36.22 147
29 57.3 434 757 9.3 38.98 111
28 52.3 361 746 9.7 38.74 121
14 51.5 181 347 10.9 30.18 98
18 59.4 275 448 7.9 46 119
17 51.9 454 515 9 12.95 86
23 54 462 453 7.1 39.04 132
47 55 625 905 9.6 41.31 111
13 61 91 132 8.2 48.52 100
31 55.2 35 71 6.6 40.75 148
12 56.7 453 716 8.7 20.66 67
10 70.3 213 582 6 7.05 36
110 50.6 3344 3369 10.4 34.44 122
56 49.1 412 158 9 43.37 127
10 68.9 721 1233 10.8 48.19 103
69 54.6 1692 1950 9.6 39.93 115
8 56.6 125 277 12.7 30.58 82
36 54 80 80 9 40.25 114
16 45.7 569 717 11.8 29.07 123
29 51.1 379 531 9.4 38.79 164
29 43.5 669 744 10.6 25.94 137
65 49.7 1007 751 10.9 34.99 155
9 68.3 204 361 8.4 56.77 113
10 75.5 207 335 9 59.8 128
26 51.5 266 540 8.6 37.01 134
31 59.3 96 308 10.6 44.68 116
10 61.6 337 624 9.2 49.1 105
11 47.1 391 463 12.4 36.11 166
14 54.5 381 507 10 37 99
17 49 104 201 11.2 30.85 103
11 56.8 46 244 8.9 7.77 58
94 50 343 179 10.6 42.75 125

My R cheatsheet

1.

library(RODBC)
Channel = odbcDriverConnect("driver={SQL Server};
                            server=server_name;
                            database=database_name")
data = sqlQuery(Channel,
                paste("
SQL query
                       ")
)
sqlSave(Channel, data, tablename="dbo.save_table_from_R")
####################################################################
2.

setwd("C:/Temp")
for(index in 1:10){
  str.index = toString(index)
  title.index = paste("Simulation", str.index)
  png(file = paste(str.index,".png", sep = ""),
      width = 1000, height = 1000,
      bg = "white")
  # par(mfrow=c(2,2))
  # figure 1 to 4
   dev.off()
  cat(index, data1, data2, ...., "\n",
      file="summary.txt", append=TRUE)
}
########################################################################
3.

logit = glm(y ~, family=binomial("logit"),data=,na.action=na.exclude)
y.hat = predict(logit,type = c("response"))
########################################################################
4.

library(pROC)
roc(Y,Y.hat,plot=T)
cut.yhat = cut(y.hat
              ,breaks = quantile(y.hat, probs = seq(0, 1, 1/50))
              ,include.lowest = T
              )
obs = xtabs(cbind(1 - y, y, 1) ~ cut.yhat)
expect = xtabs(cbind(1 - y.hat, y.hat) ~ cut.yhat)
result = xtabs(cbind(y, y.hat,1) ~ cut.yhat)
#############################################################################
5.

splitdf <- function(dataframe, seed=NULL) {
  if (!is.null(seed)) set.seed(seed)
  index <- 1:nrow(dataframe)
  trainindex <- sample(index, trunc(length(index)*0.7))
  trainset <- dataframe[trainindex, ]
  testset <- dataframe[-trainindex, ]
  list(trainset=trainset,testset=testset)
}
data.all = splitdf(data, seed=100)
data.train = data.all$trainset
data.test = data.all$testset




nnet in R

# install.packages("nnet")
# install.packages("faraway")
library(faraway)
library(nnet)
rm(list=ls())
data(ozone)
names(ozone)
nnmdl = nnet(O3 ~ temp + ibh + ibt, ozone, size=2, linout=TRUE)
summary(nnmdl)
x.test = expand.grid(temp=1, ibh=0, ibt=0)
predict(nnmdl, new=x.test)
?scale
sx = scale(ozone)
##############################
bestrss = 10000
for (i in 1:1000){
  nnmdl = nnet(O3 ~ temp + ibh + ibt, sx, size=2, linout=TRUE, trace=FALSE)
  cat(nnmdl$value, "\n")
  if (nnmdl$value < bestrss){
    bestnn = nnmdl
    bestrss = nnmdl$value
  }
}
bestnn$value
summary(bestnn)
R.sqr = 1 - bestnn$value/sum((sx[,1] - mean(sx[,1]))^2)
ozmeans = attributes(sx)$'scaled:center'
ozscales = attributes(sx)$'scaled:scale'
xx = expand.grid(temp=seq(-3,3,0.1), ibh=0, ibt=0)
plot(xx$temp*ozscales['temp'] + ozmeans['temp'],
     predict(bestnn, new=xx)*ozscales['O3'] + ozmeans['O3'],
     xlab='Temp', ylab='O3')
xx = expand.grid(temp=0, ibh=seq(-3,3,0.1), ibt=0)
plot(xx$ibh*ozscales['ibh'] + ozmeans['ibh'],
     predict(bestnn, new=xx)*ozscales['O3'] + ozmeans['O3'],
     xlab='IBH', ylab='O3')
xx = expand.grid(temp=0, ibh=0, ibt=seq(-3,3,0.1))
plot(xx$ibt*ozscales['ibt'] + ozmeans['ibt'],
     predict(bestnn, new=xx)*ozscales['O3'] + ozmeans['O3'],
     xlab='IBT', ylab='O3')

##############################
bestrss = 10000
for (i in 1:1000){
  nnmdl = nnet(O3 ~ temp + ibh + ibt, sx, size=2, linout=TRUE, decay=0.001, trace=FALSE)
  cat(nnmdl$value, "\n")
  if (nnmdl$value < bestrss){
    bestnn = nnmdl
    bestrss = nnmdl$value
  }
}
bestnn$value
summary(bestnn)
R.sqr = 1 - bestnn$value/sum((sx[,1] - mean(sx[,1]))^2)
ozmeans = attributes(sx)$'scaled:center'
ozscales = attributes(sx)$'scaled:scale'
xx = expand.grid(temp=seq(-3,3,0.1), ibh=0, ibt=0)
plot(xx$temp*ozscales['temp'] + ozmeans['temp'],
     predict(bestnn, new=xx)*ozscales['O3'] + ozmeans['O3'],
     xlab='Temp', ylab='O3')
xx = expand.grid(temp=0, ibh=seq(-3,3,0.1), ibt=0)
plot(xx$ibh*ozscales['ibh'] + ozmeans['ibh'],
     predict(bestnn, new=xx)*ozscales['O3'] + ozmeans['O3'],
     xlab='IBH', ylab='O3')
xx = expand.grid(temp=0, ibh=0, ibt=seq(-3,3,0.1))
plot(xx$ibt*ozscales['ibt'] + ozmeans['ibt'],
     predict(bestnn, new=xx)*ozscales['O3'] + ozmeans['O3'],
     xlab='IBT', ylab='O3')

Naive Bayes in R

library(e1071)
rm(list=ls())
################################################################################
############# Naive Bayes for continuous variables #############################
################################################################################
# Data
data(iris)
names(iris)
pairs(iris[1:4],main="Iris Data (red=setosa,yellow=versicolor,blue=virginica)",
      pch=21,
      bg=c("red","yellow","blue")[unclass(iris$Species)])
summary(iris)
#just show first 6 records
head(iris)
classifier = naiveBayes(iris[,1:4], iris[,5])
summary(classifier)
names(classifier)
classifier$apriori
classifier$table  #1st column: mean, 2rd column:standard deviation
classifier$levels
classifier$call
plot(function(x) dnorm(x, 1.462, 0.1736640),
     0, 8,
     col="red",
     ylab="",
     main="Petal length distribution for the 3 different species")
curve(dnorm(x, 4.260, 0.4699110), add=TRUE, col="blue")
curve(dnorm(x, 5.552, 0.5518947 ), add=TRUE, col = "black")
#split data into train and test
train.ind = sample(1:nrow(iris), ceiling(nrow(iris)*2/3), replace=FALSE)
classifier.train = naiveBayes(iris[train.ind,1:4], iris[train.ind,5])
#give the classified results
nb.pred = predict(classifier.train, newdata=iris[-train.ind,])
#just show first 6 predictions
head(nb.pred)
table(nb.pred, iris[-train.ind,]$Species)

#give the raw score
nb.pred = predict(classifier.train, newdata=iris[-train.ind,], type="raw")
#just show first 6 predictions
head(nb.pred)

################################################################################
############# Naive Bayes for discrete variables ###############################
################################################################################
# HairEyeColor
# mosaicplot(HairEyeColor)
# margin.table(HairEyeColor, 1)
# margin.table(HairEyeColor, 2)
# margin.table(HairEyeColor, 3)
# margin.table(HairEyeColor, c(1,3))
# install.packages("mlbench")
# library(mlbench)
data(HouseVotes84, package = "mlbench")
head(HouseVotes84)
summary(HouseVotes84)
train.ind = sample(1:nrow(HouseVotes84),
                   ceiling(nrow(HouseVotes84)*2/3),
                   replace=FALSE)
classifier.train = naiveBayes(Class ~ ., data=HouseVotes84[train.ind,], laplace = 0)
summary(classifier.train)
names(classifier.train)
classifier.train$apriori
classifier.train$table  #1st_column + 2rd_column = 1 (probability)
classifier.train$levels
classifier.train$call
mode(classifier.train$table)
nb.pred = predict(classifier.train, newdata=HouseVotes84[-train.ind,])
head(nb.pred)
table(nb.pred, HouseVotes84[-train.ind,]$Class)
nb.pred = predict(classifier.train, newdata=HouseVotes84[-train.ind,], type = "raw")
head(nb.pred)
head(HouseVotes84[-train.ind,]$Class)

Wednesday, March 13, 2013

Regression in R: best subset, stepwise, ridge, lasso, and PCR

# The following code is for the book The Elements of Statistical Learning, chapter 3
# Data was described on page 3

rm(list=ls())
library(ElemStatLearn)
data(prostate)
head(prostate)
train = subset(prostate, train==T)[,1:9]
pairs(train, main="prostate cancer data",pch = 21)
cor(train[,-9])
# generate a table in the Latex form
library(xtable)
xtable( cor(train[,-9]), caption="Table 3.1", digits=3 )

lm.model = lm(lpsa ~ ., data=train )
summary(lm.model)
# graphical summary
layout(matrix(c(1,2,3,4),2,2))
plot(lm.model)
dev.off()
# parameter estimation by hand
names(train)
Y = as.numeric(train[,9])
# design matrix
X = cbind(rep(1,nrow(train)),as.matrix(train[,-9]))
# equation 3.6
beta.hat = solve(t(X)%*%X)%*%t(X)%*%Y
beta.hat
Y.hat = X%*%beta.hat
Y.hat
H = X%*%solve(t(X)%*%X)%*%t(X)
p = ncol(X) - 1
N = nrow(train)
sigma2.hat = sum((Y - Y.hat)^2)/(N - p -1)
covarbeta.hat = solve(t(X)%*%X)*sigma2.hat
# equation 3.12
stdbeta.hat = sqrt(diag(covarbeta.hat))
z = beta.hat/stdbeta.hat
# best-subset selection
library(leaps)
model.bestsub = leaps(train[,-9], train[,9], method="r2")
plot(model.bestsub$size, model.bestsub$r2, xlab="# predictors+1", ylab="r2")
abline(0, 1)
# Forward- and Backward-Stepwise Selection
fit.null = lm(lpsa~1,data=train)
fit.f1 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              k=log(nrow(train)))
summary(fit.f1)
fit.f2 = step(fit.null,
              scope=ozone~
                lcavol
              + lweight
              + age
              + lbph
              + svi
              + lcp
              + gleason
              + pgg45,
              direction="forward",
              data=train,
              k=2)
summary(fit.f2)
fit.full = fit.null = lm(lpsa~.,data=train)
# BIC
fit.r1 = step(fit.full,direction="backward",data=train, trace=0, k=log(nrow(train)))
# AIC
fit.r2 = step(fit.full,direction="backward",data=train, trace=1 ,k=2)

train.s = scale(train)
# fit ridge regression
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])
lam = 1000*nrow(train.s)
# least square
betals = solve(t(X)%*%X)%*%t(X)%*%Y
# ridge regression
betar = solve(t(X)%*%X + diag(rep(lam, ncol(X))))%*%t(X)%*%Y
cbind(betals, betar)
# ridge regression coefficient paths
lambdas = exp(seq(log(0.01), log(ncol(X)), l=100))
betar.path = matrix(0, length(lambdas), ncol(X))
for (i in 1:length(lambdas)){
  betar.path[i,] = solve(t(X)%*%X + diag(rep(lambdas[i], ncol(X))))%*%t(X)%*%Y
}
plot(c(1,length(lambdas)), range(betar.path), type='n', ylab='Coefficients', xlab='Lambda Index')
for (j in c(1:8)){
  lines(betar.path[length(lambdas):1,j],col=j)
}
legend(0,0.2,legend=names(train)[1:8],col=1:8,lty=rep(1,8))

# lasso
library(glmnet)
betals = solve(t(X)%*%X)%*%t(X)%*%Y
lam = 0.1
lasso.fit = glmnet(x=X,y=Y,family="gaussian",lambda=lam,alpha=1)
# comparison between least square and lasso
cbind(betals,as.matrix(lasso.fit$beta))
lasso.fit2 = glmnet(x=X,y=Y,family="gaussian",alpha=1)
plot(lasso.fit2,col=1:ncol(X))
legend(0,0.5,legend=names(train)[1:8],col=1:8,lty=rep(1,8),cex=0.8)
# Principal Components Regression
x.svd = svd(X)
#scatterplots of samples PCs
par(mar=c(1,1,1,1))
layout(matrix(1:64,8,8))
mycols = rainbow(length(Y))
orY = order(Y)
for(i in 1:8){
  for(j in 1:8){
    plot(x.svd$u[,i],x.svd$u[,j],type="p",pch=16,col=mycols[orY])
  }
}
# amount of variance explained
var = 0
var.cum = 0;
for(i in 1:8){
  var[i] = x.svd$d[i]/sum(x.svd$d)
  var.cum[i] = sum(var)
}
par(mfrow=c(1,2))
par(mar=c(5,4,4,2))
barplot(var,ylab="Amount of Var Explained",xlab="PCs")
barplot(var.cum,ylab="Cummulative Var Explained",xlab="PCs")
# PC direction weights
par(mfrow=c(3,3))
par(mar=c(5,4,3,2))
for(i in 1:8){
  barplot(x.svd$v[,i],names.arg=names(train)[1:8])
}
#PC regression
beta.pcr = diag(1/x.svd$d)%*%t(x.svd$u)%*%Y
Z = X%*%x.svd$v
#training error
err.pcr = 0
for(i in 1:8){
  err.pcr[i] = sum(as.numeric(Y - Z[,1:i,drop=FALSE]%*%beta.pcr[1:i,1])^2)/length(Y)
}
err.pcr

Tuesday, March 5, 2013

Support Vector Machines in MATLAB II

%non-linear 2-class SVM example
var = 1.25;
n = 100; mu1 = [0; 0]; cov1 = [1 .8; 1 .8];
x1 = repmat(mu1',n/2,1) + randn(n/2,2)*chol(cov1)*sqrt(var);
xs2 = repmat(mu1',n*100,1) + randn(n*100,2)*sqrt(var*3);
ind = sum((xs2.^2)')' > 5;
x2 = xs2(ind,:);
x2 = x2(1:(n/2),:);
x = [x1; x2];
Y = [repmat(1,50,1); repmat(-1,50,1)];
clf
hold on
scatter(x1(:,1),x1(:,2),'or','filled')
scatter(x2(:,1),x2(:,2),'ob','filled')
axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
hold off



% linear kernel
C = 1;
svms = svmtrain(x,Y,'kernel_function','linear','showplot','true','method','QP','boxconstraint',C);


% radial kernel
sig = .75;
C = 1;
svms = svmtrain(x,Y,'kernel_function','rbf','showplot','true','method','QP','rbf_sigma',sig,'boxconstraint',C);


% polynomial kernel
d = 2;
C = 1;
svms = svmtrain(x,Y,'kernel_function','polynomial','showplot','true','method','QP','polyorder',d,'boxconstraint',C);

Monday, March 4, 2013

Support Vector Machines in MATLAB I

Handwritten Digit Recognition

Data: downloaded from http://www-stat.stanford.edu/~tibs/ElemStatLearn/

Description:
Normalized handwritten digits, automatically scanned from envelopes by the U.S. Postal Service. The original scanned digits are binary and of different sizes and orientations; the images  here have been deslanted and size normalized, resulting in 16 x 16 grayscale images (Le Cun et al., 1990). The data are in two gzipped files, and each line consists of the digit id (0-9) followed by the 256 grayscale values. There are 7291 training observations and 2007 test observations, distributed as follows:
               0       1      2        3      4        5       6       7        8        9        Total
Train 1194   1005   731   658   652  556    664    645   542    644      7291
 Test    359     264   198   166   200  160   170    147   166     177      2007
or as proportions:
              0            1       2       3         4        5          6        7         8        9
Train    0.16     0.14    0.1    0.09    0.09   0.08    0.09    0.09    0.07    0.09
 Test    0.18      0.13    0.1    0.08    0.10   0.08    0.08    0.07    0.08    0.09

Alternatively, the training data are available as separate files per digit (and hence without the digit identifier in each row). The test set is notoriously "difficult", and a 2.5% error rate is excellent. These data were kindly made available by the neural network group at AT&T research labs (thanks to Yann Le Cunn).

Code:

zip1 = 5; zip2 = 8;
trdat = dlmread('zip.train');
ind = trdat(:,1)==zip1 | trdat(:,1)==zip2;
X = trdat(ind,2:end);
Y = trdat(ind,1);
[n,p] = size(X);
tsdat = dlmread('zip.test');
ind = tsdat(:,1)==zip1 | tsdat(:,1)==zip2;
Xs = tsdat(ind,2:end);
Ys = tsdat(ind,1);
fitsvm = svmtrain(X,Y,'boxconstraint',1);
class = svmclassify(fitsvm,X);
trerr = sum(class~=Y)/length(Y);
class = svmclassify(fitsvm,Xs);
tserr = sum(class~=Ys)/length(Ys);
[trerr tserr]
ans =
         0    0.0491
colormap(gray)
for i=1:9
    subplot(3,3,i)
    imagesc(flipud(rot90(reshape(fitsvm.SupportVectors(i,:),16,16))))
    axis off
end


[classNB,errNB] = classify(Xs,X,Y,'diaglinear');
[classLDA,errLDA] = classify(Xs,X,Y,'linear');
[errNB errLDA tserr]
ans =
    0.0555    0.0027    0.0491










Support Vector Classifier in MATLAB

% the following code is for the book The Elements of Statistical Learning, section 12.2
% choose right directories before and after this step
cvx_setup

% cvx can be downloaded @ http://cvxr.com/cvx/
%%%%%%%%%%%
%separable case
clf
var = 1.25;
n = 100; mu1 = [3; 8]; mu2 = [8; 3];
x1 = repmat(mu1',n/2,1) + randn(n/2,2)*sqrt(var);
x2 = repmat(mu2',n/2,1) + randn(n/2,2)*sqrt(var);
x = [x1; x2];
Y = [repmat(1,50,1); repmat(-1,50,1)];
hold on
scatter(x1(:,1),x1(:,2),'or','filled')
scatter(x2(:,1),x2(:,2),'ob','filled')
axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
hold off
 


cvx_begin
cvx_precision('high')
variable beta0;
variable beta(2);
minimize( .5*square_pos(norm(beta)));
subject to
Y.*(x*beta + beta0) >= 1;
cvx_end

hold on
scatter(x1(:,1),x1(:,2),'or','filled')
scatter(x2(:,1),x2(:,2),'ob','filled')
axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
xs = linspace(min(min(x)),max(max(x)),1000);
plot(xs,(-beta0 - xs*beta(1))/beta(2),'k')
M = 1/norm(beta);
plot(xs,(-beta0 - xs*beta(1)+1)/beta(2) ,'--k')
plot(xs,(-beta0 - xs*beta(1)-1)/beta(2),'--k')
hold off


%%%%%%%%%%%%%%
%non-seperable case
clf
var = 2.5;
n = 100; mu1 = [3; 8]; mu2 = [8; 3];
x1 = repmat(mu1',n/2,1) + randn(n/2,2)*sqrt(var);
x2 = repmat(mu2',n/2,1) + randn(n/2,2)*sqrt(var);
x = [x1; x2];
Y = [repmat(1,50,1); repmat(-1,50,1)];
hold on
scatter(x1(:,1),x1(:,2),'or','filled')
scatter(x2(:,1),x2(:,2),'ob','filled')
axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
hold off

gam = .1;
cvx_begin
cvx_precision('high')
variable beta0;
variable beta(2);
variable epsilon(n);
minimize( .5*square_pos(norm(beta)) + gam*sum(epsilon));
subject to
for i=1:n
    Y(i)*(x(i,:)*beta + beta0) >= 1 - epsilon(i);
end
epsilon >= 0;
cvx_end
clf
hold on
scatter(x1(:,1),x1(:,2),'or','filled')
scatter(x2(:,1),x2(:,2),'ob','filled')
axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
xs = linspace(min(min(x)),max(max(x)),1000);
plot(xs,(-beta0 - xs*beta(1))/beta(2),'k')
M = 1/norm(beta);
plot(xs,(-beta0 - xs*beta(1)+1)/beta(2) ,'--k')
plot(xs,(-beta0 - xs*beta(1)-1)/beta(2),'--k')
ind = epsilon>1e-6;
scatter(x(ind,1),x(ind,2),'+k','SizeData',150)
ind = abs(x*beta + beta0 + 1)<1e-4;
scatter(x(ind,1),x(ind,2),'Xk','SizeData',150)
ind = abs(x*beta + beta0 - 1)<1e-4;
scatter(x(ind,1),x(ind,2),'Xk','SizeData',150)
hold off

Sunday, March 3, 2013

Hierarchical Clustering in R

rm(list=ls())
#install.packages("ElemStatLearn")
library(ElemStatLearn)
data(nci)
# DNA Expression Microarrays
# from the book "The Elements of Statistical Learning"
# Algorithm described in page 520 -- page 528
head(nci)

# complete linakge
com.hclust = hclust(dist(t(nci)),method="complete")
plot(com.hclust,cex=.7, xlab="")




# single linakge
sing.hclust = hclust(dist(t(nci)),method="single")
plot(sing.hclust,cex=.7, xlab="")



# average linakge
ave.hclust = hclust(dist(t(nci)),method="average")
plot(ave.hclust,cex=.7, xlab="")


# average linakge - maximum distance
ave.hclust = hclust(dist(t(nci),method="maximum"),method="average")
plot(ave.hclust,cex=.7, xlab="")

Saturday, March 2, 2013

K-Means in R

rm(list=ls())
# simulated data
n = 200
mu1 = c(3,8); mu2 = c(8,3); mu3 = c(7,7); mu4 = c(2,2);
var = 1.5;
x1 = t(matrix(mu1,2,n/4)) + matrix(rnorm(n)*2,n/4,2)
x2 = t(matrix(mu2,2,n/4)) + matrix(rnorm(n)*2,n/4,2)
x3 = t(matrix(mu3,2,n/4)) + matrix(rnorm(n)*.5,n/4,2)
x4 = t(matrix(mu4,2,n/4)) + matrix(rnorm(n)*5,n/4,2)
x = rbind(x1,x2,x3,x4)
Y = c(rep(1,n/4),rep(2,n/4),rep(3,n/4),rep(4,n/4))
y = factor(Y)
plot(x[,1],x[,2],col=as.numeric(y),pch=16)


# improved version of kmeans,
# showing improvement for each iteration step
mv.kmeans = function(x,k,seed=101,maxit=100,cens=NULL){
  set.seed(seed)
  n = nrow(x)
  if(is.null(cens)){
    cens = x[sample(1:n,k),]
  }
  plot(x[,1],x[,2],pch=16)
  points(cens[,1],cens[,2],col=1:k,pch=16,cex=2)
  thr = 1e-6; ind = 1; iter = 1;
  while( ind>thr & iter<maxit)
  {
    oldcen = cens
    km = kmeans(x,centers=cens,iter.max=1,nstart=1)
    plot(x[,1],x[,2],col=km$cluster,pch=16)
    cens = km$centers
    points(cens[,1],cens[,2],col=1:k,pch=16,cex=2)
    iter = iter + 1
    ind = sum(diag((oldcen-cens)%*%t(oldcen-cens)))
    # inx is the improvement for estimates of means
    # or:
    # inx = sum((oldcen-cens)^2)
    print(ind)
  }
  return(list(centers=cens,cluster=km$cluster))
}

mv.kmeans(x,4,maxit=10)

Initial choice
After 1st iteration
After 2nd iteration
After 3rd iteration
Improvement for each iteration:

[1] 68.20739
[1] 1.615522
[1] 0




#test 1
inds = which(x[,1]<1 & x[,2]<1)
startc = x[sample(inds,4),]
mv.kmeans(x,4,maxit=10,cens=startc)






[1] 191.4699
[1] 47.31325
[1] 0.1695448
[1] 0

#test 2
inds = which(x[,1]<1 & x[,2]<5)
startc = x[sample(inds,5),]
mv.kmeans(x,5,maxit=10,cens=startc)


#test 3
inds = which(x[,1]>5 & x[,2]<5)
startc = x[sample(inds,5),]
mv.kmeans(x,5,maxit=10,cens=startc)