Monday, December 2, 2013

SAS Survival Analysis IV: Cox Model with Time-dependet Variable

libname asa_data 'C:\ASA_SAS';
* Time-dependent variable;
ods output parameterestimates = temp1;
proc phreg data = asa_data.uis;
model time*censor(0) = treat off_trt tot;
if time <= los then off_trt = 0;
else off_trt = 1;
tot = treat*off_trt;
run;
* Compared with the following code;
data temp2;
set asa_data.uis;
if time <= los then off_trt = 0;
else off_trt = 1;
tot = treat*off_trt;
run;
proc phreg data = temp2;
model time*censor(0) = treat off_trt tot;
run;

Wednesday, November 27, 2013

SAS Survival Analysis III: Cox model

libname asa_data 'C:\ASA_SAS';
ods trace on;
proc phreg data = asa_data.whas100;
model lenfol*fstat(0) = gender;
run;
ods trace off;
* Table 3.2;
ods output ParameterEstimates = temp1;
proc phreg data = asa_data.whas100;
model lenfol*fstat(0) = gender;
run;
libname asa_data 'C:\ASA_SAS';
ods trace on;
proc phreg data = asa_data.whas100;
model lenfol*fstat(0) = gender;
run;
ods trace off;
* Table 4.2;
ods output ParameterEstimates = temp1;
proc phreg data = asa_data.whas100;
model lenfol*fstat(0) = gender / ties = breslow;
run;
proc print data = temp1;run;
proc contents data = temp1;run;
data table4p2;
set temp1;
z = sqrt(ChiSq);
z2 = Estimate / StdErr;
lowerb = estimate - 1.96*stderr;
upperb = estimate + 1.96*stderr;
run;

proc print data = table4p2 label noobs;
var Parameter Estimate stderr z z2 Probchisq lowerb upperb;
format Estimate z z2 Probchisq lowerb upperb f8.3;
format stderr f8.4;
label Parameter = 'Variable'
   Estimate = 'Coeff.'
      stderr = 'Std.Err.'
   Probchisq = 'p>|z|'
   lowerb = 'Conf Lower'
   upperb = 'Conf Upper';
run;

* Table 4.4;
data temp2;
set asa_data.whas100;
age_2 = 0;
age_3 = 0;
age_4 = 0;
if 60 <= age <= 69 then age_2 = 1;
else if 70 <= age <= 79 then age_3 = 1;
else if 80 <= age then age_4 = 1;  
run;
* Table 4.5 & Table 4.7;
proc phreg data = temp2;
model lenfol*fstat(0) = age_2 age_3 age_4 / ties = breslow risklimits covb;
run;

data temp3;
set asa_data.whas100;
age_2 = 0;
age_3 = 0;
age_4 = 0;
if age < 60 then do;
age_2 = -1;
age_3 = -1;
age_4 = -1;
end;
if 60 <= age <= 69 then age_2 = 1;
if 60 <= age <= 69 then age_2 = 1;
else if 70 <= age <= 79 then age_3 = 1;
else if 80 <= age then age_4 = 1;  
run;
* Table 4.8;
proc phreg data = temp3;
model lenfol*fstat(0) = age_2 age_3 age_4 / ties = breslow risklimits covb;
run;

* Table 4.13;
* Crude Model;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = gender/ risklimits;
run;
* Adjusted Model;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = gender age/ risklimits;
run;
* Interaction Model;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = gender age gender*age/ risklimits; *no hazard ratio in this case;
run;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = gender age ga/ risklimits;
ga = gender*age;
output out = temp4 xbeta = xbeta;
run;
proc print data = temp4; run;
proc gplot data = temp4;
plot xbeta*age = gender;
run;
* Figure 4.5;
data temp1;
set asa_data.gbcs;
time = rectime/30;
run;
proc phreg data = temp1;
model time*censrec(0) = hormone;
output out = temp2 survival = survival;
run;
proc sort data = temp2;
by hormone time;
run;
proc gplot data = temp2;
plot survival*time = hormone;
run;
quit;

libname asa_data 'C:\ASA_SAS';
* Figure 5.2a;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = age hr diasbp gender chf;
id id;
output out = fig52a resmart = mart;
run;
proc sort data = asa_data.whas500;
by id;
run;
proc sort data = fig52a;
by id;
run;
data fig52a;
merge fig52a asa_data.whas500;
by id;
run;
proc loess data = fig52a;
model mart = bmi;
ods output outputstatistics = temp1;
run;
proc sort data = temp1;
by bmi;
run;
proc contents data = temp1; run;
goptions reset = all;
symbol1 v = dot i = none;
symbol2 v = none i = join;
proc gplot data = temp1;
plot DepVar*bmi=1 pred*bmi=2 / overlay;
*plot pred*bmi=2;
run;
quit;

* Stepwise ;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = age chf hr diasbp bmi gender mitype miord sysbp cvd afb
/ selection = forward slentry = 0.25 slstay = 0.8;
run;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = age chf hr diasbp bmi gender mitype miord sysbp cvd afb
/ selection = stepwise slentry = 0.25 slstay = 0.8 details;
run;
* Best Subset;
proc phreg data = asa_data.whas500;
model lenfol*fstat(0) = age chf hr diasbp bmi gender mitype miord sysbp cvd afb
/ selection = score best = 3 details;
run;

Monday, November 18, 2013

SAS Survival Analysis II: descriptive analysis

libname asa_data 'C:\ASA_SAS';
data toydata;
input subject time censor;
datalines;
1 6 1
2 44 1
3 21 0
4 14 1
5 62 1
;
run;
* Table 2.2, Figure 2.1;
proc lifetest data = toydata plots = s cs = none;
time time*censor(0);
run;
data whas100;
set asa_data.whas100;
year = lenfol/365.25;
run;
* Figure 2.2, Table 2.3;
proc lifetest data = whas100 plots = (s ls h) cs = none;
time year*fstat(0);
label year = 'Survival Time (Years)';
run;
ods trace on;
proc lifetest data = whas100 plots = (s ls h) cs = none;
time year*fstat(0);
label year = 'Survival Time (Years)';
run;
ods trace off;
ods select productlimitestimates;
proc lifetest data = whas100 plots = (s ls h) cs = none;
time year*fstat(0);
label year = 'Survival Time (Years)';
run;
* Table 2.4, Figure 2.4;
proc lifetest data = whas100 plots = s method = lt;
time year*fstat(0);
run;

proc lifetest data = whas100 plots = s conftype = loglog confband = hw
outs = conf_data;
time year*fstat(0);
run;
proc print;run;
* Figure 2.7;
goptions reset = all;
legend1 label=none value=(h=1.5 font=swiss 'Est. Surv. Pr.' 'Pointwise 95%' 'Hall & Wellner Band')
        position=(bottom center inside) mode=protect down=3 across=1;
axis1 label=(a=90 'Estimated Survival Probability') order=(0 to 1 by 0.1) minor=none;
axis2 label=('Survival Time (Years)') order=(0 to 8 by 1) minor=none;
symbol1 i=steplj color=black line=1;
symbol2 i=steplj color=black line=14;
symbol3 i=steplj color=black line=46;
symbol4 i=steplj color=black line=14;
symbol5 i=steplj color=black line=46;
proc gplot data = conf_data;
plot (survival sdf_ucl hw_ucl)*year / overlay vaxis=axis1 haxis=axis2 legend=legend1;
plot2 (sdf_lcl hw_lcl)*year /overlay vaxis=axis1 noaxis nolegend;
run;
quit;

* Figure 2.7;
proc sort data = whas100;
by year;
run;
proc lifetest data = whas100 outs = loglogdata conftype = loglog;
time year*fstat(0);
run;
proc contents; run;
proc print;run;
data loglogdata;
set loglogdata;
rename SDF_LCL = lcl_loglog SDF_UCL = ucl_loglog;
drop _CENSOR_;
run;
proc lifetest data = whas100 outs = logdata conftype = log;
time year*fstat(0);
run;
data logdata;
set logdata;
rename SDF_LCL = lcl_log SDF_UCL = ucl_log;
drop _CENSOR_;
run;
proc lifetest data = whas100 outs = logitdata conftype = logit;
time year*fstat(0);
run;
data logitdata;
set logitdata;
rename SDF_LCL = lcl_logit SDF_UCL = ucl_logit;
drop _CENSOR_;
run;
proc lifetest data = whas100 outs = asindata conftype = asinsqrt noprint;
time year*fstat(0);
run;
data asindata;
set asindata;
rename SDF_LCL = lcl_asin SDF_UCL = ucl_asin;
drop _CENSOR_;
run;
data all;
merge loglogdata logdata logitdata asindata;
by year;
run;
goptions reset = all;
legend1 label=none value=(h=1.5 font=swiss 'Est. Surv. Pr.' 'Log' 'Log-Log' 'Logit' 'Arcsine'
        'lower logit' 'upper logit' 'lower arcsine' 'upper arcsine')
        position=(bottom left inside) mode=protect across=2;
axis1 label=(a=90 'Estimated Survival Probability') order=(0 to 1 by .2) minor=none ;
axis2 label=('Survival Time (Years)') order=(0 to 8 by 1) minor=none;
symbol1 i=steplj line=1 color=black;
symbol2 i=steplj line=4 color=black;
symbol3 i=steplj line=14 color=black;
symbol4 i=steplj line=46 color=black;
symbol5 i=steplj line=41 color=black;
symbol6 i=steplj line=4 color=black;
symbol7 i=steplj line=14 color=black;
symbol8 i=steplj line=46 color=black;
symbol9 i=steplj line=41 color=black;
proc gplot data = all;
plot (survival lcl_log lcl_loglog lcl_logit lcl_asin)*year / overlay vaxis=axis1 haxis=axis2 legend=legend1;
plot2 (ucl_log ucl_loglog ucl_logit ucl_asin)*year /overlay vaxis=axis1 noaxis nolegend;
run;
quit;
* Table 2.5;
ods select quartiles;
proc lifetest data = whas100 conftype = log;
time year*fstat(0);
run;
* Table 2.7;
proc lifetest data = toydata;
time time*censor(0);
run;
data toy_temp;
set toydata;
if subject = 5 then censor = 0;
run;
proc lifetest data = toy_temp;
time time*censor(0);
run;
* Figure 2.9, Table 2.12;
proc lifetest data = whas100 plots = s cs = none;
time lenfol*fstat(0);
strata gender / test = (logrank wilcoxon peto tarone);
run;

SAS Survival Analysis I: data input

libname asa_data 'C:\ASA_SAS';
data asa_data.whas100;
 infile 'C:\Applied_Survival_Analysis_Data\whas100.dat';
 input id 1-5 +2 admitdate mmddyy10. +2 foldate mmddyy10. los lenfol fstat age gender bmi;
run;
/*
proc print data = asa_data.whas100;
 title 'WHAS100';
 format admitdate date9. foldate date11.;
run;
*/
data asa_data.actg320;
 infile 'C:\Applied_Survival_Analysis_Data\actg320.dat';
 input id time censor time_d tx txgrp strat2 sex raceth ivdrug hemophil
   karnof cd4 priorzdv age;
run;
data asa_data.actg320ncc;
 infile 'C:\Applied_Survival_Analysis_Data\actg320ncc.dat';
 input set case id time tx age cd4;
run;
data asa_data.bpd;
 infile 'C:\Applied_Survival_Analysis_Data\bpd.dat';
 input id surfact ondays censor;
run;
data asa_data.comprisk;
 infile 'C:\Applied_Survival_Analysis_Data\comprisk.dat';
 input id age gender bmi time ev_typ;
run;
data asa_data.cvdrisk;
 infile 'C:\Applied_Survival_Analysis_Data\cvdrisk.dat';
 input id age bmi gender time ev_typ;
run;
data asa_data.FRTCS;
 infile 'C:\Applied_Survival_Analysis_Data\FRTCS.dat';
 input id age sex date0 date9. sbp0 dbp0 antihyp0 date1 date9. sbp1
   dbp1 antihyp1 date2 date9. sbp2 dbp2 antihyp2 date_event date9. censor;
run;
/*
proc print data = asa_data.FRTCS;
 format date0 date9. date1 date9. date_event date9.;
run;
*/
data asa_data.gbcs;
 infile 'C:\Applied_Survival_Analysis_Data\gbcs.dat';
 input id diagdate date11. recdate date11. deathdate date11. age menopause
   hormone size grade nodes prog_recp estrg_recp rectime censrec
   survtime censdead;
run;
/*
proc print data = asa_data.gbcs (obs = 5);
 format diagdate date9. recdate date11. deathdate date11.;
run;
*/
data asa_data.GRACE1000;
 infile 'C:\Applied_Survival_Analysis_Data\GRACE1000.dat';
 input id days death revasc revascdays los age sysbp stchange;
run;
data asa_data.recur;
 infile 'C:\Applied_Survival_Analysis_Data\recur.dat';
 input id age treat time0 time1 censor event;
run;
data asa_data.uis;
 infile 'C:\Applied_Survival_Analysis_Data\uis.dat';
 input id age beck hercoc ivhx ndrugtx race treat site los time censor;
run;
data asa_data.whas500;
 infile 'C:\Applied_Survival_Analysis_Data\whas500.dat';
 input id age gender hr sysbp diasbp bmi cvd afb sho chf av3 miord mitype year
    admitdate mmddyy12. disdate mmddyy12. fdate mmddyy12. los dstat lenfol fstat;
run;
/*
proc print data = asa_data.whas500;
 format admitdate date9. disdate date9. fdate date9.;
run;
*/
data asa_data.whasncc;
 infile 'C:\Applied_Survival_Analysis_Data\whasncc.dat';
 input set case t lenfol fstat age sex bmi chf miord nr;
run;
libname asa_data 'C:\ASA_SAS';
proc format;
 value status 1 = 'Dead'
     0 = 'Alive';
 value gender 0 = 'Male'
     1 = 'Female';
run;
title '100 Subjects in the Worcester Heart Attack Study (WHAS100)';
proc print data = asa_data.whas100 label;
 id id;
 var admitdate foldate los lenfol fstat age gender bmi;
 format admitdate mmddyy8. foldate mmddyy8. fstat status. gender gender.;
 label id = 'ID'
    admitdate = 'Admission Date'
    foldate = 'Follow Up Date'
    los = 'Length of Stay'
    lenfol = 'Follow Up Time'
    fstat = 'Vital Status'
    age = 'Age at Admission'
    gender = 'Gender'
    bmi = 'BMI';
run;

Monday, September 30, 2013

Time Series II: AR, MA, ARMA, ARIMA

rm(list=ls())
# white noise
set.seed(1)
w <- rnorm(100)
plot(w, type = "l")
x <- seq(-3,3, length = 1000)
hist(rnorm(100), prob = T); points(x, dnorm(x), type = "l")
set.seed(2)
acf(rnorm(100))
# random walk
set.seed(2)
x <- w <- rnorm(1000)
for (t in 2:1000) x[t] <- x[t - 1] + w[t]
plot(x, type = "l")
acf(x)
acf(diff(x))
www <- "http://elena.aut.ac.nz/~pcowpert/ts/HP.txt"
HP.dat <- read.table(www, header = T) ; attach(HP.dat)
plot (as.ts(Price))
DP <- diff(Price) ; plot (as.ts(DP)) ; acf (DP)
mean(DP) + c(-2, 2) * sd(DP)/sqrt(length(DP))
# AR(1)
rho <- function(k, alpha) alpha^k
layout(1:2)
plot(0:10, rho(0:10, 0.7), type = "b")
plot(0:10, rho(0:10, -0.7), type = "b")
layout(1:1)
set.seed(1)
x <- w <- rnorm(100)
for (t in 2:100) x[t] <- 0.7 * x[t - 1] + w[t]
plot(x, type = "l")
acf(x)
pacf(x)
x.ar <- ar(x, method = "mle")
x.ar$order
x.ar$ar
x.ar$ar + c(-2, 2) * sqrt(x.ar$asy.var)
www = "http://elena.aut.ac.nz/~pcowpert/ts/pounds_nz.dat"
Z = read.table(www, header = T)
Z.ts = ts(Z, st = 1991, fr = 4)
Z.ar <- ar(Z.ts)
mean(Z.ts)
Z.ar$order
Z.ar$ar
Z.ar$ar + c(-2, 2) * sqrt(Z.ar$asy.var)
acf(Z.ar$res[-1])
www = "http://elena.aut.ac.nz/~pcowpert/ts/global.dat"
Global = scan(www)
Global.ts = ts(Global, st = c(1856, 1), end = c(2005, 12), fr = 12)
Global.ar <- ar(aggregate(Global.ts, FUN = mean), method = "mle")
mean(aggregate(Global.ts, FUN = mean))
Global.ar$order
Global.ar$ar
acf(Global.ar$res[-(1:Global.ar$order)], lag = 50)
# MA: correlogram and simulation
# correlogram
rho = function(k, beta){
  q = length(beta) - 1
  if (k > q) ACF = 0
  else{
    s1 = 0
    s2 = 0
    for (i in 1:(q - k + 1)){
      s1 = s1 + beta[i] * beta[i + k]
    }
    for (i in (1:(q + 1))){
      s2 = s2 + beta[i]^2
    }
    ACF = s1/s2
  }
  ACF 
}
 
beta <- c(1, 0.7, 0.5, 0.2) 
rho.k <- rep(1, 10)
for (k in 1:10) rho.k[k] <- rho(k, beta)
plot(0:10, c(1, rho.k), pch = 4, ylab = expression(rho[k])) 
abline(0, 0) 
 
# simulation of MA(3) 
set.seed(1) 
b = c(0.8, 0.6, 0.4) 
x = w = rnorm(1000)
for (t in c(4:1000)){
  for (j in c(1:3)){
    x[t] = x[t] + b[j] * w[t - j]
  }
}
plot(x, type = "l")
acf(x)
x.ma <- arima(x, order = c(0, 0, 3))
x.ma
x.ma <- arima(x, order = c(0, 0, 3), include.mean=FALSE)
x.ma
# fit MA model
www = "http://elena.aut.ac.nz/~pcowpert/ts/pounds_nz.dat"
x <- read.table(www, header = T)
x.ts <- ts(x, st = 1991, fr = 4)
x.ma <- arima(x.ts, order = c(0, 0, 1))
x.ma
# ARMA simulation
set.seed(1)
x = arima.sim(n = 10000, list(ar = -0.6, ma = 0.5))
coef(arima(x, order = c(1, 0, 1)))
AIC(arima(x, order = c(1, 0, 1)))
acf(resid(arima(x, order = c(1, 0, 1))))
www = "http://elena.aut.ac.nz/~pcowpert/ts/cbe.dat"
CBE = read.table(www, header = T)
Elec.ts = ts(CBE[,3], start = 1958, freq = 12)
Time = 1:length(Elec.ts)
Imth = cycle(Elec.ts)
Elec.lm = lm(log(Elec.ts) ~ Time + I(Time^2) + factor(Imth))
summary(Elec.lm)
acf(resid(Elec.lm))
# train ARMA model
best.order = c(0, 0, 0)
best.aic = Inf
for (i in c(0:2)){
  for (j in c(0:2)){
    fit.aic = AIC(arima(resid(Elec.lm), order = c(i, 0, j)))
    if (fit.aic < best.aic){
      best.order = c(i, 0, j)
      best.arma = arima(resid(Elec.lm), order = best.order)
      best.aic = fit.aic           
    }
  }
}
best.order
acf(resid(best.arma))
# prediction
new.time <- seq(length(Elec.ts), length = 36)
new.data <- data.frame(Time = new.time, Imth = rep(1:12, 3))
predict.lm <- predict(Elec.lm, new.data)
predict.arma <- predict(best.arma, n.ahead = 36)
elec.pred <- ts(exp(predict.lm + predict.arma$pred), start = 1991, freq = 12)
ts.plot(cbind(Elec.ts, elec.pred), lty = 1:2)
www = "http://elena.aut.ac.nz/~pcowpert/ts/wave.dat"
wave.dat <- read.table(www, header = T)
attach(wave.dat)
layout(1:3)
plot(as.ts(waveht), ylab = 'Wave height')
acf(waveht)
pacf (waveht)
wave.arma <- arima(waveht, order = c(4,0,4))
acf(wave.arma$res[-(1:4)])
pacf(wave.arma$res[-(1:4)])
hist(wave.arma$res[-(1:4)], xlab='height / mm', main='')
# train Non-seasonal ARIMA model
www = "http://elena.aut.ac.nz/~pcowpert/ts/cbe.dat"
CBE = read.table(www, header = T)
Elec.ts = ts(CBE[, 3], start = 1958, freq = 12)
layout(c(1, 1, 2, 3))
plot(Elec.ts)
plot(diff(Elec.ts))
plot(diff(log(Elec.ts)))
layout(c(1, 1))
# ARIMA(1, 1, 1) simulation and inference
set.seed(1)
x = w = rnorm(1000)
for (i in 3:1000){
  x[i] = 0.5 * x[i - 1] + x[i - 1] - 0.5 * x[i - 2] + w[i] + 0.3 * w[i - 1]
}
plot(x, type = "l")
arima(x = x, order  = c(1, 1, 1))
x = arima.sim(model = list(order = c(1, 1, 1), ar = 0.5, ma = 0.3), n = 1000)
arima(x = x, order  = c(1, 1, 1))
# IMA(1, 1)
Beer.ts = ts(CBE[, 2], start = 1958, freq = 12)
Beer.ima = arima(Beer.ts, order = c(0, 1, 1))
Beer.ima
acf(resid(Beer.ima))
# prediction
Beer.1991 = predict(Beer.ima, n.ahead = 12)
plot(Beer.1991$pred)
plot(Beer.1991$se)

Time Series I: Basics

rm(list=ls())
Fact = function(n) if (n == 1) 1 else n * Fact(n - 1)
Fact(5)
data(AirPassengers)
AP = AirPassengers
AP
class(AP)
start(AP)
end(AP)
frequency(AP)
summary(AP)
plot(AP, ylab = "Passengers (1000's)")
layout(1:2)
plot(aggregate(AP))
boxplot(AP ~ cycle(AP))
www = "http://elena.aut.ac.nz/~pcowpert/ts/Maine.dat"
Maine.month = read.table(www, header = TRUE)
attach(Maine.month)
class(Maine.month)
Maine.month.ts = ts(unemploy, start = c(1996, 1), freq = 12)
Maine.annual.ts = aggregate(Maine.month.ts)/12
layout(1:2)
plot(Maine.month.ts, ylab = "unemployed (%)")
plot(Maine.annual.ts, ylab = "unemployed (%)")
Maine.Feb = window(Maine.month.ts, start = c(1996,2), freq = TRUE)
plot(Maine.Feb)
Maine.Aug = window(Maine.month.ts, start = c(1996,8), freq = TRUE)
plot(Maine.Aug)
Feb.ratio = mean(Maine.Feb) / mean(Maine.month.ts)
Aug.ratio = mean(Maine.Aug) / mean(Maine.month.ts)
Feb.ratio
Aug.ratio
layout(1:1)
www = "http://elena.aut.ac.nz/~pcowpert/ts/USunemp.dat"
US.month = read.table(www, header = T)
attach(US.month)
US.month.ts = ts(USun, start=c(1996,1), end=c(2006,10), freq = 12)
plot(US.month.ts, ylab = "unemployed (%)")
# Multiple time series
www = "http://elena.aut.ac.nz/~pcowpert/ts/cbe.dat"
CBE = read.table(www, header = T)
CBE[1:4, ]
class(CBE)
Elec.ts = ts(CBE[, 3], start = 1958, freq = 12)
Beer.ts = ts(CBE[, 2], start = 1958, freq = 12)
Choc.ts = ts(CBE[, 1], start = 1958, freq = 12)
plot(cbind(Elec.ts, Beer.ts, Choc.ts))
length(AP)
length(Elec.ts)
AP.elec = ts.intersect(AP, Elec.ts)
class(AP.elec)
AP.elec
AP
Elec.ts
AP = AP.elec[,1]
Elec = AP.elec[,2]
layout(1:2)
plot(AP, main = "", ylab = "Air passengers / 1000's")
plot(Elec, main = "", ylab = "Electricity production / MkWh")
layout(1:1)
plot(as.vector(AP), as.vector(Elec),
     xlab = "Air passengers / 1000's",
     ylab = "Electricity production / MWh")
abline(reg = lm(Elec ~ AP))
cor(AP, Elec)
www = "http://elena.aut.ac.nz/~pcowpert/ts/pounds_nz.dat"
Z = read.table(www, header = T)
Z.ts = ts(Z, st = 1991, fr = 4)
plot(Z.ts, xlab = "time / years",
     ylab = "Quarterly exchange rate in $NZ / pound")
Z.92.96 = window(Z.ts, start = c(1992, 1), end = c(1996, 1))
Z.96.98 = window(Z.ts, start = c(1996, 1), end = c(1998, 1))
layout (1:2)
plot(Z.92.96, ylab = "Exchange rate in $NZ/pound",
     xlab = "Time (years)" )
plot(Z.96.98, ylab = "Exchange rate in $NZ/pound",
     xlab = "Time (years)" )

www = "http://elena.aut.ac.nz/~pcowpert/ts/global.dat"
Global = scan(www)
Global.ts = ts(Global, st = c(1856, 1), end = c(2005, 12), fr = 12)
Global.annual = aggregate(Global.ts, FUN = mean)
plot(Global.ts)
plot(Global.annual)
layout(1:1)
New.series = window(Global.ts, start=c(1970, 1), end=c(2005, 12))
New.time = time(New.series)
plot(New.series)
abline(reg=lm(New.series ~ New.time))

# decomposition in R
plot(decompose(Elec.ts))
Elec.decom = decompose(Elec.ts, type = "mult")
plot(Elec.decom)
Trend = Elec.decom$trend
Seasonal = Elec.decom$seasonal
ts.plot(cbind(Trend, Trend * Seasonal), lty = 1:2)

www = "http://elena.aut.ac.nz/~pcowpert/ts/Herald.dat"
Herald.dat = read.table(www, header = T)
attach (Herald.dat)
x = CO; y = Benzoa; n = length(x)
sum((x - mean(x))*(y - mean(y))) / (n - 1)
mean((x - mean(x)) * (y - mean(y)))
cov(x, y)
cov(x,y) / (sd(x)*sd(y))
cor(x,y)
www = "http://elena.aut.ac.nz/~pcowpert/ts/wave.dat"
wave.dat = read.table (www, header=T)
attach(wave.dat)
plot(ts(waveht))
plot(ts(waveht[1:60]))
# acf
acf(waveht)$acf[2]
acf(waveht)$acf[1]
# autocovariance
acf(waveht, type = c("covariance"))$acf[2]
# correlogram
acf(waveht)
acf(AirPassengers)
data(AirPassengers)
AP = AirPassengers
AP.decom = decompose(AP, "multiplicative")
plot(ts(AP.decom$random[7:138]))
acf(AP.decom$random[7:138])
sd(AP[7:138])
sd(AP[7:138] - AP.decom$trend[7:138])
sd(AP.decom$random[7:138])
www = "http://elena.aut.ac.nz/~pcowpert/ts/Fontdsdt.dat"
Fontdsdt.dat = read.table(www, header=T)
attach(Fontdsdt.dat)
plot(ts(adflow), ylab = 'adflow')
acf(adflow, xlab = 'lag (months)', main="")
Build.dat = read.table("http://elena.aut.ac.nz/~pcowpert/ts/ApprovActiv.dat", header=T)
attach(Build.dat)
App.ts = ts(Approvals, start = c(1996,1), freq=4)
Act.ts = ts(Activity, start = c(1996,1), freq=4)
ts.plot(App.ts, Act.ts, lty = c(1,3))

# correlogram and cross-correlogram
acf(ts.union(App.ts, Act.ts))
print(ts.union(App.ts, Act.ts))
print(acf(ts.union(App.ts, Act.ts)))

# Bass model
T79 = 1:10
Tdelt = (1:100) / 10
Sales = c(840,1470,2110,4000, 7590, 10950, 10530, 9470, 7790, 5890)
Cusales = cumsum(Sales)
Bass.nls = nls(Sales ~ M * ( ((P+Q)^2 / P) * exp(-(P+Q) * T79) ) /
                  (1+(Q/P)*exp(-(P+Q)*T79))^2, start = list(M=60630, P=0.03, Q=0.38))
summary(Bass.nls)
Bcoef = coef(Bass.nls)
m = Bcoef[1]
p = Bcoef[2]
q = Bcoef[3]
ngete = exp(-(p+q) * Tdelt)
Bpdf = m * ( (p+q)^2 / p ) * ngete / (1 + (q/p) * ngete)^2
plot(Tdelt, Bpdf, xlab = "Year from 1979",
     ylab = "Sales per year", type='l')
points(T79, Sales)
Bcdf = m * (1 - ngete)/(1 + (q/p)*ngete)
plot(Tdelt, Bcdf, xlab = "Year from 1979",
     ylab = "Cumulative sales", type='l')
points(T79, Cusales)

?HoltWinters
require(graphics)
## Seasonal Holt-Winters
(m <- HoltWinters(co2))
plot(m)
plot(fitted(m))
(m <- HoltWinters(AirPassengers, seasonal = "mult"))
plot(m)
## Non-Seasonal Holt-Winters
x <- uspop + rnorm(uspop, sd = 5)
m <- HoltWinters(x, gamma = FALSE)
plot(m)
## Exponential Smoothing
m2 <- HoltWinters(x, gamma = FALSE, beta = FALSE)
lines(fitted(m2)[,1], col = 3)

Sunday, September 29, 2013

Some thoughts of subquery efficiency

from http://blog.csdn.net/haiwer/article/details/2826881

1. Use LEFT JOIN instead of NOT IN, NOT EXISTS

Change
SELECT PUB_NAME
FROM PUBLISHERS
WHERE PUB_ID NOT IN
   (SELECT PUB_ID
   FROM TITLES
   WHERE TYPE = 'BUSINESS')

TO
SELECT A.PUB_NAME
FROM PUBLISHERS A LEFT JOIN TITLES B
ON        B.TYPE = 'BUSINESS' AND
          A.PUB_ID=B.PUB_ID
WHERE B.PUB_ID IS NULL

Change
SELECT TITLE
FROM TITLES
WHERE NOT EXISTS
   (SELECT TITLE_ID
   FROM SALES
   WHERE TITLE_ID = TITLES.TITLE_ID)

TO
SELECT TITLE
FROM TITLES LEFT JOIN SALES
ON SALES.TITLE_ID = TITLES.TITLE_ID
WHERE SALES.TITLE_ID IS NULL


2. Use INNER JOIN instead of IN, EXISTS if there are no duplicates in subquery

Change
SELECT PUB_NAME
FROM PUBLISHERS
WHERE PUB_ID IN
   (SELECT PUB_ID
   FROM TITLES
   WHERE TYPE = 'BUSINESS')

To
SELECT DISTINCT A.PUB_NAME
FROM PUBLISHERS A INNER JOIN TITLES B
ON        B.TYPE = 'BUSINESS' AND
          A.PUB_ID=B. PUB_ID


3. Use EXISTS instead of IN in subquery

Change
SELECT PUB_NAME
FROM PUBLISHERS
WHERE PUB_ID IN
   (SELECT PUB_ID
   FROM TITLES
   WHERE TYPE = 'BUSINESS')

To
SELECT PUB_NAME
FROM PUBLISHERS
WHERE EXISTS
   (SELECT 1
   FROM TITLES
   WHERE TYPE = 'BUSINESS' AND
   PUB_ID= PUBLISHERS.PUB_ID)

Sunday, September 8, 2013

Survival Analysis Part 4: The Proportional Hazards Assumption and Leverage (Based on the book by Hosmer, Lemeshow, May)

rm(list = ls())
setwd("C:\\ASA")
# import data
whas500 = read.table("C:\\ASA\\whas500.DAT", sep = "", header = F)
names(whas500) = c("id",
                   "age",
                   "gender",
                   "hr",
                   "sysbp",
                   "diasbp",
                   "bmi",
                   "cvd",
                   "afb",
                   "sho",
                   "chf",
                   "av3",
                   "miord",
                   "mitype",
                   "year",
                   "admitdate",
                   "disdate",
                   "fdate",
                   "los",
                   "dstat",
                   "lenfol",
                   "fstat")
whas500$bmifp1 = (whas500$bmi/10)^2
whas500$bmifp2 = (whas500$bmi/10)^3
library(survival)
model.coxph = coxph( Surv(lenfol, fstat)~ bmifp1
                      + bmifp2
                      + age
                      + hr
                      + diasbp
                      + gender
                      + chf
                      + age*gender,
                      data = whas500,
                      method = "breslow")
summary(model.coxph)
# Table 6.1
zph.1 = cox.zph(model.coxph, transform = "identity")
zph.1
zph.log = cox.zph(model.coxph, transform = "log")
zph.log
zph.km = cox.zph(model.coxph, transform = "km")
zph.km
zph.rank = cox.zph(model.coxph, transform = "rank")
zph.rank
# Figure 6.1
plot(zph.log[4])
abline(h=0, lty=3)
plot(zph.1[4])
abline(h=0, lty=3)
plot(zph.km[4])
abline(h=0, lty=3)
plot(zph.rank[4])
abline(h=0, lty=3)

# Leverage: Figure 6.4 and Figure 6.5
score = residuals(model.coxph, type="score")
plot(whas500$bmi, score[,1], ylab="Score Residuals", xlab="bmifp1")
plot(whas500$hr, score[,4], ylab="Score Residuals", xlab="heart rate")
plot(whas500$age, score[,3], ylab="Score Residuals", xlab="age")

Saturday, September 7, 2013

Survival Analysis Part 3: Quartile Design Variable and Fractional Polynomials (Based on the book by Hosmer, Lemeshow, May)


rm(list = ls())
setwd("C:\\ASA")
# import data
whas500 = read.table("C:\\ASA\\whas500.DAT", sep = "", header = F)
names(whas500) = c("id",
                   "age",
                   "gender",
                   "hr",
                   "sysbp",
                   "diasbp",
                   "bmi",
                   "cvd",
                   "afb",
                   "sho",
                   "chf",
                   "av3",
                   "miord",
                   "mitype",
                   "year",
                   "admitdate",
                   "disdate",
                   "fdate",
                   "los",
                   "dstat",
                   "lenfol",
                   "fstat")
library(survival)
model1.coxph = coxph( Surv(lenfol, fstat)~ age + hr + diasbp + bmi + gender + chf,
                      data = whas500, method = "breslow")
# Table 5.6
summary(model1.coxph)
summary(whas500$bmi)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 13.05   23.22   25.95   26.61   29.39   44.84
whas500$bmicat = cut(whas500$bmi, c(13, 23.22, 25.95, 29.39, 44.84))
summary(whas500$bmicat)
# (13,23.2] (23.2,25.9] (25.9,29.4] (29.4,44.8]
# 125         126         124         125
model2.coxph = coxph( Surv(lenfol, fstat)~ age + hr + diasbp + bmicat + gender + chf,
                      data = whas500, method = "breslow")
summary(model2.coxph)
bmi.coeff = data.frame(model2.coxph$coefficients)[4:6,]
(summary(whas500$bmi)[[1]] + summary(whas500$bmi)[[2]])/2
midpt = c((summary(whas500$bmi)[[1]] + summary(whas500$bmi)[[2]])/2,
          (summary(whas500$bmi)[[2]] + summary(whas500$bmi)[[3]])/2,
          (summary(whas500$bmi)[[3]] + summary(whas500$bmi)[[5]])/2,
          (summary(whas500$bmi)[[5]] + summary(whas500$bmi)[[6]])/2
          )
bmi.plot = cbind(midpt, rbind(0,data.frame(bmi.coeff)))
bmi.plot
# Figure 5.1
plot(bmi.plot[,1], bmi.plot[,2], ylab = "Log Hazard", xlab = "BMI", type = "b")
# Table 5.8
library(mfp)
model.null = coxph( Surv(lenfol, fstat)~ age + hr + diasbp + gender + chf,
                    data = whas500, method = "breslow")
-2*model.null$loglik
model.lin = coxph( Surv(lenfol, fstat)~ age + hr + diasbp + gender + chf + bmi,
                   data = whas500, method = "breslow")
-2*model.lin$loglik
# J = 1, df = 2
model.J1 = mfp( Surv(lenfol, fstat)~ age + hr + diasbp + gender + chf + fp(bmi, df = 2),
                  data = whas500, method = "breslow", family = cox)
-2*model.J1$loglik
model.J1$powers
# J = 2, df = 4
model.J2 = mfp( Surv(lenfol, fstat)~ age + hr + diasbp + gender + chf + fp(bmi, df = 4),
                data = whas500, method = "breslow", family = cox)
-2*model.J2$loglik
model.J2$powers
# Figure 5.2
whas500$resid = residuals(model.null, type="martingale", data=whas500)
plot(whas500$bmi, whas500$resid, xlab="BMI", ylab="Martingale Residuals")
lines(lowess(whas500$bmi, whas500$resid))

Thursday, September 5, 2013

Survival Analysis Part 2: Cox Model and Applications (Based on the book by Hosmer, Lemeshow, May)

rm(list = ls())
setwd("C:\\ASA")
# import data
actg320 = read.table("C:\\ASA\\actg320.DAT", sep = "", header = F)
names(actg320) = c("id",
                   "time",
                   "censor",
                   "time_d",
                   "censor_d",
                   "tx",
                   "txgrp",
                   "strat2",
                   "sex",
                   "raceth",
                   "ivdrug",
                   "hemophil",
                   "karnof",
                   "cd4",
                   "priorzdv",
                   "age")
library(survival)
tx.coxph = coxph( Surv(time, censor)~tx, data = actg320, method = "breslow")
# Table 3.1
summary(tx.coxph)
v2.coxph = coxph( Surv(time, censor)~tx + age + sex + cd4 + priorzdv,
                  data = actg320, method = "breslow")
# Table 3.2
summary(v2.coxph)
v3.coxph = coxph( Surv(time, censor)~tx + age + cd4,
                  data = actg320, method = "breslow")
# Table 3.3
summary(v3.coxph)
v4.coxph = coxph( Surv(time, censor)~tx + age + cd4,
                  data = actg320, method = "efron")
summary(v4.coxph)
# baseline cumulative hazard function
H = basehaz(v4.coxph, centered = TRUE)
plot(H[,2], H[,1], xlab = "Time", ylab = "cumulative hazard function", type = "l")
###################################################################
###################################################################
gbcs = read.table("C:\\ASA\\gbcs.DAT", sep = "", header = F)
names(gbcs) = c("id",
                "diagdate",
                "recdate",
                "deathdate",
                "age",
                "menopause",
                "hormone",
                "size",
                "grade",
                "nodes",
                "prog_recp",
                "estrg_recp",
                "rectime",
                "censrec",
                "survtime",
                "censdead")
crude = coxph( Surv(rectime, censrec)~hormone, data = gbcs, method = "breslow")
summary(crude)
adjusted = coxph( Surv(rectime, censrec)~hormone + size, data = gbcs, method = "breslow")
summary(adjusted)
interaction = coxph( Surv(rectime, censrec)~hormone + size + hormone*size,
                     data = gbcs, method = "breslow")
summary(interaction)
hormone.new = data.frame(hormone=c(0,1))
model = coxph( Surv(rectime, censrec)~hormone, data = gbcs, method = "breslow")
plot(survfit(model, newdata=hormone.new), xlab="Time", ylab="Survival Probability")
points(survfit(model, newdata=hormone.new), pch=c(1,2))
legend(200, 0.5, c("Hormone Absent", "Hormone Present"), pch=c(1,2), bg="transparent")

Wednesday, September 4, 2013

Survival Analysis Part 1: Descriptive Methods for Survival Data (Based on the book by Hosmer, Lemeshow, May)

rm(list = ls())
setwd("C:\\ASA")
# import data
WHAS100 = read.table("C:\\ASA\\whas100.DAT", sep = "", header = F)
names(WHAS100) = c("id",
                   "admitdate",
                   "foldate",
                   "los",
                   "lenfol",
                   "fstat",
                   "age",
                   "gender",
                   "bmi")
head(WHAS100)
# Kaplan-Meier estimate
library(survival)
whas100.surv = survfit(Surv(lenfol, fstat) ~ 1, conf.type = "none", data = WHAS100)
# Table 2.3
summary(whas100.surv)
# Figure 2.2
plot(whas100.surv, xlab="Time", ylab="survival probability")

# Life-Table estimator
t1year = floor(WHAS100$lenfol/365)
fstat = WHAS100$fstat
t1 = data.frame(t1year, fstat)
t1
library(nlme)
die = gsummary(t1, sum, groups = t1year)
total = gsummary(t1, length, groups = t1year)
rm(t1year)
ltab.data = cbind(die[,1:2], total[,2])
names(ltab.data) = c("t1year", "censor", "total")
ltab.data
attach(ltab.data)
lt = length(t1year)
t1year[lt + 1] = NA
die = censor
censored = total[,2] - censor
library(KMsurv)
lifetable = lifetab(t1year, 100, censored, die)
# Table 2.4
lifetable[,1:5]
# nlost = censored
# nevent = die
# Figure 2.3
plot(t1year[1:8], lifetable[,5], type="s",
     xlab="Survival Time (Years)", ylab="Estimated Survival Probability")
# Figure 2.4
plot(t1year[1:8], lifetable[,5], type="b",
     xlab="Survival Time (Years)", ylab="Estimated Survival Probability")
################################
## confidence interval  ########
################################

whas100.surv = survfit(Surv(lenfol, fstat) ~ 1, conf.type = "none", data = WHAS100)
library(km.ci)
ci.log = km.ci(whas100.surv, conf.level = 0.95, tl = NA, tu = NA, method = "log")
ci.loglog = km.ci(whas100.surv, conf.level = 0.95, tl = NA, tu = NA, method = "loglog")
# ci.hallwellner = km.ci(whas100.surv, conf.level = 0.95, tl = NA, tu = NA, method = "hall-wellner")
ci.hallwellner = km.ci(whas100.surv, conf.level = 0.95, tl = NA, tu = NA, method = "loghall")
# look like loghall can recreate the Figure 2.7 instead of hall-wellner
plot(whas100.surv)
plot(ci.log)
plot(ci.loglog)
plot(ci.hallwellner)
log.ci = survfit(Surv(lenfol, fstat) ~ 1, conf.type = "log", data = WHAS100)
loglog.ci = survfit(Surv(lenfol, fstat) ~ 1, conf.type = "log-log", data = WHAS100)
# Figure 2.7
par(cex = 0.8)
plot(ci.hallwellner, lty = 3, lwd = 2)
lines(log.ci, lwd = 2, lty = 1)
lines(log.ci, lwd = 2, lty = 6, conf.int = T)
legend(150, 0.4, c("Kaplan-Meier", "Hall-Wellner", "Pointwise"), lty=c(1, 4, 6))
#######################################
########  Comparison  #################
#######################################
# Table 2.12
diff.logrank = survdiff(Surv(lenfol, fstat) ~ gender, data = WHAS100, rho = 0)
print(diff.logrank)
diff.petopeto = survdiff(Surv(lenfol, fstat) ~ gender, data = WHAS100, rho = 1)
print(diff.petopeto)
# Figure 2.9
diff.drug = survfit(Surv(lenfol, fstat) ~ strata(gender), data = WHAS100, conf.type="log-log")
plot(diff.drug, lty = c(1, 2), xlab = "Time", ylab = "Survival Probability")
diff.drug
legend(200,0.4, c("Male", "Female"), lty = c(1,2))
agecat = cut(WHAS100$age, c(0, 59, 69, 79, 100))
agecat
diff.age = survfit(Surv(lenfol, fstat) ~ strata(agecat), data = WHAS100, conf.type="log-log")
# Table 2.13
print(diff.age)
# Table 2.15
survdiff(Surv(lenfol, fstat) ~ agecat, data = WHAS100, rho = 0)
plot(diff.age, lty=c(1, 2, 3, 4), xlab="Time", ylab="Survival Probability")
legend(200,0.6, lty=c(1, 2, 3, 4),
       c("Age<60", "60<=Age<69", "70<=Age<79", "Age>=80"), bg="transparent")

############################################################
#####  aalen  ################################################
############################################################
aalen = survfit(coxph(Surv(lenfol, fstat) ~ 1, data = WHAS100), type = "aalen")
summary(aalen)
H.aalen = -log(aalen$surv)
aalen.est = cbind(time=aalen$time, d=aalen$n.event, n=aalen$n.risk, H.aalen, s1 = aalen$surv)
KM = survfit(Surv(lenfol, fstat) ~ 1, data = WHAS100)
KM.est = cbind(time=KM$time, s2 = KM$surv)
all = merge(data.frame(aalen.est), data.frame(KM.est), by="time")
all
# Figure 2.12
plot(all$time, all$s1, type="s", xlab="Time", ylab="Survival Probability")
points(all$time, all$s1, pch=1)
lines(all$time, all$s2, type="s")
points(all$time, all$s2, pch=3)
legend(200,0.6, c("Nelson-Aalen", "Kaplan-Meier"), pch=c(1,3))
hazard.rate = all$d/all$n
plot(all$time, hazard.rate, type = "p", pch=20, xlab="Time", ylab="Hazard Ratio")
lines(lowess(all$time, hazard.rate, f=0.75, iter=5))

Monday, April 29, 2013

CART: predict email spam

rm(list=ls())
library(ElemStatLearn)
data(spam)
# names(spam) <- c("make", "address", "all", "3d", "our",
#                  "over", "remove", "internet", "order", "mail",
#                  "receive", "will", "people", "report", "addresses",
#                  "free", "business", "email", "you", "credit",
#                  "your", "font", "000", "money", "hp",
#                  "hpl", "george", "650", "lab", "labs",
#                  "telnet", "857", "data", "415", "85",
#                  "technology", "1999", "parts", "pm",
#                  "direct", "cs", "meeting", "original", "project",
#                  "re", "edu", "table", "conference", ";:",
#                  "(:", "[:", "!:", "$:", "#:",
#                  "CRave", "CRlong", "CRtotal", "spam")
names(spam) <- c("make", "address", "all", "x3d", "our",
                 "over", "remove", "internet", "order", "mail",
                 "receive", "will", "people", "report", "addresses",
                 "free", "business", "email", "you", "credit",
                 "your", "font", "x000", "money", "hp",
                 "hpl", "george", "x650", "lab", "labs",
                 "telnet", "x857", "data", "x415", "x85",
                 "technology", "x1999", "parts", "pm",
                 "direct", "cs", "meeting", "original", "project",
                 "re", "edu", "table", "conference", "p1",
                 "p2", "p3", "p4", "p5", "p6",
                 "CRave", "CRlong", "CRtotal", "spam")

summary(spam)
spam.test_indx = read.delim("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/spam.traintest",
                            sep="\n", header=FALSE)
#########################################################################
#########################################################################
Y = as.data.frame(matrix(rep(0,4601),nrow=4601,ncol=1))
Y[spam$spam == "spam", ] = 1
names(Y) = "spam"
Y[,1]=factor(Y[,1])
data = cbind(spam[,-58],Y)
data.train = data[spam.test_indx == 0,]
data.test = data[spam.test_indx == 1,]
summary(data.train)
summary(data.test)
rm(Y, data, spam, spam.test_indx)
###############################################################################
###############################################################################
library(rpart)
library(pROC)
names(data.train)
spam.tree = rpart(spam~.
                  , data = data.train
                  , method = "class"
                  , xval = 5
                  , cp = 0.00001
                  , minsplit = 1
                  , parms=list(split='information')
                  , na.action = na.exclude
)
printcp(spam.tree)
plotcp(spam.tree, upper = "size")
spam.prune = prune.rpart(spam.tree, 0.0025)
print(spam.prune)
plot(spam.prune)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
spam.prune = prune.rpart(spam.tree, 0.003)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
summary(spam.prune)
plotcp(spam.prune)
y.hat = predict(spam.prune, data.test, type="prob")[,2]
head(y.hat)
roc(data.test$spam,y.hat,plot=T)
############################################################################
spam.tree = rpart(spam~.
                  , data = data.train
                  , method = "class"
                  , xval = 5
                  , cp = 0.00001
                  , minsplit = 1
                  , parms=list(split='gini')
                  , na.action = na.exclude
)
plotcp(spam.tree)
spam.prune = prune.rpart(spam.tree, 0.002)
plotcp(spam.prune)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
summary(spam.prune)
print(spam.prune)
y.hat = predict(spam.prune, data.test, type="prob")[,2]
roc(data.test$spam,y.hat,plot=T)
#Remark: for cross-entropy, smaller trees get better ROC than Gini

GAM: predict email spam

rm(list=ls())
library(ElemStatLearn)
data(spam)
# names(spam) <- c("make", "address", "all", "3d", "our",
#                  "over", "remove", "internet", "order", "mail",
#                  "receive", "will", "people", "report", "addresses",
#                  "free", "business", "email", "you", "credit",
#                  "your", "font", "000", "money", "hp",
#                  "hpl", "george", "650", "lab", "labs",
#                  "telnet", "857", "data", "415", "85",
#                  "technology", "1999", "parts", "pm",
#                  "direct", "cs", "meeting", "original", "project",
#                  "re", "edu", "table", "conference", ";:",
#                  "(:", "[:", "!:", "$:", "#:",
#                  "CRave", "CRlong", "CRtotal", "spam")
names(spam) <- c("make", "address", "all", "x3d", "our",
                 "over", "remove", "internet", "order", "mail",
                 "receive", "will", "people", "report", "addresses",
                 "free", "business", "email", "you", "credit",
                 "your", "font", "x000", "money", "hp",
                 "hpl", "george", "x650", "lab", "labs",
                 "telnet", "x857", "data", "x415", "x85",
                 "technology", "x1999", "parts", "pm",
                 "direct", "cs", "meeting", "original", "project",
                 "re", "edu", "table", "conference", "p1",
                 "p2", "p3", "p4", "p5", "p6",
                 "CRave", "CRlong", "CRtotal", "spam")

summary(spam)
spam.test_indx = read.delim("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/spam.traintest",
                            sep="\n", header=FALSE)
#########################################################################
#########################################################################
Y = as.data.frame(matrix(rep(0,4601),nrow=4601,ncol=1))
Y[spam$spam == "spam", ] = 1
names(Y) = "spam"
Y[,1]=factor(Y[,1])
data = cbind(spam[,-58],Y)
data.train = data[spam.test_indx == 0,]
data.test = data[spam.test_indx == 1,]
summary(data.train)
summary(data.test)

###########################################################################
library(gam)
# var.name = names(data)
# for (i in c(1:(length(var.name)-1))){
#   cat("+s(", var.name[i], ", 4)\n", append=TRUE, sep = "", collapse="")
# }
spamgam = gam(spam~
                +s(make, 4)
              +s(address, 4)
              +s(all, 4)
              +s(x3d, 4)
              +s(our, 4)
              +s(over, 4)
              +s(remove, 4)
              +s(internet, 4)
              +s(order, 4)
              +s(mail, 4)
              +s(receive, 4)
              +s(will, 4)
              +s(people, 4)
              +s(report, 4)
              +s(addresses, 4)
              +s(free, 4)
              +s(business, 4)
              +s(email, 4)
              +s(you, 4)
              +s(credit, 4)
              +s(your, 4)
              +s(font, 4)
              +s(x000, 4)
              +s(money, 4)
              +s(hp, 4)
              +s(hpl, 4)
              +s(george, 4)
              +s(x650, 4)
              +s(lab, 4)
              +s(labs, 4)
              +s(telnet, 4)
              +s(x857, 4)
              +s(data, 4)
              +s(x415, 4)
              +s(x85, 4)
              +s(technology, 4)
              +s(x1999, 4)
              +s(parts, 4)
              +s(pm, 4)
              +s(direct, 4)
              +s(cs, 4)
              +s(meeting, 4)
              +s(original, 4)
              +s(project, 4)
              +s(re, 4)
              +s(edu, 4)
              +s(table, 4)
              +s(conference, 4)
              +s(p1, 4)
              +s(p2, 4)
              +s(p3, 4)
              +s(p4, 4)
              +s(p5, 4)
              +s(p6, 4)
              +s(CRave, 4)
              +s(CRlong, 4)
              +s(CRtotal, 4)
              ,data = data.train
              ,family = binomial(link = "logit")
)
plot(spamgam, residual=TRUE, se=TRUE,pch=".")
summary(spamgam)
library(pROC)
y.hat = predict(spamgam, data.test, type="response")
roc(data.test$spam,y.hat,plot=T)
############################################################
for (i in c(1:(length(var.name)-1))){
  cat("+s(log(", var.name[i], "+0.1))\n", append=TRUE, sep = "", collapse="")
}
spamgam = gam(spam~
                +s(log(make+0.1))
              +s(log(address+0.1))
              +s(log(all+0.1))
              +s(log(x3d+0.1))
              +s(log(our+0.1))
              +s(log(over+0.1))
              +s(log(remove+0.1))
              +s(log(internet+0.1))
              +s(log(order+0.1))
              +s(log(mail+0.1))
              +s(log(receive+0.1))
              +s(log(will+0.1))
              +s(log(people+0.1))
              +s(log(report+0.1))
              +s(log(addresses+0.1))
              +s(log(free+0.1))
              +s(log(business+0.1))
              +s(log(email+0.1))
              +s(log(you+0.1))
              +s(log(credit+0.1))
              +s(log(your+0.1))
              +s(log(font+0.1))
              +s(log(x000+0.1))
              +s(log(money+0.1))
              +s(log(hp+0.1))
              +s(log(hpl+0.1))
              +s(log(george+0.1))
              +s(log(x650+0.1))
              +s(log(lab+0.1))
              +s(log(labs+0.1))
              +s(log(telnet+0.1))
              +s(log(x857+0.1))
              +s(log(data+0.1))
              +s(log(x415+0.1))
              +s(log(x85+0.1))
              +s(log(technology+0.1))
              +s(log(x1999+0.1))
              +s(log(parts+0.1))
              +s(log(pm+0.1))
              +s(log(direct+0.1))
              +s(log(cs+0.1))
              +s(log(meeting+0.1))
              +s(log(original+0.1))
              +s(log(project+0.1))
              +s(log(re+0.1))
              +s(log(edu+0.1))
              +s(log(table+0.1))
              +s(log(conference+0.1))
              +s(log(p1+0.1))
              +s(log(p2+0.1))
              +s(log(p3+0.1))
              +s(log(p4+0.1))
              +s(log(p5+0.1))
              +s(log(p6+0.1))
              +s(log(CRave+0.1))
              +s(log(CRlong+0.1))
              +s(log(CRtotal+0.1))
              ,data = data.train
              ,family = binomial
)
plot(spamgam, residual=TRUE, se=TRUE,pch=".")
summary(spamgam)
y.hat = predict(spamgam, data.test, type="response")
# head(y.hat,3)
# head(data.test$spam,3)
library(pROC)
roc(data.test$spam,y.hat,plot=T)

# penalized GAM is availabe in mgcv, but extremely slow.

Sunday, March 31, 2013

Spline Regression

Chapter 5
 

rm(list=ls())
library(ElemStatLearn)
library(splines)
# Basic Examples
set.seed(0)
x = seq(0, 4*pi, length.out=50)
y = cos(x) + 0.3*rnorm(length(x))
par(mfrow=c(2,3))
# linear, no interior knots
plot(x,y,type="p", main = "Deg = 1, Df = 1")
m1 = lm(y ~ bs(x,degree=1,df=1))
lines(x, fitted(m1))
# linear, 1 interior knot
plot(x,y,type="p", main = "Deg = 1, Df = 2")
m2 = lm(y ~ bs(x,degree=1,df=2))
lines(x, fitted(m2))
# linear, 2 interior knots
plot(x, y,type="p", main = "Deg = 1, Df = 3")
m3 = lm(y ~ bs(x,degree=1,df=3))
lines(x, fitted(m3))
# quadratic, no interior knots
plot(x, y,type="p", main = "Deg = 2, Df = 2")
m4 = lm( y ~ bs(x,degree=2,df=2))
lines(x, fitted(m4))
# quadratic, 1 interior knot
plot(x, y,type="p", main = "Deg = 2, Df = 3")
m5 = lm(y ~ bs(x,degree=2,df=3))
lines(x, fitted(m5))
# quadratic, 2 interior knots
plot(x, y,type="p", main = "Deg = 2, Df = 4")
m6 = lm(y ~ bs(x,degree=2,df=4))
lines(x, fitted(m6))
par(mfrow=c(1,1))

##########################################################################
##########################################################################
################   South African Heart Disease   #########################
##########################################################################
##########################################################################

form="chd~ns(sbp,df=4)+ns(tobacco,df=4)+ns(ldl,df=4)+famhist+ns(obesity,df=4)+ns(alcohol,df=4)+ns(age,df=4)"

form = formula(form)
sbp.ns = ns(SAheart$sbp,df=4)
attr(sbp.ns,"knots")
summary(SAheart$sbp)
# Natural Cubic Splines
model.ncs = glm(form, data=SAheart, family=binomial("logit"))
summary(model.ncs)
names(model.ncs)
# model.ncs$coefficients
# # model.ncs$effects
# model.ncs$xlevels
# model.ncs$method
# # model.ncs$data
# model.ncs$terms # knots
# # model.ncs$model
# recreate Table 5.1
drop1( model.ncs, scope=form, test="Chisq" )
# create the model in Table 5.1
library(MASS)
stepAIC(model.ncs, scope=form, k=2)

##########################################################################
##########################################################################
#######################   Phoneme Recognition   ##########################
############  Example: Filtering and Feature Extraction  #################
##########################################################################
##########################################################################
rm(list=ls())
data(phoneme)
set.seed(0)
aa_indx = which(phoneme$g == "aa")
ao_indx = which(phoneme$g == "ao")
AA_data = phoneme[aa_indx[1:15], 1:256]
AO_data = phoneme[ao_indx[1:15], 1:256]
min_l = min(c(min(AA_data), min(AO_data)))
max_l = max(c(max(AA_data), max(AO_data)))
# Figure 5.5 on page 149
ii=1
plot( as.double(AA_data[ii,]),
      ylim=c(min_l,max_l), type="l", col="red",
      xlab="Frequency", ylab="Log-periodogram" )
for( ii in 2:dim(AA_data)[1] ){
  lines( as.double(AA_data[ii,]), col="red" )
}
for( ii in 1:dim(AO_data)[1] ){
  lines( as.double(AO_data[ii,]), col="blue" )
}

rm(list=ls())
aa_indx = which(phoneme$g == "aa")
ao_indx = which(phoneme$g == "ao")
###########################################################
AA_data = phoneme[aa_indx,]
train_inds = grep("^train", AA_data$speaker)
test_inds = grep("^test", AA_data$speaker)
AA_data_train = AA_data[train_inds, 1:256]
AA_data_test = AA_data[test_inds, 1:256]
#####################################
###### call AA class 1  #############
#####################################
AA_data_train$Y = rep(1, dim(AA_data_train)[1])
AA_data_test$Y = rep(1, dim(AA_data_test)[1])
###########################################################
AO_data = phoneme[ao_indx,]
train_inds = grep("^train", AO_data$speaker)
test_inds = grep("^test", AO_data$speaker)
AO_data_train = AO_data[train_inds, 1:256]
AO_data_test = AO_data[test_inds, 1:256]
#####################################
###### call AO class 1  #############
#####################################
AO_data_train$Y = rep(0, dim(AO_data_train)[1])
AO_data_test$Y = rep(0, dim(AO_data_test)[1])
####################################################
##################  Combine Data  ##################
####################################################
Data_train = rbind(AA_data_train, AO_data_train)
Data_test = rbind(AA_data_test, AO_data_test)

#########################################################################
#############   Logistic Regression Without Intercept  ##################
#########################################################################
form = paste("Y ~ -1 + ", paste(colnames(Data_train)[1:256], collapse="+"))
m = glm(form, family=binomial("logit"), data=Data_train)
mc = as.double(coefficients(m))
m.smooth = smooth.spline(mc,df=15)
plot(mc, ylim=c(-0.8,+0.6),type="l",
     xlab="Frequency",ylab="Logistic Regression Coefficients")
lines(m.smooth, col="blue" )

abline(h=0)
Y_hat_train = predict(m, Data_train[,1:256], type="response")
predicted_class_label_train = as.double(Y_hat_train > 0.5)
Y_hat_test  = predict(m, Data_test[,1:256], type="response")
predicted_class_label_test = as.double(Y_hat_test > 0.5)
err_rate_train1 = mean(Data_train[,257] != predicted_class_label_train)
err_rate_test1  = mean(Data_test[,257] != predicted_class_label_test)
print(sprintf('Logistic Regression (Raw): err_rate_train= %10.6f; err_rate_test= %10.5f',
              err_rate_train1, err_rate_test1))
beta = matrix(m.smooth$y,ncol=1)
logit_train = as.matrix(Data_train[,1:256])%*%beta
Y_smooth_train = exp(logit_train)/(1+exp(logit_train))
predicted_class_smooth_train = as.double(Y_smooth_train > 0.5)
logit_test = as.matrix(Data_test[,1:256])%*%beta
Y_smooth_test = exp(logit_test)/(1+exp(logit_test))
predicted_class_smooth_test = as.double(Y_smooth_test > 0.5)
err_rate_train2 = mean(Data_train[,257] != predicted_class_smooth_train)
err_rate_test2  = mean(Data_test[,257] != predicted_class_smooth_test)
print(sprintf('Logistic Regression (Raw): err_rate_train= %10.6f; err_rate_test= %10.5f',
              err_rate_train1, err_rate_test1))
print(sprintf('Logistic Regression (Regularized): err_rate_train= %10.6f; err_rate_test= %10.5f',
              err_rate_train2, err_rate_test2))
#######################################################################
#############  BMD Example  ############################################
#######################################################################
males = bone$gender == "male"
females = bone$gender == "female"
# df is monotone in lambda for smoothing splines
boneMaleSmooth = smooth.spline( bone[bone$gender == "male","age"],
                                bone[bone$gender == "male","spnbmd"], df=12)
boneFemaleSmooth = smooth.spline( bone[bone$gender == "female","age"],
                                  bone[bone$gender == "female","spnbmd"], df=12)
plot(boneMaleSmooth, ylim=c(-0.05,0.20), col="blue", type="l", xlab="Age", ylab="spnbmd")
points(bone[males,c(2,4)], col="blue", pch=20)
lines(boneFemaleSmooth, ylim=c(-0.05,0.20), col="red")
points(bone[females,c(2,4)], col="red", pch=20)

Tuesday, March 26, 2013

Simple LDA and QDA in R

rm(list=ls())
library(MASS)
library(ElemStatLearn)
data(mixture.example)
data = mixture.example
rm(mixture.example)
x = data$x
y = data$y
summary(factor(y))
rm(data)
plot(x,col = 2*y + 1, xlab = "X1", ylab = "X2")
# LDA
model.lda = lda(x, y)
summary(model.lda)
model.lda
model.lda$prior
model.lda$means
model.lda$scaling
abline(model.lda$scaling)
points(model.lda$means, pch = 20)
# QDA
model.qda = qda(x, y)
model.qda
scaling.matrix1 = model.qda$scaling[,,1]
scaling.matrix2 = model.qda$scaling[,,2]
names(model.qda)
model2.qda = qda(x, y, CV = TRUE)
model2.qda
names(model2.qda)
model2.qda$class

Monday, March 25, 2013

Logistic and LASSO Regression

# The following code is for the book The Elements of Statistical Learning, chapter 4
# Example: South African Heart Disease (Page: 122)
# load data
rm(list=ls())
library(ElemStatLearn)
data(SAheart)
data = SAheart[,c(1:3,5,7:10)]
# to convert factor variables into dummy variables
temp = matrix(0,nrow(data),1)
for (i in c(1:nrow(data))){
  if (data[i,4] == 'Present'){temp[i] = 1}
  else {temp[i] = 0}
}
temp = as.data.frame(temp)
names(temp) = "famhist"
data.new = cbind(data[,1:3], temp, data[,5:8])
# in order to apply glmnet function
# change dataframe to matrix
Y = as.matrix(data.new[,8])
X = as.matrix(data.new[,-8])
logit = glm(chd ~ ., family=binomial("logit"),data=data.new,na.action=na.exclude)
# table 4.2 on page 122
summary(logit)
# Figure 4.12
pairs(data.new, main="South African heart disease data",pch = 21)























# reduced model: stepwise (same as the linear regression)
logit.reduced = glm(chd ~ tobacco + ldl + famhist + age,
                    family=binomial("logit"),data=data.new,na.action=na.exclude)
# table 4.3 on page 124
summary(logit.reduced)
library(glmnet)
# not work if X has a factor variable 
# This is the only package I know to apply lasso for binormial response
# work
lasso = glmnet(scale(X), Y, family = "binomial", alpha = 1, standardize = FALSE, intercept=TRUE)
# not work
lasso = glmnet(X, Y, family = "binomial", alpha = 1, standardize = TRUE, intercept=TRUE)
# FIGURE 4.13 on page 126
plot(lasso, xvar = "norm", label=TRUE)
abline(h=0)

Regression IV: Principal Components Regression

rm(list=ls())
library(ElemStatLearn)
data(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
# scale function in R substract mean,
# and divide by std dev (with df = N-1)
train.s = scale(train)
# fit ridge regression manually
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])

x.svd = svd(X)
#scatterplots of samples PCs
par(mar=c(1,1,1,1))
layout(matrix(1:64,8,8))
mycols = rainbow(length(Y))
orY = order(Y)
for(i in 1:8){
  for(j in 1:8){
    plot(x.svd$u[,i],x.svd$u[,j],type="p",pch=16,col=mycols[orY])
  }
}

# amount of variance explained
var = 0
var.cum = 0;
for(i in 1:8){
  var[i] = x.svd$d[i]/sum(x.svd$d)
  var.cum[i] = sum(var)
}
par(mfrow=c(1,2))
par(mar=c(5,4,4,2))
barplot(var,ylab="Amount of Var Explained",xlab="PCs")
barplot(var.cum,ylab="Cummulative Var Explained",xlab="PCs")

# PC direction weights
par(mfrow=c(3,3))
par(mar=c(5,4,3,2))
for(i in 1:8){
  barplot(x.svd$v[,i],names.arg=names(train)[1:8])
}

#PC regression
beta.pcr = diag(1/x.svd$d)%*%t(x.svd$u)%*%Y
Z = X%*%x.svd$v
#training error
err.pcr = 0
for(i in 1:8){
  err.pcr[i] = sum(as.numeric(Y - Z[,1:i,drop=FALSE]%*%beta.pcr[1:i,1])^2)/length(Y)
}
err.pcr

Regression III: LASSO

rm(list=ls())
library(ElemStatLearn)
data(prostate)
data = prostate[,1:9]
train = subset(prostate, train==T)[,1:9]
test = subset(prostate, train!=T)[,1:9]
# scale function in R substract mean,
# and divide by std dev (with df = N-1)
train.s = scale(train)
center = attributes(train.s)$'scaled:center'
scale = attributes(train.s)$'scaled:scale'
Y = as.numeric(train.s[,9])
X = as.matrix(train.s[,-9])
# scale testing data based on center and scale of training data
test.s = t((t(test) - center)/scale)
Y1 = as.numeric(test.s[,9])
X1 = as.matrix(test.s[,-9])
# unscaled data
Y0 = as.numeric(train[,9])
X0 = as.matrix(train[,-9])
Y01 = as.numeric(test[,9])
X01 = as.matrix(test[,-9])
#################################################################################
########### fit LASSO regression by glmnet package  #############################
#################################################################################
library(glmnet)
# Way 1
lasso.glm = glmnet(X,Y,
                   family = "gaussian",
                   alpha = 1,
                   standardize = FALSE,
                   intercept = FALSE,
                   standardize.response = FALSE
)
# # Way 2: NOT WORK
lasso.glm = glmnet(X0,Y0,
                   family = "gaussian",
                   alpha = 1,
                   standardize = TRUE,
                   intercept = TRUE,
                   standardize.response = TRUE
)
# # Way 3: NOT WORK
lasso.glm = glmnet(X0,Y0,
                   family = "gaussian",
                   alpha = 1
)
# Based on my experiments, I have to scale data first
names(lasso.glm)
plot(lasso.glm, xvar = "norm", label=TRUE)
abline(h=0)

lasso.glm$a0
lasso.glm$beta
lasso.glm$lambda
y = predict(lasso.glm, X, type="link")
y1 = predict(lasso.glm, X1, type="link")

lasso.mse = matrix(0, length(lasso.glm$lambda), 1)
lasso1.mse = matrix(0, length(lasso.glm$lambda), 1)
for (i in 1:length(lasso.glm$lambda)){
  lasso.mse[i] = mean((Y - y[,i])^2)
  lasso1.mse[i] = mean((Y1 - y1[,i])^2)
}
############################################################################
###################   Figure 3.10  on Page 70  #############################
############################################################################
plot(lasso.glm$lambda, lasso.mse)
lines(lasso.glm$lambda, lasso1.mse)
# cross-validation
lasso.cv = cv.glmnet(X,Y,
                     family = "gaussian",
                     alpha = 1,
                     nfolds = 10,
                     standardize = FALSE,
                     intercept = FALSE,
                     standardize.response = FALSE,
                     type.measure = "mse"
)
names(lasso.cv)
plot(lasso.cv)
plot(lasso.cv, -1)
lasso.cv$lambda.1se
lasso.cv$cvm
lasso.cv$cvsd
lasso.cv$glmnet.fit
y.cv = predict(lasso.cv, X, s="lambda.1se")
###########################################################################
############# fit LASSO regression by lars package  ###############################
###########################################################################
library(lars)
# cross-validation is available in this package
# scaled version
model.ridge = lars(X,Y,type="lasso",
                   trace=TRUE,
                   normalize=FALSE,
                   intercept=FALSE
)
# unscaled version
model.ridge = lars(X0,Y0,type="lasso",
                   trace=TRUE,
                   normalize=TRUE,
                   intercept=TRUE
)
summary(model.ridge)
model.ridge
plot(model.ridge,
     xvar="arc.length",
     breaks=TRUE,
     plottype="coefficients"
)
predict(model.ridge,
        X.test,
        type="fit",
        mode="step")
coef(model.ridge, s=5.6,mode="step")
# always scale X and Y manually before applying functions