rm(list=ls())
library(ElemStatLearn)
data(spam)
# names(spam) <- c("make", "address", "all", "3d", "our",
# "over", "remove", "internet", "order", "mail",
# "receive", "will", "people", "report", "addresses",
# "free", "business", "email", "you", "credit",
# "your", "font", "000", "money", "hp",
# "hpl", "george", "650", "lab", "labs",
# "telnet", "857", "data", "415", "85",
# "technology", "1999", "parts", "pm",
# "direct", "cs", "meeting", "original", "project",
# "re", "edu", "table", "conference", ";:",
# "(:", "[:", "!:", "$:", "#:",
# "CRave", "CRlong", "CRtotal", "spam")
names(spam) <- c("make", "address", "all", "x3d", "our",
"over", "remove", "internet", "order", "mail",
"receive", "will", "people", "report", "addresses",
"free", "business", "email", "you", "credit",
"your", "font", "x000", "money", "hp",
"hpl", "george", "x650", "lab", "labs",
"telnet", "x857", "data", "x415", "x85",
"technology", "x1999", "parts", "pm",
"direct", "cs", "meeting", "original", "project",
"re", "edu", "table", "conference", "p1",
"p2", "p3", "p4", "p5", "p6",
"CRave", "CRlong", "CRtotal", "spam")
summary(spam)
spam.test_indx = read.delim("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/spam.traintest",
sep="\n", header=FALSE)
#########################################################################
#########################################################################
Y = as.data.frame(matrix(rep(0,4601),nrow=4601,ncol=1))
Y[spam$spam == "spam", ] = 1
names(Y) = "spam"
Y[,1]=factor(Y[,1])
data = cbind(spam[,-58],Y)
data.train = data[spam.test_indx == 0,]
data.test = data[spam.test_indx == 1,]
summary(data.train)
summary(data.test)
rm(Y, data, spam, spam.test_indx)
###############################################################################
###############################################################################
library(rpart)
library(pROC)
names(data.train)
spam.tree = rpart(spam~.
, data = data.train
, method = "class"
, xval = 5
, cp = 0.00001
, minsplit = 1
, parms=list(split='information')
, na.action = na.exclude
)
printcp(spam.tree)
plotcp(spam.tree, upper = "size")
spam.prune = prune.rpart(spam.tree, 0.0025)
print(spam.prune)
plot(spam.prune)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
spam.prune = prune.rpart(spam.tree, 0.003)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
summary(spam.prune)
plotcp(spam.prune)
y.hat = predict(spam.prune, data.test, type="prob")[,2]
head(y.hat)
roc(data.test$spam,y.hat,plot=T)
############################################################################
spam.tree = rpart(spam~.
, data = data.train
, method = "class"
, xval = 5
, cp = 0.00001
, minsplit = 1
, parms=list(split='gini')
, na.action = na.exclude
)
plotcp(spam.tree)
spam.prune = prune.rpart(spam.tree, 0.002)
plotcp(spam.prune)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
summary(spam.prune)
print(spam.prune)
y.hat = predict(spam.prune, data.test, type="prob")[,2]
roc(data.test$spam,y.hat,plot=T)
#Remark: for cross-entropy, smaller trees get better ROC than Gini
No comments:
Post a Comment