Monday, April 29, 2013

CART: predict email spam

rm(list=ls())
library(ElemStatLearn)
data(spam)
# names(spam) <- c("make", "address", "all", "3d", "our",
#                  "over", "remove", "internet", "order", "mail",
#                  "receive", "will", "people", "report", "addresses",
#                  "free", "business", "email", "you", "credit",
#                  "your", "font", "000", "money", "hp",
#                  "hpl", "george", "650", "lab", "labs",
#                  "telnet", "857", "data", "415", "85",
#                  "technology", "1999", "parts", "pm",
#                  "direct", "cs", "meeting", "original", "project",
#                  "re", "edu", "table", "conference", ";:",
#                  "(:", "[:", "!:", "$:", "#:",
#                  "CRave", "CRlong", "CRtotal", "spam")
names(spam) <- c("make", "address", "all", "x3d", "our",
                 "over", "remove", "internet", "order", "mail",
                 "receive", "will", "people", "report", "addresses",
                 "free", "business", "email", "you", "credit",
                 "your", "font", "x000", "money", "hp",
                 "hpl", "george", "x650", "lab", "labs",
                 "telnet", "x857", "data", "x415", "x85",
                 "technology", "x1999", "parts", "pm",
                 "direct", "cs", "meeting", "original", "project",
                 "re", "edu", "table", "conference", "p1",
                 "p2", "p3", "p4", "p5", "p6",
                 "CRave", "CRlong", "CRtotal", "spam")

summary(spam)
spam.test_indx = read.delim("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/spam.traintest",
                            sep="\n", header=FALSE)
#########################################################################
#########################################################################
Y = as.data.frame(matrix(rep(0,4601),nrow=4601,ncol=1))
Y[spam$spam == "spam", ] = 1
names(Y) = "spam"
Y[,1]=factor(Y[,1])
data = cbind(spam[,-58],Y)
data.train = data[spam.test_indx == 0,]
data.test = data[spam.test_indx == 1,]
summary(data.train)
summary(data.test)
rm(Y, data, spam, spam.test_indx)
###############################################################################
###############################################################################
library(rpart)
library(pROC)
names(data.train)
spam.tree = rpart(spam~.
                  , data = data.train
                  , method = "class"
                  , xval = 5
                  , cp = 0.00001
                  , minsplit = 1
                  , parms=list(split='information')
                  , na.action = na.exclude
)
printcp(spam.tree)
plotcp(spam.tree, upper = "size")
spam.prune = prune.rpart(spam.tree, 0.0025)
print(spam.prune)
plot(spam.prune)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
spam.prune = prune.rpart(spam.tree, 0.003)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
summary(spam.prune)
plotcp(spam.prune)
y.hat = predict(spam.prune, data.test, type="prob")[,2]
head(y.hat)
roc(data.test$spam,y.hat,plot=T)
############################################################################
spam.tree = rpart(spam~.
                  , data = data.train
                  , method = "class"
                  , xval = 5
                  , cp = 0.00001
                  , minsplit = 1
                  , parms=list(split='gini')
                  , na.action = na.exclude
)
plotcp(spam.tree)
spam.prune = prune.rpart(spam.tree, 0.002)
plotcp(spam.prune)
plot(spam.prune, compress=T, uniform=T, branch=0.4, margin=0.01)
text(spam.prune)
summary(spam.prune)
print(spam.prune)
y.hat = predict(spam.prune, data.test, type="prob")[,2]
roc(data.test$spam,y.hat,plot=T)
#Remark: for cross-entropy, smaller trees get better ROC than Gini

No comments:

Post a Comment