Data Science Blog: CART: rpart in R

setwd("C:/TreeModel")
library(rpart)
library(pROC)
##################################################################################
###########    Regression Tree   #################################################
##################################################################################
data1 = read.table("C:/TreeModel/Pollute.txt", header=T)
# attach(data1)
# detach(data1)
summary(data1)
names(data1)
#plot to check nonlinear relationships
pairs(data1, pch=16, cex=1)
#build regression model
tree.data1 = rpart(Pollution ~ .,
                   method = "anova",
                   data = data1,
                   control=rpart.control(minsplit=2, cp=0.01),
                   na.action = na.exclude
                   )
print(tree.data1)
# plot(tree.data1)
# text(tree.data1)
#plot easier to read
plot(tree.data1, compress=T, uniform=T, branch=0.4, margin=0.1)
text(tree.data1)
summary(tree.data1)
tree.data1$terms
names(tree.data1)
#diagnostics (check errors)
plot(predict(tree.data1), residuals(tree.data1), xlab="Fitted", ylab="Residuals")
qqnorm(residuals(tree.data1))
#predict
x0 = apply(data1[,-1], 2, median)
x0
predict(tree.data1, data.frame(t(x0)))
#pruning tree
#initially make a big tree, choose cp very small
tree.data1 = rpart(Pollution ~ .,
                   method = "anova",
                   data = data1,
                   control=rpart.control(minsplit=2, cp=0.0001),
                   na.action = na.exclude
)
#select the best splits based on the cross-validation reslut
printcp(tree.data1)
plotcp(tree.data1)
#choose cp as 0.05739633
data.prune = prune.rpart(tree.data1, 0.05739633)
plot(data.prune, compress=T, uniform=T, branch=0.4, margin=0.1)
text(data.prune)
print(data.prune)
predict(data.prune, data.frame(t(x0)))
par(mfrow=c(1,2))
rsq.rpart(data.prune)
par(mfrow=c(1,1))
summary(data.prune)
# ROC curve
data.predict = predict(data.prune)
roc(data1$Pollution,data.predict,plot=T)
###############################################################################
############ Classification Tree #############################################
###############################################################################
# based on information (entropy)
data2 = read.table("C:/TreeModel/epilobium.txt", header=T)
tree.data2 = rpart(species ~ .,
                   method = "class",
                   data = data2,
                   control=rpart.control(minsplit=2, cp=0.01),
                   parms=list(split='information'),
                   na.action = na.exclude
)
plot(tree.data2, compress=T, uniform=T, branch=0.4, margin=0.1)
text(tree.data2)
print(tree.data2)
# based on Gini index
tree2.data2 = rpart(species ~ .,
                   method = "class",
                   data = data2,
                   control=rpart.control(minsplit=2, cp=0.01),
                   parms=list(split='gini'),
                   na.action = na.exclude
)
plot(tree2.data2, compress=T, uniform=T, branch=0.4, margin=0.1)
text(tree2.data2)
print(tree2.data2)

########################################################
###################     DATA      ##########################
########################################################
Pollution Temp Industry Population Wind Rain Wet.days
24 61.5 368 497 9.1 48.34 115
30 55.6 291 593 8.3 43.11 123
56 55.9 775 622 9.5 35.89 105
28 51 137 176 8.7 15.17 89
14 68.4 136 529 8.8 54.47 116
46 47.6 44 116 8.8 33.36 135
9 66.2 641 844 10.9 35.94 78
35 49.9 1064 1513 10.1 30.96 129
26 57.8 197 299 7.6 42.59 115
61 50.4 347 520 9.4 36.22 147
29 57.3 434 757 9.3 38.98 111
28 52.3 361 746 9.7 38.74 121
14 51.5 181 347 10.9 30.18 98
18 59.4 275 448 7.9 46 119
17 51.9 454 515 9 12.95 86
23 54 462 453 7.1 39.04 132
47 55 625 905 9.6 41.31 111
13 61 91 132 8.2 48.52 100
31 55.2 35 71 6.6 40.75 148
12 56.7 453 716 8.7 20.66 67
10 70.3 213 582 6 7.05 36
110 50.6 3344 3369 10.4 34.44 122
56 49.1 412 158 9 43.37 127
10 68.9 721 1233 10.8 48.19 103
69 54.6 1692 1950 9.6 39.93 115
8 56.6 125 277 12.7 30.58 82
36 54 80 80 9 40.25 114
16 45.7 569 717 11.8 29.07 123
29 51.1 379 531 9.4 38.79 164
29 43.5 669 744 10.6 25.94 137
65 49.7 1007 751 10.9 34.99 155
9 68.3 204 361 8.4 56.77 113
10 75.5 207 335 9 59.8 128
26 51.5 266 540 8.6 37.01 134
31 59.3 96 308 10.6 44.68 116
10 61.6 337 624 9.2 49.1 105
11 47.1 391 463 12.4 36.11 166
14 54.5 381 507 10 37 99
17 49 104 201 11.2 30.85 103
11 56.8 46 244 8.9 7.77 58
94 50 343 179 10.6 42.75 125

Data Science Blog

Thursday, March 14, 2013

CART: rpart in R

No comments:

Post a Comment