# week6 in-class examples...


# review of trees 
# especially with two other R packages for trees
# ... that can produce nicer _visual_ results!


# use the iris data
set.seed(1234)
data(iris)
ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3))
trainData <- iris[ind==1,]
testData <- iris[ind==2,]

# Load package party, build a decision tree, and check the prediction.
library(party)
myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width

# create a "ctree" (from the party package)
iris_ctree <- ctree(myFormula, data=trainData)

# check the prediction on training data (resubstitution)
table(predict(iris_ctree), trainData$Species)

# print the tree in text form
print(iris_ctree)

# show it graphically
plot(iris_ctree)

# a "simpler" plot...
plot(iris_ctree, type="simple")

#predict on test data
testPred <- predict(iris_ctree, newdata = testData)
table(testPred, testData$Species)




# 3d scatterplot examples
# this worked under windows, but not my Mac ... (!)
data(iris); attach(iris)
library("rgl")
plot3d(Petal.Width,Sepal.Width,Petal.Length, col=as.numeric(Species), size=5)





# Bootstrap examples...
set.seed(333); x <- rnorm(30)
bootMean <- rep(NA,1000); sampledMean <- rep(NA,1000)
for(i in 1:1000){bootMean[i] <- mean(sample(x,replace=TRUE))}
for(i in 1:1000){sampledMean[i] <- mean(rnorm(30))}
plot(density(bootMean)); lines(density(sampledMean),col="red")

# with the boot package...
set.seed(333); x <- rnorm(30); sampledMean <- rep(NA,1000)
for(i in 1:1000){sampledMean[i] <- mean(rnorm(30))}
meanFunc <- function(x,i){mean(x[i])}
bootMean <- boot(x,meanFunc,1000)
bootMean

# plotting the result
plot(density(bootMean$t)); lines(density(sampledMean),col="red")


# nuclear data
library(boot); data(nuclear)
nuke.lm <- lm(log(cost) ~ date,data=nuclear)
plot(nuclear$date,log(nuclear$cost),pch=19)
abline(nuke.lm,col="red",lwd=3)

# bootstrap distribution
bs <- function(data, indices,formula) {
 d <- data[indices,];fit <- lm(formula, data=d);return(coef(fit)) 
} 
results <- boot(data=nuclear, statistic=bs, R=1000, formula=log(cost) ~ date)
plot(density(results$t[,2]),col="red",lwd=3)
lines(rep(nuke.lm$coeff[2],10),seq(0,8,length=10),col="blue",lwd=3)


# plotting...
par(mfrow=c(1,3))
for(i in 1:3){
 nuclear0 <- nuclear[sample(1:dim(nuclear)[1],replace=TRUE),]
 nuke.lm0 <- lm(log(cost) ~ date,data=nuclear0)
 plot(nuclear0$date,log(nuclear0$cost),pch=19)
 abline(nuke.lm0,col="red",lwd=3)
}


# bootstrapping non-linear statistics
set.seed(555); x <- rnorm(30); sampledMed <- rep(NA,1000)
for(i in 1:1000){sampledMed[i] <- median(rnorm(30))}
medFunc <- function(x,i){median(x[i])}; bootMed <- boot(x,medFunc,1000)
plot(density(bootMed$t),col="red",lwd=3)
lines(density(sampledMed),lwd=3)


# things you can't bootstrap!
set.seed(333); x <- rnorm(30); sampledMax <- rep(NA,1000)
for(i in 1:1000){sampledMax[i] <- max(rnorm(30))}
maxFunc <- function(x,i){max(x[i])}; bootMax <- boot(x,maxFunc,1000)
plot(density(bootMax$t),col="red",lwd=3,xlim=c(1,3))
lines(density(sampledMax),lwd=3)






# medley library - first we need to install some extras...
library(devtools)
install_github("medley","mewo2")  # this takes a while...
library(medley)
set.seed(453234)

# our dataframe
y <- rnorm(1000)
x1 <- (y > 0); x2 <- y*rnorm(1000)
x3 <- rnorm(1000,mean=y,sd=1); x4 <- (y > 0) & (y < 3)
x5 <- rbinom(1000,size=4,prob=exp(y)/(1+exp(y)))
x6 <- (y < -2) | (y > 2)
data <- data.frame(y=y,x1=x1,x2=x2,x3=x3,x4=x4,x5=x5,x6=x6)

# training and testing subsets
train <- sample(1:1000,size=500)
trainData <- data[train,]; testData <- data[-train,]

# basic linear model
lm1 <- lm(y ~.,data=trainData) # special formula to say "everything"
rmse(predict(lm1,data=testData),testData$y)

# basic tree model
library(tree)
tree1 <- tree(y ~.,data=trainData)
rmse(predict(tree1,data=testData),testData$y)

# combine lm1 with tree1
combine1 <- predict(lm1,data=testData)/2 + predict(tree1,data=testData)/2
rmse(combine1,testData$y)

# another tree ~ sampled with replacement...
tree2 <- tree(y~.,data=trainData[sample(1:dim(trainData)[1],replace=T),])

# another combination ~ now of three classifiers...
combine2 <- (predict(lm1,data=testData)/3 + predict(tree1,data=testData)/3
 				+ predict(tree2,data=testData)/3)
rmse(combine2,testData$y)

# bootstrap-aggregated trees (bagged trees)
library(ipred)
bagTree <- bagging(Species ~.,data=iris,coob=TRUE) # 25 trees by default
print(bagTree)

# is this correct? ~ yes!
testPred <- predict(bagTree, newdata = iris)
table(iris$Species,testPred)
# you'll note that the overall error can be better than the oob error!


# Let's push it further...
train <- sample(1:150,size=75)  # 75 indices...
train
itr <- iris[train,]   # those IN the training set
ite <- iris[-train,]  # those OUT of the training set


# re-train
bagTree <- bagging(Species ~.,data=itr,coob=TRUE)
testPred <- predict(bagTree, newdata = ite)
table(ite$Species,testPred)  # missed one for me...


# Random forests
library(randomForest)
forestIris <- randomForest(Species~ Petal.Width + Petal.Length,data=iris,prox=TRUE)
forestIris


# again, let's test the resubstitution error (training set prediction)
testPred <- predict(forestIris, newdata = iris)
table(iris$Species,testPred)
# again, the oob error can be greater than overall error...


# getting a tree from a random forest (not super useful)
getTree(forestIris,k=2)

# assessing the variables' importance


# plotting the class "centers"
iris.p <- classCenter(iris[,c(3,4)], iris$Species, forestIris$prox)
plot(iris[,3], iris[,4], pch=21, xlab=names(iris)[3], ylab=names(iris)[4],
bg=c("red", "blue", "green")[as.numeric(factor(iris$Species))],
main="Iris Data with Prototypes")
points(iris.p[,1], iris.p[,2], pch=21, cex=2, bg=c("red", "blue", "green"))


# next, use rfImpute...
y <- iris[,5]
x <- iris[,-5]
res <- rfImpute(x=x, y=y)