# week6 in-class examples... # review of trees # especially with two other R packages for trees # ... that can produce nicer _visual_ results! # use the iris data set.seed(1234) data(iris) ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3)) trainData <- iris[ind==1,] testData <- iris[ind==2,] # Load package party, build a decision tree, and check the prediction. library(party) myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width # create a "ctree" (from the party package) iris_ctree <- ctree(myFormula, data=trainData) # check the prediction on training data (resubstitution) table(predict(iris_ctree), trainData$Species) # print the tree in text form print(iris_ctree) # show it graphically plot(iris_ctree) # a "simpler" plot... plot(iris_ctree, type="simple") #predict on test data testPred <- predict(iris_ctree, newdata = testData) table(testPred, testData$Species) # 3d scatterplot examples # this worked under windows, but not my Mac ... (!) data(iris); attach(iris) library("rgl") plot3d(Petal.Width,Sepal.Width,Petal.Length, col=as.numeric(Species), size=5) # Bootstrap examples... set.seed(333); x <- rnorm(30) bootMean <- rep(NA,1000); sampledMean <- rep(NA,1000) for(i in 1:1000){bootMean[i] <- mean(sample(x,replace=TRUE))} for(i in 1:1000){sampledMean[i] <- mean(rnorm(30))} plot(density(bootMean)); lines(density(sampledMean),col="red") # with the boot package... set.seed(333); x <- rnorm(30); sampledMean <- rep(NA,1000) for(i in 1:1000){sampledMean[i] <- mean(rnorm(30))} meanFunc <- function(x,i){mean(x[i])} bootMean <- boot(x,meanFunc,1000) bootMean # plotting the result plot(density(bootMean$t)); lines(density(sampledMean),col="red") # nuclear data library(boot); data(nuclear) nuke.lm <- lm(log(cost) ~ date,data=nuclear) plot(nuclear$date,log(nuclear$cost),pch=19) abline(nuke.lm,col="red",lwd=3) # bootstrap distribution bs <- function(data, indices,formula) { d <- data[indices,];fit <- lm(formula, data=d);return(coef(fit)) } results <- boot(data=nuclear, statistic=bs, R=1000, formula=log(cost) ~ date) plot(density(results$t[,2]),col="red",lwd=3) lines(rep(nuke.lm$coeff[2],10),seq(0,8,length=10),col="blue",lwd=3) # plotting... par(mfrow=c(1,3)) for(i in 1:3){ nuclear0 <- nuclear[sample(1:dim(nuclear)[1],replace=TRUE),] nuke.lm0 <- lm(log(cost) ~ date,data=nuclear0) plot(nuclear0$date,log(nuclear0$cost),pch=19) abline(nuke.lm0,col="red",lwd=3) } # bootstrapping non-linear statistics set.seed(555); x <- rnorm(30); sampledMed <- rep(NA,1000) for(i in 1:1000){sampledMed[i] <- median(rnorm(30))} medFunc <- function(x,i){median(x[i])}; bootMed <- boot(x,medFunc,1000) plot(density(bootMed$t),col="red",lwd=3) lines(density(sampledMed),lwd=3) # things you can't bootstrap! set.seed(333); x <- rnorm(30); sampledMax <- rep(NA,1000) for(i in 1:1000){sampledMax[i] <- max(rnorm(30))} maxFunc <- function(x,i){max(x[i])}; bootMax <- boot(x,maxFunc,1000) plot(density(bootMax$t),col="red",lwd=3,xlim=c(1,3)) lines(density(sampledMax),lwd=3) # medley library - first we need to install some extras... library(devtools) install_github("medley","mewo2") # this takes a while... library(medley) set.seed(453234) # our dataframe y <- rnorm(1000) x1 <- (y > 0); x2 <- y*rnorm(1000) x3 <- rnorm(1000,mean=y,sd=1); x4 <- (y > 0) & (y < 3) x5 <- rbinom(1000,size=4,prob=exp(y)/(1+exp(y))) x6 <- (y < -2) | (y > 2) data <- data.frame(y=y,x1=x1,x2=x2,x3=x3,x4=x4,x5=x5,x6=x6) # training and testing subsets train <- sample(1:1000,size=500) trainData <- data[train,]; testData <- data[-train,] # basic linear model lm1 <- lm(y ~.,data=trainData) # special formula to say "everything" rmse(predict(lm1,data=testData),testData$y) # basic tree model library(tree) tree1 <- tree(y ~.,data=trainData) rmse(predict(tree1,data=testData),testData$y) # combine lm1 with tree1 combine1 <- predict(lm1,data=testData)/2 + predict(tree1,data=testData)/2 rmse(combine1,testData$y) # another tree ~ sampled with replacement... tree2 <- tree(y~.,data=trainData[sample(1:dim(trainData)[1],replace=T),]) # another combination ~ now of three classifiers... combine2 <- (predict(lm1,data=testData)/3 + predict(tree1,data=testData)/3 + predict(tree2,data=testData)/3) rmse(combine2,testData$y) # bootstrap-aggregated trees (bagged trees) library(ipred) bagTree <- bagging(Species ~.,data=iris,coob=TRUE) # 25 trees by default print(bagTree) # is this correct? ~ yes! testPred <- predict(bagTree, newdata = iris) table(iris$Species,testPred) # you'll note that the overall error can be better than the oob error! # Let's push it further... train <- sample(1:150,size=75) # 75 indices... train itr <- iris[train,] # those IN the training set ite <- iris[-train,] # those OUT of the training set # re-train bagTree <- bagging(Species ~.,data=itr,coob=TRUE) testPred <- predict(bagTree, newdata = ite) table(ite$Species,testPred) # missed one for me... # Random forests library(randomForest) forestIris <- randomForest(Species~ Petal.Width + Petal.Length,data=iris,prox=TRUE) forestIris # again, let's test the resubstitution error (training set prediction) testPred <- predict(forestIris, newdata = iris) table(iris$Species,testPred) # again, the oob error can be greater than overall error... # getting a tree from a random forest (not super useful) getTree(forestIris,k=2) # assessing the variables' importance # plotting the class "centers" iris.p <- classCenter(iris[,c(3,4)], iris$Species, forestIris$prox) plot(iris[,3], iris[,4], pch=21, xlab=names(iris)[3], ylab=names(iris)[4], bg=c("red", "blue", "green")[as.numeric(factor(iris$Species))], main="Iris Data with Prototypes") points(iris.p[,1], iris.p[,2], pch=21, cex=2, bg=c("red", "blue", "green")) # next, use rfImpute... y <- iris[,5] x <- iris[,-5] res <- rfImpute(x=x, y=y)