# notes for Lecture 5 of IST 380

# correlations with R
?cor


# colors...
colors()
col=rgb(1.0,1.0,0.5)
col="thistle3"
#To try things out:
plot( 0,0 )
rect( 0,0,100,100,col="red" )
rect( 0,0,100,100,col="thistle3" )


# strings and regular expressions
library(stringr)
str_length( "hi there!" )
str_count( "s", "mississippi" )
str_replace_all( "sing", "s", "str" )

# test of matching and extracting...
strings <- c(	" 219 733 8965", "329-293-8753 ", 
				"banana", "595 794 7569", "387 287 6718", 
				"apple", "233.398.9187 ", "482 952 3315", 
				"239 923 8115", "842 566 4692", 
				"Work: 579-499-7527", "$1000", 
				"Home: 543.355.3679") 
# a regular expression pattern:
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
# try it...
str_match( strings, phone )
str_extract( strings, phone )


# 
# next section:
#
# prediction with trees
#

# Clinton vs. Obama counties:
http://graphics8.nytimes.com/images/2008/04/16/us/0416-nat-subOBAMA.jpg


# Iris data...
data(iris) 
names(iris)
View(iris)   # to show the data in an R Studio dataframe


# plotting the Iris data
plot(iris$Petal.Width,iris$Sepal.Width,pch=19,col=as.numeric(iris$Species)) 
legend(1,4.5,legend=unique(iris$Species),col=unique(as.numeric(iris$Species)),pch=18)


# creating the first tree
library(tree)  # you may need to install this package...
tree1 <- tree(Species ~ Sepal.Width + Petal.Width,data=iris) 
summary(tree1)


# plotting the tree
plot(tree1)
text(tree1)  # the text overlaps too much here...
# next week will look at the art and science of "nice" tree plots


# here, we'll show an alternative plot, e.g., a partition graph
plot(iris$Petal.Width,iris$Sepal.Width,pch=18,col=as.numeric(iris$Species)) 
partition.tree(tree1,label="Species",add=TRUE) 
legend(1.75,4.5,legend=unique(iris$Species),col=unique(as.numeric(iris$Species)),pch=18)


# this creates random data for Petal.Width and Sepal.Width
# and then uses the tree to predict which species it will be...
#
# below, note that "runif" stands for "random uniform," so that runif(20,0,2.5) 
# produces 20 random numbers uniformly pulled out of the interval from 0 to 2.5
set.seed(32313)
newdata <- data.frame(Petal.Width = runif(20,0,2.5),Sepal.Width = runif(20,2,4.5)) 
#  now, we predict the output on those random data...
pred1 <- predict(tree1,newdata)
pred1


# here we plot that random data and the predictions
# the argument type="class" here indicates that R should not
# yield probabilities (as in the table above), but should choose
# the single value ("class") with the highest probability as its prediction
#
# take a look at pred1 after this next line to see this "class"ifier...
pred1 <- predict(tree1,newdata,type="class") 
plot(newdata$Petal.Width,newdata$Sepal.Width,col=as.numeric(pred1),pch=18) 
partition.tree(tree1,"Species",add=TRUE)


# a table of classifications
table(iris$Species,predict(tree1,type="class"))


# Cross-validation aside...
# our data - no relationship between z's coinflips and x,y
set.seed(12345)
x <- rnorm(10); y <- rnorm(10); z <- rbinom(10,size=1,prob=0.5) 
plot(x,y,pch=19,col=(z+3))


# classifier (a tree, in fact):
par(mfrow=c(1,2))
zhat<-(-0.2<y)&(y<0.6) 
plot(x,y,pch=19,col=(z+3)); plot(x,y,pch=19,col=(zhat+3))


# new data (doesn't work well)
set.seed(1233)
xnew <- rnorm(10); ynew <- rnorm(10); znew <- rbinom(10,size=1,prob=0.5) 
par(mfrow=c(1,2))
zhatnew <- (-0.2 < ynew) & (ynew < 0.6) 
plot(xnew,ynew,pch=19,col=(z+3)); plot(xnew,ynew,pch=19,col=(zhatnew+3))


# 2-fold cross-validation example (by hand)
y1 <- y[1:5]; x1 <- x[1:5]; z1 <- z[1:5]
y2 <- y[6:10]; x2 <- x[6:10]; z2 <- z[6:10];
zhat2<-(y2<1)&(y2>-0.5)
par(mfrow=c(1,3))
plot(x1,y1,col=(z1+3),pch=19); plot(x2,y2,col=(z2+3),pch=19); plot(x2,y2,col=(zhat2+3),pch=19)


# back to the tree-building...
# using cross-validation for checking the Iris tree-building procedure
par(mfrow=c(1,1))
plot(cv.tree(tree1,FUN=prune.tree,method="misclass"))


# with deviance instead of misclassification
plot(cv.tree(tree1,FUN=prune.tree))


# build a three-leaf classification tree:
t3 <- prune.tree(tree1,best=3)


# see how it does (resubstitution error) versus whole tree
table(iris$Species,predict(t3,type="class"))
table(iris$Species,predict(tree1,type="class"))


# to see the text description in R
t3


#
# another tree-based predictive model with the Cars93 dataset
#
#
# read about the dataset with ??Cars93 ...
data(Cars93,package="MASS")  # get the dataset
head(Cars93)
??Cars93


# build a tree model of the driver-train type, based on... EVERYTHING
treeCars <- tree(DriveTrain ~ MPG.city + MPG.highway + AirBags +
				EngineSize + Width + Length + Weight + Price + Cylinders +
				Horsepower + Wheelbase,data=Cars93)
plot(treeCars) 
text(treeCars)  # messy!


# plotting the misclassification errors on the tree... (both kinds)
par(mfrow=c(1,2)) 
plot(cv.tree(treeCars,FUN=prune.tree,method="misclass")) 
plot(cv.tree(treeCars))


# pruning the tree to four leaves
pruneTree <- prune.tree(treeCars,best=4) 
plot(pruneTree)
text(pruneTree)


# showing the pruned tree's and original tree's resubstitution errors:
table(Cars93$DriveTrain,predict(pruneTree,type="class"))
table(Cars93$DriveTrain,predict(treeCars,type="class"))


#
# extra! Logistic Regression...
#


# get Ravens data
load("ravensData.rda")
head(ravensData)


# run ordinary regression
lmRavens <- lm(ravensData$ravenWinNum ~ ravensData$ravenScore) 
summary(lmRavens)


# plot the results
plot(ravensData$ravenScore,lmRavens$fitted,pch=19,col="blue",ylab="Prob Win",xlab="Ravens Score")


# a logistic regression (fitting log-odds instead):
logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore,family="binomial") 
summary(logRegRavens)
# Note on AIC:
# Given a set of candidate models for the data, the preferred model is the one with the minimum AIC value.
# AIC is the "Akaike information criterion"


# plot of the logistic fit:
plot(ravensData$ravenScore,logRegRavens$fitted,pch=19,col="blue",xlab="Score",ylab="Prob Ravens Win")


# finding the results in terms of _odds_ of a win...
exp(logRegRavens$coeff)
exp(confint(logRegRavens))  # and 95% confidence intervals


# finding the statistical strength via anova
anova(logRegRavens,test="Chisq")


# here are the missing data examples at the end of the slides...
i = iris[]   # copy iris into I
i[30,]$Sepal.Width = NA

# try with na.omit
tr_omit <- tree(Species ~ Sepal.Width + Petal.Width,data=i, na.action=na.omit) 
summary(tr_omit)

# try with na.pass
tr_pass <- tree(Species ~ Sepal.Width + Petal.Width,data=i, na.action=na.pass) 
summary(tr_pass)