# notes for Lecture 5 of IST 380 # correlations with R ?cor # colors... colors() col=rgb(1.0,1.0,0.5) col="thistle3" #To try things out: plot( 0,0 ) rect( 0,0,100,100,col="red" ) rect( 0,0,100,100,col="thistle3" ) # strings and regular expressions library(stringr) str_length( "hi there!" ) str_count( "s", "mississippi" ) str_replace_all( "sing", "s", "str" ) # test of matching and extracting... strings <- c( " 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", "387 287 6718", "apple", "233.398.9187 ", "482 952 3315", "239 923 8115", "842 566 4692", "Work: 579-499-7527", "$1000", "Home: 543.355.3679") # a regular expression pattern: phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" # try it... str_match( strings, phone ) str_extract( strings, phone ) # # next section: # # prediction with trees # # Clinton vs. Obama counties: http://graphics8.nytimes.com/images/2008/04/16/us/0416-nat-subOBAMA.jpg # Iris data... data(iris) names(iris) View(iris) # to show the data in an R Studio dataframe # plotting the Iris data plot(iris$Petal.Width,iris$Sepal.Width,pch=19,col=as.numeric(iris$Species)) legend(1,4.5,legend=unique(iris$Species),col=unique(as.numeric(iris$Species)),pch=18) # creating the first tree library(tree) # you may need to install this package... tree1 <- tree(Species ~ Sepal.Width + Petal.Width,data=iris) summary(tree1) # plotting the tree plot(tree1) text(tree1) # the text overlaps too much here... # next week will look at the art and science of "nice" tree plots # here, we'll show an alternative plot, e.g., a partition graph plot(iris$Petal.Width,iris$Sepal.Width,pch=18,col=as.numeric(iris$Species)) partition.tree(tree1,label="Species",add=TRUE) legend(1.75,4.5,legend=unique(iris$Species),col=unique(as.numeric(iris$Species)),pch=18) # this creates random data for Petal.Width and Sepal.Width # and then uses the tree to predict which species it will be... # # below, note that "runif" stands for "random uniform," so that runif(20,0,2.5) # produces 20 random numbers uniformly pulled out of the interval from 0 to 2.5 set.seed(32313) newdata <- data.frame(Petal.Width = runif(20,0,2.5),Sepal.Width = runif(20,2,4.5)) # now, we predict the output on those random data... pred1 <- predict(tree1,newdata) pred1 # here we plot that random data and the predictions # the argument type="class" here indicates that R should not # yield probabilities (as in the table above), but should choose # the single value ("class") with the highest probability as its prediction # # take a look at pred1 after this next line to see this "class"ifier... pred1 <- predict(tree1,newdata,type="class") plot(newdata$Petal.Width,newdata$Sepal.Width,col=as.numeric(pred1),pch=18) partition.tree(tree1,"Species",add=TRUE) # a table of classifications table(iris$Species,predict(tree1,type="class")) # Cross-validation aside... # our data - no relationship between z's coinflips and x,y set.seed(12345) x <- rnorm(10); y <- rnorm(10); z <- rbinom(10,size=1,prob=0.5) plot(x,y,pch=19,col=(z+3)) # classifier (a tree, in fact): par(mfrow=c(1,2)) zhat<-(-0.2-0.5) par(mfrow=c(1,3)) plot(x1,y1,col=(z1+3),pch=19); plot(x2,y2,col=(z2+3),pch=19); plot(x2,y2,col=(zhat2+3),pch=19) # back to the tree-building... # using cross-validation for checking the Iris tree-building procedure par(mfrow=c(1,1)) plot(cv.tree(tree1,FUN=prune.tree,method="misclass")) # with deviance instead of misclassification plot(cv.tree(tree1,FUN=prune.tree)) # build a three-leaf classification tree: t3 <- prune.tree(tree1,best=3) # see how it does (resubstitution error) versus whole tree table(iris$Species,predict(t3,type="class")) table(iris$Species,predict(tree1,type="class")) # to see the text description in R t3 # # another tree-based predictive model with the Cars93 dataset # # # read about the dataset with ??Cars93 ... data(Cars93,package="MASS") # get the dataset head(Cars93) ??Cars93 # build a tree model of the driver-train type, based on... EVERYTHING treeCars <- tree(DriveTrain ~ MPG.city + MPG.highway + AirBags + EngineSize + Width + Length + Weight + Price + Cylinders + Horsepower + Wheelbase,data=Cars93) plot(treeCars) text(treeCars) # messy! # plotting the misclassification errors on the tree... (both kinds) par(mfrow=c(1,2)) plot(cv.tree(treeCars,FUN=prune.tree,method="misclass")) plot(cv.tree(treeCars)) # pruning the tree to four leaves pruneTree <- prune.tree(treeCars,best=4) plot(pruneTree) text(pruneTree) # showing the pruned tree's and original tree's resubstitution errors: table(Cars93$DriveTrain,predict(pruneTree,type="class")) table(Cars93$DriveTrain,predict(treeCars,type="class")) # # extra! Logistic Regression... # # get Ravens data load("ravensData.rda") head(ravensData) # run ordinary regression lmRavens <- lm(ravensData$ravenWinNum ~ ravensData$ravenScore) summary(lmRavens) # plot the results plot(ravensData$ravenScore,lmRavens$fitted,pch=19,col="blue",ylab="Prob Win",xlab="Ravens Score") # a logistic regression (fitting log-odds instead): logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore,family="binomial") summary(logRegRavens) # Note on AIC: # Given a set of candidate models for the data, the preferred model is the one with the minimum AIC value. # AIC is the "Akaike information criterion" # plot of the logistic fit: plot(ravensData$ravenScore,logRegRavens$fitted,pch=19,col="blue",xlab="Score",ylab="Prob Ravens Win") # finding the results in terms of _odds_ of a win... exp(logRegRavens$coeff) exp(confint(logRegRavens)) # and 95% confidence intervals # finding the statistical strength via anova anova(logRegRavens,test="Chisq") # here are the missing data examples at the end of the slides... i = iris[] # copy iris into I i[30,]$Sepal.Width = NA # try with na.omit tr_omit <- tree(Species ~ Sepal.Width + Petal.Width,data=i, na.action=na.omit) summary(tr_omit) # try with na.pass tr_pass <- tree(Species ~ Sepal.Width + Petal.Width,data=i, na.action=na.pass) summary(tr_pass)