# Part 0: using predict to predict from a linear model # just one slide... y <- seq(-6,6,1) x <- seq(-3,3,.5) d <- data.frame( y, x ) attach(d) plot(x,y) m <- lm( y ~ x ) predict( m, newdata=data.frame(x=42)) predict( m, newdata=data.frame(x=42), interval="confidence") summary(m) detach() # Part 1: inference and p-values... # slide 1 library(UsingR); data(galton); plot(galton$parent,galton$child,pch=19,col="blue") lm1 <- lm(galton$child ~ galton$parent) lines(galton$parent,lm1$fitted,col="red",lwd=3) # null t distribution x <- seq(-20,20,length=100) plot(x,dt(x,df=(928-2)),col="blue",lwd=3,type="l") # with arrow x <- seq(-20,20,length=100) plot(x,dt(x,df=(928-2)),col="blue",lwd=3,type="l") arrows(summary(lm1)$coeff[2,3],0.25,summary(lm1)$coeff[2,3],0,col="red",lwd=4) # a quick simulated example set.seed(9898324) yValues <- rnorm(10); xValues <- rnorm(10) lm2 <- lm(yValues ~ xValues) summary(lm2) # with arrow x <- seq(-5,5,length=100) plot(x,dt(x,df=(10-2)),col="blue",lwd=3,type="l") arrows(summary(lm2)$coeff[2,3],0.25,summary(lm2)$coeff[2,3],0,col="red",lwd=4) # with polygons to show how common this is... xCoords <- seq(-5,5,length=100) plot(xCoords,dt(xCoords,df=(10-2)),col="blue",lwd=3,type="l") xSequence <- c(seq(summary(lm2)$coeff[2,3],5,length=10),summary(lm2)$coeff[2,3]) ySequence <- c(dt(seq(summary(lm2)$coeff[2,3],5,length=10),df=8),0) polygon(xSequence,ySequence,col="red"); polygon(-xSequence,ySequence,col="red") # lots of data without signal set.seed(8323); pValues <- rep(NA,100) for(i in 1:100){ xValues <- rnorm(20);yValues <- rnorm(20) pValues[i] <- summary(lm(yValues ~ xValues))$coeff[2,4] } hist(pValues,col="blue",main="",freq=F) abline(h=1,col="red",lwd=3) # lots of data with signal set.seed(8323); pValues <- rep(NA,100) for(i in 1:100){ xValues <- rnorm(20);yValues <- 0.2 * xValues + rnorm(20) pValues[i] <- summary(lm(yValues ~ xValues))$coeff[2,4] } hist(pValues,col="blue",main="",freq=F,xlim=c(0,1)); abline(h=1,col="red",lwd=3) # again, more with signal! set.seed(8323); pValues <- rep(NA,100) for(i in 1:100){ xValues <- rnorm(100);yValues <- 0.2* xValues + rnorm(100) pValues[i] <- summary(lm(yValues ~ xValues))$coeff[2,4] } hist(pValues,col="blue",main="",freq=F,xlim=c(0,1)); abline(h=1,col="red",lwd=3) # interpret the results... summary(lm(galton$child ~ galton$parent))$coeff # Part 2 - regression with factor variables # getting the movie data! download.file("http://www.rossmanchance.com/iscam2/data/movies03RT.txt",destfile="./movies.txt") movies <- read.table("./movies.txt",sep="\t",header=T,quote="") head(movies) # plot review score vs. movie rating plot(movies$score ~ jitter(as.numeric(movies$rating)),col="blue",xaxt="n",pch=19) axis(side=1,at=unique(as.numeric(movies$rating)),labels=unique(movies$rating)) # plot average score by rating plot(movies$score ~ jitter(as.numeric(movies$rating)),col="blue",xaxt="n",pch=19) axis(side=1,at=unique(as.numeric(movies$rating)),labels=unique(movies$rating)) meanRatings <- tapply(movies$score,movies$rating,mean) points(1:4,meanRatings,col="red",pch="-",cex=5) # getting the summary in R lm1 <- lm(movies$score ~ as.factor(movies$rating)) summary(lm1) # plotting the fitted values plot(movies$score ~ jitter(as.numeric(movies$rating)),col="blue",xaxt="n",pch=19) axis(side=1,at=unique(as.numeric(movies$rating)),labels=unique(movies$rating)) points(1:4,lm1$coeff[1] + c(0,lm1$coeff[2:4]),col="red",pch="-",cex=5) # Question 1 in R lm1 <- lm(movies$score ~ as.factor(movies$rating)) summary(lm1) # Question 1 in R (confidence intervals) lm1 <- lm(movies$score ~ as.factor(movies$rating)) confint(lm1) # Question 2 in R lm2 <- lm(movies$score ~ relevel(movies$rating,ref="R")) summary(lm2) # Question 2 in R (confidence intervals) lm2 <- lm(movies$score ~ relevel(movies$rating,ref="R")) confint(lm2) # Question 3 in R lm1 <- lm(movies$score ~ as.factor(movies$rating)) anova(lm1) # sum of squares of G movies gMovies <- movies[movies$rating=="G",] xVals <- seq(0.2,0.8,length=4) plot(xVals,gMovies$score,ylab="Score",xaxt="n",xlim=c(0,1),pch=19) abline(h=mean(gMovies$score),col="blue",lwd=3) abline(h=mean(movies$score),col="red",lwd=3) segments(xVals+0.01,rep(mean(gMovies$score),length(xVals)), xVals+0.01,rep(mean(movies$score),length(xVals)),col="red",lwd=2) segments(xVals-0.01,gMovies$score,xVals-0.01,rep(mean(gMovies$score),length(xVals)),col="blue") # Tukey's HSD test lm1 <- aov(movies$score ~ as.factor(movies$rating)) TukeyHSD(lm1) # Part 3 ~ graphs and data exploration... # reading the data pData <- read.csv("ss06pid.csv") # boxplot boxplot(pData$AGEP,col="blue") # two boxplots boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue") # comparative scales boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE) # barplot barplot(table(pData$CIT),col="blue") # hist hist(pData$AGEP,col="blue") # thinner bins hist(pData$AGEP,col="blue",breaks=100,main="Age") # density plots - using a density object dens <- density(pData$AGEP) plot(dens,lwd=3,col="blue") # multiple distributions for density plots... dens <- density(pData$AGEP) densMales <- density(pData$AGEP[which(pData$SEX==1)]) plot(dens,lwd=3,col="blue") lines(densMales,lwd=3,col="orange") # scatterplot plot(pData$JWMNP,pData$WAGP,pch=19,col="blue") # scatterplot - changing the size plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5) # scatterplot with color plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$SEX,cex=0.5) # scatterplot with size percentMaxAge <- pData$AGEP/max(pData$AGEP) plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=percentMaxAge*0.5) # overlaying lines and points plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5) lines(rep(100,dim(pData)[1]),pData$WAGP,col="grey",lwd=5) points(seq(0,200,length=100),seq(0,20e5,length=100),col="red",pch=19) # using numeric values as factors with cut2 # first - be sure that package Hmisc is installed! library(Hmisc) ageGroups <- cut2(pData$AGEP,g=5) plot(pData$JWMNP,pData$WAGP,pch=19,col=ageGroups,cex=0.5) # if you have a lot of points: x <- rnorm(1e5) y <- rnorm(1e5) plot(x,y,pch=19) # you can sample them x <- rnorm(1e5) y <- rnorm(1e5) sampledValues <- sample(1:1e5,size=1000,replace=FALSE) plot(x[sampledValues],y[sampledValues],pch=19) # we've seen this before, too... x <- rnorm(1e5) y <- rnorm(1e5) smoothScatter(x,y) # here's anotehr one that uses intensity library(hexbin) x <- rnorm(1e5) y <- rnorm(1e5) hbo <- hexbin(x,y) plot(hbo) # Q-Q plots to compare distributions x <- rnorm(20); y <- rnorm(20) qqplot(x,y) abline(c(0,1)) # matrix plotting by row... difficult to read... X <- matrix(rnorm(20*5),nrow=20) matplot(X,type="b") # heatmaps may be better... image(1:10,161:236,as.matrix(pData[1:10,161:236])) # there had been a question about transpose... newMatrix <- as.matrix(pData[1:10,161:236]) newMatrix <- t(newMatrix)[,nrow(newMatrix):1] image(161:236, 1:10, newMatrix) # not as good as ESRI's, but there are still maps... library(maps) map("world") lat <- runif(40,-180,180); lon <- runif(40,-90,90) points(lat,lon,col="blue",pch=19) # missing values in plots x <- c(NA,NA,NA,4,5,6,7,8,9,10) y <- 1:10 plot(x,y,pch=19,xlim=c(0,11),ylim=c(0,11)) # more missing values... x <- rnorm(100) y <- rnorm(100) y[x < 0] <- NA boxplot(x ~ is.na(y)) # see R Graph Gallery!