# Part 0: using predict to predict from a linear model

# just one slide...
y <- seq(-6,6,1)
x <- seq(-3,3,.5)
d <- data.frame( y, x )
attach(d)
plot(x,y)
m <- lm( y ~ x )
predict( m, newdata=data.frame(x=42))
predict( m, newdata=data.frame(x=42), interval="confidence")
summary(m)
detach()


# Part 1: inference and p-values...

# slide 1
library(UsingR); data(galton);
plot(galton$parent,galton$child,pch=19,col="blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent,lm1$fitted,col="red",lwd=3)

# null t distribution
x <- seq(-20,20,length=100)
plot(x,dt(x,df=(928-2)),col="blue",lwd=3,type="l")

# with arrow
x <- seq(-20,20,length=100)
plot(x,dt(x,df=(928-2)),col="blue",lwd=3,type="l")
arrows(summary(lm1)$coeff[2,3],0.25,summary(lm1)$coeff[2,3],0,col="red",lwd=4)

# a quick simulated example
set.seed(9898324)
yValues <- rnorm(10); xValues <- rnorm(10)
lm2 <- lm(yValues ~ xValues)
summary(lm2)

# with arrow
x <- seq(-5,5,length=100)
plot(x,dt(x,df=(10-2)),col="blue",lwd=3,type="l")
arrows(summary(lm2)$coeff[2,3],0.25,summary(lm2)$coeff[2,3],0,col="red",lwd=4)

# with polygons to show how common this is...
xCoords <- seq(-5,5,length=100)
plot(xCoords,dt(xCoords,df=(10-2)),col="blue",lwd=3,type="l")
xSequence <- c(seq(summary(lm2)$coeff[2,3],5,length=10),summary(lm2)$coeff[2,3])
ySequence <- c(dt(seq(summary(lm2)$coeff[2,3],5,length=10),df=8),0)
polygon(xSequence,ySequence,col="red"); polygon(-xSequence,ySequence,col="red")

# lots of data without signal
set.seed(8323); pValues <- rep(NA,100)
for(i in 1:100){
  xValues <- rnorm(20);yValues <- rnorm(20)
  pValues[i] <- summary(lm(yValues ~ xValues))$coeff[2,4]
}
hist(pValues,col="blue",main="",freq=F)
abline(h=1,col="red",lwd=3)

# lots of data with signal
set.seed(8323); pValues <- rep(NA,100)
for(i in 1:100){
  xValues <- rnorm(20);yValues <- 0.2 * xValues + rnorm(20)
  pValues[i] <- summary(lm(yValues ~ xValues))$coeff[2,4]
}
hist(pValues,col="blue",main="",freq=F,xlim=c(0,1)); abline(h=1,col="red",lwd=3)

# again, more with signal!
set.seed(8323); pValues <- rep(NA,100)
for(i in 1:100){
  xValues <- rnorm(100);yValues <- 0.2* xValues + rnorm(100)
  pValues[i] <- summary(lm(yValues ~ xValues))$coeff[2,4]
}
hist(pValues,col="blue",main="",freq=F,xlim=c(0,1)); abline(h=1,col="red",lwd=3)

# interpret the results...
summary(lm(galton$child ~ galton$parent))$coeff


# Part 2 - regression with factor variables

# getting the movie data!
download.file("http://www.rossmanchance.com/iscam2/data/movies03RT.txt",destfile="./movies.txt")
movies <- read.table("./movies.txt",sep="\t",header=T,quote="")
head(movies)

# plot review score vs. movie rating
plot(movies$score ~ jitter(as.numeric(movies$rating)),col="blue",xaxt="n",pch=19)
axis(side=1,at=unique(as.numeric(movies$rating)),labels=unique(movies$rating)) 

# plot average score by rating
plot(movies$score ~ jitter(as.numeric(movies$rating)),col="blue",xaxt="n",pch=19) 
axis(side=1,at=unique(as.numeric(movies$rating)),labels=unique(movies$rating)) 
meanRatings <- tapply(movies$score,movies$rating,mean) 
points(1:4,meanRatings,col="red",pch="-",cex=5)

# getting the summary in R
lm1 <- lm(movies$score ~ as.factor(movies$rating)) 
summary(lm1)


# plotting the fitted values
plot(movies$score ~ jitter(as.numeric(movies$rating)),col="blue",xaxt="n",pch=19) 
axis(side=1,at=unique(as.numeric(movies$rating)),labels=unique(movies$rating)) 
points(1:4,lm1$coeff[1] + c(0,lm1$coeff[2:4]),col="red",pch="-",cex=5)

# Question 1 in R
lm1 <- lm(movies$score ~ as.factor(movies$rating)) 
summary(lm1)

# Question 1 in R (confidence intervals)
lm1 <- lm(movies$score ~ as.factor(movies$rating)) 
confint(lm1)

# Question 2 in R
lm2 <- lm(movies$score ~ relevel(movies$rating,ref="R")) 
summary(lm2)

# Question 2 in R (confidence intervals)
lm2 <- lm(movies$score ~ relevel(movies$rating,ref="R")) 
confint(lm2)

# Question 3 in R
lm1 <- lm(movies$score ~ as.factor(movies$rating)) 
anova(lm1)

# sum of squares of G movies
gMovies <- movies[movies$rating=="G",]
xVals <- seq(0.2,0.8,length=4)
plot(xVals,gMovies$score,ylab="Score",xaxt="n",xlim=c(0,1),pch=19)
abline(h=mean(gMovies$score),col="blue",lwd=3)
abline(h=mean(movies$score),col="red",lwd=3)
segments(xVals+0.01,rep(mean(gMovies$score),length(xVals)),
         xVals+0.01,rep(mean(movies$score),length(xVals)),col="red",lwd=2)
segments(xVals-0.01,gMovies$score,xVals-0.01,rep(mean(gMovies$score),length(xVals)),col="blue")

# Tukey's HSD test
lm1 <- aov(movies$score ~ as.factor(movies$rating)) 
TukeyHSD(lm1)


# Part 3 ~ graphs and data exploration...

# reading the data
pData <- read.csv("ss06pid.csv") 

# boxplot
boxplot(pData$AGEP,col="blue")

# two boxplots
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue")

# comparative scales
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE)

# barplot
barplot(table(pData$CIT),col="blue")

# hist
hist(pData$AGEP,col="blue")

# thinner bins
hist(pData$AGEP,col="blue",breaks=100,main="Age")

# density plots - using a density object 
dens <- density(pData$AGEP)
plot(dens,lwd=3,col="blue")

# multiple distributions for density plots...
dens <- density(pData$AGEP)
densMales <- density(pData$AGEP[which(pData$SEX==1)])
plot(dens,lwd=3,col="blue")
lines(densMales,lwd=3,col="orange")

# scatterplot
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue")

# scatterplot - changing the size 
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)

# scatterplot with color
plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$SEX,cex=0.5)

# scatterplot with size
percentMaxAge <- pData$AGEP/max(pData$AGEP)
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=percentMaxAge*0.5)

# overlaying lines and points
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)
lines(rep(100,dim(pData)[1]),pData$WAGP,col="grey",lwd=5)
points(seq(0,200,length=100),seq(0,20e5,length=100),col="red",pch=19)

# using numeric values as factors with cut2
# first - be sure that package Hmisc is installed! 
library(Hmisc)
ageGroups <- cut2(pData$AGEP,g=5)
plot(pData$JWMNP,pData$WAGP,pch=19,col=ageGroups,cex=0.5)

# if you have a lot of points:
x <- rnorm(1e5)
y <- rnorm(1e5)
plot(x,y,pch=19)

# you can sample them
x <- rnorm(1e5)
y <- rnorm(1e5)
sampledValues <- sample(1:1e5,size=1000,replace=FALSE)
plot(x[sampledValues],y[sampledValues],pch=19)

# we've seen this before, too...
x <- rnorm(1e5)
y <- rnorm(1e5)
smoothScatter(x,y)

# here's anotehr one that uses intensity
library(hexbin)
x <- rnorm(1e5)
y <- rnorm(1e5)
hbo <- hexbin(x,y)
plot(hbo)

# Q-Q plots to compare distributions
x <- rnorm(20); y <- rnorm(20)
qqplot(x,y)
abline(c(0,1))

# matrix plotting by row... difficult to read...
X <- matrix(rnorm(20*5),nrow=20)
matplot(X,type="b")

# heatmaps may be better...
image(1:10,161:236,as.matrix(pData[1:10,161:236]))

# there had been a question about transpose...
newMatrix <- as.matrix(pData[1:10,161:236])
newMatrix <- t(newMatrix)[,nrow(newMatrix):1]
image(161:236, 1:10, newMatrix)

# not as good as ESRI's, but there are still maps...
library(maps)
map("world")
lat <- runif(40,-180,180); lon <- runif(40,-90,90)
points(lat,lon,col="blue",pch=19)

# missing values in plots
x <- c(NA,NA,NA,4,5,6,7,8,9,10)
y <- 1:10
plot(x,y,pch=19,xlim=c(0,11),ylim=c(0,11))

# more missing values...
x <- rnorm(100)
y <- rnorm(100)
y[x < 0] <- NA
boxplot(x ~ is.na(y))

# see R Graph Gallery!