# load Galton data library(UsingR); data(galton) par(mfrow=c(1,2)) hist(galton$child,col="blue",breaks=100) hist(galton$parent,col="blue",breaks=100) # average children's heights hist(galton$child,col="blue",breaks=100) meanChild <- mean(galton$child) lines(rep(meanChild,100),seq(0,150,length=100),col="red",lwd=5) # plot plot(galton$parent,galton$child,pch=19,col="blue") # jittered plot set.seed(1234) plot(jitter(galton$parent,factor=2),jitter(galton$child,factor=2),pch=19,col="blue") # average at 65 plot(galton$parent,galton$child,pch=19,col="blue") near65 <- galton[abs(galton$parent - 65)<1, ] points(near65$parent,near65$child,pch=19,col="red") lines(seq(64,66,length=100),rep(mean(near65$child),100),col="red",lwd=4) # average at 71 plot(galton$parent,galton$child,pch=19,col="blue") near71 <- galton[abs(galton$parent - 71)<1, ] points(near71$parent,near71$child,pch=19,col="red") lines(seq(70,72,length=100),rep(mean(near71$child),100),col="red",lwd=4) # plot and fit plot(galton$parent,galton$child,pch=19,col="blue") lm1 <- lm(galton$child ~ galton$parent) lines(galton$parent,lm1$fitted,col="red",lwd=3) # not this line plot(galton$parent,galton$child,pch=19,col="blue") lines(galton$parent, 26 + 0.646*galton$parent) # plot plot(galton$parent,galton$child,pch=19,col="blue") lines(galton$parent,lm1$fitted,col="red",lwd=3) # plot residuals par(mfrow=c(1,2)) plot(galton$parent,galton$child,pch=19,col="blue") lines(galton$parent,lm1$fitted,col="red",lwd=3) plot(galton$parent,lm1$residuals,col="blue",pch=19) abline(c(0,0),col="red",lwd=3) # fit a line library(UsingR); data(galton); plot(galton$parent,galton$child,pch=19,col="blue") lm1 <- lm(galton$child ~ galton$parent) lines(galton$parent,lm1$fitted,col="red",lwd=3) # 1 million newGalton <- data.frame(parent=rep(NA,1e6),child=rep(NA,1e6)) newGalton$parent <- rnorm(1e6,mean=mean(galton$parent),sd=sd(galton$parent)) newGalton$child <- lm1$coeff[1] + lm1$coeff[2]*newGalton$parent + rnorm(1e6,sd=sd(lm1$residuals)) smoothScatter(newGalton$parent,newGalton$child) abline(lm1,col="red",lwd=3) # sample set.seed(134325); sampleGalton1 <- newGalton[sample(1:1e6,size=50,replace=F),] sampleLm1 <- lm(sampleGalton1$child ~ sampleGalton1$parent) plot(sampleGalton1$parent,sampleGalton1$child,pch=19,col="blue") lines(sampleGalton1$parent,sampleLm1$fitted,lwd=3,lty=2) abline(lm1,col="red",lwd=3) # another sample sampleGalton2 <- newGalton[sample(1:1e6,size=50,replace=F),] sampleLm2 <- lm(sampleGalton2$child ~ sampleGalton2$parent) plot(sampleGalton2$parent,sampleGalton2$child,pch=19,col="blue") lines(sampleGalton2$parent,sampleLm2$fitted,lwd=3,lty=2) abline(lm1,col="red",lwd=3) # another sampleGalton3 <- newGalton[sample(1:1e6,size=50,replace=F),] sampleLm3 <- lm(sampleGalton3$child ~ sampleGalton3$parent) plot(sampleGalton3$parent,sampleGalton3$child,pch=19,col="blue") lines(sampleGalton3$parent,sampleLm3$fitted,lwd=3,lty=2) abline(lm1,col="red",lwd=3) # many samples sampleLm <- vector(100,mode="list") for(i in 1:100){ sampleGalton <- newGalton[sample(1:1e6,size=50,replace=F),] sampleLm[[i]] <- lm(sampleGalton$child ~ sampleGalton$parent) } # smooth scatter smoothScatter(newGalton$parent,newGalton$child) for(i in 1:100){abline(sampleLm[[i]],lwd=3,lty=2)} abline(lm1,col="red",lwd=3) # histogram of estimates par(mfrow=c(1,2)) hist(sapply(sampleLm,function(x){coef(x)[1]}),col="blue",xlab="Intercept",main="") hist(sapply(sampleLm,function(x){coef(x)[2]}),col="blue",xlab="Slope",main="") # more conf int par(mar=c(4,4,0,2));plot(1:10,type="n",xlim=c(0,1.5),ylim=c(0,100), xlab="Coefficient Values",ylab="Replication") for(i in 1:100){ ci <- confint(sampleLm[[i]]); color="red"; if((ci[2,1] < lm1$coeff[2]) & (lm1$coeff[2] < ci[2,2])){color = "grey"} segments(ci[2,1],i,ci[2,2],i,col=color,lwd=3) } lines(rep(lm1$coeff[2],100),seq(0,100,length=100),lwd=3) # confidence intervals sampleGalton4 <- newGalton[sample(1:1e6,size=50,replace=F),] sampleLm4 <- lm(sampleGalton4$child ~ sampleGalton4$parent) summary(sampleLm4) confint(sampleLm4,level=0.95)