# file of R code for IST380's week 7... # hierarchical clustering # create three clusters of data and plot it... set.seed(1234); par(mar=c(0,0,0,0)) x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2) y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2) plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) # using dist to get all pairwise distances... dataFrame <- data.frame(x=x,y=y) dists <- dist(dataFrame) # using hclust and plotting the dendrogram dataFrame <- data.frame(x=x,y=y) distxy <- dist(dataFrame,"euclidean") # different distances... # "manhattan", "maximum", see ?hclust for more hClustering <- hclust(distxy) par(oma=c(2,2,2,2)) par(mai=c(1,1,1,1)) plot(hClustering) # function "myplclust" for colored-leaf dendrograms myplclust <- function( hclust, lab.col, hang=0.1, ... ) { ## modifiction of plclust for plotting hclust objects *in colour*! ## Copyright Eva KF Chan 2009 ## Arguments: ## hclust: hclust object ## lab.col: colour for the labels; NA=default device foreground colour ## hang: as in hclust & plclust ## Side effect: A display of hierarchical cluster with coloured leaf labels. y <- rep(hclust$height,2); x <- as.numeric(hclust$merge) y <- y[which(x<0)]; x <- x[which(x<0)]; x <- abs(x) y <- y[order(x)]; x <- x[order(x)] plot( hclust, labels=FALSE, hang=hang, ... ) text( x=x, y=y[hclust$order]-(max(hclust$height)*hang), labels=hclust$order, col=lab.col[hclust$order], srt=90, adj=c(1,0.5), xpd=NA, ... ) } # example of that function in action... dataFrame <- data.frame(x=x,y=y) distxy <- dist(dataFrame) hClustering <- hclust(distxy) colors <- rep( c(1,2,3), each=4 ) # 1 1 1 1 2 2 2 2 3 3 3 3 myplclust(hClustering,lab.col=colors) # example using cutree... dataFrame <- data.frame(x=x,y=y) distxy <- dist(dataFrame) hClustering <- hclust(distxy) clusters <- cutree(hClustering,k=5) myplclust(hClustering,lab.col=clusters) # plotting the original points with cutree determining colors... par(oma=c(2,2,2,2)) par(mai=c(1,1,1,1)) plot(x,y,col=cutree(hClustering,k=5),pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12),cex=0.75) # here is the R-gallery dendrogram... http://gallery.r-enthusiasts.com/graph/Colored_Dendrogram_79 # and here is the code for it... require(fpc) require(A2R) # get that package from # http://addictedtor.free.fr/packages d.usa <- dist(USArrests, "euc") h.usa <- hclust(d.usa, method="ward") set.seed(1) some.factor <- letters[1:4][rbinom(50, prob=0.5, size=3)+1] hubertgamma <- rep(0,10) for(i in 1:10){ hubertgamma[i] <- cluster.stats(d.usa, cutree(h.usa, k=i+1), G2 = FALSE, G3 = FALSE, silhouette = FALSE)$hubertgamma } A2Rplot(h.usa, k=3, fact.sup=some.factor, criteria=hubertgamma, boxes = FALSE, col.up = "gray", col.down = c("orange","royalblue","green3")) # k-means examples in R # Here is a set of functions to see the algorithm # Run run_demo() to try it... pause <- function(message="") { cat( message, "") user <- readline("") } showall <- function(message,x,y,cluster.labels,cl_centers) { plot(x,y,col=cluster.labels,pch=19,cex=2) points(cl_centers,col=1:3,pch=3,cex=3,lwd=3) # old points pause(message) } dists <- function(df,p) { diffs <- apply( df, c(1), function(row){row-p} ) # works m <- matrix( unlist(diffs), nrow=dim(df)[1], byrow=T ) # ludicrous sqdists <- apply( m*m, c(1), sum ) # square dists dists <- vapply( sqdists, sqrt, 42.0 ) return(dists) } run_demo <- function() { set.seed(1234) par(mar=c(0,0,0,0)) x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2) y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2) plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) pause("Initial Points") # centers c(1,2), c(1.8,1), c(2.5,1.5) cl_centers = data.frame( rbind( c(1,2), c(1.8,1), c(2.5,1.5) ) ) names(cl_centers) <- c("x","y") cluster.labels <- rep(c(4),12) # all blue showall("Starting centers",x,y,cluster.labels,cl_centers) # compute distance to each center - add as a column! df <- data.frame(x,y) while (TRUE) { d1 <- dists(df,cl_centers[1,]) d2 <- dists(df,cl_centers[2,]) d3 <- dists(df,cl_centers[3,]) ddf <- data.frame(d1,d2,d3) old.cluster.labels <- cluster.labels cluster.labels <- apply( ddf, c(1), which.min ) if (all(old.cluster.labels == cluster.labels)) { cat("No points changed cluster... stopping\n") break } showall("Assign groups",x,y,cluster.labels,cl_centers) # need to get the new cluster centers... c1 <- apply( df[cluster.labels==1,], c(2), mean ) c2 <- apply( df[cluster.labels==2,], c(2), mean ) c3 <- apply( df[cluster.labels==3,], c(2), mean ) cl_centers <- rbind( c1, c2, c3 ) showall("Update centers",x,y,cluster.labels,cl_centers) # print the ss errors } # end of loop } # k-means with iris data in R # original data par(oma=c(1,1,1,1)); par(mai=c(1,1,1,1)) Xcol <- 1 # Sepal.Length Xname <- names(iris)[Xcol] Ycol <- 3 # Petal.Length Yname <- names(iris)[Ycol] plot(iris[,Xcol],iris[,Ycol],col=iris[,5], xlab=Xname, ylab=Yname) # with three centers km <- kmeans(iris[,-5],centers=3) Xcol <- 1 Xname <- names(iris)[Xcol] Ycol <- 3 Yname <- names(iris)[Ycol] plot(iris[,Xcol],iris[,Ycol],col=km$cluster, xlab=Xname, ylab=Yname) # amount of variance explained by three centers var_expl <- km$betweenss/km$totss cat("variance explained: ", var_expl) # try it with other numbers of centers # or even the same # of centers! # demo of how to create an "elbow plot" of the variance explained # by different values of k = number of centers # Note that this uses the iris data data(iris) # for "elbow" plot method of choosing # of centers var_explained <- function(num_centers) { km <- kmeans(iris[,-5],centers=num_centers) Xcol <- 1 Xname <- names(iris)[Xcol] Ycol <- 3 Yname <- names(iris)[Ycol] plot(iris[,Xcol],iris[,Ycol],col=km$cluster, xlab=Xname, ylab=Yname) var_expl <- km$betweenss/km$totss return(var_expl) } # demo for multiple centers... run_ncenters_demo <- function() { data(iris) km <- kmeans(iris[,-5],centers=3) Xcol <- 1 Xname <- names(iris)[Xcol] Ycol <- 3 Yname <- names(iris)[Ycol] #plot(iris[,Xcol],iris[,Ycol],col=km$cluster, # xlab=Xname, ylab=Yname) NUM_CENTERS_TESTED <- 7 results <- vector(mode="numeric",length=7) for (num_centers in 1:NUM_CENTERS_TESTED) { var_expl <- var_explained(num_centers) results[num_centers] <- var_expl } # plot variance explained for each number of centers... plot(1:NUM_CENTERS_TESTED, results, type="b",pch=16,col="black", xlab="number of centers",ylab="variance explained") } # here are two functions to plot the 784 pixels # and to plot a row of the above dataframe, respectively # example call: plot.data( d ) # d should be a row of 784 pixel values plot.data <- function( data ) { D = 28 # image is square with D pixels on a side im1 <- as.single(1.0-data/255) # adjust pixels im1 <- matrix( im1, nrow=D, ncol=D, byrow=T ) # make 2d image <- as.raster(im1) # make a raster (an image) # create a plot... something plot(c(1,D),c(1,D), type = "n", xlab="", ylab="") rasterImage(image,1,1,D,D,interpolate=F) } # example call: plot.digit( df, 42 ) # df is a dataframe of digits (with no labels/initial columns) # the rownumber is just that... plot.digit <- function( digs, rownumber=42 ) { D = 28 # image is square with D pixels on a side im1 <- digs[rownumber,] # get first row #im1 <- im1[,-c(1,2)] # remove first two columns plot.data(im1) }