# K-means clustering, # made-up data cl1 = matrix(rnorm(100, sd = 0.3), ncol = 2) cl2 = matrix(rnorm(100, mean=1.5, sd = 0.3), ncol = 2) plot(cl1, xlim = range(cl1, cl2), ylim = range(cl1, cl2)) points(cl2) # combine the two datasets, and let's see # if the clustering method can take them apart again data = rbind(cl1, cl2) clustering = kmeans(data, 2) clustering # how good is the clustering? clustering$withinss clustering$betweenss # visualizing the results of clustering plot(data, col = clustering$cluster) points(clustering$centers, col = 1:2, pch = 8, cex = 2) # for a more challenging dataset, set the second mean to 1.0, # or try 3-4 clusters # In that case do nstart = 25 # k-means clustering, now with real data nyt = read.table("/Users/katrinerk/Teaching/2012/spring/ald/assignments/hw1/nyt_opinion.txt", header=T) # content words only nyt.cl = kmeans(nyt[,13:29], 4, nstart = 25) # inspecting the clusters nyt$cl = nyt.cl$cluster xtabs(~nyt$cl + nyt$Author) aggregate(nyt[,13:29], list(nyt$cl), sum) # hierarchical clustering:bottom-up # agglomerative library(languageR) head(lexicalMeasures) # find pairwise correlations between measures # in correlation matrix. # use non-parametric, rank-based correlation: spearman lexicalMeasures.cor = cor(lexicalMeasures[,-1], method="spearman") # we only want positive distances for the hierarchical clustering. # so square the correlatoins lexicalMeasures.cor = lexicalMeasures.cor^2 # convert to distances: # This computes Euclidean distance between # the rows of the cor matrix # That is, it computes the distance between the vector of correlations # for any two lexical measures lexicalMeasures.dist = dist(lexicalMeasures.cor) # this can now be used as input to hierarchical clustering # default linking method is "complete" lexicalMeasures.clust = hclust(lexicalMeasures.dist) # and visualize plclust(lexicalMeasures.clust) # the choice of linking method does make a difference, though par(mfrow = c(1,2)) plclust(hclust(lexicalMeasures.dist, method = "complete")) plclust(hclust(lexicalMeasures.dist, method = "ward")) # The package rms has a method that combines all the above steps # for an inspection of the relation between variables: # computing pairwise correlation, # and doing hierarchical clustering on the vectors # that hold the squared correlations for each variable library(rms) plot(varclus(as.matrix(lexicalMeasures[,-1]))) # hierarchical clustering: top-down # divisive library(cluster) pltree(diana(lexicalMeasures.dist)) # doing actual clustering with divisive clustering: # we again need to decide on the number of clusters cutree(diana(lexicalMeasures.dist), 5) # analyzing the clusters x = data.frame(measure = rownames(lexicalMeasures.cor), cluster = cutree(diana(lexicalMeasures.dist), 5), class = lexicalMeasuresClasses$Class) x = x[order(x$cluster),] x # Baayen also does divisive clustering to induce phylogeny trees plotnames = as.character(phylogeny$Language) plot(diana(dist(phylogeny[,3:ncol(phylogeny)], method = "binary")), labels = plotnames, cex = 0.8, which.plot = 2) |