Home‎ > ‎Schedule‎ > ‎

R commands for April 3

# K-means clustering,
# made-up data

cl1 = matrix(rnorm(100, sd = 0.3), ncol = 2)
cl2 = matrix(rnorm(100, mean=1.5, sd = 0.3), ncol = 2)

plot(cl1, xlim = range(cl1, cl2), ylim = range(cl1, cl2))
points(cl2)

# combine the two datasets, and let's see
# if the clustering method can take them apart again
data = rbind(cl1, cl2)

clustering = kmeans(data, 2)
clustering

# how good is the clustering?
clustering$withinss
clustering$betweenss

# visualizing the results of clustering
plot(data, col = clustering$cluster)
points(clustering$centers, col = 1:2, pch = 8, cex = 2)

# for a more challenging dataset, set the second mean to 1.0,
# or try 3-4 clusters
# In that case do nstart = 25

# k-means clustering, now with real data
nyt =
read.table("/Users/katrinerk/Teaching/2012/spring/ald/assignments/hw1/nyt_opinion.txt",
header=T)

# content words only
nyt.cl = kmeans(nyt[,13:29], 4, nstart = 25)

# inspecting the clusters
nyt$cl = nyt.cl$cluster
xtabs(~nyt$cl + nyt$Author)
aggregate(nyt[,13:29], list(nyt$cl), sum)

# hierarchical clustering:bottom-up
# agglomerative
library(languageR)
head(lexicalMeasures)

# find pairwise correlations between measures
# in correlation matrix.
# use non-parametric, rank-based correlation: spearman
lexicalMeasures.cor = cor(lexicalMeasures[,-1], method="spearman")

# we only want positive distances for the hierarchical clustering.
# so square the correlatoins
lexicalMeasures.cor = lexicalMeasures.cor^2

# convert to distances:
# This computes Euclidean distance between
# the rows of the cor matrix
# That is, it computes the distance between the vector of correlations
# for any two lexical measures
lexicalMeasures.dist = dist(lexicalMeasures.cor)

# this can now be used as input to hierarchical clustering
# default linking method is "complete"

lexicalMeasures.clust = hclust(lexicalMeasures.dist)

# and visualize
plclust(lexicalMeasures.clust)

# the choice of linking method does make a difference, though
par(mfrow = c(1,2))
plclust(hclust(lexicalMeasures.dist, method = "complete"))
plclust(hclust(lexicalMeasures.dist, method = "ward"))

# The package rms has a method that combines all the above steps
# for an inspection of the relation between variables:
# computing pairwise correlation,
# and doing hierarchical clustering on the vectors
# that hold the squared correlations for each variable
library(rms)
plot(varclus(as.matrix(lexicalMeasures[,-1])))


# hierarchical clustering: top-down
# divisive
library(cluster)
pltree(diana(lexicalMeasures.dist))

# doing actual clustering with divisive clustering:
# we again need to decide on the number of clusters
cutree(diana(lexicalMeasures.dist), 5)

# analyzing the clusters
x = data.frame(measure = rownames(lexicalMeasures.cor),
cluster = cutree(diana(lexicalMeasures.dist), 5),
class = lexicalMeasuresClasses$Class)

x = x[order(x$cluster),]
x

# Baayen also does divisive clustering to induce phylogeny trees

plotnames = as.character(phylogeny$Language)
plot(diana(dist(phylogeny[,3:ncol(phylogeny)], method = "binary")),
labels = plotnames, cex = 0.8, which.plot = 2)

Comments