Home‎ > ‎Schedule‎ > ‎

R commands for April 5

# Doing Principal Component Analysis
# on the affixProductivity data:
# Finding latent groups of texts
# that behave similarly in terms of affix productivity

affixes.pr = prcomp(affixProductivity[,
1:(ncol(affixProductivity)-3)])

names(affixes.pr)

summary(affixes.pr)

# proportion of variance for each principal component:
# squared sdev sivided by sum of squared standard deviations

affixes.pr$sdev
affixes.pr$sdev^2 / sum(affixes.pr$sdev^2)

# visualization, to decide on which principal components to keep
plot(affixes.pr)

#viewing the coordinates of 4 books on the first 3 dimensions
affixes.pr$x[c("Mormon", "Austen", "Carroll", "Gao"), 1:3]

# Interpreting principal components
# by visualizing different genres on the relevant components
#
# affixProductivity$Registers: Biblical, Children's books, Literary texts, Other
library(lattice)
super.sym = trellis.par.get("superpose.symbol")

# splom: conditional scatter plot matrices

splom(data.frame(affixes.pr$x[,1:3]),
groups = affixProductivity$Registers,
panel = panel.superpose,
key =list(
title = "texts in productivity space", text = list(c("Religious", "Children",
"Literary", "Other")),
points = list(pch = super.sym$pch[1:4],
col = super.sym$col[1:4])))

# Loadings of affixes:
# how strongly is each affix associated with each principal component?

affixes.pr$rotation[1:10, 1:3]

# Visualizing clsuterings of texts and clusterings of affixes in the same plot
biplot(affixes.pr, scale = 0, var.axes = F,
col = c("darkgrey", "black"), cex = c(0.9, 1.2))

# The effects of scaling
affixes.pr.sc = prcomp(affixProductivity[ ,1:27], scale = T, center =
T)

splom(data.frame(affixes.pr.sc$x[,1:3]),
groups = affixProductivity$Registers,
panel = panel.superpose,
key =list(
title = "texts in productivity space", text = list(c("Religious", "Children",
"Literary", "Other")),
points = list(pch = super.sym$pch[1:4],
col = super.sym$col[1:4])))

biplot(affixes.pr.sc, scale = 0, var.axes = F,
col = c("darkgrey", "black"), cex = c(0.9, 1.2))

#####################################
# collinearity tests
head(english)
colnames(english)

# we can only use numeric predictors. others not allowed
# in collin.fnc
# first argument: data frame
# second argument: column indices to be used
collin.fnc(english[english$AgeSubject == "young",], 7:29)$cnumber

# high collinearity, so we have a problem

# again inspecting the variables using hierarchical clustering
library(rms)
grouped.predictors = varclus(as.matrix(english[english$AgeSubject ==
"young", 7:29]))
plot(grouped.predictors)

# transforming variables 18:27 of the 'english' dataset,
# which have high collinearity,
# into a new set of orthogonal variables
# using PCA
collin.fnc(english[english$AgeSubject == "young",], 18:27)$cnumber

items = english[english$AgeSubject == "young",]

# note: PCA with scaling!
items.pca = prcomp(items[, 18:27], center = T, scale = T)

summary(items.pca)

# inspecting rotation matrix to inspect relation between
# original and new variables
# Component 4 seems to distinguish type and token measures
x = as.data.frame(items.pca$rotation[,1:4])
x[order(x$PC4),]

# no collinearity between principal components
collin.fnc(items.pca$x ,1:4)$cnumber

Comments