# Doing Principal Component Analysis # on the affixProductivity data: # Finding latent groups of texts # that behave similarly in terms of affix productivity affixes.pr = prcomp(affixProductivity[, 1:(ncol(affixProductivity)-3)]) names(affixes.pr) summary(affixes.pr) # proportion of variance for each principal component: # squared sdev sivided by sum of squared standard deviations affixes.pr$sdev affixes.pr$sdev^2 / sum(affixes.pr$sdev^2) # visualization, to decide on which principal components to keep plot(affixes.pr) #viewing the coordinates of 4 books on the first 3 dimensions affixes.pr$x[c("Mormon", "Austen", "Carroll", "Gao"), 1:3] # Interpreting principal components # by visualizing different genres on the relevant components # # affixProductivity$Registers: Biblical, Children's books, Literary texts, Other library(lattice) super.sym = trellis.par.get("superpose.symbol") # splom: conditional scatter plot matrices splom(data.frame(affixes.pr$x[,1:3]), groups = affixProductivity$Registers, panel = panel.superpose, key =list( title = "texts in productivity space", text = list(c("Religious", "Children", "Literary", "Other")), points = list(pch = super.sym$pch[1:4], col = super.sym$col[1:4]))) # Loadings of affixes: # how strongly is each affix associated with each principal component? affixes.pr$rotation[1:10, 1:3] # Visualizing clsuterings of texts and clusterings of affixes in the same plot biplot(affixes.pr, scale = 0, var.axes = F, col = c("darkgrey", "black"), cex = c(0.9, 1.2)) # The effects of scaling affixes.pr.sc = prcomp(affixProductivity[ ,1:27], scale = T, center = T) splom(data.frame(affixes.pr.sc$x[,1:3]), groups = affixProductivity$Registers, panel = panel.superpose, key =list( title = "texts in productivity space", text = list(c("Religious", "Children", "Literary", "Other")), points = list(pch = super.sym$pch[1:4], col = super.sym$col[1:4]))) biplot(affixes.pr.sc, scale = 0, var.axes = F, col = c("darkgrey", "black"), cex = c(0.9, 1.2)) ##################################### # collinearity tests head(english) colnames(english) # we can only use numeric predictors. others not allowed # in collin.fnc # first argument: data frame # second argument: column indices to be used collin.fnc(english[english$AgeSubject == "young",], 7:29)$cnumber # high collinearity, so we have a problem # again inspecting the variables using hierarchical clustering library(rms) grouped.predictors = varclus(as.matrix(english[english$AgeSubject == "young", 7:29])) plot(grouped.predictors) # transforming variables 18:27 of the 'english' dataset, # which have high collinearity, # into a new set of orthogonal variables # using PCA collin.fnc(english[english$AgeSubject == "young",], 18:27)$cnumber items = english[english$AgeSubject == "young",] # note: PCA with scaling! items.pca = prcomp(items[, 18:27], center = T, scale = T) summary(items.pca) # inspecting rotation matrix to inspect relation between # original and new variables # Component 4 seems to distinguish type and token measures x = as.data.frame(items.pca$rotation[,1:4]) x[order(x$PC4),] # no collinearity between principal components collin.fnc(items.pca$x ,1:4)$cnumber |