`#############` `# Baayen's demo of different degrees of correlation` `library(languageR)` `mvrnormplot.fnc(r = -0.4)` `mvrnormplot.fnc(r = 0.1)` `mvrnormplot.fnc(r = 0.6)` `mvrnormplot.fnc(r = 0.95)` `############` `# Hinton's study time data` `studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69))` `plot(studytimedata[,1:2])` `# we want a linear relation between study time and exam score.` `# a line can be determined by its offset a (y-value at x=0)` `# and its slope b` `my.line = function(x, a, b) {` ` a + b*x` `}` `# here are some guesses` `xvals = 0:50` `lines(xvals, my.line(xvals, 25, 1.1), col="blue")` `lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")` `lines(xvals, my.line(xvals, 24, 1.05), col="green")` `lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")` `# and here is the actual solution:` `study.lm = lm(studytimedata$exam.score ~ studytimedata$study.time)` `# we can extract the coefficients from study.lm using coef():` `# both` `coef(study.lm)` `# just the intercept` `coef(study.lm)[1]` `# just the slope` `coef(study.lm)[2]` `# we can plot the line using the function my.line that we defined earlier ` `lines(xvals, my.line(xvals, coef(study.lm)[1], coef(study.lm)[2]), col = "red")` `# or, more simply, using abline(),` `# which is able to extract the coefficients from the study.lm object` `abline(study.lm, col = "tomato")` `#######` `# We now study the output of lm() in more detail.` `summary(study.lm)` `# first come the residuals: the difference of predicted Y values and actual study time` `# this is the same as the following:` `res = studytimedata$exam.score - predict(study.lm)` `summary(res)` `# should be approximately normally distributed` `plot(density(res))` `# the residuals are also stored in the object` `study.lm$residuals` `# extracting fitted values` `fitted(study.lm)` `predict(study.lm)` `# testing whether the slope is significantly different from zero:` `# t.value = coefficient / stderror` `t.value.studytime = 0.7446/0.2532` `# degrees of freedom: N - 2` `study.lm$df.residual` `# 2-tailed t-test` `2 * pt(t.value.studytime, lower.tail = F, df = study.lm$df.residual)` `# How close are the points to the regression line?` `# Standard distance from Y to regression line is ` `# residual standard error:` `# sqrt( sumofsquares_error^* / N - 2)` `sqrt(sum(study.lm$residuals**2) / study.lm$df.residual)` `# R-squared: this is Pearson's R, squared` `cor(studytimedata$study.time, studytimedata$exam.score)**2` `##############` `# Hinton, problems with regression: "smiling" example` `hinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),` ` sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))` `plot(hinton.smile)` `# a significant correlation when all items are taken into account` `cor(hinton.smile$smile.time, hinton.smile$sold)` `# no significant correlation when we leave out the last salesperson` `cor(hinton.smile[-c(9),]$smile.time, hinton.smile[-c(9),]$sold)` `# no correlation at all for the first 4 salespeople` `cor(hinton.smile[1:4,]$smile.time, hinton.smile[1:4,]$sold)` `# perfect negative correlation for participants 5-8` `cor(hinton.smile[5:8,]$smile.time, hinton.smile[5:8,]$sold)` |