Home‎ > ‎Schedule‎ > ‎

R commands for Feb 28

#############
# Baayen's demo of different degrees of correlation
library(languageR)
mvrnormplot.fnc(r = -0.4)
mvrnormplot.fnc(r = 0.1)
mvrnormplot.fnc(r = 0.6)
mvrnormplot.fnc(r = 0.95)


############
# Hinton's study time data
studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69))

plot(studytimedata[,1:2])

# we want a linear relation between study time and exam score.
# a line can be determined by its offset a (y-value at x=0)
# and its slope b
my.line = function(x, a, b) {
  a + b*x
}

# here are some guesses
xvals = 0:50
lines(xvals, my.line(xvals, 25, 1.1), col="blue")
lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")
lines(xvals, my.line(xvals, 24, 1.05), col="green")
lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")

# and here is the actual solution:
study.lm = lm(studytimedata$exam.score ~ studytimedata$study.time)
# we can extract the coefficients from study.lm using coef():
# both
coef(study.lm)
# just the intercept
coef(study.lm)[1]
# just the slope
coef(study.lm)[2]

# we can plot the line using the function my.line that we defined earlier
lines(xvals, my.line(xvals, coef(study.lm)[1], coef(study.lm)[2]), col = "red")
# or, more simply, using abline(),
# which is able to extract the coefficients from the study.lm object
abline(study.lm, col = "tomato")

#######
# We now study the output of lm() in more detail.
summary(study.lm)

# first come the residuals: the difference of predicted Y values and actual study time
# this is the same as the following:
res = studytimedata$exam.score - predict(study.lm)
summary(res)
# should be approximately normally distributed
plot(density(res))

# the residuals are also stored in the object
study.lm$residuals

# extracting fitted values
fitted(study.lm)
predict(study.lm)

# testing whether the slope is significantly different from zero:
# t.value = coefficient / stderror
t.value.studytime = 0.7446/0.2532

# degrees of freedom: N - 2
study.lm$df.residual

# 2-tailed t-test
2 * pt(t.value.studytime, lower.tail = F, df = study.lm$df.residual)

# How close are the points to the regression line?
# Standard distance from Y to regression line is
# residual standard error:
# sqrt( sumofsquares_error^* / N - 2)
sqrt(sum(study.lm$residuals**2) / study.lm$df.residual)

# R-squared: this is Pearson's R, squared
cor(studytimedata$study.time, studytimedata$exam.score)**2

##############
# Hinton, problems with regression: "smiling" example
hinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),
  sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))
plot(hinton.smile)

# a significant correlation when all items are taken into account
cor(hinton.smile$smile.time, hinton.smile$sold)

# no significant correlation when we leave out the last salesperson
cor(hinton.smile[-c(9),]$smile.time, hinton.smile[-c(9),]$sold)

# no correlation at all for the first 4 salespeople
cor(hinton.smile[1:4,]$smile.time, hinton.smile[1:4,]$sold)

# perfect negative correlation for participants 5-8
cor(hinton.smile[5:8,]$smile.time, hinton.smile[5:8,]$sold)


Comments