Home‎ > ‎Schedule‎ > ‎

R commands for March 6

# Regression Equation
# Y = Intercept + Slope*X

studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69))
score.studytime.lm = lm(exam.score ~ study.time,data=studytimedata)

#Understanding the output of lm
#Understanding R2

score.studytime_SSres = sum(residuals(score.studytime.lm)^2)
SStot = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2)
score.studytime.r2 = 1 - (score.studytime_SSres/SStot)

#Understanding the F statistic

# F-test for model evaluation

F = (SStot - score.studytime_SSres / 1) / (score.studytime_SSres/8)
plot(density(rf(100,1,7)))
1- pf(F,1,8)

# Can also be calculated using anova command

null_model = lm(exam.score ~ 1,data=studytimedata)
anova(null_model,score.studytime.lm)

# Multiple Regression Equation
# Y = Intercept + Slope1*X1 + Slope2*X2

studytimedata.mreg = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),
study.time = c(40,43,18,10,25,33,27,17,30,47),
exam.score=c(58,73,56,47,58,54,45,32,68,69),
iq = c(118,128,110,114,138,120,106,124,132,130))

score.iq.studytime.lm = lm(exam.score ~ iq + study.time,data=studytimedata.mreg)

# Score is related to both study time and intelligence.  What variance is uniquely explained by each? The problem can be well represented by a Venn diagram

#Teasing the relationships apart

# A first kind of analysis is a "semi-partial correlation" and it leaves the  outcome variable (score in this case) unchanged so it is good for prediction.

# Using separate models

residualstudytime = residuals(lm(study.time ~ iq,data=studytimedata.mreg))
cor(studytimedata$exam.score,residualstudytime)

# Can also be done with single regression model

score.iq.lm = lm(exam.score ~ iq,data=studytimedata.mreg)
score.iq_SSres = sum(residuals(score.iq.lm)^2)
SStot = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2)
score.iq.r2 = 1 - (score.iq_SSres/SStot)

score.iq.studytime_SSres = sum(residuals(score.iq.studytime.lm)^2)
SStot = sum((studytimedata.mreg$exam.score - mean(studytimedata.mreg$exam.score))^2)
score.iq.studytime.r2 = 1 - (score.iq.studytime_SSres/SStot)

SemiPartialR2 = score.iq.studytime.r2 - score.iq.r2

# If we want to quantify the proportion of exam score that is explained once IQ is completely removed from the picture, we can do a full partial correlation. As this transforms the outcome variable it is not so useful for prediction.

residualscore = residuals(lm(exam.score ~ iq,data=studytimedata.mreg))
cor(residualscore,residualstudytime)

#demo of how adding variables to data can spuriously improve fit. 

var1 = rnorm(100)
var2 = rnorm(100)
var3 = rnorm(100)
var4 = rnorm(100)
var5 = rnorm(100)

summary(lm(var1 ~ var2))
summary(lm(var1 ~ var2 + var3))
summary(lm(var1 ~ var2 + var3 + var4))
summary(lm(var1 ~ var2 + var3 + var4 + var5))

#Adjusted R-squared corrects for this.

df = 9
numberofpredictors = 2
score.iq.studytime.adjustedR2 = 1 - (score.iq.studytime_SSres/SStot) * (df/(df-numberofpredictors))

# F-test for model comparison
F = ((score.iq_SSres - score.iq.studytime_SSres)/1) / (score.iq.studytime_SSres/7)
1- pf(F,1,7)

# Also possible using anova command
anova(score.iq.lm,score.iq.studytime.lm)


Comments