Home‎ > ‎Schedule‎ > ‎

### R commands for March 6

 # Regression Equation# Y = Intercept + Slope*Xstudytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69))score.studytime.lm = lm(exam.score ~ study.time,data=studytimedata)#Understanding the output of lm#Understanding R2score.studytime_SSres = sum(residuals(score.studytime.lm)^2)SStot = sum((studytimedata\$exam.score - mean(studytimedata\$exam.score))^2)score.studytime.r2 = 1 - (score.studytime_SSres/SStot)#Understanding the F statistic# F-test for model evaluationF = (SStot - score.studytime_SSres / 1) / (score.studytime_SSres/8)plot(density(rf(100,1,7)))1- pf(F,1,8)# Can also be calculated using anova commandnull_model = lm(exam.score ~ 1,data=studytimedata)anova(null_model,score.studytime.lm)# Multiple Regression Equation# Y = Intercept + Slope1*X1 + Slope2*X2studytimedata.mreg = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),study.time = c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69),iq = c(118,128,110,114,138,120,106,124,132,130))score.iq.studytime.lm = lm(exam.score ~ iq + study.time,data=studytimedata.mreg)# Score is related to both study time and intelligence.  What variance is uniquely explained by each? The problem can be well represented by a Venn diagram#Teasing the relationships apart# A first kind of analysis is a "semi-partial correlation" and it leaves the  outcome variable (score in this case) unchanged so it is good for prediction.# Using separate modelsresidualstudytime = residuals(lm(study.time ~ iq,data=studytimedata.mreg))cor(studytimedata\$exam.score,residualstudytime)# Can also be done with single regression modelscore.iq.lm = lm(exam.score ~ iq,data=studytimedata.mreg)score.iq_SSres = sum(residuals(score.iq.lm)^2)SStot = sum((studytimedata\$exam.score - mean(studytimedata\$exam.score))^2)score.iq.r2 = 1 - (score.iq_SSres/SStot)score.iq.studytime_SSres = sum(residuals(score.iq.studytime.lm)^2)SStot = sum((studytimedata.mreg\$exam.score - mean(studytimedata.mreg\$exam.score))^2)score.iq.studytime.r2 = 1 - (score.iq.studytime_SSres/SStot)SemiPartialR2 = score.iq.studytime.r2 - score.iq.r2# If we want to quantify the proportion of exam score that is explained once IQ is completely removed from the picture, we can do a full partial correlation. As this transforms the outcome variable it is not so useful for prediction.residualscore = residuals(lm(exam.score ~ iq,data=studytimedata.mreg))cor(residualscore,residualstudytime)#demo of how adding variables to data can spuriously improve fit. var1 = rnorm(100)var2 = rnorm(100)var3 = rnorm(100)var4 = rnorm(100)var5 = rnorm(100)summary(lm(var1 ~ var2))summary(lm(var1 ~ var2 + var3))summary(lm(var1 ~ var2 + var3 + var4))summary(lm(var1 ~ var2 + var3 + var4 + var5))#Adjusted R-squared corrects for this.df = 9numberofpredictors = 2score.iq.studytime.adjustedR2 = 1 - (score.iq.studytime_SSres/SStot) * (df/(df-numberofpredictors))# F-test for model comparisonF = ((score.iq_SSres - score.iq.studytime_SSres)/1) / (score.iq.studytime_SSres/7)1- pf(F,1,7)# Also possible using anova commandanova(score.iq.lm,score.iq.studytime.lm)