# Regression Equation # Y = Intercept + Slope*X studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69)) score.studytime.lm = lm(exam.score ~ study.time,data=studytimedata) #Understanding the output of lm #Understanding R2 score.studytime_SSres = sum(residuals(score.studytime.lm)^2) SStot = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2) score.studytime.r2 = 1 - (score.studytime_SSres/SStot) #Understanding the F statistic # F-test for model evaluation F = (SStot - score.studytime_SSres / 1) / (score.studytime_SSres/8) plot(density(rf(100,1,7))) 1- pf(F,1,8) # Can also be calculated using anova command null_model = lm(exam.score ~ 1,data=studytimedata) anova(null_model,score.studytime.lm) # Multiple Regression Equation # Y = Intercept + Slope1*X1 + Slope2*X2 studytimedata.mreg = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10), study.time = c(40,43,18,10,25,33,27,17,30,47), exam.score=c(58,73,56,47,58,54,45,32,68,69), iq = c(118,128,110,114,138,120,106,124,132,130)) score.iq.studytime.lm = lm(exam.score ~ iq + study.time,data=studytimedata.mreg) # Score is related to both study time and intelligence. What variance is uniquely explained by each? The problem can be well represented by a Venn diagram #Teasing the relationships apart # A first kind of analysis is a "semi-partial correlation" and it leaves the outcome variable (score in this case) unchanged so it is good for prediction. # Using separate models residualstudytime = residuals(lm(study.time ~ iq,data=studytimedata.mreg)) cor(studytimedata$exam.score,residualstudytime) # Can also be done with single regression model score.iq.lm = lm(exam.score ~ iq,data=studytimedata.mreg) score.iq_SSres = sum(residuals(score.iq.lm)^2) SStot = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2) score.iq.r2 = 1 - (score.iq_SSres/SStot) score.iq.studytime_SSres = sum(residuals(score.iq.studytime.lm)^2) SStot = sum((studytimedata.mreg$exam.score - mean(studytimedata.mreg$exam.score))^2) score.iq.studytime.r2 = 1 - (score.iq.studytime_SSres/SStot) SemiPartialR2 = score.iq.studytime.r2 - score.iq.r2 # If we want to quantify the proportion of exam score that is explained once IQ is completely removed from the picture, we can do a full partial correlation. As this transforms the outcome variable it is not so useful for prediction. residualscore = residuals(lm(exam.score ~ iq,data=studytimedata.mreg)) cor(residualscore,residualstudytime) #demo of how adding variables to data can spuriously improve fit. var1 = rnorm(100) var2 = rnorm(100) var3 = rnorm(100) var4 = rnorm(100) var5 = rnorm(100) summary(lm(var1 ~ var2)) summary(lm(var1 ~ var2 + var3)) summary(lm(var1 ~ var2 + var3 + var4)) summary(lm(var1 ~ var2 + var3 + var4 + var5)) #Adjusted R-squared corrects for this. df = 9 numberofpredictors = 2 score.iq.studytime.adjustedR2 = 1 - (score.iq.studytime_SSres/SStot) * (df/(df-numberofpredictors)) # F-test for model comparison F = ((score.iq_SSres - score.iq.studytime_SSres)/1) / (score.iq.studytime_SSres/7) 1- pf(F,1,7) # Also possible using anova command anova(score.iq.lm,score.iq.studytime.lm) |