Home‎ > ‎Schedule‎ > ‎

R commands for March 20

# Logit: logarithm of odds,
# where the odds of a probability p is p/(1-p)
#
# The logit is
# f(p) = log( p / (1-p) )
logit = function(p) { log(p/(1-p)) }

# Note the sigmoid shape of the plot
xvals = seq(0.001,.999,.001)
plot(xvals,logit(xvals), type="l")

# The inverse logit function (aka logistic function, aka sigmoid
# function) maps any value into a value between 0 and 1.
# Note: exp(x) is e (the Euler number) to the power of x
#
# The logistic function is
# f(x) = exp(x) / (exp(x) + 1) = 1 / (1 + exp(-x))
invlogit = function(x) { 1/(1+exp(-x)) }

# logit and invlogit are inverse functions:
logit(0.8)
invlogit(1.386294)


# again, note the sigmoid shape of the plot
xvals = -6:6
plot(xvals,invlogit(xvals),type="l")

# Now remember linear regression: We model a dependent variable Y as
# linearly dependent on a predictor X:
#
# Y = Beta_0 + Beta_1 X
#
# Beta_0 is the intercept, and Beta_1 is the slope.
#
# When we do logistic regression, we predict Y to be
# Y = invlogit(Beta_0 + Beta_1 X)
#
# where invlogit is the logistic function.
# So Y will be a value between 0 and 1.

## An example using a languageR dataset.
# regularity: languageR dataset containing information on
# regular and irregular Dutch verbs.
# regularity$Regularity is a factor with values "regular", "irregular".
# We map it to a number, either 0 (for "irregular") or 1 (for "regular").
library(languageR)
reg = regularity
reg$Regularity = as.numeric(regularity$Regularity=="regular")

# Relation between regularity and written frequency of a verb:
# irregular verbs tend to be more frequent
plot(Regularity~WrittenFrequency,data=reg)

# Regularity has two values, 0 or 1. First, we try to fit a linear model:
reg.lm = lm(Regularity~WrittenFrequency,data=reg)
summary(reg.lm)


# We plot regularity and written frequency again, and
# add the fitted model as a line.
# This can be interpreted as a boundary line between items that the model predicts
# to be regular and items that the model predicts to be irregular
plot(Regularity~WrittenFrequency,data=reg)
abline(reg.lm)

# Now we use logistic regression.
# Parameters are estimated using maximum likelihood estimation
# NEW: using library rms instead of the outdated library Design
# used in the Baayen book
library(rms)
reg.dd = datadist(reg)
options(datadist="reg.dd")

reg.lrm = lrm(Regularity~WrittenFrequency,data=reg)
# inspect by just typing reg.lrm
# We see that the coefficient for WrittenFrequency is negative,
# which means that the higher WrittenFrequency, the more likely Regularity is going to be
# near zero (that is, the more likely Regularity is going to be "irregular")

# We use this model to get predictions for Regularity
# for the WrittenFrequency values in the dataset
reg.predicted = predict(reg.lrm)
# When we inspect the results, we see that what we get is
# Beta_0 + Beta_1 X, rather than invlogit(Beta_0 + Beta_1 X)
range(reg.predicted)

# Compare:
# > reg.predicted[1]
#       1
# 4.278845
# > coef(reg.lrm)[1] + coef(reg.lrm)[2] * reg[1,]$WrittenFrequency
# Intercept
#  4.278845
# > range(invlogit(reg.predicted))
# [1] 0.1029522 0.9944239

# If we plot those, we get the same line as when we use abline:
plot(Regularity~WrittenFrequency,data=reg)
abline(reg.lrm)
points(reg$WrittenFrequency, predict(reg.lrm), col="blue")

# Now let's plot the invlogit() of these points
points(reg$WrittenFrequency, invlogit(predict(reg.lrm)), col="orange")

Comments