## NWAV 42, October 17, 2013
## Martijn Wieling, http://www.martijnwieling.nl
## Harald Baayen, http://www.sfs.uni-tuebingen.de/~hbaayen/

# A0. Start R 3.0.2
# Start R and make sure the libraries mgcv, lme4, car and rms are installed
# Menu: Packages -> Install package(s)...  -> Select mirror -> Select packages
# or: 
# install.packages(c("mgcv","lme4","car","rms"),repos='http://cran.r-project.org')

# A1. These commands should work when the packages are installed
library(mgcv)
library(lme4)
library(car)
library(rms)

# A2. Now download the following files and load the custom functions and data
download.file('http://www.let.rug.nl/wieling/NWAV/files/myFunctions.R', 'myFunctions.R')
download.file('http://www.let.rug.nl/wieling/NWAV/files/dialectdata.rda', 'dialectdata.rda')

source('myFunctions.R') # custom functions
load('dialectdata.rda') # Dutch dialect data, subset of 48 words

# Note that predictors were standardized and log-transformed when necessary.
# The dependent (LD) was centered. When predictors correlated with each other,
# we decorrelated such predictors (e.g. population age and income). 
# 
# Standardization: scaledA = scale(A)
# Centering: centeredA = scale(A,scale=F)
# Log-transform: logA = log(A)
#                check if necessary with: plot(density(A))
# Decorrelation: newPredA = resid(lm(predA ~ predB, data=dialectdata))
#                must be positive: cor(dialectdata$predA,dialectdata$newPredA)

# EXERCISE 1: Investigate the data using head, str, summary
# e.g., str(dialectdata) will show you the structure of the dataset


##### GENERALIZED ADDITIVE MODELING

# B1. our first generalized additive model, 
# Longitude and Latitude have the same scale, so we use s()
model1 = gam(LD ~ s(Longitude,Latitude), data=dialectdata)
summary(model1)

# B2. let's test the speed difference between bam and gam
# For large datasets and complex analysis bam is quicker,
# but here gam will be quicker as the analysis is simple
system.time(model1 <- bam(LD ~ s(Longitude,Latitude), data=dialectdata))
system.time(model1 <- gam(LD ~ s(Longitude,Latitude), data=dialectdata))

# B3. we can visualize the effect of geography using the function vis.gam
vis.gam(model1, plot.type="contour", color="terrain", too.far=0.05, 
view=c("Longitude","Latitude"),main="General effect of Geography")

# B4. results with s() use thin plate regression splines as basis functions 
# and these are generally better (and slower!) than the cubic regression 
# splines default used by te()
model1b = gam(LD ~ te(Longitude,Latitude,d=c(2)), data=dialectdata)
summary(model1b)

# B5. comparing gam models via Akaike Information Criterion (AIC): 
# a more complex model should be at least 2 units lower than the simpler one, 
# if we want to consider it a real improvement 
AIC(model1) - AIC(model1b) # negative value, so model1 is better (AIC lower)!

# B6. anova can also be used for comparing models, but only when no random-effect 
# structure is present, otherwise AIC should be used.
anova(model1b,model1,test='F') # the model with s() is better

# B7. We can investigate if the influence of geography differs for different 
# values of word frequency d=c(2,1) indicates that Longitude and Latitude 
# (the first two parameters) are on the same scale, but the next one (1) is not
model2 = gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata)
summary(model2)
anova(model1,model2) 
AIC(model1) - AIC(model2) # model2 is better as the AIC is lower of model2

# B8. We can investigate the geographical pattern for geography varying by word frequency. 
#     Note that WordFreq is z-transformed and -2 indicates low-frequency words 
#     (2 SD below the mean) and +2 indicates high-frequency words
# REMARK: these results seem very different from those reported in the PLOS ONE paper
#         but remember that this is only a random subset of 48 words (out of > 550)
par(mfrow=c(1,3)) # show 3 plots next to each other
# in cond=list() the values you'd like to see can be specified 
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=-2),view=c("Longitude","Latitude"),
	main="Effect of Geography for LF words")
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=0),view=c("Longitude","Latitude"),
	main="Effect of Geography for MF words")
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=+2),view=c("Longitude","Latitude"),
	main="Effect of Geography for HF words")

# EXERCISE 2: Create a model to investigate the varying effect of geography 
#             only depending on population age (name the model: model3)

# EXERCISE 3: Visualize the results of the model

# EXERCISE 4: Create a model (use bam!) to investigate the geographic variability 
#             in the influence of word frequency and population age. 
#             Name the model: model4.
#     Hint, make sure to specify that Longitude and Latitude are on the same scale,
#           but word frequency and population age each have their own scale
#           To reduce memory requirements use the option: gc.level=2 for bam
#     Depending on the speed of your laptop this might take a few minutes 
#     (on my laptop: 70 seconds)

# (OPTIONAL) EXERCISE 5: rerun the model using gam to see the speed-difference.

# B9. We can investigate the pattern of model4 by making a more complex series of plots
par(mfrow=c(2,2)) # show 2x2 plots 

# in cond=list() the values you'd like to see can be specified 
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=-2,PopAge=-2),view=c("Longitude","Latitude"),
        main="Geography: LF + Young")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=+2,PopAge=-2),view=c("Longitude","Latitude"),
        main="Geography: HF + Young")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=-2,PopAge=2),view=c("Longitude","Latitude"),
        main="Geography: LF + Old ")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=+2,PopAge=2),view=c("Longitude","Latitude"),
        main="Geography: HF + Old")

# B11. Now we can test if other variables should be added (not interacting 
# with geography). For efficiency purposes, we continue working with model2). 
# We first test population count:
model5 = gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt, data=dialectdata)
summary(model5)
anova(model2,model5,test='F') # model5 is better
AIC(model2) - AIC(model5)

# EXERCISE 6: What is the interpretation of the effect of population count?

# EXERCISE 7: Test some other potential variables for inclusion 
#             (make sure you have enough time for part C, below.)


##### Generalized Additive Mixed-Effects Regression

# C1. We first test if we need a random intercept for word (we use bam for speed)
model6 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                  + PopCnt + s(Word,bs="re"), data=dialectdata)

# C2. We can see immediately if the random intercept is necessary via the summary:
summary(model6)

# C3. We can visualize the random intercepts:
par(mfrow=c(1,1)) # only one plot in a window
myCoefPlot.gam(model6,"Word","Intercept")

# C4. The following illustrates that anova does not work when having random effects
# model6b is more complex than model6 as it contains one additional parameter
# but anova shows that model6b is actually a little bit less complex.
# This is caused by the implementation of the random intercepts and slopes in gam/bam
# which have their own degrees of freedom. Adding a lexical variable will likely reduce 
# the number of degrees of freedom needed for the by-word random intercepts. 
# So: do not use anova to compare models when they have a random-effects structure.
# Instead, use AIC to evaluate the models and look at the significance of the variables
# (an AIC reduction of at least 2 indicates a significant improvement)
model6b = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + VCratio + s(Word,bs="re"),
                   data=dialectdata)
anova(model6,model6b,test='F')

# As indicated, AIC's can be usefully compared
AIC(model6) - AIC(model6b) # positive, but less than two, so no real improvement

# C5. We include a random slope for population count per word
model7 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                  + PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re"), data=dialectdata)
summary(model7)

# C6. We can visualize the random slopes:
myCoefPlot.gam(model7,"Word","PopCnt")

# EXERCISE 8: Add a random intercept for Location and visualize the adaptations

# EXERCISE 9: Test if other variables need to be added as predictors to the model, 
#             and test them also as random slopes. Think carefully about if you 
#             need to test them as random slopes per word or location. 

# FINAL REMARK: note that the (geographical) results are different than reported in the paper
# (http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0023613). This is due to 
# the relatively small dataset (containing less than 10% of the words).