## MIXED-EFFECTS REGRESSION LAB SESSION, MARCH 31, 2011, 9:15 - 11:00
## Martijn Wieling, http://www.martijnwieling.nl


# A0. Boot in Windows (not Linux)
# Start R (Start Menu -> RUG Menu -> Mathematics & Statistics -> R for Windows)
# Make sure the libraries lme, gamm4, languageR and Design are installed
# Menu: Packages -> Install package(s)...  -> Select mirror -> Select packages


# A1. These commands should work when the packages are installed
library(lme4)
library(gamm4)
library(languageR)
library(Design)


# A2. Now download the two files: 
# http://www.let.rug.nl/~wieling/R/myFunctions.R
# http://www.let.rug.nl/~wieling/R/dialectdata.rda
# And make sure that your R directory points to where these files are 
# located (File -> Change dir...)


# A3. load several custom functions which make the analysis easier
source('myFunctions.R')


# A4. load cleaned data set (subset of 48 words) 
load('dialectdata.rda')


# Note that predictors were standardized and log-transformed when necessary.
# The dependent (LD) was centered. When predictors correlated with each other,
# we decorrelated such predictors (e.g. population age and income). 
# 
# Standardization: scaledA = scale(A)
# Centering: centeredA = scale(A,scale=F)
# Log-transform: logA = log(A)
#                check if necessary with: plot(density(A))
# Decorrelation: newPredA = resid(lm(predA ~ predB,data=dialectdata))
#                must be positive: cor(dialectdata$predA,dialectdata$newPredA)
# ** if you need more information about the above procedure, you can ask me **


# A5. data investigation
head(dialectdata)
str(dialectdata)
nrow(dialectdata) # 19446 word-place combinations
plot(density(dialectdata$PopIncome)) # e.g., to investigate distributions


# A6. we model geography as the fitted values of a generalized additive model
geographyModel = gam(LD ~ s(Longitude,Latitude), data=dialectdata)
dialectdata$GeoGAM = fitted(geographyModel)


# A7. we can visualize the effect of geography using the function vis.gam
vis.gam(geographyModel, plot.type="contour", color="terrain", too.far=0.05, view=c("Longitude","Latitude"),main="General effect of Geography")


# A8. We create our first multiple regression model (no mixed-effects yet)
linearModel = lm(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength, data=dialectdata)


# A9. Look at the results of the model and note the significant variables
summary(linearModel)


###### MIXED-EFFECTS REGRESSION: RANDOM INTERCEPTS ######

# B1. Since we want our results to be generalizable across words, locations
# and transcribers, we add random intercepts for these factors.
# We start with a random intercept for Location: (1|Location) 
mixedModel1 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength + (1|Location), data=dialectdata)


# B2. look at the results, note that p-values are not present anymore, 
# but |t| > 1.65 equals p < 0.05 (one-tailed) and |t| > 2 equals p < 0.05
# (for a large data set)
print(mixedModel1,cor=F) # suppresses the correlation table (cor=F)


# B3. Test if the random intercept is necessary (significant improvement)
# note that simply 'anova' can be used when both models contain random effects
myAnova(linearModel,mixedModel1)


# B4. We now add random intercepts for Word and display the results
mixedModel2 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength + (1|Location) + (1|Word), data=dialectdata)
print(mixedModel2,cor=F)


# B5. To see how the intercepts for word vary, we can plot them
# Since there are 424 places, this was not sensible for Location
myInterceptPlot(dialectdata,"LD","PopAge","Word",mixedModel2)


# B6. We test if the inclusion of this random intercept is an improvement
anova(mixedModel1,mixedModel2)


# B7. Look a the t-values of the fixed effects of mixedModel2
# We clearly see that WordLength is not significant anymore, 
# which also becomes clear from the following test
# (investing 1 df does not give a better fit)
mixedModel2a = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength + (1|Location) + (1|Word), data=dialectdata, REML=F)
mixedModel2b = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio  + (1|Location) + (1|Word), data=dialectdata, REML=F)
anova(mixedModel2a,mixedModel2b)


# B8. we therefore exclude WordLength from the remaining analysis.
# We keep PopIncome, as we will include this factor in the random effects
mixedModel2 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word), data=dialectdata)
print(mixedModel2,cor=F)


# EXERCISE 1: add the random intercept for Transcriber and store the model in
#             the variable mixedModel3. Display the results of the model and test 
#             if the inclusion of this intercept is necessary. 


###### MIXED-EFFECTS REGRESSION: RANDOM SLOPES ######

# C1. We will test if the effect of population size varies per word
# We therefore add: (0+PopCnt|Word)
mixedModel4 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word) + (0+PopCnt|Word) + (1|Transcriber), data=dialectdata)
print(mixedModel4,cor=F)
anova(mixedModel3,mixedModel4)


# C2. To see how the slopes vary compared to the model slope (dashed lines)
# we can plot the random slopes (solid lines)
myXYPlot(dialectdata,"LD","PopCnt","Word",mixedModel3,mixedModel4)


# C3. Now we test if the correlation parameter between the by-word adjustment
# of the intercept and the by-word adjustments to the slope
# of population size is necessary by replacing (1|Word) + (0+PopCnt|Word) with
# (1+PopCnt|Word). Note that adding variables and intercepts
# in one group (between parentheses) will include their correlation.
mixedModel4b = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1+PopCnt|Word) + (1|Transcriber), data=dialectdata)
print(mixedModel4b,cor=F)
anova(mixedModel4,mixedModel4b) # not significant


# C4. We now add by-word adjustments to the slope of population income (separate),
# test the necessity of this addition and plot the slopes
mixedModel5 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word) + (0+PopCnt|Word) + (0+PopIncome|Word) + (1|Transcriber), data=dialectdata)
print(mixedModel5,cor=F)
anova(mixedModel4,mixedModel5)
myXYPlot(dialectdata,"LD","PopIncome","Word",mixedModel4,mixedModel5)


# C5. Now we test if the correlation parameter between the two by-word slope 
# adjustments is necessary
mixedModel6 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word) + (0+PopCnt+PopIncome|Word) + (1|Transcriber), data=dialectdata)
print(mixedModel6,cor=F)
anova(mixedModel5,mixedModel6)


# EXERCISE 2: Check if an uncorrelated random slope for PopAge is necessary
#             visualize the random slopes. 


# EXERCISE 3: Check if the random slope for PopAge should be correlated with
#             the other random slopes (this will take a few minutes to run)


# EXERCISE 4: Use your last model to see if a random slope
# for IsNoun is necessary to include. Also check if it has to be correlated
# with the random intercept. First think about if the slope of IsNoun varies
# per Word or Location?


# Of course there is a more complex random-effect structure possible,
# but we will not investigate it here further (by now, you should know how
# to do this though). 


###### INVESTIGATING BY-WORD AND BY-LOCATION RANDOM SLOPES ######

# D1. We can visualize the by-word random slopes of a variable as follows:
myCoefPlot(mixedModel10,"Word","PopAge")


# D2. We can visualize the by-word random intercept of a variable as follows:
myCoefPlot(mixedModel10,"Word","Intercept")


# D3. To visualize the correlational structure between the by-word random
# slopes for 2 variables, we proceed as follows (note the different function!):
myCoefPlot2(mixedModel10,"Word","PopAge","PopIncome")


# D4. To see the relationship between the random by-word intercepts and random 
# by-word slopes for PopCnt:
myCoefPlot2(mixedModel10,"Word","Intercept","PopCnt")


# D5. We can visualize the by-location random slopes in similar fashion
myCoefPlot(mixedModel10,"Location","IsNoun")


# D6. Of course, it is more informative to show this geographically
myCoefPlotGeo(mixedModel10,dialectdata,"IsNoun") 


# EXERCISE 5: visualize separately the by-word random slopes for 
#             PopCnt and PopIncome


# EXERCISE 6: visualize all remaining pairs of by-word random slopes


# EXERCISE 7: If interested, take a look at the files
#             datacreation.R and myFunctions.R located in
#             http://www.let.rug.nl/~wieling/R/


## FINAL REMARK: lmer can decide by itself if your design is nested or crossed,
# but you do need to use correct factor labels, i.e. having 8 schools with 4
# groups in each school, you have schoollabels S1 to S8, but need
# group-school labels S1G1, S1G2, ..., S8G4 as opposed to G1, ..., G4 
# (groups G1 in two schools are not the same)
# See also http://lme4.r-forge.r-project.org/book/ (Ch2.pdf).
# See Chapter 3 for a longitudinal design (repeated measures) and 
# Chapter 1 for a more general introduction to mixed models