## NWAV 42, October 17, 2013 ## Martijn Wieling, http://www.martijnwieling.nl ## Harald Baayen, http://www.sfs.uni-tuebingen.de/~hbaayen/ # A0. Start R 3.0.2 # Start R and make sure the libraries mgcv, lme4, car and rms are installed # Menu: Packages -> Install package(s)... -> Select mirror -> Select packages # or: # install.packages(c("mgcv","lme4","car","rms"),repos='http://cran.r-project.org') # A1. These commands should work when the packages are installed library(mgcv) library(lme4) library(car) library(rms) # A2. Now download the following files and load the custom functions and data download.file('http://www.let.rug.nl/wieling/NWAV/files/myFunctions.R', 'myFunctions.R') download.file('http://www.let.rug.nl/wieling/NWAV/files/dialectdata.rda', 'dialectdata.rda') source('myFunctions.R') # custom functions load('dialectdata.rda') # Dutch dialect data, subset of 48 words # Note that predictors were standardized and log-transformed when necessary. # The dependent (LD) was centered. When predictors correlated with each other, # we decorrelated such predictors (e.g. population age and income). # # Standardization: scaledA = scale(A) # Centering: centeredA = scale(A,scale=F) # Log-transform: logA = log(A) # check if necessary with: plot(density(A)) # Decorrelation: newPredA = resid(lm(predA ~ predB, data=dialectdata)) # must be positive: cor(dialectdata$predA,dialectdata$newPredA) # EXERCISE 1: Investigate the data using head, str, summary # e.g., str(dialectdata) will show you the structure of the dataset ##### GENERALIZED ADDITIVE MODELING # B1. our first generalized additive model, # Longitude and Latitude have the same scale, so we use s() model1 = gam(LD ~ s(Longitude,Latitude), data=dialectdata) summary(model1) # B2. let's test the speed difference between bam and gam # For large datasets and complex analysis bam is quicker, # but here gam will be quicker as the analysis is simple system.time(model1 <- bam(LD ~ s(Longitude,Latitude), data=dialectdata)) system.time(model1 <- gam(LD ~ s(Longitude,Latitude), data=dialectdata)) # B3. we can visualize the effect of geography using the function vis.gam vis.gam(model1, plot.type="contour", color="terrain", too.far=0.05, view=c("Longitude","Latitude"),main="General effect of Geography") # B4. results with s() use thin plate regression splines as basis functions # and these are generally better (and slower!) than the cubic regression # splines default used by te() model1b = gam(LD ~ te(Longitude,Latitude,d=c(2)), data=dialectdata) summary(model1b) # B5. comparing gam models via Akaike Information Criterion (AIC): # a more complex model should be at least 2 units lower than the simpler one, # if we want to consider it a real improvement AIC(model1) - AIC(model1b) # negative value, so model1 is better (AIC lower)! # B6. anova can also be used for comparing models, but only when no random-effect # structure is present, otherwise AIC should be used. anova(model1b,model1,test='F') # the model with s() is better # B7. We can investigate if the influence of geography differs for different # values of word frequency d=c(2,1) indicates that Longitude and Latitude # (the first two parameters) are on the same scale, but the next one (1) is not model2 = gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata) summary(model2) anova(model1,model2) AIC(model1) - AIC(model2) # model2 is better as the AIC is lower of model2 # B8. We can investigate the geographical pattern for geography varying by word frequency. # Note that WordFreq is z-transformed and -2 indicates low-frequency words # (2 SD below the mean) and +2 indicates high-frequency words # REMARK: these results seem very different from those reported in the PLOS ONE paper # but remember that this is only a random subset of 48 words (out of > 550) par(mfrow=c(1,3)) # show 3 plots next to each other # in cond=list() the values you'd like to see can be specified vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=-2),view=c("Longitude","Latitude"), main="Effect of Geography for LF words") vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=0),view=c("Longitude","Latitude"), main="Effect of Geography for MF words") vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=+2),view=c("Longitude","Latitude"), main="Effect of Geography for HF words") # EXERCISE 2: Create a model to investigate the varying effect of geography # only depending on population age (name the model: model3) # EXERCISE 3: Visualize the results of the model # EXERCISE 4: Create a model (use bam!) to investigate the geographic variability # in the influence of word frequency and population age. # Name the model: model4. # Hint, make sure to specify that Longitude and Latitude are on the same scale, # but word frequency and population age each have their own scale # To reduce memory requirements use the option: gc.level=2 for bam # Depending on the speed of your laptop this might take a few minutes # (on my laptop: 70 seconds) # (OPTIONAL) EXERCISE 5: rerun the model using gam to see the speed-difference. # B9. We can investigate the pattern of model4 by making a more complex series of plots par(mfrow=c(2,2)) # show 2x2 plots # in cond=list() the values you'd like to see can be specified vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=-2,PopAge=-2),view=c("Longitude","Latitude"), main="Geography: LF + Young") vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=+2,PopAge=-2),view=c("Longitude","Latitude"), main="Geography: HF + Young") vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=-2,PopAge=2),view=c("Longitude","Latitude"), main="Geography: LF + Old ") vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=+2,PopAge=2),view=c("Longitude","Latitude"), main="Geography: HF + Old") # B11. Now we can test if other variables should be added (not interacting # with geography). For efficiency purposes, we continue working with model2). # We first test population count: model5 = gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt, data=dialectdata) summary(model5) anova(model2,model5,test='F') # model5 is better AIC(model2) - AIC(model5) # EXERCISE 6: What is the interpretation of the effect of population count? # EXERCISE 7: Test some other potential variables for inclusion # (make sure you have enough time for part C, below.) ##### Generalized Additive Mixed-Effects Regression # C1. We first test if we need a random intercept for word (we use bam for speed) model6 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + s(Word,bs="re"), data=dialectdata) # C2. We can see immediately if the random intercept is necessary via the summary: summary(model6) # C3. We can visualize the random intercepts: par(mfrow=c(1,1)) # only one plot in a window myCoefPlot.gam(model6,"Word","Intercept") # C4. The following illustrates that anova does not work when having random effects # model6b is more complex than model6 as it contains one additional parameter # but anova shows that model6b is actually a little bit less complex. # This is caused by the implementation of the random intercepts and slopes in gam/bam # which have their own degrees of freedom. Adding a lexical variable will likely reduce # the number of degrees of freedom needed for the by-word random intercepts. # So: do not use anova to compare models when they have a random-effects structure. # Instead, use AIC to evaluate the models and look at the significance of the variables # (an AIC reduction of at least 2 indicates a significant improvement) model6b = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + VCratio + s(Word,bs="re"), data=dialectdata) anova(model6,model6b,test='F') # As indicated, AIC's can be usefully compared AIC(model6) - AIC(model6b) # positive, but less than two, so no real improvement # C5. We include a random slope for population count per word model7 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re"), data=dialectdata) summary(model7) # C6. We can visualize the random slopes: myCoefPlot.gam(model7,"Word","PopCnt") # EXERCISE 8: Add a random intercept for Location and visualize the adaptations # EXERCISE 9: Test if other variables need to be added as predictors to the model, # and test them also as random slopes. Think carefully about if you # need to test them as random slopes per word or location. # FINAL REMARK: note that the (geographical) results are different than reported in the paper # (http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0023613). This is due to # the relatively small dataset (containing less than 10% of the words).