## STATISTICS COURSE - LAB SESSION 3 ## Martijn Wieling, http://www.martijnwieling.nl # A0. Start R 3.1.2 # Start R and make sure the libraries mgcv and car are installed # Menu: Packages -> Install package(s)... -> Select mirror -> Select packages # or: # install.packages(c("mgcv","car"),repos='http://cran.r-project.org') # A1. These commands should work when the packages are installed library(mgcv) library(car) packageVersion('mgcv') # tested with version 1.8.4 # A2. Now download the following files and load the custom functions and data download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/myFunctions.R', 'myFunctions.R') download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/compareML.R', 'compareML.R') download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/dialectdata.rda', 'dialectdata.rda') source('myFunctions.R') # custom functions source('compareML.R') # custom function to compare models load('dialectdata.rda') # Dutch dialect data, subset of 48 words # Note that predictors were standardized and log-transformed when necessary. # The dependent (LD) was centered. # # Standardization: scaledA = scale(A) # Centering: centeredA = scale(A,scale=F) # Log-transform: logA = log(A) # check if necessary with: plot(density(A)) # EXERCISE 1. Investigate the data yourself (use: head, str, nrow) # e.g., str(dialectdata) will show you the structure of the dataset ##### GENERALIZED ADDITIVE MODELING # B1. our first generalized additive model, # Longitude and Latitude have the same scale, so we use s() # LD: Levenshtein distance (pronunciation distance) from standard Dutch model1 = gam(LD ~ s(Longitude,Latitude), data=dialectdata, method='ML') summary(model1) # B2. we can visualize the partial effect of geography using the function plot plot.gam(model1) # shows confidence intervals # nicer visualization including all effects (note that the intercept is 0, # so this will show the same as the partial effect plot) vis.gam(model1, plot.type="contour", color="terrain", too.far=0.05, view=c("Longitude","Latitude"),main="General effect of Geography") # B3. s() uses thin plate regression splines as basis functions and these # are generally better (and slower!) than the cubic regression splines used as default by te() # also note that the default wigglyness (specified via 'k') of s(..) is different than that of te(). # The default maximum degrees of freedom per dimension in te() is 5, whereas the default total maximum # degrees of freedom of s() is 10 (1 dimension), 30 (2 dimensions) and 110 (3 dimensions). # A te(..) can be made identical to an s(..) by changing the default basis function and wigglyness # (te(..) is more flexible though, since it also allows combinations of non-isotropic predictors) equal1 = gam(LD ~ s(Longitude,Latitude,k=10), data=dialectdata, method='ML') equal2 = gam(LD ~ te(Longitude,Latitude,d=c(2),bs="tp",k=10), data=dialectdata, method='ML') summary(equal1) summary(equal2) # identical to that of equal1 AIC(equal1) - AIC(equal2) # 0 # B4. We can investigate if the influence of geography differs for different values of word frequency # d=c(2,1) indicates that Longitude and Latitude (the first two parameters) are on the same scale, # but the next one (,1) is not system.time(model2 <- gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata, method='ML')) # about 30 seconds summary(model2) # B5. Models can be compared using AIC AIC(model1) - AIC(model2) # model2 is better as it has lower AIC (> 2 difference) # B6. Note that bam is much faster here: system.time(model2b <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata, method='ML')) # about 3 seconds summary(model2b) # However, the fit is slightly worse than that of gam. So were losing some precision at the benefit of # increased speed and less memory-intensive computations. As soon as you have a reasonably-sized dataset (i.e. > 10,000 rows) # you will always use bam. AIC(model2) - AIC(model2b) # B7. Note that AIC cannot always be used to compare gam/bam models. # When the rho-parameter is used in bam for correcting autocorrelation # (this will be discussed in lecture 4) comparing fixed effects is done via # ML score comparison and random effects via REML score comparison (lower is better). # The custom function compareML will test these differences for you. It also shows # the AIC difference when appropriate. compareML(model2,model2b) # model2 is better # B8. We can investigate the geographical pattern for word frequency. # Note that WordFreq is z-transformed and -2 indicates low-frequency words # (2 SD below the mean) and +2 indicates high-frequency words # REMARK: these results seem very different from those reported in the PLOS ONE paper # but remember that this is only a subset of 48 words (out of > 550) par(mfrow=c(1,3)) # show 3 plots next to each other # in cond=list() the values you'd like to see can be specified vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=-2),view=c("Longitude","Latitude"),main="Effect of Geography for LF words") vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=0),view=c("Longitude","Latitude"),main="Effect of Geography for MF words") vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=+2),view=c("Longitude","Latitude"),main="Effect of Geography for HF words") # EXERCISE 2. Create a model to investigate the varying effect of geography only depending # on population age (name the model: model3) # EXERCISE 3. Visualize the results of the model # EXERCISE 4. Create a model (don't forget to use bam!) to investigate the geographic variability in the influence # of word frequency and population age. Name the model: model4. # Hint, make sure to specify that Longitude and Latitude are in the same scale, # but word frequency and population age each have their own scale # To reduce memory requirements use the option: gc.level=2 for bam # Depending on the speed of your laptop this might take a few minutes (on my desktop: 90 seconds) # B9. We can investigate this pattern by making a more complex series of plots par(mfrow=c(2,2)) # show 2x2 plots # in cond=list() the values you'd like to see can be specified vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=-2,PopAge=-2),view=c("Longitude","Latitude"),main="Geography: LF + Young") vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=+2,PopAge=-2),view=c("Longitude","Latitude"),main="Geography: HF + Young") vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=-2,PopAge=2),view=c("Longitude","Latitude"),main="Geography: LF + Old ") vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, cond=list(WordFreq=+2,PopAge=2),view=c("Longitude","Latitude"),main="Geography: HF + Old") # B10. Now we can test if other variables should be added (not interacting with geography). # (For efficiency purposes, we continue working with model2 in this lab session). We first test population count: model5 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt, data=dialectdata, method='ML') summary(model5) compareML(model2,model5) # model5 has lower AIC and also significantly lower ML scores # EXERCISE 5. What is the interpretation of the effect of population count? # EXERCISE 6. Test some other potential variables for inclusion # (make sure you have enough time for part C.) ##### Generalized Additive Mixed-Effects Regression # C1. We first test if we need a random intercept for word (we use bam for speed) # Note that method='fREML' does the same as method='REML', it's (generally) just faster. model6 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + s(Word,bs="re"), data=dialectdata, method='fREML') # C2. We can see immediately if the random intercept is necessary via the summary: summary(model6) # C3. We can also compare the REML scores: model5 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt, data=dialectdata, method='fREML') compareML(model5,model6) # C3. We can visualize the random intercepts: par(mfrow=c(1,1)) # only one plot in a window myCoefPlot.gam(model6,"Word","Intercept") # C4. We include a random slope for population count per word and evaluate if it improves the model model7 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re"), data=dialectdata, method='fREML') summary(model7) # the random slope is significant compareML(model6,model7) # shows the same pattern, but I generally follow the summary to decide # if a random slope/intercept needs to be retained # C5. We can visualize the random slopes: myCoefPlot.gam(model7,"Word","PopCnt") # EXERCISE 7. Add a random intercept for Location and visualize the random intercepts # EXERCISE 8. Test if other variables need to be added as predictors to the model, # and test them also as random slopes. Think carefully about if you # need to test them as random slopes by word or location. # FINAL REMARKS: note that the (geographical) results are different than reported in the paper # (http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0023613). This is due to # the relatively small dataset (containing less than 10% of the locations). # # The original dataset and all analyses steps can be found in the paper package: # http://openscience.uni-leipzig.de/index.php/mr2/article/view/44 # # For the Tuscan data presented during the lecture, these can be found here: # http://openscience.uni-leipzig.de/index.php/mr2/article/view/41