STATISTICS COURSE - LAB SESSION 3

Martijn Wieling, http://www.martijnwieling.nl

## Generated on: December 25, 2014 - 11:04:59

# A0. Start R 3.1.2
# Start R and make sure the libraries mgcv and car are installed
# Menu: Packages -> Install package(s)...  -> Select mirror -> Select packages
# or: 
# install.packages(c("mgcv","car"),repos='http://cran.r-project.org')

# A1. These commands should work when the packages are installed
library(mgcv)
library(car)

# version information
R.version.string

## [1] "R version 3.1.2 (2014-10-31)"

packageVersion('mgcv')

## [1] '1.8.4'

packageVersion('car')

## [1] '2.0.22'

# A2. Now download the following files and load the custom functions and data
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/myFunctions.R', 'myFunctions.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/compareML.R', 'compareML.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/dialectdata.rda', 'dialectdata.rda')

source('myFunctions.R') # custom functions
source('compareML.R') # custom function to compare models
load('dialectdata.rda') # Dutch dialect data, subset of 48 words

# Note that predictors were standardized and log-transformed when necessary.
# The dependent (LD) was centered. 
# 
# Standardization: scaledA = scale(A)
# Centering: centeredA = scale(A,scale=F)
# Log-transform: logA = log(A)
#                check if necessary with: plot(density(A))

# EXERCISE 1. Investigate the data yourself (use: head, str, nrow)
# e.g., str(dialectdata) will show you the structure of the dataset

# ANSWER 1: 
str(dialectdata)

## 'data.frame':    19446 obs. of  13 variables:
##  $ LD         : num [1:19446, 1] 0.1134 -0.4989 -0.0759 0.2456 0.3063 ...
##  $ Location   : Factor w/ 424 levels "Aalsmeer NH",..: 117 38 230 107 422 195 7 378 280 165 ...
##  $ Word       : Factor w/ 48 levels "benen","best",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Transcriber: Factor w/ 30 levels "AC","AG","AJ",..: 15 11 30 30 15 19 19 15 30 24 ...
##  $ Longitude  : num  3.88 4.46 6.62 7.1 6.07 ...
##  $ Latitude   : num  51.5 52 52.1 53.2 52.6 ...
##  $ PopCnt     : num [1:19446, 1] 1.313 -0.707 0.624 -0.31 0.188 ...
##  $ PopAge     : num [1:19446, 1] 0.197 -2.3897 0.3881 1.6554 -0.0491 ...
##  $ PopIncome  : num [1:19446, 1] 0.6435 0.4112 0.0949 -0.2032 -0.6164 ...
##  $ IsNoun     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WordFreq   : num [1:19446, 1] 0.665 0.665 0.665 0.665 0.665 ...
##  $ VCratio    : num [1:19446, 1] 1.15 1.15 1.15 1.15 1.15 ...
##  $ WordLength : num [1:19446, 1] 0.423 0.423 0.423 0.423 0.423 ...

head(dialectdata)

##            LD        Location  Word Transcriber Longitude Latitude     PopCnt      PopAge   PopIncome IsNoun  WordFreq
## 1  0.11336890         Goes Ze benen          EH  3.877739 51.50081  1.3132947  0.19696746  0.64348160      1 0.6652531
## 2 -0.49887820   Berkel-Oud ZH benen          CA  4.464149 51.98904 -0.7069466 -2.38967785  0.41123039      1 0.6652531
## 3 -0.07588232        Neede Gl benen          PV  6.622521 52.14262  0.6242153  0.38812732  0.09486759      1 0.6652531
## 4  0.24560448 Finsterwolde Gn benen          PV  7.098723 53.19468 -0.3102058  1.65541113 -0.20315301      1 0.6652531
## 5  0.30629504   Zwartsluis Ov benen          EH  6.070411 52.64288  0.1881099 -0.04914316 -0.61641849      1 0.6652531
## 6  0.20705193       Kuinre Ov benen          HV  5.837851 52.79604 -0.9337052 -0.34259474 -0.50026404      1 0.6652531
##    VCratio WordLength
## 1 1.146317  0.4229065
## 2 1.146317  0.4229065
## 3 1.146317  0.4229065
## 4 1.146317  0.4229065
## 5 1.146317  0.4229065
## 6 1.146317  0.4229065

nrow(dialectdata)

## [1] 19446

summary(dialectdata)

##         LD.V1                  Location          Word        Transcriber     Longitude        Latitude    
##  Min.   :-0.4988782   Almen Gl     :   48   rijst  :  422   PV     :5634   Min.   :3.438   Min.   :50.77  
##  1st Qu.:-0.2016394   Alphen NB    :   48   schade :  421   EH     :3767   1st Qu.:5.024   1st Qu.:51.70  
##  Median : 0.0121314   Amersfoort Ut:   48   veel   :  420   AO     :1262   Median :5.735   Median :52.12  
##  Mean   : 0.0000000   Angerlo Gl   :   48   kammen :  419   HV     :1026   Mean   :5.590   Mean   :52.21  
##  3rd Qu.: 0.2372087   Anloo Dr     :   48   wonen  :  419   AV     : 931   3rd Qu.:6.175   3rd Qu.:52.73  
##  Max.   : 0.8705929   Austerlitz Ut:   48   noten  :  418   DV     : 904   Max.   :7.168   Max.   :53.48  
##                       (Other)      :19158   (Other):16927   (Other):5922                                  
##       PopCnt.V1            PopAge.V1         PopIncome.V1         IsNoun          WordFreq.V1          VCratio.V1     
##  Min.   :-2.6608800   Min.   :-3.177204   Min.   :-3.575051   Min.   :0.000   Min.   :-2.070331   Min.   :-2.2202930  
##  1st Qu.:-0.7225091   1st Qu.:-0.601558   1st Qu.:-0.586729   1st Qu.:0.000   1st Qu.:-0.503609   1st Qu.:-0.9179104  
##  Median :-0.0915865   Median :-0.044225   Median :-0.012116   Median :0.000   Median :-0.044536   Median : 0.3844722  
##  Mean   : 0.0003773   Mean   :-0.003588   Mean   : 0.004545   Mean   :0.436   Mean   :-0.009755   Mean   :-0.0051614  
##  3rd Qu.: 0.6421021   3rd Qu.: 0.630994   3rd Qu.: 0.522902   3rd Qu.:1.000   3rd Qu.: 0.496171   3rd Qu.: 0.3844722  
##  Max.   : 3.1117973   Max.   : 3.347749   Max.   : 4.555094   Max.   :1.000   Max.   : 3.164715   Max.   : 2.4486998  
##                                                                                                                       
##     WordLength.V1    
##  Min.   :-1.6859319  
##  1st Qu.:-0.4982957  
##  Median : 0.4229065  
##  Mean   : 0.0058583  
##  3rd Qu.: 0.4229065  
##  Max.   : 2.3632196  
##

##### GENERALIZED ADDITIVE MODELING
# B1. our first generalized additive model, 
# Longitude and Latitude have the same scale, so we use s()
# LD: Levenshtein distance (pronunciation distance) from standard Dutch
model1 = gam(LD ~ s(Longitude,Latitude), data=dialectdata, method='ML')
summary(model1)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ s(Longitude, Latitude)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.258e-15  2.135e-03       0        1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value    
## s(Longitude,Latitude) 25.65  28.32 77.93  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.102   Deviance explained = 10.3%
## -ML = 4073.7  Scale est. = 0.088672  n = 19446

# B2. we can visualize the partial effect of geography using the function plot
plot.gam(model1) # shows confidence intervals

# nicer visualization including all effects (note that the intercept is 0, 
# so this will show the same as the partial effect plot)
vis.gam(model1, plot.type="contour", color="terrain", too.far=0.05, 
view=c("Longitude","Latitude"),main="General effect of Geography")

# B3. s() uses thin plate regression splines as basis functions and these 
#     are generally better (and slower!) than the cubic regression splines used as default by te()
#     also note that the default wigglyness (specified via 'k') of s(..) is different than that of te(). 
#     The default maximum degrees of freedom per dimension in te() is 5, whereas the default total maximum 
#     degrees of freedom of s() is 10 (1 dimension), 30 (2 dimensions) and 110 (3 dimensions). 
#     A te(..) can be made identical to an s(..) by changing the default basis function and wigglyness
#     (te(..) is more flexible though, since it also allows combinations of non-isotropic predictors)
equal1 = gam(LD ~ s(Longitude,Latitude,k=10), data=dialectdata, method='ML')
equal2 = gam(LD ~ te(Longitude,Latitude,d=c(2),bs="tp",k=10), data=dialectdata, method='ML')
summary(equal1)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ s(Longitude, Latitude, k = 10)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.244e-15  2.142e-03       0        1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value    
## s(Longitude,Latitude) 8.894  8.998 229.9  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.096   Deviance explained = 9.64%
## -ML = 4115.6  Scale est. = 0.089256  n = 19446

summary(equal2) # identical to that of equal1

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, d = c(2), bs = "tp", k = 10)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.424e-15  2.142e-03       0        1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F p-value    
## te(Longitude,Latitude) 8.894  8.998 229.9  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.096   Deviance explained = 9.64%
## -ML = 4115.6  Scale est. = 0.089256  n = 19446

AIC(equal1) - AIC(equal2) # 0

## [1] 0

# B4. We can investigate if the influence of geography differs for different values of word frequency
# d=c(2,1) indicates that Longitude and Latitude (the first two parameters) are on the same scale, 
# but the next one (,1) is not
system.time(model2 <- gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata, method='ML')) # about 30 seconds

##    user  system elapsed 
##   30.89    0.33   31.23

summary(model2)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.557e-15  2.077e-03       0        1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df     F p-value    
## te(Longitude,Latitude,WordFreq) 88.56  104.4 32.95  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.15   Deviance explained = 15.4%
## -ML = 3618.5  Scale est. = 0.083888  n = 19446

# B5. Models can be compared using AIC 
AIC(model1) - AIC(model2) # model2 is better as it has lower AIC (> 2 difference)

## [1] 1014.85

# B6. Note that bam is much faster here:
system.time(model2b <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata, method='ML')) # about 3 seconds

##    user  system elapsed 
##    3.79    0.05    3.84

summary(model2b)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001103   0.002081    0.53    0.596
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df     F p-value    
## te(Longitude,Latitude,WordFreq) 87.92  103.5 33.15  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 124/125
## R-sq.(adj) =   0.15   Deviance explained = 15.4%
## -ML = 3620.4  Scale est. = 0.083907  n = 19446

# However, the fit is slightly worse than that of gam. So were losing some precision at the benefit of 
# increased speed and less memory-intensive computations. As soon as you have a reasonably-sized dataset (i.e. > 10,000 rows)
# you will always use bam. 
AIC(model2) - AIC(model2b)

## [1] -3.874644

# B7. Note that AIC cannot always be used to compare gam/bam models. 
# When the rho-parameter is used in bam for correcting autocorrelation 
# (this will be discussed in lecture 4) comparing fixed effects is done via 
# ML score comparison and random effects via REML score comparison (lower is better).
# The custom function compareML will test these differences for you. It also shows
# the AIC difference when appropriate.
compareML(model2,model2b) # model2 is better

## model2: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
## 
## model2b: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
## 
## Chi-square test of ML scores
## -----
##     Model    Score Edf Chisq    Df  p.value Sig.
## 1 model2b 3620.444   8                          
## 2  model2 3618.478   8 1.966 0.000  < 2e-16  ***
## 
## AIC difference: -3.87, model model2 has lower AIC.

## Warning in compareML(model2, model2b): Only small difference in ML...

# B8. We can investigate the geographical pattern for word frequency. 
#     Note that WordFreq is z-transformed and -2 indicates low-frequency words 
#     (2 SD below the mean) and +2 indicates high-frequency words
# REMARK: these results seem very different from those reported in the PLOS ONE paper
#         but remember that this is only a subset of 48 words (out of > 550)
par(mfrow=c(1,3)) # show 3 plots next to each other
# in cond=list() the values you'd like to see can be specified 
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=-2),view=c("Longitude","Latitude"),main="Effect of Geography for LF words")
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=0),view=c("Longitude","Latitude"),main="Effect of Geography for MF words")
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=+2),view=c("Longitude","Latitude"),main="Effect of Geography for HF words")

# EXERCISE 2. Create a model to investigate the varying effect of geography only depending 
#             on population age (name the model: model3)

# ANSWER 2:
model3 = bam(LD ~ te(Longitude,Latitude,PopAge,d=c(2,1)), data=dialectdata, method='ML')
AIC(model1) - AIC(model3) # better than the simple model

## [1] 51.66321

summary(model3)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, PopAge, d = c(2, 1))
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0002126  0.0021333     0.1    0.921
## 
## Approximate significance of smooth terms:
##                                 edf Ref.df    F p-value    
## te(Longitude,Latitude,PopAge) 54.07  66.28 34.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 124/125
## R-sq.(adj) =  0.106   Deviance explained = 10.8%
## -ML = 4062.2  Scale est. = 0.088298  n = 19446

# EXERCISE 3. Visualize the results of the model

# ANSWER 3:
range(dialectdata$PopAge) # also z-transformed

## [1] -3.177204  3.347749

par(mfrow=c(1,3)) # show 3 plots next to each other
# in cond=list() the values you'd like to see can be specified 
vis.gam(model3, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(PopAge=-2),view=c("Longitude","Latitude"),main="Effect of Geography for a young population")
vis.gam(model3, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(PopAge=0),view=c("Longitude","Latitude"),main="Effect of Geography for a medium-aged population")
vis.gam(model3, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(PopAge=+2),view=c("Longitude","Latitude"),main="Effect of Geography for an old population")

# EXERCISE 4. Create a model (don't forget to use bam!) to investigate the geographic variability in the influence 
#     of word frequency and population age. Name the model: model4.
#     Hint, make sure to specify that Longitude and Latitude are in the same scale,
#           but word frequency and population age each have their own scale
#           To reduce memory requirements use the option: gc.level=2 for bam
#     Depending on the speed of your laptop this might take a few minutes (on my desktop: 90 seconds)

# ANSWER 4:
system.time(model4 <- bam(LD ~ te(Longitude,Latitude,WordFreq,PopAge,d=c(2,1,1)), data=dialectdata, method='ML', gc.level=2))

##    user  system elapsed 
##  109.85    0.45  110.99

summary(model4)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, PopAge, d = c(2, 1, 1))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001043   0.002081   0.501    0.616
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## te(Longitude,Latitude,WordFreq,PopAge) 131.3  164.8 21.09  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 624/625
## R-sq.(adj) =  0.152   Deviance explained = 15.8%
## -ML = 3634.1  Scale est. = 0.083726  n = 19446

# B9. We can investigate this pattern by making a more complex series of plots
par(mfrow=c(2,2)) # show 2x2 plots 

# in cond=list() the values you'd like to see can be specified 
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=-2,PopAge=-2),view=c("Longitude","Latitude"),main="Geography: LF + Young")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=+2,PopAge=-2),view=c("Longitude","Latitude"),main="Geography: HF + Young")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=-2,PopAge=2),view=c("Longitude","Latitude"),main="Geography: LF + Old ")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05, 
        cond=list(WordFreq=+2,PopAge=2),view=c("Longitude","Latitude"),main="Geography: HF + Old")

# B10. Now we can test if other variables should be added (not interacting with geography).
#      (For efficiency purposes, we continue working with model2 in this lab session). We first test population count:
model5 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt, data=dialectdata, method='ML')
summary(model5)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.001051   0.002080   0.505    0.613    
## PopCnt      -0.011330   0.002223  -5.097 3.48e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df     F p-value    
## te(Longitude,Latitude,WordFreq) 88.11  103.7 33.15  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 125/126
## R-sq.(adj) =  0.151   Deviance explained = 15.5%
## -ML = 3607.5  Scale est. = 0.083796  n = 19446

compareML(model2,model5) # model5 has lower AIC and also significantly lower ML scores

## model2: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
## 
## model5: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
## 
## Chi-square test of ML scores
## -----
##    Model    Score Edf  Chisq    Df   p.value Sig.
## 1 model2 3618.478   8                            
## 2 model5 3607.461   9 11.018 1.000 2.676e-06  ***
## 
## AIC difference: 20.81, model model5 has lower AIC.

# EXERCISE 5. What is the interpretation of the effect of population count?

# ANSWER 5: Locations with a higher population have a lower LD 
#           (i.e. pronunciation distance from standard Dutch)

# EXERCISE 6. Test some other potential variables for inclusion 
#             (make sure you have enough time for part C.)

# ANSWER 6: (example)
model5b = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + IsNoun, data=dialectdata, method='ML')
summary(model5b) # IsNoun is highly significant

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##     IsNoun
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.049810   0.002875 -17.327  < 2e-16 ***
## PopCnt      -0.011515   0.002188  -5.264 1.43e-07 ***
## IsNoun       0.116347   0.004620  25.185  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df     F p-value    
## te(Longitude,Latitude,WordFreq) 88.68  104.1 36.51  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 126/127
## R-sq.(adj) =  0.178   Deviance explained = 18.2%
## -ML = 3295.3  Scale est. = 0.081138  n = 19446

compareML(model5,model5b) # which also is observable when comparing the simpler model

## model5: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
## 
## model5b: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##      IsNoun
## 
## Chi-square test of ML scores
## -----
##     Model    Score Edf   Chisq    Df  p.value Sig.
## 1  model5 3607.461   9                            
## 2 model5b 3295.305  10 312.155 1.000  < 2e-16  ***
## 
## AIC difference: 625.10, model model5b has lower AIC.

##### Generalized Additive Mixed-Effects Regression

# C1. We first test if we need a random intercept for word (we use bam for speed)
# Note that method='fREML' does the same as method='REML', it's (generally) just faster.
model6 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                  + PopCnt + s(Word,bs="re"), data=dialectdata, method='fREML')

# C2. We can see immediately if the random intercept is necessary via the summary:
summary(model6)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##     s(Word, bs = "re")
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0004235  0.0269223   0.016    0.987    
## PopCnt      -0.0115779  0.0017745  -6.525 6.99e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df      F p-value    
## te(Longitude,Latitude,WordFreq) 94.69  108.4  39.43  <2e-16 ***
## s(Word)                         44.64   46.0 237.08  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 173/174
## R-sq.(adj) =  0.462   Deviance explained = 46.6%
## fREML = -657.73  Scale est. = 0.053123  n = 19446

# C3. We can also compare the REML scores:
model5 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                  + PopCnt, data=dialectdata, method='fREML')
compareML(model5,model6)

## model5: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
## 
## model6: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##      s(Word, bs = "re")
## 
## Chi-square test of fREML scores
## -----
##    Model     Score Edf    Chisq    Df  p.value Sig.
## 1 model5 3628.9102   9                             
## 2 model6 -657.7291  10 4286.639 1.000  < 2e-16  ***
## 
## AIC difference: 8811.61, model model6 has lower AIC.

# C3. We can visualize the random intercepts:
par(mfrow=c(1,1)) # only one plot in a window
myCoefPlot.gam(model6,"Word","Intercept")

# C4. We include a random slope for population count per word and evaluate if it improves the model
model7 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                  + PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re"), data=dialectdata, method='fREML')
summary(model7) # the random slope is significant

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##     s(Word, bs = "re") + s(Word, PopCnt, bs = "re")
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0004346  0.0269236   0.016    0.987    
## PopCnt      -0.0115838  0.0026939  -4.300 1.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df      F  p-value    
## te(Longitude,Latitude,WordFreq) 94.68  108.3  39.54  < 2e-16 ***
## s(Word)                         44.64   46.0 238.84  < 2e-16 ***
## s(Word,PopCnt)                  28.14   47.0   1.52 8.11e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 221/222
## R-sq.(adj) =  0.464   Deviance explained = 46.9%
## fREML = -671.29  Scale est. = 0.052931  n = 19446

compareML(model6,model7) # shows the same pattern, but I generally follow the summary to decide

## model6: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##      s(Word, bs = "re")
## 
## model7: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##      s(Word, bs = "re") + s(Word, PopCnt, bs = "re")
## 
## Chi-square test of fREML scores
## -----
##    Model     Score Edf  Chisq    Df   p.value Sig.
## 1 model6 -657.7291  10                            
## 2 model7 -671.2868  11 13.558 1.000 1.917e-07  ***
## 
## AIC difference: 40.92, model model7 has lower AIC.

                         # if a random slope/intercept needs to be retained

# C5. We can visualize the random slopes:
myCoefPlot.gam(model7,"Word","PopCnt")

# EXERCISE 7. Add a random intercept for Location and visualize the random intercepts

# ANSWER 7:
model8 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                  + PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re") + s(Location,bs='re'), 
             data=dialectdata, method='fREML')
summary(model8)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##     s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + s(Location, 
##     bs = "re")
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.0006155  0.0269784   0.023  0.98180   
## PopCnt      -0.0112542  0.0037041  -3.038  0.00238 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                    edf Ref.df       F  p-value    
## te(Longitude,Latitude,WordFreq)  91.68  104.7  18.131  < 2e-16 ***
## s(Word)                          44.62   46.0 254.829  < 2e-16 ***
## s(Word,PopCnt)                   28.82   47.0   1.619 2.21e-08 ***
## s(Location)                     276.42  420.0   2.116  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 645/646
## R-sq.(adj) =  0.487   Deviance explained = 49.9%
## fREML = -866.12  Scale est. = 0.050647  n = 19446

myCoefPlot.gam(model8,"Location","Intercept")

# EXERCISE 8. Test if other variables need to be added as predictors to the model, 
#             and test them also as random slopes. Think carefully about if you 
#             need to test them as random slopes by word or location. 

# ANSWER 8: (example)
summary(model8a <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                            + PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re") + s(Location,bs='re'), 
                       data=dialectdata, method='ML')) # method='ML' to compare fixed effects

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##     s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + s(Location, 
##     bs = "re")
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.0006135  0.0265322   0.023   0.9816   
## PopCnt      -0.0112456  0.0036887  -3.049   0.0023 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                    edf Ref.df       F  p-value    
## te(Longitude,Latitude,WordFreq)  91.26  104.3  18.207  < 2e-16 ***
## s(Word)                          44.60   46.0 254.777  < 2e-16 ***
## s(Word,PopCnt)                   28.63   47.0   1.608 2.17e-08 ***
## s(Location)                     275.96  420.0   2.112  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 645/646
## R-sq.(adj) =  0.487   Deviance explained = 49.9%
## -ML = -884.23  Scale est. = 0.050651  n = 19446

summary(model9a <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) 
                            + PopCnt + IsNoun + s(Word,bs="re") + s(Word,PopCnt,bs="re") + s(Location,bs='re'), 
                       data=dialectdata, method='ML')) # IsNoun is significant

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##     IsNoun + s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + 
##     s(Location, bs = "re")
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.054252   0.035394  -1.533   0.1253   
## PopCnt      -0.011247   0.003689  -3.049   0.0023 **
## IsNoun       0.124536   0.056234   2.215   0.0268 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                    edf Ref.df       F  p-value    
## te(Longitude,Latitude,WordFreq)  91.27  104.4  18.209  < 2e-16 ***
## s(Word)                          43.55   45.0 234.801  < 2e-16 ***
## s(Word,PopCnt)                   28.63   47.0   1.608 2.01e-08 ***
## s(Location)                     275.95  420.0   2.110  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 646/647
## R-sq.(adj) =  0.487   Deviance explained = 49.9%
## -ML = -886.58  Scale est. = 0.050651  n = 19446

compareML(model8a,model9a)

## model8a: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##      s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + s(Location, 
##      bs = "re")
## 
## model9a: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt + 
##      IsNoun + s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + 
##      s(Location, bs = "re")
## 
## Chi-square test of ML scores
## -----
##     Model     Score Edf Chisq    Df p.value Sig.
## 1 model8a -884.2340  12                         
## 2 model9a -886.5752  13 2.341 1.000   0.030  *  
## 
## AIC difference: -0.08, model model8a has lower AIC.

## Warning in compareML(model8a, model9a): Only small difference in ML...

# The AIC does not decrease (i.e. the model becomes worse), and the reduction in ML is only limited.
# I would therefore exclude IsNoun.


# FINAL REMARKS: note that the (geographical) results are different than reported in the paper
# (http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0023613). This is due to 
# the relatively small dataset (containing less than 10% of the locations).
#
# The original dataset and all analyses steps can be found in the paper package: 
# http://openscience.uni-leipzig.de/index.php/mr2/article/view/44
#
# For the Tuscan data presented during the lecture, these can be found here:
# http://openscience.uni-leipzig.de/index.php/mr2/article/view/41

Replication of the analysis

To run all analyses yourself, you first have to install pandoc, and then you can just copy the following lines to the most recent version of R. You need the packages ‘mgcv’, ‘car’ and ‘rmarkdown’. If these are not installed (the library commands will throw an error), you can uncomment (i.e. remove the hashtag) the first three lines to install them.

#install.packages('mgcv',repos='http://cran.us.r-project.org')
#install.packages('car',repos='http://cran.us.r-project.org')
#install.packages('rmarkdown',repos='http://cran.us.r-project.org')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/answers/lab-including-answers.Rmd', 'lab-including-answers.Rmd')
library(rmarkdown)
render('lab-including-answers.Rmd') # generates html file with results
browseURL(paste('file://', file.path(getwd(),'lab-including-answers.html'), sep='')) # shows result