Martijn Wieling, http://www.martijnwieling.nl
## Generated on: December 25, 2014 - 11:04:59
# A0. Start R 3.1.2
# Start R and make sure the libraries mgcv and car are installed
# Menu: Packages -> Install package(s)... -> Select mirror -> Select packages
# or:
# install.packages(c("mgcv","car"),repos='http://cran.r-project.org')
# A1. These commands should work when the packages are installed
library(mgcv)
library(car)
# version information
R.version.string
## [1] "R version 3.1.2 (2014-10-31)"
packageVersion('mgcv')
## [1] '1.8.4'
packageVersion('car')
## [1] '2.0.22'
# A2. Now download the following files and load the custom functions and data
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/myFunctions.R', 'myFunctions.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/compareML.R', 'compareML.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/dialectdata.rda', 'dialectdata.rda')
source('myFunctions.R') # custom functions
source('compareML.R') # custom function to compare models
load('dialectdata.rda') # Dutch dialect data, subset of 48 words
# Note that predictors were standardized and log-transformed when necessary.
# The dependent (LD) was centered.
#
# Standardization: scaledA = scale(A)
# Centering: centeredA = scale(A,scale=F)
# Log-transform: logA = log(A)
# check if necessary with: plot(density(A))
# EXERCISE 1. Investigate the data yourself (use: head, str, nrow)
# e.g., str(dialectdata) will show you the structure of the dataset
# ANSWER 1:
str(dialectdata)
## 'data.frame': 19446 obs. of 13 variables:
## $ LD : num [1:19446, 1] 0.1134 -0.4989 -0.0759 0.2456 0.3063 ...
## $ Location : Factor w/ 424 levels "Aalsmeer NH",..: 117 38 230 107 422 195 7 378 280 165 ...
## $ Word : Factor w/ 48 levels "benen","best",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Transcriber: Factor w/ 30 levels "AC","AG","AJ",..: 15 11 30 30 15 19 19 15 30 24 ...
## $ Longitude : num 3.88 4.46 6.62 7.1 6.07 ...
## $ Latitude : num 51.5 52 52.1 53.2 52.6 ...
## $ PopCnt : num [1:19446, 1] 1.313 -0.707 0.624 -0.31 0.188 ...
## $ PopAge : num [1:19446, 1] 0.197 -2.3897 0.3881 1.6554 -0.0491 ...
## $ PopIncome : num [1:19446, 1] 0.6435 0.4112 0.0949 -0.2032 -0.6164 ...
## $ IsNoun : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WordFreq : num [1:19446, 1] 0.665 0.665 0.665 0.665 0.665 ...
## $ VCratio : num [1:19446, 1] 1.15 1.15 1.15 1.15 1.15 ...
## $ WordLength : num [1:19446, 1] 0.423 0.423 0.423 0.423 0.423 ...
head(dialectdata)
## LD Location Word Transcriber Longitude Latitude PopCnt PopAge PopIncome IsNoun WordFreq
## 1 0.11336890 Goes Ze benen EH 3.877739 51.50081 1.3132947 0.19696746 0.64348160 1 0.6652531
## 2 -0.49887820 Berkel-Oud ZH benen CA 4.464149 51.98904 -0.7069466 -2.38967785 0.41123039 1 0.6652531
## 3 -0.07588232 Neede Gl benen PV 6.622521 52.14262 0.6242153 0.38812732 0.09486759 1 0.6652531
## 4 0.24560448 Finsterwolde Gn benen PV 7.098723 53.19468 -0.3102058 1.65541113 -0.20315301 1 0.6652531
## 5 0.30629504 Zwartsluis Ov benen EH 6.070411 52.64288 0.1881099 -0.04914316 -0.61641849 1 0.6652531
## 6 0.20705193 Kuinre Ov benen HV 5.837851 52.79604 -0.9337052 -0.34259474 -0.50026404 1 0.6652531
## VCratio WordLength
## 1 1.146317 0.4229065
## 2 1.146317 0.4229065
## 3 1.146317 0.4229065
## 4 1.146317 0.4229065
## 5 1.146317 0.4229065
## 6 1.146317 0.4229065
nrow(dialectdata)
## [1] 19446
summary(dialectdata)
## LD.V1 Location Word Transcriber Longitude Latitude
## Min. :-0.4988782 Almen Gl : 48 rijst : 422 PV :5634 Min. :3.438 Min. :50.77
## 1st Qu.:-0.2016394 Alphen NB : 48 schade : 421 EH :3767 1st Qu.:5.024 1st Qu.:51.70
## Median : 0.0121314 Amersfoort Ut: 48 veel : 420 AO :1262 Median :5.735 Median :52.12
## Mean : 0.0000000 Angerlo Gl : 48 kammen : 419 HV :1026 Mean :5.590 Mean :52.21
## 3rd Qu.: 0.2372087 Anloo Dr : 48 wonen : 419 AV : 931 3rd Qu.:6.175 3rd Qu.:52.73
## Max. : 0.8705929 Austerlitz Ut: 48 noten : 418 DV : 904 Max. :7.168 Max. :53.48
## (Other) :19158 (Other):16927 (Other):5922
## PopCnt.V1 PopAge.V1 PopIncome.V1 IsNoun WordFreq.V1 VCratio.V1
## Min. :-2.6608800 Min. :-3.177204 Min. :-3.575051 Min. :0.000 Min. :-2.070331 Min. :-2.2202930
## 1st Qu.:-0.7225091 1st Qu.:-0.601558 1st Qu.:-0.586729 1st Qu.:0.000 1st Qu.:-0.503609 1st Qu.:-0.9179104
## Median :-0.0915865 Median :-0.044225 Median :-0.012116 Median :0.000 Median :-0.044536 Median : 0.3844722
## Mean : 0.0003773 Mean :-0.003588 Mean : 0.004545 Mean :0.436 Mean :-0.009755 Mean :-0.0051614
## 3rd Qu.: 0.6421021 3rd Qu.: 0.630994 3rd Qu.: 0.522902 3rd Qu.:1.000 3rd Qu.: 0.496171 3rd Qu.: 0.3844722
## Max. : 3.1117973 Max. : 3.347749 Max. : 4.555094 Max. :1.000 Max. : 3.164715 Max. : 2.4486998
##
## WordLength.V1
## Min. :-1.6859319
## 1st Qu.:-0.4982957
## Median : 0.4229065
## Mean : 0.0058583
## 3rd Qu.: 0.4229065
## Max. : 2.3632196
##
##### GENERALIZED ADDITIVE MODELING
# B1. our first generalized additive model,
# Longitude and Latitude have the same scale, so we use s()
# LD: Levenshtein distance (pronunciation distance) from standard Dutch
model1 = gam(LD ~ s(Longitude,Latitude), data=dialectdata, method='ML')
summary(model1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ s(Longitude, Latitude)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.258e-15 2.135e-03 0 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Longitude,Latitude) 25.65 28.32 77.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.102 Deviance explained = 10.3%
## -ML = 4073.7 Scale est. = 0.088672 n = 19446
# B2. we can visualize the partial effect of geography using the function plot
plot.gam(model1) # shows confidence intervals
# nicer visualization including all effects (note that the intercept is 0,
# so this will show the same as the partial effect plot)
vis.gam(model1, plot.type="contour", color="terrain", too.far=0.05,
view=c("Longitude","Latitude"),main="General effect of Geography")
# B3. s() uses thin plate regression splines as basis functions and these
# are generally better (and slower!) than the cubic regression splines used as default by te()
# also note that the default wigglyness (specified via 'k') of s(..) is different than that of te().
# The default maximum degrees of freedom per dimension in te() is 5, whereas the default total maximum
# degrees of freedom of s() is 10 (1 dimension), 30 (2 dimensions) and 110 (3 dimensions).
# A te(..) can be made identical to an s(..) by changing the default basis function and wigglyness
# (te(..) is more flexible though, since it also allows combinations of non-isotropic predictors)
equal1 = gam(LD ~ s(Longitude,Latitude,k=10), data=dialectdata, method='ML')
equal2 = gam(LD ~ te(Longitude,Latitude,d=c(2),bs="tp",k=10), data=dialectdata, method='ML')
summary(equal1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ s(Longitude, Latitude, k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.244e-15 2.142e-03 0 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Longitude,Latitude) 8.894 8.998 229.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.096 Deviance explained = 9.64%
## -ML = 4115.6 Scale est. = 0.089256 n = 19446
summary(equal2) # identical to that of equal1
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, d = c(2), bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.424e-15 2.142e-03 0 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude) 8.894 8.998 229.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.096 Deviance explained = 9.64%
## -ML = 4115.6 Scale est. = 0.089256 n = 19446
AIC(equal1) - AIC(equal2) # 0
## [1] 0
# B4. We can investigate if the influence of geography differs for different values of word frequency
# d=c(2,1) indicates that Longitude and Latitude (the first two parameters) are on the same scale,
# but the next one (,1) is not
system.time(model2 <- gam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata, method='ML')) # about 30 seconds
## user system elapsed
## 30.89 0.33 31.23
summary(model2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.557e-15 2.077e-03 0 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 88.56 104.4 32.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.15 Deviance explained = 15.4%
## -ML = 3618.5 Scale est. = 0.083888 n = 19446
# B5. Models can be compared using AIC
AIC(model1) - AIC(model2) # model2 is better as it has lower AIC (> 2 difference)
## [1] 1014.85
# B6. Note that bam is much faster here:
system.time(model2b <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)), data=dialectdata, method='ML')) # about 3 seconds
## user system elapsed
## 3.79 0.05 3.84
summary(model2b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001103 0.002081 0.53 0.596
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 87.92 103.5 33.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 124/125
## R-sq.(adj) = 0.15 Deviance explained = 15.4%
## -ML = 3620.4 Scale est. = 0.083907 n = 19446
# However, the fit is slightly worse than that of gam. So were losing some precision at the benefit of
# increased speed and less memory-intensive computations. As soon as you have a reasonably-sized dataset (i.e. > 10,000 rows)
# you will always use bam.
AIC(model2) - AIC(model2b)
## [1] -3.874644
# B7. Note that AIC cannot always be used to compare gam/bam models.
# When the rho-parameter is used in bam for correcting autocorrelation
# (this will be discussed in lecture 4) comparing fixed effects is done via
# ML score comparison and random effects via REML score comparison (lower is better).
# The custom function compareML will test these differences for you. It also shows
# the AIC difference when appropriate.
compareML(model2,model2b) # model2 is better
## model2: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
##
## model2b: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 model2b 3620.444 8
## 2 model2 3618.478 8 1.966 0.000 < 2e-16 ***
##
## AIC difference: -3.87, model model2 has lower AIC.
## Warning in compareML(model2, model2b): Only small difference in ML...
# B8. We can investigate the geographical pattern for word frequency.
# Note that WordFreq is z-transformed and -2 indicates low-frequency words
# (2 SD below the mean) and +2 indicates high-frequency words
# REMARK: these results seem very different from those reported in the PLOS ONE paper
# but remember that this is only a subset of 48 words (out of > 550)
par(mfrow=c(1,3)) # show 3 plots next to each other
# in cond=list() the values you'd like to see can be specified
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=-2),view=c("Longitude","Latitude"),main="Effect of Geography for LF words")
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=0),view=c("Longitude","Latitude"),main="Effect of Geography for MF words")
vis.gam(model2, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=+2),view=c("Longitude","Latitude"),main="Effect of Geography for HF words")
# EXERCISE 2. Create a model to investigate the varying effect of geography only depending
# on population age (name the model: model3)
# ANSWER 2:
model3 = bam(LD ~ te(Longitude,Latitude,PopAge,d=c(2,1)), data=dialectdata, method='ML')
AIC(model1) - AIC(model3) # better than the simple model
## [1] 51.66321
summary(model3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, PopAge, d = c(2, 1))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0002126 0.0021333 0.1 0.921
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,PopAge) 54.07 66.28 34.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 124/125
## R-sq.(adj) = 0.106 Deviance explained = 10.8%
## -ML = 4062.2 Scale est. = 0.088298 n = 19446
# EXERCISE 3. Visualize the results of the model
# ANSWER 3:
range(dialectdata$PopAge) # also z-transformed
## [1] -3.177204 3.347749
par(mfrow=c(1,3)) # show 3 plots next to each other
# in cond=list() the values you'd like to see can be specified
vis.gam(model3, plot.type="contour", color="terrain", too.far=0.05,
cond=list(PopAge=-2),view=c("Longitude","Latitude"),main="Effect of Geography for a young population")
vis.gam(model3, plot.type="contour", color="terrain", too.far=0.05,
cond=list(PopAge=0),view=c("Longitude","Latitude"),main="Effect of Geography for a medium-aged population")
vis.gam(model3, plot.type="contour", color="terrain", too.far=0.05,
cond=list(PopAge=+2),view=c("Longitude","Latitude"),main="Effect of Geography for an old population")
# EXERCISE 4. Create a model (don't forget to use bam!) to investigate the geographic variability in the influence
# of word frequency and population age. Name the model: model4.
# Hint, make sure to specify that Longitude and Latitude are in the same scale,
# but word frequency and population age each have their own scale
# To reduce memory requirements use the option: gc.level=2 for bam
# Depending on the speed of your laptop this might take a few minutes (on my desktop: 90 seconds)
# ANSWER 4:
system.time(model4 <- bam(LD ~ te(Longitude,Latitude,WordFreq,PopAge,d=c(2,1,1)), data=dialectdata, method='ML', gc.level=2))
## user system elapsed
## 109.85 0.45 110.99
summary(model4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, PopAge, d = c(2, 1, 1))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001043 0.002081 0.501 0.616
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq,PopAge) 131.3 164.8 21.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 624/625
## R-sq.(adj) = 0.152 Deviance explained = 15.8%
## -ML = 3634.1 Scale est. = 0.083726 n = 19446
# B9. We can investigate this pattern by making a more complex series of plots
par(mfrow=c(2,2)) # show 2x2 plots
# in cond=list() the values you'd like to see can be specified
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=-2,PopAge=-2),view=c("Longitude","Latitude"),main="Geography: LF + Young")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=+2,PopAge=-2),view=c("Longitude","Latitude"),main="Geography: HF + Young")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=-2,PopAge=2),view=c("Longitude","Latitude"),main="Geography: LF + Old ")
vis.gam(model4, plot.type="contour", color="terrain", too.far=0.05,
cond=list(WordFreq=+2,PopAge=2),view=c("Longitude","Latitude"),main="Geography: HF + Old")
# B10. Now we can test if other variables should be added (not interacting with geography).
# (For efficiency purposes, we continue working with model2 in this lab session). We first test population count:
model5 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt, data=dialectdata, method='ML')
summary(model5)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001051 0.002080 0.505 0.613
## PopCnt -0.011330 0.002223 -5.097 3.48e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 88.11 103.7 33.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 125/126
## R-sq.(adj) = 0.151 Deviance explained = 15.5%
## -ML = 3607.5 Scale est. = 0.083796 n = 19446
compareML(model2,model5) # model5 has lower AIC and also significantly lower ML scores
## model2: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1))
##
## model5: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 model2 3618.478 8
## 2 model5 3607.461 9 11.018 1.000 2.676e-06 ***
##
## AIC difference: 20.81, model model5 has lower AIC.
# EXERCISE 5. What is the interpretation of the effect of population count?
# ANSWER 5: Locations with a higher population have a lower LD
# (i.e. pronunciation distance from standard Dutch)
# EXERCISE 6. Test some other potential variables for inclusion
# (make sure you have enough time for part C.)
# ANSWER 6: (example)
model5b = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1)) + PopCnt + IsNoun, data=dialectdata, method='ML')
summary(model5b) # IsNoun is highly significant
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## IsNoun
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.049810 0.002875 -17.327 < 2e-16 ***
## PopCnt -0.011515 0.002188 -5.264 1.43e-07 ***
## IsNoun 0.116347 0.004620 25.185 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 88.68 104.1 36.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 126/127
## R-sq.(adj) = 0.178 Deviance explained = 18.2%
## -ML = 3295.3 Scale est. = 0.081138 n = 19446
compareML(model5,model5b) # which also is observable when comparing the simpler model
## model5: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
##
## model5b: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## IsNoun
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 model5 3607.461 9
## 2 model5b 3295.305 10 312.155 1.000 < 2e-16 ***
##
## AIC difference: 625.10, model model5b has lower AIC.
##### Generalized Additive Mixed-Effects Regression
# C1. We first test if we need a random intercept for word (we use bam for speed)
# Note that method='fREML' does the same as method='REML', it's (generally) just faster.
model6 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1))
+ PopCnt + s(Word,bs="re"), data=dialectdata, method='fREML')
# C2. We can see immediately if the random intercept is necessary via the summary:
summary(model6)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0004235 0.0269223 0.016 0.987
## PopCnt -0.0115779 0.0017745 -6.525 6.99e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 94.69 108.4 39.43 <2e-16 ***
## s(Word) 44.64 46.0 237.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 173/174
## R-sq.(adj) = 0.462 Deviance explained = 46.6%
## fREML = -657.73 Scale est. = 0.053123 n = 19446
# C3. We can also compare the REML scores:
model5 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1))
+ PopCnt, data=dialectdata, method='fREML')
compareML(model5,model6)
## model5: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt
##
## model6: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re")
##
## Chi-square test of fREML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 model5 3628.9102 9
## 2 model6 -657.7291 10 4286.639 1.000 < 2e-16 ***
##
## AIC difference: 8811.61, model model6 has lower AIC.
# C3. We can visualize the random intercepts:
par(mfrow=c(1,1)) # only one plot in a window
myCoefPlot.gam(model6,"Word","Intercept")
# C4. We include a random slope for population count per word and evaluate if it improves the model
model7 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1))
+ PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re"), data=dialectdata, method='fREML')
summary(model7) # the random slope is significant
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re") + s(Word, PopCnt, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0004346 0.0269236 0.016 0.987
## PopCnt -0.0115838 0.0026939 -4.300 1.72e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 94.68 108.3 39.54 < 2e-16 ***
## s(Word) 44.64 46.0 238.84 < 2e-16 ***
## s(Word,PopCnt) 28.14 47.0 1.52 8.11e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 221/222
## R-sq.(adj) = 0.464 Deviance explained = 46.9%
## fREML = -671.29 Scale est. = 0.052931 n = 19446
compareML(model6,model7) # shows the same pattern, but I generally follow the summary to decide
## model6: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re")
##
## model7: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re") + s(Word, PopCnt, bs = "re")
##
## Chi-square test of fREML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 model6 -657.7291 10
## 2 model7 -671.2868 11 13.558 1.000 1.917e-07 ***
##
## AIC difference: 40.92, model model7 has lower AIC.
# if a random slope/intercept needs to be retained
# C5. We can visualize the random slopes:
myCoefPlot.gam(model7,"Word","PopCnt")
# EXERCISE 7. Add a random intercept for Location and visualize the random intercepts
# ANSWER 7:
model8 = bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1))
+ PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re") + s(Location,bs='re'),
data=dialectdata, method='fREML')
summary(model8)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + s(Location,
## bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0006155 0.0269784 0.023 0.98180
## PopCnt -0.0112542 0.0037041 -3.038 0.00238 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 91.68 104.7 18.131 < 2e-16 ***
## s(Word) 44.62 46.0 254.829 < 2e-16 ***
## s(Word,PopCnt) 28.82 47.0 1.619 2.21e-08 ***
## s(Location) 276.42 420.0 2.116 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 645/646
## R-sq.(adj) = 0.487 Deviance explained = 49.9%
## fREML = -866.12 Scale est. = 0.050647 n = 19446
myCoefPlot.gam(model8,"Location","Intercept")
# EXERCISE 8. Test if other variables need to be added as predictors to the model,
# and test them also as random slopes. Think carefully about if you
# need to test them as random slopes by word or location.
# ANSWER 8: (example)
summary(model8a <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1))
+ PopCnt + s(Word,bs="re") + s(Word,PopCnt,bs="re") + s(Location,bs='re'),
data=dialectdata, method='ML')) # method='ML' to compare fixed effects
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + s(Location,
## bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0006135 0.0265322 0.023 0.9816
## PopCnt -0.0112456 0.0036887 -3.049 0.0023 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 91.26 104.3 18.207 < 2e-16 ***
## s(Word) 44.60 46.0 254.777 < 2e-16 ***
## s(Word,PopCnt) 28.63 47.0 1.608 2.17e-08 ***
## s(Location) 275.96 420.0 2.112 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 645/646
## R-sq.(adj) = 0.487 Deviance explained = 49.9%
## -ML = -884.23 Scale est. = 0.050651 n = 19446
summary(model9a <- bam(LD ~ te(Longitude,Latitude,WordFreq,d=c(2,1))
+ PopCnt + IsNoun + s(Word,bs="re") + s(Word,PopCnt,bs="re") + s(Location,bs='re'),
data=dialectdata, method='ML')) # IsNoun is significant
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## IsNoun + s(Word, bs = "re") + s(Word, PopCnt, bs = "re") +
## s(Location, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.054252 0.035394 -1.533 0.1253
## PopCnt -0.011247 0.003689 -3.049 0.0023 **
## IsNoun 0.124536 0.056234 2.215 0.0268 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Longitude,Latitude,WordFreq) 91.27 104.4 18.209 < 2e-16 ***
## s(Word) 43.55 45.0 234.801 < 2e-16 ***
## s(Word,PopCnt) 28.63 47.0 1.608 2.01e-08 ***
## s(Location) 275.95 420.0 2.110 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 646/647
## R-sq.(adj) = 0.487 Deviance explained = 49.9%
## -ML = -886.58 Scale est. = 0.050651 n = 19446
compareML(model8a,model9a)
## model8a: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## s(Word, bs = "re") + s(Word, PopCnt, bs = "re") + s(Location,
## bs = "re")
##
## model9a: LD ~ te(Longitude, Latitude, WordFreq, d = c(2, 1)) + PopCnt +
## IsNoun + s(Word, bs = "re") + s(Word, PopCnt, bs = "re") +
## s(Location, bs = "re")
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 model8a -884.2340 12
## 2 model9a -886.5752 13 2.341 1.000 0.030 *
##
## AIC difference: -0.08, model model8a has lower AIC.
## Warning in compareML(model8a, model9a): Only small difference in ML...
# The AIC does not decrease (i.e. the model becomes worse), and the reduction in ML is only limited.
# I would therefore exclude IsNoun.
# FINAL REMARKS: note that the (geographical) results are different than reported in the paper
# (http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0023613). This is due to
# the relatively small dataset (containing less than 10% of the locations).
#
# The original dataset and all analyses steps can be found in the paper package:
# http://openscience.uni-leipzig.de/index.php/mr2/article/view/44
#
# For the Tuscan data presented during the lecture, these can be found here:
# http://openscience.uni-leipzig.de/index.php/mr2/article/view/41
Replication of the analysis
To run all analyses yourself, you first have to install pandoc, and then you can just copy the following lines to the most recent version of R. You need the packages ‘mgcv’, ‘car’ and ‘rmarkdown’. If these are not installed (the library commands will throw an error), you can uncomment (i.e. remove the hashtag) the first three lines to install them.
#install.packages('mgcv',repos='http://cran.us.r-project.org')
#install.packages('car',repos='http://cran.us.r-project.org')
#install.packages('rmarkdown',repos='http://cran.us.r-project.org')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture3/lab/answers/lab-including-answers.Rmd', 'lab-including-answers.Rmd')
library(rmarkdown)
render('lab-including-answers.Rmd') # generates html file with results
browseURL(paste('file://', file.path(getwd(),'lab-including-answers.html'), sep='')) # shows result