## During lecture 4 we have looked at the effect of Correctness and the effect of Age of
## Arrival. In this lab session we will look at another additional factor: Structure.
## Research question: Is there an effect of the distance between determiner and noun in the
## violation? We compare the DN (determiner-noun, e.g. der Garten/*das Garten) structure with
## the DAN (determiner-adjective-noun, e.g. das frische Gras/*der frische Gras) structure.
# A0. Start R 3.1.2
# Start R and make sure the libraries mgcv and car are installed
# Menu: Packages -> Install package(s)... -> Select mirror -> Select packages
# or:
# install.packages(c("mgcv","car"),repos='http://cran.r-project.org')
# A1. These commands should work when the packages are installed
library(mgcv)
library(car)
# version information
R.version.string
## [1] "R version 3.1.2 (2014-10-31)"
packageVersion('mgcv')
## [1] '1.8.4'
packageVersion('car')
## [1] '2.0.22'
# A2. Now download and load the following files containing the dataset and some functions:
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/dat.rda', 'dat.rda')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/plotting.R', 'plotting.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/compareML.R', 'compareML.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/bam.art.fit.R', 'bam.art.fit.R')
load('dat.rda') # This dataset is a subset of 15 subjects
source('plotting.R')
source('compareML.R')
# A3. Data investigation
head(dat)
## Time Subject Group Word TrialNr Roi Structure Correctness L1 AoArr LoR Age Edu SeqStart SubjectCor uV
## 44947 505 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 TRUE GL102.incor 8.94400
## 28121 515 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 15.56267
## 40909 525 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 21.30807
## 1294 535 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 13.31573
## 9446 545 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 19.10700
## 81324 555 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 17.95607
str(dat)
## 'data.frame': 102960 obs. of 16 variables:
## $ Time : num 505 515 525 535 545 555 565 575 585 595 ...
## $ Subject : Factor w/ 15 levels "GL102","GL103",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Group : Factor w/ 2 levels "GenEarly","GenLate": 1 1 1 1 1 1 1 1 1 1 ...
## $ Word : Factor w/ 96 levels "Auge","Bart",..: 56 56 56 56 56 56 56 56 56 56 ...
## $ TrialNr : num 2 2 2 2 2 2 2 2 2 2 ...
## $ Roi : Factor w/ 1 level "post.mid": 1 1 1 1 1 1 1 1 1 1 ...
## $ Structure : Factor w/ 2 levels "DAN","DN": 2 2 2 2 2 2 2 2 2 2 ...
## $ Correctness: Factor w/ 2 levels "cor","incor": 2 2 2 2 2 2 2 2 2 2 ...
## $ L1 : Factor w/ 2 levels "PL","RU": 1 1 1 1 1 1 1 1 1 1 ...
## $ AoArr : int 8 8 8 8 8 8 8 8 8 8 ...
## $ LoR : int 24 24 24 24 24 24 24 24 24 24 ...
## $ Age : int 32 32 32 32 32 32 32 32 32 32 ...
## $ Edu : int 3 3 3 3 3 3 3 3 3 3 ...
## $ SeqStart : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
## $ SubjectCor : Factor w/ 30 levels "GL102.cor","GL103.cor",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ uV : num 8.94 15.56 21.31 13.32 19.11 ...
summary(dat)
## Time Subject Group Word TrialNr Roi Structure Correctness L1 AoArr
## Min. : 505.0 GL153 : 7680 GenEarly:60160 Auge : 1200 Min. : 1.00 post.mid:102960 DAN:51600 cor :51360 PL:12400 Min. : 7.00
## 1st Qu.: 702.5 GL154 : 7680 GenLate :42800 Bart : 1200 1st Qu.: 34.00 DN :51360 incor:51600 RU:90560 1st Qu.:10.00
## Median : 900.0 GL205 : 7680 Beginn : 1200 Median : 68.00 Median :13.00
## Mean : 900.0 GL207 : 7680 Bein : 1200 Mean : 68.76 Mean :15.41
## 3rd Qu.:1097.5 GL323 : 7680 Beispiel: 1200 3rd Qu.:102.00 3rd Qu.:21.00
## Max. :1295.0 GL335 : 7680 Berg : 1200 Max. :144.00 Max. :26.00
## (Other):56880 (Other) :95760
## LoR Age Edu SeqStart SubjectCor uV
## Min. : 7.00 Min. :20.00 Min. :3.000 Mode :logical GL110.cor: 3840 Min. :-132.3745
## 1st Qu.:10.00 1st Qu.:23.00 1st Qu.:3.000 FALSE:101673 GL153.cor: 3840 1st Qu.: -8.0165
## Median :13.00 Median :28.00 Median :3.000 TRUE :1287 GL154.cor: 3840 Median : 0.4563
## Mean :13.17 Mean :28.58 Mean :3.981 NA's :0 GL205.cor: 3840 Mean : 0.5424
## 3rd Qu.:16.00 3rd Qu.:32.00 3rd Qu.:5.000 GL207.cor: 3840 3rd Qu.: 9.0951
## Max. :24.00 Max. :44.00 Max. :5.000 GL323.cor: 3840 Max. : 135.9779
## (Other) :79920
dim(dat)
## [1] 102960 16
# A4. Sort the data
# The data needs to be sorted to be able to check for autocorrelation problems later
dat = dat[order(dat$Subject, dat$TrialNr, dat$Roi, dat$Time, decreasing=F),]
head(dat)
## Time Subject Group Word TrialNr Roi Structure Correctness L1 AoArr LoR Age Edu SeqStart SubjectCor uV
## 44947 505 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 TRUE GL102.incor 8.94400
## 28121 515 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 15.56267
## 40909 525 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 21.30807
## 1294 535 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 13.31573
## 9446 545 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 19.10700
## 81324 555 GL102 GenEarly Wald 2 post.mid DN incor PL 8 24 32 3 FALSE GL102.incor 17.95607
# A5. Our first model for the general effect of time (we use bam as the dataset is large):
m0 = bam(uV ~ s(Time), data=dat, gc.level=2, method='ML')
# A6. Look at the results of the model and plot the general effect of time
# Note that this is not very interesting since we are not differentiating between correct vs.
# incorrect or DN vs. DAN
summary(m0)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.54572 0.05228 10.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 3.217 3.996 28.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00111 Deviance explained = 0.114%
## -ML = 4.3641e+05 Scale est. = 281.25 n = 102960
par(mfrow=c(1,2))
plot(m0,rug=F,shade=T,ylim=c(2,-2), main='m0'); abline(h=0)
# A7. Taking into account individual variation via factor smooths
m0b = bam(uV ~ s(Time) + s(Time,Subject,bs='fs',m=1), data=dat, gc.level=2, method='ML')
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
# (N.B. You can ignore the warning messages about repeated smooths in this lab)
summary(m0b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, Subject, bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5204 0.6009 0.866 0.386
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 2.915 3.47 8.164 7.91e-06 ***
## s(Time,Subject) 49.468 134.00 16.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0216 Deviance explained = 2.21%
## -ML = 4.354e+05 Scale est. = 275.49 n = 102960
plot(m0b,select=1,rug=F,shade=T,ylim=c(2,-2), main='m0b'); abline(h=0) # wider confidence bands
# B1. To see whether there is an effect of correctness, we make a model where we look at correct
# versus incorrect. We also include factor smooths for subject-correctness combination.
dat$SubjectCor = interaction(dat$Subject,dat$Correctness)
m1 = bam(uV ~ s(Time,by=Correctness) + Correctness + s(Time,SubjectCor, bs='fs', m=1), data=dat,
gc.level=2, method='ML')
summary(m1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = Correctness) + Correctness + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.401 0.752 -1.863 0.062440 .
## Correctnessincor 3.898 1.070 3.642 0.000271 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):Correctnesscor 1.058 1.082 0.328 0.584
## s(Time):Correctnessincor 3.563 4.213 13.600 2.22e-11 ***
## s(Time,SubjectCor) 115.948 268.000 14.644 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0498 Deviance explained = 5.09%
## -ML = 4.3398e+05 Scale est. = 267.54 n = 102960
# B2. Plot factor smooths per subject-correctness combination
par(mfrow=c(1,1))
plot(m1,select=3)
# B3. We need to check whether this model suffers from autocorrelation:
m1acf = acf(resid(m1)) # plot autocorrelation
rhoval = as.vector(m1acf[1]$acf)
rhoval
## [1] 0.9269407
# Note that we use this value throughout this lab session, but in principle the lowest rho
# value which is still adequate is best. Very high rho values are somewhat problematic:
# i.e. if you have a rho value of 0.99, your model will only try to predict the 1% of the
# variation not explained by the previous observation. So I follow the following strategy:
# - start from the acf value at lag 1 (as above)
# - then find the optimal acf value (i.e. with the lowest (RE)ML score)
# - then decrease the acf value with as much as possible, without
# getting a significantly worse model (using compareML)
# Note that for each distinct model, you need to use the rho values specific
# (using the method above) for that model. I.e. when comparing two models
# differing in fixed/random effects, you might have two different rho values.
# B4. Autocorrelation correction is necessary, so we include the rho parameter in the model:
m1b = bam(uV ~ s(Time,by=Correctness) + Correctness + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
# B5. Let's see if adding the rho parameter significantly improves the model:
compareML(m1, m1b) # m1b preferred
## m1: uV ~ s(Time, by = Correctness) + Correctness + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## m1b: uV ~ s(Time, by = Correctness) + Correctness + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 m1 433979.4 8
## 2 m1b 325595.7 8 108383.780 0.000 < 2e-16 ***
## Warning in compareML(m1, m1b): AIC is not reliable, because an AR1 model is included (rho1 = 0.000000, rho2 = 0.926941).
# B6. In the summary of this new model, we see that the non-linear pattern for correct does not
# significantly deviate from zero, but the (non-linear: edf > 1) pattern for incorrect does:
summary(m1b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = Correctness) + Correctness + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4732 0.7761 -1.898 0.057677 .
## Correctnessincor 3.9165 1.1007 3.558 0.000374 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):Correctnesscor 1.003 1.006 0.001 0.973
## s(Time):Correctnessincor 2.842 3.714 7.770 7.98e-06 ***
## s(Time,SubjectCor) 24.484 268.000 0.569 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0443 Deviance explained = 4.46%
## -ML = 3.256e+05 Scale est. = 226.39 n = 102960
# B7. Let's plot the two patterns, and the difference between them:
x11() # opens new plot window, USE: quartz() in Mac OS X
plotSmooths(m1b,"Time","Correctness",colors=c('green','red')) # visualizing both patterns, custom function
x11()
plotDiff(m1b,"Time","Correctness") # visualizing the difference, custom function
# B8. We still have to test whether the difference between correct and incorrect is significant.
# For that we create a binary variable and test whether the one condition (incorrect) is
# significantly different from the reference level (correct).
dat$IsIncorrect = (dat$Correctness == 'incor')*1 # create a binary variable (0-1 coding).
m1c = bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
summary(m1c) # highly significant
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = IsIncorrect) + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4739 0.7837 -1.881 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.003 1.006 0.001 0.974
## s(Time):IsIncorrect 3.715 4.547 6.916 6.1e-06 ***
## s(Time,SubjectCor) 24.588 268.000 0.571 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0443 Deviance explained = 4.46%
## -ML = 3.256e+05 Scale est. = 226.39 n = 102960
# B9. We can also separate intercept difference and the non-linear difference via an ordered
# factor
dat$CorrectnessO = as.ordered(dat$Correctness)
contrasts(dat$CorrectnessO) = 'contr.treatment' # contrast treatment
m1d = bam(uV ~ s(Time) + s(Time,by=CorrectnessO) + CorrectnessO + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
summary(m1d) # both intercept and non-linear difference are highly significant
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = CorrectnessO) + CorrectnessO + s(Time,
## SubjectCor, bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4730 0.7761 -1.898 0.057695 .
## CorrectnessOincor 3.9164 1.1007 3.558 0.000374 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.003 1.006 0.001 0.973738
## s(Time):CorrectnessOincor 2.841 3.714 5.705 0.000255 ***
## s(Time,SubjectCor) 24.496 268.000 0.569 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0443 Deviance explained = 4.46%
## -ML = 3.256e+05 Scale est. = 226.39 n = 102960
# EXERCISE 1. plot the factor smooths of model m1d. What has happened to them?
# ANSWER 1:
plot(m1d, select=3)
# The wiggly curves have (almost) lost their wigglyness, so they are essentially now factor smooths.
# Note that also in the fourth lecture, adding rho resulted in less wigglyness for the random
# wiggly curves (the number of edf decreased for the factor smooths). This does not
# happen always, however.
# B10. We can plot the difference smooths of m1c and m1d, and compare them to the difference
# we plotted before for model m1b using the plotDiff() function. Note that the difference curve
# for m1b will be similar to the difference smooth of m1c (slight differences may occur, as they
# are somewhat different models), but not m1d, as the latter does not show the (constant)
# intercept difference (so the smooth is centered)
par(mfrow=c(1,3))
plotDiff(m1b,"Time","Correctness") # m1b difference plotted with custom function
# m1c plotted with default plotting function of mgcv
plot(m1c,shade=T,rug=F,ylim=c(10,-1),select=2,main='Difference m1c', ylab='uV'); abline(h = 0)
# m1d plotted with default plotting function of mgcv
plot(m1d,shade=T,rug=F,ylim=c(2,-4),select=2,main='Smooth difference m1d', ylab='uV'); abline(h = 0)
# the smaller confidence bands are caused by not having to take the uncertainty
# about the intercept into account
# EXERCISE 2. What do you conclude about the effect of Correctness?
# ANSWER 2:
# The incorrect condition has significantly higher amplitudes than the
# correct condition. Whereas the correct condition shows a linear pattern,
# neither decreasing nor increasing over time, the incorrect condition shows
# a non-linear, U-shaped pattern, peaking around 1000 ms after stimulus onset.
# C1. Now let's investigate the effect of distance between determiner and noun.
# First, we create a model where we look at the effect of structure (DN versus DAN) in general.
m2 = bam(uV ~ s(Time,by=Structure) + Structure + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = Structure) + Structure + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8932 0.6796 1.314 0.1887
## StructureDN -0.9107 0.4275 -2.130 0.0332 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):StructureDAN 2.366 3.083 2.544 0.05278 .
## s(Time):StructureDN 1.031 1.061 8.727 0.00275 **
## s(Time,SubjectCor) 25.816 269.000 0.784 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0438 Deviance explained = 4.4%
## -ML = 3.256e+05 Scale est. = 226.42 n = 102960
x11()
plotSmooths(m2,"Time","Structure") # visualizing both patterns
x11()
plotDiff(m2,"Time","Structure") # visualizing the differences (comparing DN to DAN)
# C2. We would like to know whether the P600 effect between DN and DAN differs.
# Remember that the size of the P600 is determined by the DIFFERENCE between incorrect and correct.
# We want to see whether that difference varies between the levels of Structure.
# So let's make a model where we look at the interaction between Correctness and Structure:
dat$CorStruct = interaction(dat$Correctness,dat$Structure) # create a new variable with 4 levels
levels(dat$CorStruct) # show levels
## [1] "cor.DAN" "incor.DAN" "cor.DN" "incor.DN"
m3 = bam(uV ~ s(Time,by=CorStruct) + CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = CorStruct) + CorStruct + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4642 0.8276 -0.561 0.574844
## CorStructincor.DAN 2.7920 1.1772 2.372 0.017708 *
## CorStructcor.DN -2.0302 0.5944 -3.416 0.000636 ***
## CorStructincor.DN 2.9043 1.1702 2.482 0.013072 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):CorStructcor.DAN 1.003 1.007 0.029 0.865332
## s(Time):CorStructincor.DAN 3.031 3.964 4.837 0.000719 ***
## s(Time):CorStructcor.DN 1.003 1.007 0.015 0.903413
## s(Time):CorStructincor.DN 1.007 1.013 16.637 4.29e-05 ***
## s(Time,SubjectCor) 24.449 268.000 0.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0467 Deviance explained = 4.7%
## -ML = 3.2559e+05 Scale est. = 226.36 n = 102960
x11()
plotSmooths(m3,"Time","CorStruct")
# C3. Again, we want to know about the significance of the difference curves:
dat$IsDN = (dat$Structure == 'DN')*1 # new binary variable
dat$IsIncorrectDN = dat$IsIncorrect * dat$IsDN # new binary variable
m4 = bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time, by=IsDN) + s(Time, by=IsIncorrectDN) +
s(Time,SubjectCor,bs='fs',m=1), data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
summary(m4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = IsIncorrect) + s(Time, by = IsDN) +
## s(Time, by = IsIncorrectDN) + s(Time, SubjectCor, bs = "fs",
## m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4649 0.8348 -0.557 0.578
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.004 1.007 0.028 0.86794
## s(Time):IsIncorrect 3.715 4.546 3.963 0.00218 **
## s(Time):IsDN 2.001 2.003 5.844 0.00289 **
## s(Time):IsIncorrectDN 2.001 2.003 5.648 0.00351 **
## s(Time,SubjectCor) 24.553 268.000 0.567 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0464 Deviance explained = 4.67%
## -ML = 3.2559e+05 Scale est. = 226.37 n = 102960
# C4. Now, try to interpret what you see in the following plots:
x11()
par(mfrow=c(2,2))
plotSmooths(m3,"Time","CorStruct", useLegend=F)
plot(m4,shade=T,rug=F,ylim=c(7,-4),select=2, ylab='uV', main='m4: IsIncorrect'); abline(h=0)
plot(m4,shade=T,rug=F,ylim=c(7,-4),select=3, ylab='uV', main='m4: IsDN'); abline(h=0)
plot(m4,shade=T,rug=F,ylim=c(7,-4),select=4, ylab='uV', main='m4: IsIncorrectDN'); abline(h=0)
# Note that cor.DAN is the reference level (here all the binary variables are set to 0).
# So the second plot is what you need to do to get from cor.DAN to incor.DAN.
# The third plot shows what you need to do to get from cor.DAN to cor.DN.
# To get to incor.DN, you need to add the second plot (because IsIncorrect == 1),
# the third plot (because IsDN == 1), but also the fourth plot (because IsIncorrectDN == 1).
# N.B. Ideally you also want to add factor smooths per item ('Word' in this case).
# These models take a lot more time to run though, so we excluded them from the lab session
# (but see home exercise below!).
# EXERCISE 3. What do you conclude about the effect of Structure and Correctness?
# ANSWER 3:
# The main effects of both Structure and Correctness are significant, as
# well as the interaction between the two factors. Generally, amplitudes
# for DN are lower than for DAN, and amplitudes for correct are lower than
# for incorrect. The difference between the correct and incorrect condition
# increases over time for the DN condition (linear effect), whereas this
# difference in the DAN condition shows a non-linear, U-shaped, pattern.
# D1. We saw that there are some differences between the DN and the DAN conditions.
# Now let's see whether these differences vary depending on the AoArr of the subject.
# (Note that we only have a subset of the data here (n=15), so there are not so many different
# AoArrs to base the model on.)
m5 = bam(uV ~ te(Time,AoArr,by=CorStruct) + CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m5)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ te(Time, AoArr, by = CorStruct) + CorStruct + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4651 0.8149 -0.571 0.568215
## CorStructincor.DAN 2.7785 1.1593 2.397 0.016541 *
## CorStructcor.DN -2.0149 0.5947 -3.388 0.000704 ***
## CorStructincor.DN 2.9196 1.1526 2.533 0.011307 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Time,AoArr):CorStructcor.DAN 3.044 3.088 0.993 0.395066
## te(Time,AoArr):CorStructincor.DAN 5.948 7.066 2.938 0.004355 **
## te(Time,AoArr):CorStructcor.DN 3.039 3.077 0.270 0.851787
## te(Time,AoArr):CorStructincor.DN 3.105 3.206 5.432 0.000791 ***
## s(Time,SubjectCor) 22.536 266.000 0.553 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0474 Deviance explained = 4.77%
## -ML = 3.2558e+05 Scale est. = 226.36 n = 102960
# D2. We can check whether we need to increase k. We can set k to a single value, or a list of
# multiple values. If you select a single value for te(), every dimension is set to
# that value, i.e. for a two-dimensional tensor k=10 is equal to k=c(10,10)
# (an s() only uses a single k-value, even for multiple dimensions)
m5b = bam(uV ~ te(Time,AoArr,by=CorStruct,k=c(10,10)) + CorStruct + s(Time,SubjectCor,bs='fs',m=1),
data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m5b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ te(Time, AoArr, by = CorStruct, k = c(10, 10)) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4653 0.8150 -0.571 0.568060
## CorStructincor.DAN 2.7617 1.1590 2.383 0.017186 *
## CorStructcor.DN -2.0140 0.5951 -3.384 0.000714 ***
## CorStructincor.DN 2.9232 1.1528 2.536 0.011218 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Time,AoArr):CorStructcor.DAN 3.112 3.222 0.950 0.41635
## te(Time,AoArr):CorStructincor.DAN 6.419 8.091 2.511 0.00978 **
## te(Time,AoArr):CorStructcor.DN 3.079 3.157 0.263 0.86129
## te(Time,AoArr):CorStructincor.DN 3.209 3.410 5.040 0.00113 **
## s(Time,SubjectCor) 22.168 266.000 0.552 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0473 Deviance explained = 4.77%
## -ML = 3.2558e+05 Scale est. = 226.36 n = 102960
compareML(m5,m5b) # the additional complexity is not needed, the ML score of m5 is even lower than that of m5b
## m5: uV ~ te(Time, AoArr, by = CorStruct) + CorStruct + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## m5b: uV ~ te(Time, AoArr, by = CorStruct, k = c(10, 10)) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1)
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 m5b 325582.9 26
## 2 m5 325582.5 26 0.397 0.000 < 2e-16 ***
## Warning in compareML(m5, m5b): AIC is not reliable, because an AR1 model is included (rho1 = 0.926941, rho2 = 0.926941).
## Warning in compareML(m5, m5b): Only small difference in ML...
# D3. Plot the surfaces (partial effects, so these exclude the intercept differences)
# zlim is used to put every plot on the same scale
par(mfrow=c(2,2))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=1, color='topo', main='m5: cor.DAN', zlim=c(-4,6))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=2, color='topo', main='m5: incor.DAN', zlim=c(-4,6))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=3, color='topo', main='m5: cor.DN', zlim=c(-4,6))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=4, color='topo', main='m5: incor.DN', zlim=c(-4,6))
# D4. The partial effects are not so informative, as the intercept differences (in the parametric part of
# the summary) are not included. We can include these in vis.gam (shows complete effects) by using the cond-parameter.
par(mfrow=c(2,2))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='cor.DAN'), color='topo', main='m5: cor.DAN', zlim=c(-8,6))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='incor.DAN'), color='topo', main='m5: incor.DAN', zlim=c(-8,6))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='cor.DN'), color='topo', main='m5: cor.DN', zlim=c(-8,6))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='incor.DN'), color='topo', main='m5: incor.DN', zlim=c(-8,6))
# D5. But actually, this is also not what we need, we only need to look at the
# difference between incor vs. cor for both cases
par(mfrow=c(1,2))
plotDiff2D(m5,'Time','AoArr','CorStruct',c('cor.DAN','incor.DAN'))
plotDiff2D(m5,'Time','AoArr','CorStruct',c('cor.DN','incor.DN'))
# D6. Furthermore, we want to test if these difference surfaces are significant.
m6 = bam(uV ~ te(Time,AoArr) + te(Time,AoArr,by=IsIncorrect) + te(Time,AoArr,by=IsDN) +
te(Time,AoArr,by=IsIncorrectDN) + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m6) # they are significant
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ te(Time, AoArr) + te(Time, AoArr, by = IsIncorrect) + te(Time,
## AoArr, by = IsDN) + te(Time, AoArr, by = IsIncorrectDN) +
## s(Time, SubjectCor, bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4684 0.8230 -0.569 0.569
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(Time,AoArr) 1.029 1.055 0.589 0.4475
## te(Time,AoArr):IsIncorrect 6.736 7.822 2.415 0.0142 *
## te(Time,AoArr):IsDN 4.026 4.051 3.048 0.0156 *
## te(Time,AoArr):IsIncorrectDN 4.350 4.625 2.969 0.0138 *
## s(Time,SubjectCor) 23.196 267.000 0.554 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0468 Deviance explained = 4.72%
## -ML = 3.2559e+05 Scale est. = 226.37 n = 102960
# D7. We can now plot the difference surface between DAN correct and incorrect directly
# and compare this to the previous difference surface, as the models are *different*
# these surfaces may differ (somewhat).
par(mfrow=c(1,2))
plotDiff2D(m5,'Time','AoArr','CorStruct',c('cor.DAN','incor.DAN'),main='m5: Calculated diff. incor.DAN vs. cor.DAN')
pvis.gam(m6, plot.type='contour', view=c('Time','AoArr'), select=2, color='topo', main='m6: Diff. surface incor.DAN vs. cor.DAN')
# D8. We can test whether the 2-dimensional interaction is necessary via decomposition:
m7 = bam(uV ~ s(Time,by=CorStruct) + s(AoArr,by=CorStruct) + ti(Time,AoArr,by=CorStruct,k=10) +
CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m7)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + ti(Time,
## AoArr, by = CorStruct, k = 10) + CorStruct + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4444 0.7951 -0.559 0.576223
## CorStructincor.DAN 2.7096 1.1319 2.394 0.016670 *
## CorStructcor.DN -2.0315 0.5945 -3.417 0.000632 ***
## CorStructincor.DN 2.7908 1.1250 2.481 0.013116 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):CorStructcor.DAN 1.008 1.017 0.036 0.8541
## s(Time):CorStructincor.DAN 3.032 3.966 4.850 0.0007 ***
## s(Time):CorStructcor.DN 1.009 1.018 0.016 0.9030
## s(Time):CorStructincor.DN 1.015 1.029 16.365 4.64e-05 ***
## s(AoArr):CorStructcor.DAN 1.005 1.009 0.358 0.5516
## s(AoArr):CorStructincor.DAN 1.004 1.005 0.095 0.7593
## s(AoArr):CorStructcor.DN 1.004 1.008 0.805 0.3698
## s(AoArr):CorStructincor.DN 5.594 6.554 2.570 0.0144 *
## ti(Time,AoArr):CorStructcor.DAN 1.119 1.235 2.024 0.1451
## ti(Time,AoArr):CorStructincor.DAN 1.070 1.139 1.338 0.2443
## ti(Time,AoArr):CorStructcor.DN 1.085 1.168 0.028 0.9002
## ti(Time,AoArr):CorStructincor.DN 1.100 1.198 0.006 0.9624
## s(Time,SubjectCor) 21.353 266.000 0.481 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0503 Deviance explained = 5.07%
## -ML = 3.2558e+05 Scale est. = 226.33 n = 102960
# EXERCISE 4. Is the interaction necessary? Test this by comparing with a simpler model (save as m8).
# ANSWER 4:
# It seems not, as the ti's are n.s., but we should test this by excluding the interaction:
m8 = bam(uV ~ s(Time,by=CorStruct) + s(AoArr,by=CorStruct) +
CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
compareML(m7,m8) # m7 is not significantly better than m8
## m7: uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + ti(Time,
## AoArr, by = CorStruct, k = 10) + CorStruct + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## m8: uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1)
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 m8 325584.9 22
## 2 m7 325583.7 34 1.231 12.000 0.998
## Warning in compareML(m7, m8): AIC is not reliable, because an AR1 model is included (rho1 = 0.926941, rho2 = 0.926941).
## Warning in compareML(m7, m8): Only small difference in ML...
summary(m8) # it appears there is also no effect of AoArr
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4472 0.8147 -0.549 0.583055
## CorStructincor.DAN 2.7847 1.1592 2.402 0.016297 *
## CorStructcor.DN -2.0309 0.5944 -3.417 0.000634 ***
## CorStructincor.DN 2.8953 1.1521 2.513 0.011971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):CorStructcor.DAN 1.003 1.005 0.029 0.865025
## s(Time):CorStructincor.DAN 3.031 3.965 4.837 0.000717 ***
## s(Time):CorStructcor.DN 1.003 1.005 0.015 0.904782
## s(Time):CorStructincor.DN 1.006 1.011 16.688 4.21e-05 ***
## s(AoArr):CorStructcor.DAN 1.002 1.005 0.347 0.556957
## s(AoArr):CorStructincor.DAN 1.001 1.002 0.110 0.740353
## s(AoArr):CorStructcor.DN 1.002 1.004 0.770 0.380336
## s(AoArr):CorStructincor.DN 1.001 1.002 1.051 0.305221
## s(Time,SubjectCor) 22.558 266.000 0.553 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0468 Deviance explained = 4.71%
## -ML = 3.2558e+05 Scale est. = 226.36 n = 102960
# EXERCISE 5. Plot the difference patterns across age of arrival for both structures for model m8.
# ANSWER 5:
plotDiff(m8,'AoArr','CorStruct',c('cor.DAN','incor.DAN'))
plotDiff(m8,'AoArr','CorStruct',c('cor.DN','incor.DN'))
# this also suggests there is no significant difference for AoArr
# EXERCISE 6. Test if the difference patterns across time and AoArr are significant by using
# an ordered factor: CorStructO
# Hint: you need to make two models (m9a and m9b), one where cor.DAN is the
# reference level, and another where cor.DN is.
# After having run the first model, use:
# dat$CorStruct = relevel(dat$CorStruct,'cor.DN')
# to relevel and then create the ordered factor.
# ANSWER 6
dat$CorStructO = as.ordered(dat$CorStruct)
contrasts(dat$CorStructO) = 'contr.treatment'
m9a = bam(uV ~ s(Time) + s(Time,by=CorStructO) + s(AoArr) + s(AoArr,by=CorStructO) + CorStructO
+ s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
summary(m9a) # shows difference between cor.DAN vs. rest, only different across Time, not AoArr
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = CorStructO) + s(AoArr) + s(AoArr,
## by = CorStructO) + CorStructO + s(Time, SubjectCor, bs = "fs",
## m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4470 0.8147 -0.549 0.583243
## CorStructOincor.DAN 2.7847 1.1592 2.402 0.016292 *
## CorStructOcor.DN -2.0309 0.5943 -3.417 0.000633 ***
## CorStructOincor.DN 2.8953 1.1521 2.513 0.011966 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.004 1.007 0.030 0.86295
## s(Time):CorStructOincor.DAN 3.031 3.965 4.617 0.00106 **
## s(Time):CorStructOcor.DN 1.002 1.005 0.044 0.83589
## s(Time):CorStructOincor.DN 1.004 1.009 9.079 0.00254 **
## s(AoArr) 1.002 1.002 0.351 0.55403
## s(AoArr):CorStructOincor.DAN 1.001 1.003 0.034 0.85437
## s(AoArr):CorStructOcor.DN 1.002 1.004 0.152 0.69734
## s(AoArr):CorStructOincor.DN 1.002 1.004 0.093 0.76206
## s(Time,SubjectCor) 22.431 266.000 0.553 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0468 Deviance explained = 4.71%
## -ML = 3.2558e+05 Scale est. = 226.36 n = 102960
dat$CorStruct = relevel(dat$CorStruct,'cor.DN') # releveling to show difference wrt cor.DN
dat$CorStructO = as.ordered(dat$CorStruct)
contrasts(dat$CorStructO) = 'contr.treatment'
system.time(m9b <- bam(uV ~ s(Time) + s(Time,by=CorStructO) + s(AoArr) + s(AoArr,by=CorStructO) + CorStructO
+ s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart))
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
## user system elapsed
## 81.43 3.53 85.03
# multiple comparisons (two), so use lower p-value threshold (e.g., 0.025)
summary(m9b) # shows difference between cor.DN vs. rest, only different across Time, not AoArr
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = CorStructO) + s(AoArr) + s(AoArr,
## by = CorStructO) + CorStructO + s(Time, SubjectCor, bs = "fs",
## m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.4779 0.8162 -3.036 0.002398 **
## CorStructOcor.DAN 2.0310 0.5944 3.417 0.000633 ***
## CorStructOincor.DAN 4.8150 1.1602 4.150 3.33e-05 ***
## CorStructOincor.DN 4.9262 1.1531 4.272 1.94e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.005 1.011 0.013 0.91020
## s(Time):CorStructOcor.DAN 1.004 1.007 0.043 0.83756
## s(Time):CorStructOincor.DAN 3.030 3.964 4.506 0.00130 **
## s(Time):CorStructOincor.DN 1.007 1.014 7.757 0.00522 **
## s(AoArr) 1.001 1.001 0.774 0.37915
## s(AoArr):CorStructOcor.DAN 1.003 1.005 0.151 0.69903
## s(AoArr):CorStructOincor.DAN 1.002 1.003 0.149 0.70013
## s(AoArr):CorStructOincor.DN 1.002 1.004 0.010 0.92080
## s(Time,SubjectCor) 22.736 266.000 0.553 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0468 Deviance explained = 4.71%
## -ML = 3.2558e+05 Scale est. = 226.36 n = 102960
# To conclude: it appears there is only a Time effect, but not an AoArr effect (in this SUBSET)
# D9. Model criticism
# The residuals of m8 unfortunately look familiar (compared to the lecture)...
par(mfrow=c(1,2))
qqplot.rho(m8)
hist.rho(m8)
# You can do these exercises at home, since running the model takes quite long.
# EXERCISE 7. Add factor smooths per word-correctness combination to m8
# and rerun the model (name it m10).
# Don't forget to make the new factorial predictor combining Word and Correctness.
# ANSWER 7:
dat$WordCor = interaction(dat$Word,dat$Correctness)
system.time(m10 <- bam(uV ~ s(Time,by=CorStruct) + s(AoArr,by=CorStruct) +
CorStruct + s(Time,SubjectCor,bs='fs',m=1) + s(Time,WordCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart))
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
## user system elapsed
## 15259.24 108.35 15372.45
summary(m10)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1) + s(Time, WordCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.639 1.097 -2.406 0.01612 *
## CorStructcor.DAN 2.162 1.204 1.796 0.07247 .
## CorStructincor.DAN 5.008 1.558 3.214 0.00131 **
## CorStructincor.DN 4.762 1.554 3.064 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time):CorStructcor.DN 1.002 1.004 0.015 0.902527
## s(Time):CorStructcor.DAN 1.002 1.004 0.029 0.866162
## s(Time):CorStructincor.DAN 3.038 3.974 4.853 0.000686 ***
## s(Time):CorStructincor.DN 1.004 1.008 16.869 3.87e-05 ***
## s(AoArr):CorStructcor.DN 1.002 1.004 0.653 0.419376
## s(AoArr):CorStructcor.DAN 1.002 1.003 0.240 0.625142
## s(AoArr):CorStructincor.DAN 1.001 1.002 0.154 0.695259
## s(AoArr):CorStructincor.DN 1.001 1.003 1.350 0.245125
## s(Time,SubjectCor) 21.602 266.000 0.515 < 2e-16 ***
## s(Time,WordCor) 133.655 1724.000 0.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.139 Deviance explained = 14%
## -ML = 3.2548e+05 Scale est. = 225.36 n = 102960
# EXERCISE 8. Assess which model is best, m8 or m10.
# ANSWER 8:
compareML(m8,m10) # m10 is better
## m8: uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1)
##
## m10: uV ~ s(Time, by = CorStruct) + s(AoArr, by = CorStruct) + CorStruct +
## s(Time, SubjectCor, bs = "fs", m = 1) + s(Time, WordCor,
## bs = "fs", m = 1)
##
## Chi-square test of ML scores
## -----
## Model Score Edf Chisq Df p.value Sig.
## 1 m8 325584.9 22
## 2 m10 325480.2 24 104.701 2.000 < 2e-16 ***
## Warning in compareML(m8, m10): AIC is not reliable, because an AR1 model is included (rho1 = 0.926941, rho2 = 0.926941).
# CAVEAT: This approach is still in development, i.e. there may be errors in the approach outlined here
# E1. The scaled-t distribution is implemented in the gam function (family="scat"), however
# in gam it is per default not possible to account for autocorrelation. By using a
# custom scat function (stored in bam.art.fit.R) created by Natalya Pya (University of Bath),
# it is possible. For example, to fit an intercept-only model while taking into account rho:
source('bam.art.fit.R')
g0 <- gam(uV ~ 1, data=dat, method='REML', family=scat(rho=rhoval,AR.start=dat$SeqStart))
qqp(resid(g0)) # qqplot.rho is not necessary here as the residuals are already corrected
# E2. Unfortunately, gam is too slow to fit the complex models necessary for
# EEG data including the subject- and word-specific factor smooths, and we therefore
# turn to bam. However, bam does not (yet) support a scaled-t distribution.
# We therefore again use customized functions created by Nataly Pya ('bam.art.fit.R').
# The current drawback of these adapted functions is that they only work with the fREML method.
# In addition, the scale parameters of the t-distributions (theta) need to be pre-specified when
# running the function. This implies that we need to use the function gam for this
# (since this function estimates the scale parameters). Simulations suggest that we could run
# gam on a subset of the original data (e.g., 50% or perhaps less) to get a reasonable approximation
# of the scaled-t parameters. Unfortunately, this still is likely too computationally expensive
# in the presence of a complex random-effects structure. We therefore use another approximation:
#
# 1. We first fit a very simple gam model (g0, above) and extract the scaled-t parameters from that
# 2. We fit a more complex model (including the full fixed effects structure, but no random effects)
# and extract the scaled-t parameters of that model
# 3. We compare the two sets of scaled-t parameters
# - if the difference is relatively small (a few tenths), we use the scaled-t parameters
# of the more complex model in step 4.
# - if the difference is large, we unfortunately need to determine the scaled-t parameters
# by running the complete model with gam for a subset of the data (e.g., 10% of the trials)
# [in that case it makes sense to do this 2+ times to assess if the variability is not too great]
# 4. We fit the bam model using the selected scaled-t parameters
theta0 <- g0$family$getTheta(TRUE) # get scaled-t parameters of simple model
# model without random effects (on the basis of m1c, for simplicity)
g1 <- gam(uV ~ s(Time) + s(Time,by=IsIncorrect), data=dat, method='REML', family=scat(rho=rhoval,AR.start=dat$SeqStart))
theta1 <- g1$family$getTheta(TRUE) # get scaled-t parameters of simple model
# compare thetas (small difference: OK)
theta1 - theta0
## [1] -0.005344687 -0.002927001
# using bam with theta and rho for the full model
system.time(m10 <- bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time,SubjectCor,bs='fs',m=1),
data=dat, method='fREML', gc.level=2, AR.start=SeqStart,
family=art(theta=theta1,rho=rhoval)))
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
## user system elapsed
## 574.79 50.09 625.19
# comparison with the duration for a normal Gaussian model
system.time(m10g <- bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time,SubjectCor,bs='fs',m=1),
data=dat, method='fREML', gc.level=2, AR.start=SeqStart, rho=rhoval))
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-d smooths of same variable.
## user system elapsed
## 26.43 2.74 29.22
# clear difference in residuals
par(mfrow=c(1,2))
qqplot.rho(m10g,main='Residuals: gaussian fit')
qqp(resid(m10),main='Residuals: scaled-t fit')
# summaries are reasonably similar (although p-values differ)
summary(m10g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = IsIncorrect) + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4760 0.8046 -1.835 0.0666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.002 1.003 0.001 0.975
## s(Time):IsIncorrect 4.079 5.029 6.362 6.27e-06 ***
## s(Time,SubjectCor) 24.038 268.000 0.575 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0444 Deviance explained = 4.47%
## fREML = 3.2559e+05 Scale est. = 226.39 n = 102960
summary(m10)
##
## Family: Scaled t(5.807,4.6), rho=0.926940720442651
## Link function: identity
##
## Formula:
## uV ~ s(Time) + s(Time, by = IsIncorrect) + s(Time, SubjectCor,
## bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9830 0.7133 -1.378 0.168
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 1.808 2.308 1.271 0.276
## s(Time):IsIncorrect 4.752 5.885 9.483 3.5e-10 ***
## s(Time,SubjectCor) 26.366 268.000 1.470 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0426 Deviance explained = 2.71%
## fREML = 14237 Scale est. = 1 n = 102960
# graphs are also reasonably similar
par(mfrow=c(2,2))
plot(m10,shade=T,rug=F,select=1,main='s(Time): scaled-t', ylab='uV'); abline(h = 0)
plot(m10g,shade=T,rug=F,select=1,main='s(Time): gaussian', ylab='uV'); abline(h = 0)
plot(m10,shade=T,rug=F,select=2,main='s(Time,by=IsIncorrect): scaled-t', ylab='uV'); abline(h = 0)
plot(m10g,shade=T,rug=F,select=2,main='s(Time,by=IsIncorrect): gaussian', ylab='uV'); abline(h = 0)
# REMARK about normal model building: always include the factor smooths and rho, then make
# the model which tests your hypothesis. If computation takes too long, start with factor smooths
# per subject only, but always include rho if there is autocorrelation. Since using the scaled-t
# distribution takes much more time, I generally only fit the final model using this approach
# to validate the results.
To run all analyses yourself, you first have to install pandoc, and then you can just copy the following lines to the most recent version of R. You need the packages ‘mgcv’, ‘car’ and ‘rmarkdown’. If these are not installed (the library commands will throw an error), you can uncomment (i.e. remove the hashtag) the first three lines to install them. Note that running all these analyses will take many hours of computational time!
#install.packages('mgcv',repos='http://cran.us.r-project.org')
#install.packages('car',repos='http://cran.us.r-project.org')
#install.packages('rmarkdown',repos='http://cran.us.r-project.org')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/answers/lab-including-answers.Rmd', 'lab-including-answers.Rmd')
library(rmarkdown)
render('lab-including-answers.Rmd') # generates html file with results
browseURL(paste('file://', file.path(getwd(),'lab-including-answers.html'), sep='')) # shows result