## STATISTICS COURSE - LAB SESSION 4
## Analyzing EEG data using GAMs
## Martijn Wieling (with help of Nienke Meulman)

## During lecture 4 we have looked at the effect of Correctness and the effect of Age of 
## Arrival. In this lab session we will look at another additional factor: Structure. 
## Research question: Is there an effect of the distance between determiner and noun in the 
## violation? We compare the DN (determiner-noun, e.g. der Garten/*das Garten) structure with 
## the DAN (determiner-adjective-noun, e.g. das frische Gras/*der frische Gras) structure.

# A0. Start R 3.1.2
# Start R and make sure the libraries mgcv and car are installed
# Menu: Packages -> Install package(s)...  -> Select mirror -> Select packages
# or: 
# install.packages(c("mgcv","car"),repos='http://cran.r-project.org')

# A1. These commands should work when the packages are installed
library(mgcv) # tested with version 1.8.4
library(car)

# version information
R.version.string
packageVersion('mgcv')
packageVersion('car')

# A2. Now download and load the following files containing the dataset and some functions:
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/dat.rda', 'dat.rda')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/plotting.R', 'plotting.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/compareML.R', 'compareML.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture4/lab/bam.art.fit.R', 'bam.art.fit.R')
load('dat.rda') # This dataset is a subset of 15 subjects
source('plotting.R')
source('compareML.R')

# A3. Data investigation
head(dat)
str(dat)
summary(dat)
dim(dat)

# A4. Sort the data
# The data needs to be sorted to be able to check for autocorrelation problems later
dat = dat[order(dat$Subject, dat$TrialNr, dat$Roi, dat$Time, decreasing=F),]
head(dat)

# A5. Our first model for the general effect of time (we use bam as the dataset is large):
m0 = bam(uV ~ s(Time), data=dat, gc.level=2, method='ML')

# A6. Look at the results of the model and plot the general effect of time
# Note that this is not very interesting since we are not differentiating between correct vs. 
# incorrect or DN vs. DAN
summary(m0)
par(mfrow=c(1,2))
plot(m0,rug=F,shade=T,ylim=c(2,-2), main='m0'); abline(h=0)

# A7. Taking into account individual variation via factor smooths
m0b = bam(uV ~ s(Time) + s(Time,Subject,bs='fs',m=1), data=dat, gc.level=2, method='ML')
# (N.B. You can ignore the warning messages about repeated smooths in this lab)
summary(m0b)
plot(m0b,select=1,rug=F,shade=T,ylim=c(2,-2), main='m0b'); abline(h=0) # wider confidence bands


#----------------------------------------------
# Effect of Correctness (correct vs. incorrect)
#----------------------------------------------

# B1. To see whether there is an effect of correctness, we make a model where we look at correct 
# versus incorrect. We also include factor smooths for subject-correctness combination.
dat$SubjectCor = interaction(dat$Subject,dat$Correctness)
m1 = bam(uV ~ s(Time,by=Correctness) + Correctness + s(Time,SubjectCor, bs='fs', m=1), data=dat, 
         gc.level=2, method='ML')
summary(m1)

# B2. Plot factor smooths per subject-correctness combination
par(mfrow=c(1,1))
plot(m1,select=3)

# B3. We need to check whether this model suffers from autocorrelation:
m1acf = acf(resid(m1)) # plot autocorrelation
rhoval = as.vector(m1acf[1]$acf)
rhoval
# Note that we use this value throughout this lab session, but in principle the lowest rho
# value which is still adequate is best. Very high rho values are somewhat problematic:
# i.e. if you have a rho value of 0.99, your model will only try to predict the 1% of the 
# variation not explained by the previous observation. So I follow the following strategy:
# - start from the acf value at lag 1 (as above)
# - then find the optimal acf value (i.e. with the lowest (RE)ML score)
# - then decrease the acf value with as much as possible, without
#   getting a significantly worse model (using compareML)
# Note that for each distinct model, you need to use the rho values specific 
# (using the method above) for that model. I.e. when comparing two models
# differing in fixed/random effects, you might have two different rho values. 

# B4. Autocorrelation correction is necessary, so we include the rho parameter in the model:
m1b = bam(uV ~ s(Time,by=Correctness) + Correctness + s(Time,SubjectCor,bs='fs',m=1), data=dat, 
          gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)

# B5. Let's see if adding the rho parameter significantly improves the model:
compareML(m1, m1b) # m1b preferred

# B6. In the summary of this new model, we see that the non-linear pattern for correct does not 
# significantly deviate from zero, but the (non-linear: edf > 1) pattern for incorrect does:
summary(m1b)

# B7. Let's plot the two patterns, and the difference between them:
x11() # opens new plot window, USE: quartz() in Mac OS X
plotSmooths(m1b,"Time","Correctness",colors=c('green','red')) # visualizing both patterns, custom function
x11() 
plotDiff(m1b,"Time","Correctness") # visualizing the difference, custom function

# B8. We still have to test whether the difference between correct and incorrect is significant. 
# For that we create a binary variable and test whether the one condition (incorrect) is 
# significantly different from the reference level (correct).
dat$IsIncorrect = (dat$Correctness == 'incor')*1 # create a binary variable (0-1 coding).
m1c = bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time,SubjectCor,bs='fs',m=1), data=dat, 
          gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m1c) # highly significant

# B9. We can also separate intercept difference and the non-linear difference via an ordered
#     factor 
dat$CorrectnessO = as.ordered(dat$Correctness)
contrasts(dat$CorrectnessO) = 'contr.treatment' # contrast treatment
m1d = bam(uV ~ s(Time) + s(Time,by=CorrectnessO) + CorrectnessO + s(Time,SubjectCor,bs='fs',m=1), data=dat,
gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m1d) # both intercept and non-linear difference are highly significant


# EXERCISE 1. plot the random wiggly curves of model m1d. What has happened to them?


# B10. We can plot the difference smooths of m1c and m1d, and compare them to the difference 
# we plotted before for model m1b using the plotDiff() function. Note that the difference curve
# for m1b will be similar to the difference smooth of m1c (slight differences may occur, as they
# are somewhat different models), but not m1d, as the latter  does not show the (constant) 
# intercept difference (so the smooth is centered) 
par(mfrow=c(1,3))
plotDiff(m1b,"Time","Correctness") # m1b difference plotted with custom function

# m1c plotted with default plotting function of mgcv
plot(m1c,shade=T,rug=F,ylim=c(10,-1),select=2,main='Difference m1c', ylab='uV'); abline(h = 0)  

# m1d plotted with default plotting function of mgcv
plot(m1d,shade=T,rug=F,ylim=c(2,-4),select=2,main='Smooth difference m1d', ylab='uV'); abline(h = 0)  
# the smaller confidence bands are caused by not having to take the uncertainty 
# about the intercept into account


# EXERCISE 2. What do you conclude about the effect of Correctness?


#---------------------------------
# Effect of Structure (DN vs. DAN)
#---------------------------------

# C1. Now let's investigate the effect of distance between determiner and noun. 
# First, we create a model where we look at the effect of structure (DN versus DAN) in general.
m2 = bam(uV ~ s(Time,by=Structure) + Structure + s(Time,SubjectCor,bs='fs',m=1), data=dat, 
         gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m2)  
x11()
plotSmooths(m2,"Time","Structure") # visualizing both patterns
x11()
plotDiff(m2,"Time","Structure") # visualizing the differences (comparing DN to DAN)

# C2. We would like to know whether the P600 effect between DN and DAN differs. 
# Remember that the size of the P600 is determined by the DIFFERENCE between incorrect and correct. 
# We want to see whether that difference varies between the levels of Structure. 
# So let's make a model where we look at the interaction between Correctness and Structure:
dat$CorStruct = interaction(dat$Correctness,dat$Structure) # create a new variable with 4 levels
levels(dat$CorStruct) # show levels
m3 = bam(uV ~ s(Time,by=CorStruct) + CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat, 
         gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m3)
x11()
plotSmooths(m3,"Time","CorStruct")

# C3. Again, we want to know about the significance of the difference curves:
dat$IsDN = (dat$Structure == 'DN')*1 # new binary variable
dat$IsIncorrectDN = dat$IsIncorrect * dat$IsDN # new binary variable
m4 = bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time, by=IsDN) + s(Time, by=IsIncorrectDN) + 
           s(Time,SubjectCor,bs='fs',m=1), data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m4)

# C4. Now, try to interpret what you see in the following plots:
x11()
par(mfrow=c(2,2))
plotSmooths(m3,"Time","CorStruct", useLegend=F)
plot(m4,shade=T,rug=F,ylim=c(7,-4),select=2, ylab='uV', main='m4: IsIncorrect'); abline(h=0)
plot(m4,shade=T,rug=F,ylim=c(7,-4),select=3, ylab='uV', main='m4: IsDN'); abline(h=0)
plot(m4,shade=T,rug=F,ylim=c(7,-4),select=4, ylab='uV', main='m4: IsIncorrectDN'); abline(h=0)
# Note that cor.DAN is the reference level (here all the binary variables are set to 0). 
# So the second plot is what you need to do to get from cor.DAN to incor.DAN. 
# The third plot shows what you need to do to get from cor.DAN to cor.DN. 
# To get to incor.DN, you need to add the second plot (because IsIncorrect == 1), 
# the third plot (because IsDN == 1), but also the fourth plot (because IsIncorrectDN == 1).

# N.B. Ideally you also want to add factor smooths per item ('Word' in this case). 
# These models take a lot more time to run though, so we excluded them from the lab session 
# (but see home exercise below!).


# EXERCISE 3. What do you conclude about the effect of Structure and Correctness?


#----------------------------------------
# Effect of Age of Arrival for DN vs. DAN
#----------------------------------------

# D1. We saw that there are some differences between the DN and the DAN conditions. 
# Now let's see whether these differences vary depending on the AoArr of the subject.
# (Note that we only have a subset of the data here (n=15), so there are not so many different 
# AoArrs to base the model on.)
m5 = bam(uV ~ te(Time,AoArr,by=CorStruct) + CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat, 
         gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m5)

# D2. We can check whether we need to increase k. We can set k to a single value, or a list of 
#     multiple values. If you select a single value for te(), every dimension is set to 
#     that value, i.e. for a two-dimensional tensor k=10 is equal to k=c(10,10)
#     (an s() only uses a single k-value, even for multiple dimensions)
m5b = bam(uV ~ te(Time,AoArr,by=CorStruct,k=c(10,10)) + CorStruct + s(Time,SubjectCor,bs='fs',m=1), 
          data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m5b)
compareML(m5,m5b) # the additional complexity is not needed, the ML score of m5 is even lower than that of m5b

# D3. Plot the surfaces (partial effects, so these exclude the intercept differences)
# zlim is used to put every plot on the same scale
par(mfrow=c(2,2))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=1, color='topo', main='m5: cor.DAN', zlim=c(-4,6))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=2, color='topo', main='m5: incor.DAN', zlim=c(-4,6))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=3, color='topo', main='m5: cor.DN', zlim=c(-4,6))
pvis.gam(m5, plot.type='contour', view=c('Time','AoArr'), select=4, color='topo', main='m5: incor.DN', zlim=c(-4,6))

# D4. The partial effects are not so informative, as the intercept differences (in the parametric part of 
# the summary) are not included. We can include these in vis.gam (shows complete effects) by using the cond-parameter. 
par(mfrow=c(2,2))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='cor.DAN'), color='topo', main='m5: cor.DAN', zlim=c(-8,6))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='incor.DAN'), color='topo', main='m5: incor.DAN', zlim=c(-8,6))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='cor.DN'), color='topo', main='m5: cor.DN', zlim=c(-8,6))
vis.gam(m5, plot.type='contour', view=c('Time','AoArr'), cond=list(CorStruct='incor.DN'), color='topo', main='m5: incor.DN', zlim=c(-8,6))

# D5. But actually, this is also not what we need, we only need to look at the 
# difference between incor vs. cor for both cases
par(mfrow=c(1,2))
plotDiff2D(m5,'Time','AoArr','CorStruct',c('cor.DAN','incor.DAN'))
plotDiff2D(m5,'Time','AoArr','CorStruct',c('cor.DN','incor.DN'))

# D6. Furthermore, we want to test if these difference surfaces are significant.
m6 = bam(uV ~ te(Time,AoArr) + te(Time,AoArr,by=IsIncorrect) + te(Time,AoArr,by=IsDN) + 
              te(Time,AoArr,by=IsIncorrectDN) + s(Time,SubjectCor,bs='fs',m=1), data=dat, 
         gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m6) # they are significant

# D7. We can now plot the difference surface between DAN correct and incorrect directly 
# and compare this to the previous difference surface, as the models are *different*
# these surfaces may differ (somewhat). 
par(mfrow=c(1,2))
plotDiff2D(m5,'Time','AoArr','CorStruct',c('cor.DAN','incor.DAN'),main='m5: Calculated diff. incor.DAN vs. cor.DAN')
pvis.gam(m6, plot.type='contour', view=c('Time','AoArr'), select=2, color='topo', main='m6: Diff. surface incor.DAN vs. cor.DAN')

# D8. We can test whether the 2-dimensional interaction is necessary via decomposition:
m7 = bam(uV ~ s(Time,by=CorStruct) + s(AoArr,by=CorStruct)  + ti(Time,AoArr,by=CorStruct,k=10) + 
              CorStruct + s(Time,SubjectCor,bs='fs',m=1), data=dat, gc.level=2, method='ML', rho=rhoval, AR.start=SeqStart)
summary(m7)


# EXERCISE 4. Is the interaction necessary? Test this by comparing with a simpler model (save as m8).


# EXERCISE 5. Plot the difference patterns across age of arrival for both structures for model m8.


# EXERCISE 6. Test if the difference patterns across time and AoArr are significant by using
#             an ordered factor: CorStructO
#             Hint: you need to make two models (m9a and m9b), one where cor.DAN is the
#                   reference level, and another where cor.DN is. 
#                   After having run the first model, use: 
#                   dat$CorStruct = relevel(dat$CorStruct,'cor.DN') 
#                   to relevel and then create the ordered factor. 


# D9. Model criticism
# The residuals of m8 unfortunately look familiar (compared to the lecture)...
par(mfrow=c(1,2))
qqplot.rho(m8)
hist.rho(m8)

#--------------------------------------------------------------
# Computationally intensive: taking into account effect of Word
#--------------------------------------------------------------

# You can do these exercises at home, since running the model takes quite long.


# EXERCISE 7. Add factor smooths per word-correctness combination to m8 
#             and rerun the model (name it m10). 
#             Don't forget to make the new factorial predictor combining Word and Correctness.


# EXERCISE 8. Assess which model is best, m8 or m10.


#--------------------------------
# Using the scaled-t distribution
#--------------------------------

# CAVEAT: This approach is still in development, i.e. there may be errors in the approach outlined here

# E1. The scaled-t distribution is implemented in the gam function (family="scat"), however
#     in gam it is per default not possible to account for autocorrelation. By using a 
#     custom scat function (stored in bam.art.fit.R) created by Natalya Pya (University of Bath), 
#     it is possible. For example, to fit an intercept-only model while taking into account rho:
source('bam.art.fit.R')
g0 <- gam(uV ~ 1, data=dat, method='REML', family=scat(rho=rhoval,AR.start=dat$SeqStart))
qqp(resid(g0)) # qqplot.rho is not necessary here as the residuals are already corrected

# E2. Unfortunately, gam is too slow to fit the complex models necessary for 
#     EEG data including the subject- and word-specific factor smooths, and we therefore 
#     turn to bam. However, bam does not (yet) support a scaled-t distribution. 
#     We therefore again use customized functions created by Nataly Pya ('bam.art.fit.R'). 
#     The current drawback of these adapted functions is that they only work with the fREML method. 
#     In addition, the scale parameters of the t-distributions (theta) need to be pre-specified when 
#     running the function. This implies that we need to use the function gam for this 
#     (since this function estimates the scale parameters). Simulations suggest that we could run 
#     gam on a subset of the original data (e.g., 50% or perhaps less) to get a reasonable approximation 
#     of the scaled-t parameters. Unfortunately, this still is likely too computationally expensive 
#     in the presence of a complex random-effects structure. We therefore use another approximation: 
#
#     1. We first fit a very simple gam model (g0, above) and extract the scaled-t parameters from that
#     2. We fit a more complex model (including the full fixed effects structure, but no random effects)
#        and extract the scaled-t parameters of that model
#     3. We compare the two sets of scaled-t parameters
#        - if the difference is relatively small (a few tenths), we use the scaled-t parameters
#          of the more complex model in step 4.
#        - if the difference is large, we unfortunately need to determine the scaled-t parameters 
#          by running the complete model with gam for a subset of the data (e.g., 10% of the trials)
#          [in that case it makes sense to do this 2+ times to assess if the variability is not too great]
#     4. We fit the bam model using the selected scaled-t parameters

theta0 <- g0$family$getTheta(TRUE) # get scaled-t parameters of simple model

# model without random effects (on the basis of m1c, for simplicity)
g1 <- gam(uV ~ s(Time) + s(Time,by=IsIncorrect), data=dat, method='REML', family=scat(rho=rhoval,AR.start=dat$SeqStart)))

theta1 <- g1$family$getTheta(TRUE) # get scaled-t parameters of simple model

# compare thetas (small difference: OK)
theta1 - theta0

# using bam with theta and rho for the full model
system.time(m10 <- bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time,SubjectCor,bs='fs',m=1), 
                       data=dat, method='fREML', gc.level=2, AR.start=SeqStart, 
                       family=art(theta=theta1,rho=rhoval))

# comparison with the duration for a normal Gaussian model
system.time(m10g <- bam(uV ~ s(Time) + s(Time,by=IsIncorrect) + s(Time,SubjectCor,bs='fs',m=1), 
                        data=dat, method='fREML', gc.level=2, AR.start=SeqStart, rho=rhoval))

# clear difference in residuals
par(mfrow=c(1,2))
qqplot.rho(m10g,main='Residuals: gaussian fit')
qqp(resid(m10),main='Residuals: scaled-t fit')

# summaries are reasonably similar (although p-values differ)
summary(m10g)
summary(m10)

# graphs are also reasonably similar
par(mfrow=c(2,2))
plot(m10,shade=T,rug=F,select=1,main='s(Time): scaled-t', ylab='uV'); abline(h = 0) 
plot(m10g,shade=T,rug=F,select=1,main='s(Time): gaussian', ylab='uV'); abline(h = 0) 
plot(m10,shade=T,rug=F,select=2,main='s(Time,by=IsIncorrect): scaled-t', ylab='uV'); abline(h = 0) 
plot(m10g,shade=T,rug=F,select=2,main='s(Time,by=IsIncorrect): gaussian', ylab='uV'); abline(h = 0) 

# REMARK about normal model building: always include the factor smooths and rho, then make
# the model which tests your hypothesis. If computation takes too long, start with factor smooths
# per subject only, but always include rho if there is autocorrelation. Since using the scaled-t
# distribution takes much more time, I generally only fit the final model using this approach
# to validate the results.