## STATISTICS COURSE - LAB SESSION 2
## Martijn Wieling, http://www.martijnwieling.nl

# A0. Start R 3.1.2
# Start R and make sure the libraries lmer and car are installed
# Menu: Packages -> Install package(s)...  -> Select mirror -> Select packages
# or: 
# install.packages(c("lme4","car","gplots"),repos='http://cran.r-project.org')

# A1. These commands should work when the packages are installed
library(lme4)
library(car)
library(gplots)
packageVersion('lme4') # tested with version 1.1.7

# A2. Now download the following files and load the custom functions and data
download.file('http://www.let.rug.nl/wieling/statscourse/lecture2/lab/myFunctions.R', 'myFunctions.R')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture2/lab/eye.rda', 'eye.rda')
download.file('http://www.let.rug.nl/wieling/statscourse/lecture2/lab/eyeAll.rda', 'eyeAll.rda')

source('myFunctions.R') # custom functions
load('eye.rda') # data for mixed-effects regression
load('eyeAll.rda') # data for making focus plots

# A3. data investigation
head(eye)
str(eye)
nrow(eye) # 1102 Subject-Item combinations (only when SameColor==True)
plot(density(eye$Age)) # e.g., to investigate distributions


# EXERCISE 1. Investigate eyeAll yourself


# A4. Investigating focus plots
myFocusPlot(eyeAll) 
# open the file 'eyeplots.pdf' to investigate the results, note the multiple pages!
# use: getwd() to get the folder where the PDF is saved

# A5. Close the file 'eyeplots.pdf' before running myFocusPlot again
# we will now investigate a split based on definiteness as well
myFocusPlot(eyeAll, splitDefiniteness=T)

# A6. Close the pdf. It is possible to visualize a certain time window
myFocusPlot(eyeAll, startTime="TimeOnsetNoun", endTime="TimeOnsetNoun+200")
myFocusPlot(eyeAll, startTime=500, endTime=2000)

# A7. We can also focus on the Competitor vs. Distractors
myFocusPlot(eyeAll, startTime=500, endTime=2000, plotTarget=F)


###### MIXED-EFFECTS REGRESSION: RANDOM INTERCEPTS AND SLOPES ######

# B1. We create our first multiple regression model (no fixed-effects yet)
model  = lmer( FocusDiff ~ (1|Subject) + (1|Item) , data = eye )

# B2. Look at the results of the model, note the 0 variance of the Items
summary(model)

# B3. Plot the random intercepts
myInterceptPlot(eye,"FocusDiff","TrialID","Subject",model) # for subjects
x11() # use quartz() when on Mac OS
myInterceptPlot(eye,"FocusDiff","TrialID","Item",model) # for items

# B4. Is a by-item analysis necessary?
model1 = lmer(FocusDiff ~ (1|Subject),  data=eye)
model2 = lmer(FocusDiff ~ (1|Subject) + (1|Item),  data=eye)
AIC(model1) - AIC(model2) # < 2, so no

# B5. Trying to add a fixed effect factor for Color
# Note this will return a warning as the column is constant in this subset
model3 = lmer(FocusDiff ~ SameColor + (1|Subject), data=eye)

# B6. Adding the contrast between men and women: IsMale
model4 = lmer(FocusDiff ~ IsMale + (1|Subject), data=eye)
summary(model4)
# It is close to absolute value of 2, so we keep it in for now,
# if it is n.s. in our final model, we drop it.


# EXERCISE 2: Why does it not make sense to vary the effect of IsMale 
#     per Subject as a random slope? 
#     Would it make sense to vary it per item instead?
#     (If Item would have been required as a random-effect factor.) 


# B7. We investigate the effect of gender competitor and target gender:
model5 = lmer(FocusDiff ~ IsMale + TargetNeuter + SameGender + (1|Subject), 
              data=eye)
summary(model5) # both not significant

# B8. We investigate the interaction between gender competitor 
#     and target gender:
model6 = lmer(FocusDiff ~ IsMale + TargetNeuter * SameGender + (1|Subject), 
              data=eye)
summary(model6) # both significant, as well as their interaction

# B9. Preparing for the test if the interaction is necessary 
#     (note the REML=F as we are comparing fixed effects) 
model4b = lmer(FocusDiff ~ IsMale + (1|Subject), data=eye, REML=F)
model6b = lmer(FocusDiff ~ IsMale + TargetNeuter * SameGender + (1|Subject), 
               data=eye, REML=F)


# EXERCISE 3. Do we need the interaction (i.e. compare model4b and model6b)?


# B10. We need to test if their inclusion as a random slope is necessary 
# and influences the significance as fixed effects (in the interaction or 
# separately). 
model6 = lmer(FocusDiff ~ IsMale + TargetNeuter * SameGender + (1|Subject), 
              data=eye)
model7 = lmer(FocusDiff ~ IsMale + TargetNeuter * SameGender + 
              (1 + TargetNeuter * SameGender|Subject), data=eye)

summary(model7)
AIC(model6) - AIC(model7) # not better

# B11. Add the factor TargetColor to the best previous model (model6)
model8 = lmer(FocusDiff ~ IsMale + TargetNeuter * SameGender + TargetColor + 
               (1|Subject), data=eye)
summary(model8) # it appears we need a contrast for brown
eye$TargetBrown = (eye$TargetColor == "brown")*1 # 1 if brown, 0 otherwise


# EXERCISE 4. Add the contrast TargetBrown to the best previous model (model6)
#             and test if the model is better than the more complex model (model8).
#             Store the model as model9.

# B12. To get an estimate of the performance of your model use:
#      (including the variation explained by the random structure)
cor(eye$FocusDiff, fitted(model9))^2


# EXERCISE 5. Investigate if other parameters (e.g., age) included in the 
#             dataset (colnames(eye) shows the columnnames) should be included
#             as fixed effects. When including a variable as fixed effect
#             always test if it also has to be included as a random slope
#             (if it varies per subject). 


# B13. Model criticism (on the basis of model9)
qqp(resid(model9)) # not normal

# no heteroskedasticity, but this is due to the bounded nature of the 
# dependent variable (always between -100 and 100)
plot(scale(resid(model9)) ~ fitted(model9), pch='.',cex=2, ylim=c(-5,5))
abline(h = c(-2.5, 2.5))

eye2 = eye[ abs(scale(resid(model9))) < 2 , ]
(1-(nrow(eye2))/nrow(eye))*100 # 0.36% removed
model9.v2 = lmer(FocusDiff ~ IsMale + TargetNeuter * SameGender + TargetBrown + 
               (1|Subject), data=eye2)
cor(eye$FocusDiff, fitted(model9))^2 
cor(eye2$FocusDiff, fitted(model9.v2))^2 # improved prediction
qqp(resid(model9.v2)) # unfortunately, residuals not much better 

# The problem with the residuals might be caused by by our choice of 
# dependent variable here (which is essentially bounded by the 
# time span we are looking at), so a better approach would 
# be to predict for every point in time if someone is looking at
# the target or not (or a certain distance from the target), 
# and then take into account time in the 
# analysis. We will illustrate an analysis which takes into
# account a time-signal in the fourth lecture of this course.