## MIXED-EFFECTS REGRESSION LAB SESSION, MARCH 31, 2011, 9:15 - 11:00 ## Martijn Wieling, http://www.martijnwieling.nl # A0. Boot in Windows (not Linux) # Start R (Start Menu -> RUG Menu -> Mathematics & Statistics -> R for Windows) # Make sure the libraries lme, gamm4, languageR and Design are installed # Menu: Packages -> Install package(s)... -> Select mirror -> Select packages # A1. These commands should work when the packages are installed library(lme4) library(gamm4) library(languageR) library(Design) # A2. Now download the two files: # http://www.let.rug.nl/~wieling/R/myFunctions.R # http://www.let.rug.nl/~wieling/R/dialectdata.rda # And make sure that your R directory points to where these files are # located (File -> Change dir...) # A3. load several custom functions which make the analysis easier source('myFunctions.R') # A4. load cleaned data set (subset of 48 words) load('dialectdata.rda') # Note that predictors were standardized and log-transformed when necessary. # The dependent (LD) was centered. When predictors correlated with each other, # we decorrelated such predictors (e.g. population age and income). # # Standardization: scaledA = scale(A) # Centering: centeredA = scale(A,scale=F) # Log-transform: logA = log(A) # check if necessary with: plot(density(A)) # Decorrelation: newPredA = resid(lm(predA ~ predB,data=dialectdata)) # must be positive: cor(dialectdata$predA,dialectdata$newPredA) # ** if you need more information about the above procedure, you can ask me ** # A5. data investigation head(dialectdata) str(dialectdata) nrow(dialectdata) # 19446 word-place combinations plot(density(dialectdata$PopIncome)) # e.g., to investigate distributions # A6. we model geography as the fitted values of a generalized additive model geographyModel = gam(LD ~ s(Longitude,Latitude), data=dialectdata) dialectdata$GeoGAM = fitted(geographyModel) # A7. we can visualize the effect of geography using the function vis.gam vis.gam(geographyModel, plot.type="contour", color="terrain", too.far=0.05, view=c("Longitude","Latitude"),main="General effect of Geography") # A8. We create our first multiple regression model (no mixed-effects yet) linearModel = lm(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength, data=dialectdata) # A9. Look at the results of the model and note the significant variables summary(linearModel) ###### MIXED-EFFECTS REGRESSION: RANDOM INTERCEPTS ###### # B1. Since we want our results to be generalizable across words, locations # and transcribers, we add random intercepts for these factors. # We start with a random intercept for Location: (1|Location) mixedModel1 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength + (1|Location), data=dialectdata) # B2. look at the results, note that p-values are not present anymore, # but |t| > 1.65 equals p < 0.05 (one-tailed) and |t| > 2 equals p < 0.05 # (for a large data set) print(mixedModel1,cor=F) # suppresses the correlation table (cor=F) # B3. Test if the random intercept is necessary (significant improvement) # note that simply 'anova' can be used when both models contain random effects myAnova(linearModel,mixedModel1) # B4. We now add random intercepts for Word and display the results mixedModel2 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength + (1|Location) + (1|Word), data=dialectdata) print(mixedModel2,cor=F) # B5. To see how the intercepts for word vary, we can plot them # Since there are 424 places, this was not sensible for Location myInterceptPlot(dialectdata,"LD","PopAge","Word",mixedModel2) # B6. We test if the inclusion of this random intercept is an improvement anova(mixedModel1,mixedModel2) # B7. Look a the t-values of the fixed effects of mixedModel2 # We clearly see that WordLength is not significant anymore, # which also becomes clear from the following test # (investing 1 df does not give a better fit) mixedModel2a = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + WordLength + (1|Location) + (1|Word), data=dialectdata, REML=F) mixedModel2b = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word), data=dialectdata, REML=F) anova(mixedModel2a,mixedModel2b) # B8. we therefore exclude WordLength from the remaining analysis. # We keep PopIncome, as we will include this factor in the random effects mixedModel2 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word), data=dialectdata) print(mixedModel2,cor=F) # EXERCISE 1: add the random intercept for Transcriber and store the model in # the variable mixedModel3. Display the results of the model and test # if the inclusion of this intercept is necessary. ###### MIXED-EFFECTS REGRESSION: RANDOM SLOPES ###### # C1. We will test if the effect of population size varies per word # We therefore add: (0+PopCnt|Word) mixedModel4 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word) + (0+PopCnt|Word) + (1|Transcriber), data=dialectdata) print(mixedModel4,cor=F) anova(mixedModel3,mixedModel4) # C2. To see how the slopes vary compared to the model slope (dashed lines) # we can plot the random slopes (solid lines) myXYPlot(dialectdata,"LD","PopCnt","Word",mixedModel3,mixedModel4) # C3. Now we test if the correlation parameter between the by-word adjustment # of the intercept and the by-word adjustments to the slope # of population size is necessary by replacing (1|Word) + (0+PopCnt|Word) with # (1+PopCnt|Word). Note that adding variables and intercepts # in one group (between parentheses) will include their correlation. mixedModel4b = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1+PopCnt|Word) + (1|Transcriber), data=dialectdata) print(mixedModel4b,cor=F) anova(mixedModel4,mixedModel4b) # not significant # C4. We now add by-word adjustments to the slope of population income (separate), # test the necessity of this addition and plot the slopes mixedModel5 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word) + (0+PopCnt|Word) + (0+PopIncome|Word) + (1|Transcriber), data=dialectdata) print(mixedModel5,cor=F) anova(mixedModel4,mixedModel5) myXYPlot(dialectdata,"LD","PopIncome","Word",mixedModel4,mixedModel5) # C5. Now we test if the correlation parameter between the two by-word slope # adjustments is necessary mixedModel6 = lmer(LD ~ GeoGAM + PopCnt + PopAge + PopIncome + IsNoun + WordFreq + VCratio + (1|Location) + (1|Word) + (0+PopCnt+PopIncome|Word) + (1|Transcriber), data=dialectdata) print(mixedModel6,cor=F) anova(mixedModel5,mixedModel6) # EXERCISE 2: Check if an uncorrelated random slope for PopAge is necessary # visualize the random slopes. # EXERCISE 3: Check if the random slope for PopAge should be correlated with # the other random slopes (this will take a few minutes to run) # EXERCISE 4: Use your last model to see if a random slope # for IsNoun is necessary to include. Also check if it has to be correlated # with the random intercept. First think about if the slope of IsNoun varies # per Word or Location? # Of course there is a more complex random-effect structure possible, # but we will not investigate it here further (by now, you should know how # to do this though). ###### INVESTIGATING BY-WORD AND BY-LOCATION RANDOM SLOPES ###### # D1. We can visualize the by-word random slopes of a variable as follows: myCoefPlot(mixedModel10,"Word","PopAge") # D2. We can visualize the by-word random intercept of a variable as follows: myCoefPlot(mixedModel10,"Word","Intercept") # D3. To visualize the correlational structure between the by-word random # slopes for 2 variables, we proceed as follows (note the different function!): myCoefPlot2(mixedModel10,"Word","PopAge","PopIncome") # D4. To see the relationship between the random by-word intercepts and random # by-word slopes for PopCnt: myCoefPlot2(mixedModel10,"Word","Intercept","PopCnt") # D5. We can visualize the by-location random slopes in similar fashion myCoefPlot(mixedModel10,"Location","IsNoun") # D6. Of course, it is more informative to show this geographically myCoefPlotGeo(mixedModel10,dialectdata,"IsNoun") # EXERCISE 5: visualize separately the by-word random slopes for # PopCnt and PopIncome # EXERCISE 6: visualize all remaining pairs of by-word random slopes # EXERCISE 7: If interested, take a look at the files # datacreation.R and myFunctions.R located in # http://www.let.rug.nl/~wieling/R/ ## FINAL REMARK: lmer can decide by itself if your design is nested or crossed, # but you do need to use correct factor labels, i.e. having 8 schools with 4 # groups in each school, you have schoollabels S1 to S8, but need # group-school labels S1G1, S1G2, ..., S8G4 as opposed to G1, ..., G4 # (groups G1 in two schools are not the same) # See also http://lme4.r-forge.r-project.org/book/ (Ch2.pdf). # See Chapter 3 for a longitudinal design (repeated measures) and # Chapter 1 for a more general introduction to mixed models