
Martijn Wieling
University of Groningen
f112300
and ShinyDem0
)
head(iris)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species Petal.Area
# 1 5.1 3.5 1.4 0.2 setosa 0.14
# 2 4.9 3.0 1.4 0.2 setosa 0.14
# 3 4.7 3.2 1.3 0.2 setosa 0.13
# 4 4.6 3.1 1.5 0.2 setosa 0.15
# 5 5.0 3.6 1.4 0.2 setosa 0.14
# 6 5.4 3.9 1.7 0.4 setosa 0.34
m0 <- lm(Petal.Area ~ Sepal.Length, data = iris)
summary(m0)
#
# Call:
# lm(formula = Petal.Area ~ Sepal.Length, data = iris)
#
# Residuals:
# Min 1Q Median 3Q Max
# -2.671 -0.794 -0.099 0.730 3.489
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -11.357 0.711 -16.0 <2e-16 ***
# Sepal.Length 2.439 0.120 20.3 <2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 1.22 on 148 degrees of freedom
# Multiple R-squared: 0.735, Adjusted R-squared: 0.733
# F-statistic: 410 on 1 and 148 DF, p-value: <2e-16
library(visreg) # package containing visualization function visreg
visreg(m0) # visualize regression line together with data points
round(m0$coefficients, 2)
# (Intercept) Sepal.Length
# -11.36 2.44
iris$FittedPA <- fitted(m0)
head(iris, 2)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species Petal.Area FittedPA
# 1 5.1 3.5 1.4 0.2 setosa 0.14 1.08376
# 2 4.9 3.0 1.4 0.2 setosa 0.14 0.59589
# (Intercept) Sepal.Length
# -11.36 2.44
iris$Sepal.Length.c <- iris$Sepal.Length - mean(iris$Sepal.Length) # centered Sepal.Length
m0.c <- lm(Petal.Area ~ Sepal.Length.c, data = iris)
library(mosaic) # contains msummary function
msummary(m0.c) # more concise summary (slide friendly!)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.8970 0.0994 29.1 <2e-16 ***
# Sepal.Length.c 2.4394 0.1204 20.3 <2e-16 ***
#
# Residual standard error: 1.22 on 148 degrees of freedom
# Multiple R-squared: 0.735, Adjusted R-squared: 0.733
# F-statistic: 410 on 1 and 148 DF, p-value: <2e-16
iris$FittedPA.c <- fitted(m0.c)
head(iris[, c("Sepal.Length", "Sepal.Length.c", "FittedPA", "FittedPA.c")])
# Sepal.Length Sepal.Length.c FittedPA FittedPA.c
# 1 5.1 -0.74333 1.08376 1.08376
# 2 4.9 -0.94333 0.59589 0.59589
# 3 4.7 -1.14333 0.10801 0.10801
# 4 4.6 -1.24333 -0.13593 -0.13593
# 5 5.0 -0.84333 0.83982 0.83982
# 6 5.4 -0.44333 1.81558 1.81558
# (Intercept) Sepal.Length.c
# 2.90 2.44
m1 <- lm(Petal.Area ~ Sepal.Length.c + Sepal.Width.c, data = iris)
msummary(m1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.8970 0.0928 31.23 < 2e-16 ***
# Sepal.Length.c 2.3757 0.1132 20.99 < 2e-16 ***
# Sepal.Width.c -1.0285 0.2150 -4.78 4.1e-06 ***
#
# Residual standard error: 1.14 on 147 degrees of freedom
# Multiple R-squared: 0.771, Adjusted R-squared: 0.768
# F-statistic: 247 on 2 and 147 DF, p-value: <2e-16
visreg(m1)
plot.type = 'rgl'
in visreg2d
to obtain interactive plot)visreg2d(m1, 'Sepal.Length.c', 'Sepal.Width.c')
You must enable Javascript to view this page properly.
round(m1$coefficients, 2)
# (Intercept) Sepal.Length.c Sepal.Width.c
# 2.90 2.38 -1.03
Petal.Area = 2.38 \(\times\) Sepal.Length.c + -1.03 \(\times\) Sepal.Width.c + 2.9
For a centered sepal length of about -0.7 (sepal length of 5.1) and a centered sepal width of about 0.4 (sepal width of 3.5), the predicted (fitted) petal area is:
2.38 \(\times\) -0.7 + -1.03 \(\times\) 0.4 + 2.9 = 0.68
iris$FittedPA <- fitted(m1)
head(iris[,c("Sepal.Length","Sepal.Length.c","Sepal.Width","Sepal.Width.c","FittedPA")], 1)
# Sepal.Length Sepal.Length.c Sepal.Width Sepal.Width.c FittedPA
# 1 5.1 -0.74333 3.5 0.44267 0.67577
anova(m0, m1) # model comparison
# Analysis of Variance Table
#
# Model 1: Petal.Area ~ Sepal.Length
# Model 2: Petal.Area ~ Sepal.Length.c + Sepal.Width.c
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 148 219
# 2 147 190 1 29.5 22.9 4.1e-06 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(m0) - AIC(m1) # AIC decrease >= 2: more complex model should be used
# [1] 19.703
m2 <- lm(Petal.Area ~ Sepal.Length.c * Sepal.Width.c, data=iris)
# alternatively: Petal.Area ~ Sepal.Length.c + Sepal.Width.c + Sepal.Length.c:Sepal.Width.c
msummary(m2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.9326 0.0915 32.05 < 2e-16 ***
# Sepal.Length.c 2.3334 0.1116 20.91 < 2e-16 ***
# Sepal.Width.c -0.8716 0.2173 -4.01 9.6e-05 ***
# Sepal.Length.c:Sepal.Width.c 0.8431 0.2987 2.82 0.0054 **
#
# Residual standard error: 1.11 on 146 degrees of freedom
# Multiple R-squared: 0.783, Adjusted R-squared: 0.778
# F-statistic: 175 on 3 and 146 DF, p-value: <2e-16
# (Intercept) Sepal.Length.c Sepal.Width.c Sepal.Length.c:Sepal.Width.c
# [1,] 2.93 2.33 -0.87 0.84
Petal.Area = 2.33 \(\times\) SL + -0.87 \(\times\) SW + 0.84 \(\times\) SL \(\times\) SW + 2.93
SL set to 0: 2.33 \(\times\) 0 + -0.87 \(\times\) SW + 0.84 \(\times\) 0 \(\times\) SW + 2.93 = -0.87 \(\times\) SW + 2.93
SL set to 1: 2.33 \(\times\) 1 + -0.87 \(\times\) SW + 0.84 \(\times\) 1 \(\times\) SW + 2.93 = -0.03 \(\times\) SW + 5.27
SL set to 2: 2.33 \(\times\) 2 + -0.87 \(\times\) SW + 0.84 \(\times\) 2 \(\times\) SW + 2.93 = 0.81 \(\times\) SW + 7.6
SW set to 0: 2.33 \(\times\) SL + -0.87 \(\times\) 0 + 0.84 \(\times\) SL \(\times\) 0 + 2.93 = 2.33 \(\times\) SL + 2.93
SW set to 1: 2.33 \(\times\) SL + -0.87 \(\times\) 1 + 0.84 \(\times\) SL \(\times\) 1 + 2.93 = 3.18 \(\times\) SL + 2.06
SW set to 2: 2.33 \(\times\) SL + -0.87 \(\times\) 2 + 0.84 \(\times\) SL \(\times\) 2 + 2.93 = 4.02 \(\times\) SL + 1.19
par(mfrow = c(1, 2))
visreg(m2, "Sepal.Width.c", by = "Sepal.Length.c", overlay = T) # lines at 10/50/90th percentile
visreg(m2, "Sepal.Length.c", by = "Sepal.Width.c", overlay = T)
visreg2d(m2, 'Sepal.Length.c', 'Sepal.Width.c')
You must enable Javascript to view this page properly.
anova(m1, m2) # model comparison
# Analysis of Variance Table
#
# Model 1: Petal.Area ~ Sepal.Length.c + Sepal.Width.c
# Model 2: Petal.Area ~ Sepal.Length.c * Sepal.Width.c
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 147 190
# 2 146 180 1 9.82 7.97 0.0054 **
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(m1) - AIC(m2) # AIC decrease >= 2: more complex model should be used
# [1] 5.9696
# (Intercept) Sepal.Length.c Sepal.Width.c Sepal.Length.c:Sepal.Width.c
# [1,] 2.93 2.33 -0.87 0.84
Petal.Area = 2.33 \(\times\) Sepal.Length.c + -0.87 \(\times\) Sepal.Width.c + 0.84 \(\times\) Sepal.Length.c \(\times\) Sepal.Width.c + 2.93
For a centered sepal length of about -0.7 (sepal length of 5.1) and a centered sepal width of about 0.4 (sepal width of 3.5), the predicted (fitted) petal area is:
2.33 \(\times\) -0.7 + -0.87 \(\times\) 0.4 + 0.84 \(\times\) -0.7 \(\times\) 0.4 + 2.93 = 0.53
iris$FittedPA <- fitted(m2)
head(iris[,c("Sepal.Length","Sepal.Length.c","Sepal.Width","Sepal.Width.c","FittedPA")], 1)
# Sepal.Length Sepal.Length.c Sepal.Width Sepal.Width.c FittedPA
# 1 5.1 -0.74333 3.5 0.44267 0.53482
levels(iris$Species) # factor levels of Species variable (by default ordered alphabetically)
# [1] "setosa" "versicolor" "virginica"
contrasts(iris$Species) # show internal dummy coding
# versicolor virginica
# setosa 0 0
# versicolor 1 0
# virginica 0 1
iris$Species <- relevel(iris$Species, "versicolor") # change reference level
contrasts(iris$Species) # dummy coding
# [,1] [,2]
# versicolor 1 0
# setosa 0 1
# virginica -1 -1
m3 <- lm(Petal.Area ~ Species, data = iris)
msummary(m3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.8970 0.0604 47.99 <2e-16 ***
# Species1 -0.0368 0.0854 -0.43 0.67
# Species2 -2.7142 0.0854 -31.79 <2e-16 ***
#
# Residual standard error: 0.739 on 147 degrees of freedom
# Multiple R-squared: 0.903, Adjusted R-squared: 0.902
# F-statistic: 683 on 2 and 147 DF, p-value: <2e-16
iris$SpeciesSetosa <- (iris$Species == "setosa") * 1
iris$SpeciesVirginica <- (iris$Species == "virginica") * 1
msummary(lm(Petal.Area ~ SpeciesSetosa + SpeciesVirginica, data = iris))
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.860 0.105 27.4 <2e-16 ***
# SpeciesSetosa -2.677 0.148 -18.1 <2e-16 ***
# SpeciesVirginica 2.788 0.148 18.9 <2e-16 ***
#
# Residual standard error: 0.739 on 147 degrees of freedom
# Multiple R-squared: 0.903, Adjusted R-squared: 0.902
# F-statistic: 683 on 2 and 147 DF, p-value: <2e-16
visreg(m3)
# (Intercept) Species1 Species2
# 2.90 -0.04 -2.71
iris$FittedPA <- round(fitted(m3),2)
unique(iris[,c("Species","FittedPA")])
# Species FittedPA
# 1 setosa 0.18
# 51 versicolor 2.86
# 101 virginica 5.65
library(multcomp)
summary(glht(m3, linfct = mcp(Species = "Tukey")))
#
# Simultaneous Tests for General Linear Hypotheses
#
# Multiple Comparisons of Means: Tukey Contrasts
#
#
# Fit: lm(formula = Petal.Area ~ Species, data = iris)
#
# Linear Hypotheses:
# Estimate Std. Error t value Pr(>|t|)
# setosa - versicolor == 0 -2.677 0.148 -18.1 <2e-16 ***
# virginica - versicolor == 0 2.788 0.148 18.9 <2e-16 ***
# virginica - setosa == 0 5.465 0.148 37.0 <2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (Adjusted p values reported -- single-step method)
msummary(m4 <- lm(Petal.Area ~ Sepal.Length.c + Sepal.Width.c + Species, data = iris)) # w/o interactions
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.8970 0.0477 60.78 < 2e-16 ***
# Sepal.Length.c 0.6995 0.1103 6.34 2.7e-09 ***
# Sepal.Width.c 0.4461 0.1672 2.67 0.0085 **
# Species1 0.0265 0.0865 0.31 0.7594
# Species2 -2.2939 0.1516 -15.13 < 2e-16 ***
#
# Residual standard error: 0.584 on 145 degrees of freedom
# Multiple R-squared: 0.94, Adjusted R-squared: 0.939
# F-statistic: 571 on 4 and 145 DF, p-value: <2e-16
library(car)
car::vif(m4) # Preferably (G)VIF < 5 (rule of thumb, but see: https://goo.gl/a6aKdU)
# GVIF Df GVIF^(1/(2*Df))
# Sepal.Length.c 3.6484 1 1.9101
# Sepal.Width.c 2.3215 1 1.5237
# Species 6.0044 2 1.5654
msummary(lm(Sepal.Width.c ~ Sepal.Length.c, data = iris))$r.squared
# [1] 0.013823
msummary(lm(Sepal.Width.c ~ Species, data = iris))$r.squared
# [1] 0.40078
msummary(lm(Sepal.Length.c ~ Species, data = iris))$r.squared # R^2 > 0.5: potentially problematic
# [1] 0.61871
# Exclude length (best choice: lowest AIC)
AIC(m5 <- lm(Petal.Area ~ Sepal.Width.c + Species, data = iris))
# [1] 305.83
# Or exclude species (worst choice: highest AIC)
AIC(m5a <- lm(Petal.Area ~ Sepal.Width.c + Sepal.Length.c, data = iris))
# [1] 468.91
car::vif(m5) # OK
# GVIF Df GVIF^(1/(2*Df))
# Sepal.Width.c 1.6688 1 1.2918
# Species 1.6688 2 1.1366
msummary(m5)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.8970 0.0537 53.96 < 2e-16 ***
# Sepal.Width.c 1.0082 0.1596 6.32 3.1e-09 ***
# Species1 0.2529 0.0887 2.85 0.005 **
# Species2 -3.0879 0.0963 -32.08 < 2e-16 ***
#
# Residual standard error: 0.657 on 146 degrees of freedom
# Multiple R-squared: 0.924, Adjusted R-squared: 0.922
# F-statistic: 589 on 3 and 146 DF, p-value: <2e-16
Sepal.Width.c
is now positive as including Species
controls for Setosa irises having the highest sepal width and the lowest sepal areaplot(fitted(m5), resid(m5))
ncvTest(m5) # from library(car)
# Non-constant Variance Score Test
# Variance formula: ~ fitted.values
# Chisquare = 32.18, Df = 1, p = 1.41e-08
library(MASS)
boxcox(m5) # shows optimal transformation at around 0.3
iris$Petal.Area.0.3 <- iris$Petal.Area^0.3
m6 <- lm(Petal.Area.0.3 ~ Sepal.Width.c + Species, data = iris)
msummary(m6)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.20802 0.00672 179.85 < 2e-16 ***
# Sepal.Width.c 0.13604 0.01997 6.81 2.4e-10 ***
# Species1 0.19316 0.01110 17.40 < 2e-16 ***
# Species2 -0.67108 0.01204 -55.72 < 2e-16 ***
#
# Residual standard error: 0.0823 on 146 degrees of freedom
# Multiple R-squared: 0.97, Adjusted R-squared: 0.969
# F-statistic: 1.56e+03 on 3 and 146 DF, p-value: <2e-16
ncvTest(m6)
# Non-constant Variance Score Test
# Variance formula: ~ fitted.values
# Chisquare = 0.48356, Df = 1, p = 0.487
ncvTest(m6, ~Sepal.Width.c)
# Non-constant Variance Score Test
# Variance formula: ~ Sepal.Width.c
# Chisquare = 0.63821, Df = 1, p = 0.424
ncvTest(m6, ~Species)
# Non-constant Variance Score Test
# Variance formula: ~ Species
# Chisquare = 0.31587, Df = 2, p = 0.854
shapiro.test(resid(m6))$p.value
# [1] 0.65129
qqnorm(resid(m6))
qqline(resid(m6))
bestmodel <- lm(Petal.Area.0.3 ~ Sepal.Width.c * Species, data = iris)
msummary(bestmodel)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.22682 0.00826 148.56 < 2e-16 ***
# Sepal.Width.c 0.14796 0.01948 7.59 3.6e-12 ***
# Species1 0.19654 0.01207 16.28 < 2e-16 ***
# Species2 -0.65912 0.01228 -53.69 < 2e-16 ***
# Sepal.Width.c:Species1 0.06526 0.02850 2.29 0.02349 *
# Sepal.Width.c:Species2 -0.09492 0.02600 -3.65 0.00037 ***
#
# Residual standard error: 0.0791 on 144 degrees of freedom
# Multiple R-squared: 0.972, Adjusted R-squared: 0.971
# F-statistic: 1.01e+03 on 5 and 144 DF, p-value: <2e-16
visreg(bestmodel, "Sepal.Width.c", by = "Species", overlay = T)
# (Intercept) Sepal.Width.c Species1 Species2 Sepal.Width.c:Species1 Sepal.Width.c:Species2
# [1,] 1.23 0.15 0.2 -0.66 0.07 -0.09
iris$FittedPA <- fitted(bestmodel)
head(iris[, c("Sepal.Width", "Sepal.Width.c", "Species", "FittedPA")], 1)
# Sepal.Width Sepal.Width.c Species FittedPA
# 1 3.5 0.44267 setosa 0.59118
msummary(similarmodel <- lm(Petal.Area.0.3 ~ Species + Sepal.Width.c:Species, data = iris))
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.22682 0.00826 148.56 < 2e-16 ***
# Species1 0.19654 0.01207 16.28 < 2e-16 ***
# Species2 -0.65912 0.01228 -53.69 < 2e-16 ***
# Speciesversicolor:Sepal.Width.c 0.21323 0.03603 5.92 2.3e-08 ***
# Speciessetosa:Sepal.Width.c 0.05305 0.02983 1.78 0.077 .
# Speciesvirginica:Sepal.Width.c 0.17762 0.03506 5.07 1.2e-06 ***
#
# Residual standard error: 0.0791 on 144 degrees of freedom
# Multiple R-squared: 0.972, Adjusted R-squared: 0.971
# F-statistic: 1.01e+03 on 5 and 144 DF, p-value: <2e-16
AIC(similarmodel) - AIC(bestmodel)
# [1] 0
coef(msummary(bestmodel))[c(1, 3, 4, 2, 5, 6), c(1, 4)]
# Estimate Pr(>|t|)
# (Intercept) 1.226819 1.8562e-159
# Species1 0.196538 1.8250e-34
# Species2 -0.659119 4.0120e-97
# Sepal.Width.c 0.147964 3.5913e-12
# Sepal.Width.c:Species1 0.065262 2.3492e-02
# Sepal.Width.c:Species2 -0.094916 3.6597e-04
coef(summary(similarmodel))[, c(1, 4)]
# Estimate Pr(>|t|)
# (Intercept) 1.226819 1.8562e-159
# Species1 0.196538 1.8250e-34
# Species2 -0.659119 4.0120e-97
# Speciesversicolor:Sepal.Width.c 0.213226 2.2782e-08
# Speciessetosa:Sepal.Width.c 0.053048 7.7425e-02
# Speciesvirginica:Sepal.Width.c 0.177618 1.2238e-06
library(lmSupport)
modelEffectSizes(bestmodel)
# lm(formula = Petal.Area.0.3 ~ Sepal.Width.c * Species, data = iris)
#
# Coefficients
# SSR df pEta-sqr dR-sqr
# (Intercept) 138.2396 1 0.9935 NA
# Sepal.Width.c 0.3612 1 0.2859 0.0111
# Species 20.9875 2 0.9588 0.6431
# Sepal.Width.c:Species 0.0861 2 0.0871 0.0026
#
# Sum of squared errors (SSE): 0.9
# Sum of squared total (SST): 32.6
iris2 <- iris[abs(scale(resid(bestmodel))) < 2, ] # exclude outliers
(noutliers <- sum(abs(scale(resid(bestmodel))) >= 2)) # 7 outliers
# [1] 7
bestmodel2 <- lm(Petal.Area.0.3 ~ Sepal.Width.c * Species, data = iris2)
msummary(bestmodel2) # only small changes in p-values: same pattern as before
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.22159 0.00701 174.15 <2e-16 ***
# Sepal.Width.c 0.17427 0.01692 10.30 <2e-16 ***
# Species1 0.20123 0.01023 19.66 <2e-16 ***
# Species2 -0.66534 0.01046 -63.60 <2e-16 ***
# Sepal.Width.c:Species1 0.08067 0.02493 3.24 0.0015 **
# Sepal.Width.c:Species2 -0.10353 0.02262 -4.58 1e-05 ***
#
# Residual standard error: 0.0666 on 137 degrees of freedom
# Multiple R-squared: 0.98, Adjusted R-squared: 0.98
# F-statistic: 1.37e+03 on 5 and 137 DF, p-value: <2e-16
summary(bestmodel.boot <- Boot(bestmodel2, R = 1000)) # from library(car), 1000 bootstrap samples
#
# Number of bootstrap replications R = 1000
# original bootBias bootSE bootMed
# (Intercept) 1.2216 -4.35e-05 0.00682 1.2217
# Sepal.Width.c 0.1743 2.18e-03 0.01837 0.1758
# Species1 0.2012 7.25e-04 0.00934 0.2017
# Species2 -0.6653 -1.39e-03 0.01099 -0.6664
# Sepal.Width.c:Species1 0.0807 -8.84e-04 0.02547 0.0791
# Sepal.Width.c:Species2 -0.1035 1.20e-03 0.02573 -0.1036
confint(bestmodel.boot, type = "norm") # confidence intervals based on bootstrap sampling
# Bootstrap normal confidence intervals
#
# 2.5 % 97.5 %
# (Intercept) 1.208267 1.235004
# Sepal.Width.c 0.136090 0.208102
# Species1 0.182190 0.218818
# Species2 -0.685494 -0.642418
# Sepal.Width.c:Species1 0.031639 0.131468
# Sepal.Width.c:Species2 -0.155172 -0.054301
glm
)family='binomial'
)family='poisson'
)library(swirl); install_from_swirl('Regression_Models')
Thank you for your attention!