1 Introduction

This file contains the answers of the lab session: https://www.let.rug.nl/wieling/Statistics/Basic-Tests/lab.

2 Importing the data

We will first download a csv file generated in Excel and import this data into R. We also add the two columns we added during the lab session associated with the lecture “Introduction to R and data exploration”.

download.file("http://www.let.rug.nl/wieling/Statistics/Basic-Tests/lab/mtcars.csv", "mtcars.csv")
dat <- read.csv2("mtcars.csv")
dat$relHP <- dat$hp/dat$wt
dat$sportscar <- FALSE
dat[dat$relHP > 42, ]$sportscar <- TRUE

3 The structure of the data

Note that this dataset is similar to the mtcars dataset standard available in R, so the description of the columns can be obtained with ?mtcars. There is one addition column ‘region’ which contains the region which the car maker originated from.

4 Running basic statistical analyses

In this section, we will conduct several basic statistical analyses using the data.

4.1 Comparing one or two means

# We will investigate if the weight in the sample significantly differs from 3 (x1000) lbs.

# First look at the distribution of the weight: is wt normally distributed?
qqnorm(dat$wt)
qqline(dat$wt)

shapiro.test(dat$wt)
# 
#   Shapiro-Wilk normality test
# 
# data:  dat$wt
# W = 0.9, p-value = 0.09
# Visualize the weight using a box-plot, and add a horizontal line at height 3
boxplot(dat$wt)
abline(h = 3, lty = 2, col = "red")

# Run the one-sample t-test and obtain the effect size (Cohen's D)
t.test(dat$wt, mu = 3)
# 
#   One Sample t-test
# 
# data:  dat$wt
# t = 1, df = 31, p-value = 0.2
# alternative hypothesis: true mean is not equal to 3
# 95 percent confidence interval:
#  2.86 3.57
# sample estimates:
# mean of x 
#      3.22
library(lsr)
cohensD(dat$wt, mu = 3)
# [1] 0.222
# Assess if the weight of sportscars differs significantly from non-sportscars.  First assess if
# the distribution of wt of both groups is approximately normal.  If not, use an appropriate
# non-parametric alternative. Also calculate the effect size.
par(mfrow = c(1, 2))
qqnorm(dat[dat$sportscar, ]$wt)
qqline(dat[dat$sportscar, ]$wt)
qqnorm(dat[!dat$sportscar, ]$wt)
qqline(dat[!dat$sportscar, ]$wt)

shapiro.test(dat[dat$sportscar, ]$wt)  # n.s., but I would judge this as non-normal
# 
#   Shapiro-Wilk normality test
# 
# data:  dat[dat$sportscar, ]$wt
# W = 0.9, p-value = 0.1
shapiro.test(dat[!dat$sportscar, ]$wt)
# 
#   Shapiro-Wilk normality test
# 
# data:  dat[!dat$sportscar, ]$wt
# W = 0.9, p-value = 0.04
(model <- wilcox.test(wt ~ sportscar, data = dat, alternative = "two.sided"))
# Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot compute exact p-value
# with ties
# 
#   Wilcoxon rank sum test with continuity correction
# 
# data:  wt by sportscar
# W = 74, p-value = 0.05
# alternative hypothesis: true location shift is not equal to 0
# look at help: ?wilcox.test

wilcox.effsize <- function(pval2tailed, N) {
    (r <- abs(qnorm(pval2tailed/2)/sqrt(N)))  # r = z / sqrt(N)
}

wilcox.effsize(model$p.value, nrow(dat))
# [1] 0.354
# N.B. to get exact p-values in the presence of ties, the help indicates wilcox_ties of the package
# coin can be used: install.packages('coin')
library(coin)
wilcox_test(wt ~ sportscar, data = dat)
# Error in check(object): 'object' does not represent a two-sample problem (maybe the grouping variable is not a factor?)
# As sportscar is not a factor (it is a logical variable), it gives an error.  The solution is to
# convert it to a factor
dat$sportscar <- as.factor(dat$sportscar)
wilcox_test(wt ~ sportscar, data = dat)
# 
#   Asymptotic Wilcoxon-Mann-Whitney Test
# 
# data:  wt by sportscar (FALSE, TRUE)
# Z = -2, p-value = 0.04
# alternative hypothesis: true mu is not equal to 0
wilcox.effsize(0.04, nrow(dat))  # manual entering p-value to get effect size
# [1] 0.363

4.2 Assessing categorical dependence

# Assess if there is a dependency between transmission and sportscar.  Also obtain the effect size.
(tab <- table(dat$am, dat$sportscar))
#    
#     FALSE TRUE
#   0     9   10
#   1     8    5
chisq.test(tab)
# 
#   Pearson's Chi-squared test with Yates' continuity correction
# 
# data:  tab
# X-squared = 0.2, df = 1, p-value = 0.7
cramersV(tab)
# [1] 0.0757

4.3 Comparing three or more means

# Assess if the region of the car maker influences the weight. Start with visualizing the data If
# region influences weight also assess which regions differ. Also obtain the effect size.  For
# simplicity, you may assume that the data is normally distributed (even though it is not).
boxplot(wt ~ region, data = dat)

summary(result <- aov(wt ~ region, data = dat))
#             Df Sum Sq Mean Sq F value Pr(>F)   
# region       2   8.81    4.41    6.13  0.006 **
# Residuals   29  20.86    0.72                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(lsr)
posthocPairwiseT(result)
# 
#   Pairwise comparisons using t tests with pooled SD 
# 
# data:  wt and region 
# 
#        Asia Europe
# Europe 0.55 -     
# USA    0.01 0.01  
# 
# P value adjustment method: holm
etaSquared(result)
#        eta.sq eta.sq.part
# region  0.297       0.297
# Investigate if the region of the car maker and the type of car (sportscar or not) influence the
# weight.
with(dat, interaction.plot(sportscar, region, wt, col = c("blue", "red"), type = "b"))

table(dat$region, dat$sportscar)  # not balanced
#         
#          FALSE TRUE
#   Asia       6    3
#   Europe     7    7
#   USA        4    5
library(car)
op <- options(contrasts = c("contr.sum", "contr.poly"))  # orthogonal contrasts
Anova(aov(wt ~ sportscar * region, data = dat), type = 3)
# Anova Table (Type III tests)
# 
# Response: wt
#                  Sum Sq Df F value Pr(>F)    
# (Intercept)         327  1  528.85 <2e-16 ***
# sportscar             1  1    1.57 0.2212    
# region                7  2    6.06 0.0069 ** 
# sportscar:region      4  2    3.28 0.0536 .  
# Residuals            16 26                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# interaction not significant, so exclude interaction (and use type 2; type 3 is also OK)
Anova(aov(wt ~ sportscar + region, data = dat), type = 2)
# Anova Table (Type II tests)
# 
# Response: wt
#           Sum Sq Df F value Pr(>F)   
# sportscar   0.73  1    1.02 0.3207   
# region      8.03  2    5.58 0.0091 **
# Residuals  20.13 28                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# sportscar not significant, so the best model, is the one above (result)

# Finally assess if adding the number of carburators (carb) is a signiciant covariate and obtain
# the effect sizes.
Anova(result <- aov(wt ~ carb + region, data = dat), type = 2)  # yes
# Anova Table (Type II tests)
# 
# Response: wt
#           Sum Sq Df F value Pr(>F)   
# carb        4.78  1    8.31 0.0075 **
# region      8.16  2    7.10 0.0032 **
# Residuals  16.09 28                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etaSquared(result)
#        eta.sq eta.sq.part
# carb    0.161       0.229
# region  0.275       0.337

5 Replication

From within RStudio, you can simply download this file using the commands:

# download original file if not already exists (to prevent overwriting)
if (!file.exists("answers.Rmd")) {
    download.file("http://www.let.rug.nl/wieling/Statistics/Basic-Tests/lab/answers/answers.Rmd", "answers.Rmd")
}

Subsequently, open it in the editor and use the Knit HMTL button to generate the html file.

If you use plain R, you first have to install Pandoc. Then copy the following lines to the most recent version of R.

# install rmarkdown package if not installed
if (!"rmarkdown" %in% rownames(installed.packages())) {
    install.packages("rmarkdown")
}
library(rmarkdown)  # load rmarkdown package

# download original file if not already exists (to prevent overwriting)
if (!file.exists("answers.Rmd")) {
    download.file("http://www.let.rug.nl/wieling/Statistics/Basic-Tests/lab/answers/answers.Rmd", "answers.Rmd")
}

# generate output
render("answers.Rmd")  # generates html file with results

# view output in browser
browseURL(paste("file://", file.path(getwd(), "answers.html"), sep = ""))  # shows result