
Martijn Wieling
Computational Linguistics Research Group
# Addition (this is a comment: preceded by '#')
5 + 5
# [1] 10
# Multiplication
5 * 3
# [1] 15
# Division
5/3
# [1] 1.6667
a <- 5 # store a single value; instead of '<-' you can also use '='
a # display the value
# [1] 5
b <- c(2, 4, 6, 7, 8) # store a series of values in a vector
b
# [1] 2 4 6 7 8
b[4] <- a # assign value 5 (stored in 'a') to the 4th element of vector b
b[1] <- NA # assign NA (missing) to the first element of vector b
b <- b * 10 # multiply all values in vector b with 10
b
# [1] NA 40 60 50 80
mn <- mean(b) # calculating the mean and storing in variable mn
mn
# [1] NA
# mn is NA (missing) as one of the values is missing
mean(b, na.rm = TRUE) # we can use the function parameter na.rm to ignore NAs
# [1] 57.5
# But which parameters does a function have: use help!
help(mean) # alternatively: ?mean
install.packages("swirl", repos = "http://cran.rstudio.com/")
library(swirl)
swirl()
# first set directory to where file is located, e.g., using function setwd()
dat <- read.csv2("thnl.csv") # read.csv2 reads Excel csv file from work dir
str(dat) # shows structure of the data frame dat (note: wide format)
# 'data.frame': 19 obs. of 4 variables:
# $ Participant : Factor w/ 19 levels "VENI-NL_1","VENI-NL_10",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ Gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 1 2 2 1 ...
# $ Frontness.T : num 0.781 0.766 0.884 0.748 0.748 ...
# $ Frontness.TH: num 0.738 0.767 0.879 0.761 0.774 ...
dim(dat) # number of rows and columns of data set
# [1] 19 4
head
head(dat) # show first few rows of dat
# Participant Gender Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
# 2 VENI-NL_10 M 0.76621 0.76685
# 3 VENI-NL_11 M 0.88366 0.87871
# 4 VENI-NL_12 M 0.74757 0.76094
# 5 VENI-NL_13 M 0.74761 0.77420
# 6 VENI-NL_14 M 0.75186 0.74913
dat[1, ] # values in first row
# Participant Gender Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
dat[1:2, c(2, 3)] # values of first two rows for second and third column
# Gender Frontness.T
# 1 M 0.78052
# 2 M 0.76621
dat[c(1, 2, 3), "Participant"] # values of first three rows for column 'Participant'
# [1] VENI-NL_1 VENI-NL_10 VENI-NL_11
# 19 Levels: VENI-NL_1 VENI-NL_10 VENI-NL_11 VENI-NL_12 VENI-NL_13 VENI-NL_14 ... VENI-NL_9
tmp <- dat[5:8, c(1, 3)] # store columns 1 and 3 for rows 5 to 8 in tmp
tmp <- dat[dat$Gender == "M", ] # only observations for male participants
head(tmp, n = 2) # show first two rows
# Participant Gender Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
# 2 VENI-NL_10 M 0.76621 0.76685
# more advanced subsetting: include rows for which frontness for the T sound is
# higher than 0.74 AND participant is either 1 or 2 N.B. use '|' instead of '&'
# for logical OR
dat[dat$Frontness.T > 0.74 & dat$Participant %in% c("VENI-NL_1", "VENI-NL_2"), ]
# Participant Gender Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
# new column Diff containing difference between TH and T positions
dat$Diff <- dat$Frontness.TH - dat$Frontness.T
# new column DiffClass, initially all observations set to TH0
dat$DiffClass <- "TH0"
# observations with Diff larger than 0.02 are categorized as TH1, negative as TH-
dat[dat$Diff > 0.02, ]$DiffClass <- "TH1"
dat[dat$Diff < 0, ]$DiffClass <- "TH-"
dat$DiffClass <- factor(dat$DiffClass) # convert string variable to factor
head(dat, 2)
# Participant Gender Frontness.T Frontness.TH Diff DiffClass
# 1 VENI-NL_1 M 0.78052 0.73801 -0.04250668 TH-
# 2 VENI-NL_10 M 0.76621 0.76685 0.00064245 TH0
swirl()
in RStudio and finish the following lessons of the R Programming course:
mean(dat$Diff) # mean
# [1] 0.016263
median(dat$Diff) # median
# [1] 0.01093
min(dat$Diff) # minimum value
# [1] -0.042507
max(dat$Diff) # maximum value
# [1] 0.10346
sd(dat$Diff) # standard deviation
# [1] 0.038213
var(dat$Diff) # variance
# [1] 0.0014603
quantile(dat$Diff) # quantiles
# 0% 25% 50% 75% 100%
# -0.0425067 -0.0038419 0.0109299 0.0248903 0.1034607
table(dat$Gender)
#
# F M
# 9 10
table(dat$DiffClass)
#
# TH- TH0 TH1
# 6 7 6
# correlation: two numerical variables
cor(dat$Frontness.T, dat$Frontness.TH)
# [1] 0.71054
# crosstable: two categorical variables
table(dat$Gender, dat$DiffClass)
#
# TH- TH0 TH1
# F 1 3 5
# M 5 4 1
# means per category: numerical and categorical variable
c(mean(dat[dat$Gender == "M", ]$Diff), mean(dat[dat$Gender == "F", ]$Diff))
# [1] -0.0034299 0.0381446
R
boxplot()
for a boxplothist()
for a histogramqqnorm()
and qqline()
for a quantile-quantile plotplot()
for many types of plots (scatter, line, etc.)barplot()
for a barplot (plotting frequencies)par(mfrow = c(1, 2)) # set graphics option: 2 graphs side-by-side
boxplot(dat$Diff, main = "Difference") # boxplot of difference values
boxplot(dat[, c("Frontness.T", "Frontness.TH")]) # frontness per group
hist(dat$Diff, main = "Difference histogram")
qqnorm(dat$Diff) # plot actual values vs. theoretical quantiles
qqline(dat$Diff) # plot reference line of normal distribution
plot(dat$Frontness.T, dat$Frontness.TH, col = "blue")
counts <- table(dat$Gender) # frequency table for gender
barplot(counts, ylim = c(0, 15))
counts <- table(dat$Gender, dat$DiffClass)
barplot(counts, col = c("pink", "lightblue"), legend = rownames(counts), ylim = c(0,
10))
swirl()
in RStudio and finish the following lesson of the R Programming course:
R
t.test()
for a \(t\)-test (single sample, paired, independent)wilcox.test()
for non-parametric alternatives to the \(t\)-testchisq.test()
for the chi-square testaov()
for one-way ANOVAlm()
for regressionqqnorm(dat$Diff) # plot actual values vs. theoretical quantiles
qqline(dat$Diff) # plot reference line of normal distribution
shapiro.test(dat$Diff)
#
# Shapiro-Wilk normality test
#
# data: dat$Diff
# W = 0.92, p-value = 0.11
R
car
, which contains other useful functions and will be discussed laterbartlett.test(list(dat[dat$Gender == "M", ]$Diff, dat[dat$Gender == "F", ]$Diff))
#
# Bartlett test of homogeneity of variances
#
# data: list(dat[dat$Gender == "M", ]$Diff, dat[dat$Gender == "F", ]$Diff)
# Bartlett's K-squared = 3.42, df = 1, p-value = 0.064
# simpler way to write this: bartlett.test( Diff ~ Gender, data=dat )
fligner.test(Diff ~ Gender, data = dat)
#
# Fligner-Killeen test of homogeneity of variances
#
# data: Diff by Gender
# Fligner-Killeen:med chi-squared = 2.79, df = 1, p-value = 0.095
# start with visualization
boxplot(dat$Diff)
abline(h = 0, col = "red")
t.test(dat$Diff) # 2-tailed test is default
#
# One Sample t-test
#
# data: dat$Diff
# t = 1.86, df = 18, p-value = 0.08
# alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
# -0.002155 0.034682
# sample estimates:
# mean of x
# 0.016263
t.test(dat$Diff, alternative = "greater")$p.value # 1-tailed p-value
# [1] 0.040018
library(swirl)
install_from_swirl("Mathematical_Biostatistics_Boot_Camp")
swirl()
in RStudio and finish the following lesson of the Mathematical Biostatistics Boot Camp course:
# start with visualization
boxplot(dat[, c("Frontness.T", "Frontness.TH")])
t.test(dat$Frontness.T, dat$Frontness.TH, paired = TRUE)
#
# Paired t-test
#
# data: dat$Frontness.T and dat$Frontness.TH
# t = -1.86, df = 18, p-value = 0.08
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
# -0.034682 0.002155
# sample estimates:
# mean of the differences
# -0.016263
t.test(dat$Frontness.T, dat$Frontness.TH, paired = T, alt = "greater")$p.value # wrong tail!
# [1] 0.95998
# start with visualization
boxplot(Diff ~ Gender, data = dat)
t.test(dat[dat$Gender == "M", ]$Diff, dat[dat$Gender == "F", ]$Diff) # default: unequal var.
#
# Welch Two Sample t-test
#
# data: dat[dat$Gender == "M", ]$Diff and dat[dat$Gender == "F", ]$Diff
# t = -2.68, df = 11.6, p-value = 0.02
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
# -0.0754351 -0.0077139
# sample estimates:
# mean of x mean of y
# -0.0034299 0.0381446
t.test(dat[dat$Gender == "M", ]$Diff, dat[dat$Gender == "F", ]$Diff, var.equal = TRUE)$p.value
# [1] 0.013018
wilcox.test
# start with visualization
qqnorm(dat$Diff)
qqline(dat$Diff)
wilcox.test(dat$Diff) # 2-tailed test is default
#
# Wilcoxon signed rank test
#
# data: dat$Diff
# V = 133, p-value = 0.13
# alternative hypothesis: true location is not equal to 0
wilcox.test(dat$Diff, alternative = "greater")$p.value # 1-tailed p-value
# [1] 0.066811
wilcox.test(dat$Frontness.T, dat$Frontness.TH, paired = TRUE)
#
# Wilcoxon signed rank test
#
# data: dat$Frontness.T and dat$Frontness.TH
# V = 57, p-value = 0.13
# alternative hypothesis: true location shift is not equal to 0
wilcox.test(dat$Frontness.T, dat$Frontness.TH, paired = T, alternative = "less")$p.value
# [1] 0.066811
par(mfrow = c(1, 2)) # start with visualization
qqnorm(dat[dat$Gender == "M", ]$Diff, main = "M")
qqline(dat[dat$Gender == "M", ]$Diff)
qqnorm(dat[dat$Gender == "F", ]$Diff, main = "F")
qqline(dat[dat$Gender == "F", ]$Diff)
wilcox.test(dat[dat$Gender == "F", ]$Diff, dat[dat$Gender == "M", ]$Diff)
#
# Wilcoxon rank sum test
#
# data: dat[dat$Gender == "F", ]$Diff and dat[dat$Gender == "M", ]$Diff
# W = 73, p-value = 0.022
# alternative hypothesis: true location shift is not equal to 0
chisq.test(table(dat$Gender, dat$DiffClass)) # warning due to low expected values
# Warning in chisq.test(table(dat$Gender, dat$DiffClass)): Chi-squared approximation may be incorrect
#
# Pearson's Chi-squared test
#
# data: table(dat$Gender, dat$DiffClass)
# X-squared = 5.44, df = 2, p-value = 0.066
table(dat$Gender, dat$DiffClass)
#
# TH- TH0 TH1
# F 1 3 5
# M 5 4 1
chisq.test(table(dat$Gender, dat$DiffClass), simulate.p.value = TRUE)
#
# Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)
#
# data: table(dat$Gender, dat$DiffClass)
# X-squared = 5.44, df = NA, p-value = 0.086
# start with visualization
boxplot(Diff ~ DiffClass, data = dat)
result <- aov(Diff ~ DiffClass, data = dat)
# non-parametric alternative: kruskal.test(Diff ~ DiffClass, dat=dat)
summary(result) # is the ANOVA significant?
# Df Sum Sq Mean Sq F value Pr(>F)
# DiffClass 2 0.01957 0.00978 23.3 1.8e-05 ***
# Residuals 16 0.00672 0.00042
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(result) # post-hoc tests
# Tukey multiple comparisons of means
# 95% family-wise confidence level
#
# Fit: aov(formula = Diff ~ DiffClass, data = dat)
#
# $DiffClass
# diff lwr upr p adj
# TH0-TH- 0.027155 -0.0022602 0.056571 0.07281
# TH1-TH- 0.079321 0.0487957 0.109847 0.00001
# TH1-TH0 0.052166 0.0227509 0.081582 0.00086
result <- lm(Frontness.T ~ Frontness.TH, data = dat)
summary(result) # predictor significant
#
# Call:
# lm(formula = Frontness.T ~ Frontness.TH, data = dat)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.06674 -0.01274 -0.00056 0.01169 0.05867
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.299 0.113 2.65 0.01692 *
# Frontness.TH 0.598 0.144 4.16 0.00065 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.0325 on 17 degrees of freedom
# Multiple R-squared: 0.505, Adjusted R-squared: 0.476
# F-statistic: 17.3 on 1 and 17 DF, p-value: 0.000651
modelResiduals <- resid(result)
qqnorm(modelResiduals)
qqline(modelResiduals) # not really
result <- lm(Diff ~ DiffClass + Frontness.TH + Gender, data = dat)
summary(result) # not all predictors significant
#
# Call:
# lm(formula = Diff ~ DiffClass + Frontness.TH + Gender, data = dat)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.03746 -0.00878 -0.00120 0.00813 0.04020
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.11795 0.08439 -1.40 0.18398
# DiffClassTH0 0.02612 0.01165 2.24 0.04165 *
# DiffClassTH1 0.06575 0.01473 4.46 0.00053 ***
# Frontness.TH 0.13805 0.10710 1.29 0.21828
# GenderM -0.00876 0.01102 -0.80 0.43971
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.0202 on 14 degrees of freedom
# Multiple R-squared: 0.783, Adjusted R-squared: 0.722
# F-statistic: 12.7 on 4 and 14 DF, p-value: 0.000145
m0 <- lm(Diff ~ 1, data = dat)
m1 <- lm(Diff ~ DiffClass, data = dat)
m2 <- lm(Diff ~ DiffClass + Frontness.TH, data = dat)
m3 <- lm(Diff ~ DiffClass + Frontness.TH + Gender, data = dat)
anova(m0, m1, m2, m3) # model comparison: m1 best model
# Analysis of Variance Table
#
# Model 1: Diff ~ 1
# Model 2: Diff ~ DiffClass
# Model 3: Diff ~ DiffClass + Frontness.TH
# Model 4: Diff ~ DiffClass + Frontness.TH + Gender
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 18 0.02628
# 2 16 0.00672 2 0.01957 24.06 3e-05 ***
# 3 15 0.00595 1 0.00077 1.89 0.19
# 4 14 0.00569 1 0.00026 0.63 0.44
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R
multcomp
)car
)lme4
)
reshape
)ggplot2
for advanced plotting functions)install.packages("lme4", repos = "http://cran.rstudio.com/") # installation only once
library(lme4) # then load package
library(multcomp) # install via: install.packages('multcomp')
summary(glht(m1, linfct = mcp(DiffClass = "Tukey")))
#
# Simultaneous Tests for General Linear Hypotheses
#
# Multiple Comparisons of Means: Tukey Contrasts
#
#
# Fit: lm(formula = Diff ~ DiffClass, data = dat)
#
# Linear Hypotheses:
# Estimate Std. Error t value Pr(>|t|)
# TH0 - TH- == 0 0.0272 0.0114 2.38 0.073 .
# TH1 - TH- == 0 0.0793 0.0118 6.71 <0.001 ***
# TH1 - TH0 == 0 0.0522 0.0114 4.58 <0.001 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (Adjusted p values reported -- single-step method)
table(dat$Gender, dat$DiffClass)
#
# TH- TH0 TH1
# F 1 3 5
# M 5 4 1
with(dat, interaction.plot(DiffClass, Gender, Diff, col = c("blue", "red"), type = "b"))
library(car)
# set orthogonal contrasts contrasts for unordered and ordered factors
op <- options(contrasts = c("contr.sum", "contr.poly"))
Anova(aov(Diff ~ DiffClass * Gender, data = dat), type = 3) # run Anova SS type 3
# Anova Table (Type III tests)
#
# Response: Diff
# Sum Sq Df F value Pr(>F)
# (Intercept) 0.00194 1 4.75 0.0484 *
# DiffClass 0.00700 2 8.57 0.0042 **
# Gender 0.00063 1 1.55 0.2346
# DiffClass:Gender 0.00106 2 1.30 0.3054
# Residuals 0.00531 13
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
options(op) # reset to previous setting (treatment contrasts for unordered factors)
library(car)
op <- options(contrasts = c("contr.sum", "contr.poly"))
Anova(aov(Diff ~ DiffClass * Gender + Frontness.T, data = dat), type = 3)
# Anova Table (Type III tests)
#
# Response: Diff
# Sum Sq Df F value Pr(>F)
# (Intercept) 0.00028 1 0.65 0.4370
# DiffClass 0.00623 2 7.30 0.0084 **
# Gender 0.00068 1 1.60 0.2304
# Frontness.T 0.00019 1 0.44 0.5178
# DiffClass:Gender 0.00118 2 1.39 0.2869
# Residuals 0.00512 12
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
options(op)
library(reshape)
datlong <- melt(dat, id.vars=c("Participant","Gender","Diff","DiffClass"),
measure.vars= c("Frontness.T","Frontness.TH"), variable_name="Type")
head(datlong,3)
# Participant Gender Diff DiffClass Type value
# 1 VENI-NL_1 M -0.04250668 TH- Frontness.T 0.78052
# 2 VENI-NL_10 M 0.00064245 TH0 Frontness.T 0.76621
# 3 VENI-NL_11 M -0.00494710 TH- Frontness.T 0.88366
dim(dat)
# [1] 19 6
dim(datlong)
# [1] 38 6
library(lme4)
model <- lmer(value ~ Gender + DiffClass + (1 | Participant), data = datlong)
summary(model, cor = F)
# Linear mixed model fit by REML ['lmerMod']
# Formula: value ~ Gender + DiffClass + (1 | Participant)
# Data: datlong
#
# REML criterion at convergence: -110.7
#
# Scaled residuals:
# Min 1Q Median 3Q Max
# -1.8897 -0.2849 -0.0311 0.4311 1.7146
#
# Random effects:
# Groups Name Variance Std.Dev.
# Participant (Intercept) 0.001732 0.0416
# Residual 0.000824 0.0287
# Number of obs: 38, groups: Participant, 19
#
# Fixed effects:
# Estimate Std. Error t value
# (Intercept) 0.78476 0.02824 27.79
# GenderM -0.00501 0.02518 -0.20
# DiffClassTH0 -0.02403 0.02659 -0.90
# DiffClassTH1 0.01299 0.03156 0.41
R
R
for:
Thank you for your attention!