Martijn Wieling
University of Groningen
R
R
as calculatorR
R
R
R
participant | year | sex | bl_edu | study | english_grade | english_score |
---|---|---|---|---|---|---|
1 | 2019 | M | N | LING | 5 | 6.10 |
2 | 2019 | F | N | CIS | 6 | 6.67 |
3 | 2019 | F | N | CIS | 7 | 7.42 |
4 | 2019 | F | N | LING | 8 | 9.10 |
5 | 2019 | F | N | CIS | 7 | 7.47 |
6 | 2019 | M | N | LING | 8 | 8.14 |
7 | 2019 | F | N | LING | 8 | 7.65 |
8 | 2019 | F | N | CIS | 6 | 7.35 |
9 | 2019 | F | N | LING | 8 | 8.54 |
10 | 2019 | M | N | IS | 8 | 8.39 |
11 | 2019 | F | N | LING | 7 | 7.98 |
12 | 2019 | M | N | OTHER | 7 | 6.15 |
13 | 2019 | F | N | LING | 7 | 5.60 |
participant | year | sex | bl_edu | study | english_grade | english_score |
---|---|---|---|---|---|---|
103 | 2020 | F | N | LING | 6 | 5.19 |
104 | 2020 | M | N | LING | 7 | 6.82 |
105 | 2020 | M | N | LING | 8 | 8.21 |
106 | 2020 | F | N | CIS | 7 | 7.34 |
107 | 2020 | F | N | LING | 7 | 6.59 |
108 | 2020 | F | N | LING | 8 | 7.55 |
109 | 2020 | F | N | LING | 7 | 7.19 |
110 | 2020 | F | Y | LING | 8 | 7.63 |
111 | 2020 | F | N | LING | 6 | 6.58 |
112 | 2020 | M | N | IS | 8 | 8.89 |
113 | 2020 | M | N | CIS | 7 | 6.76 |
114 | 2020 | F | N | OTHER | 8 | 8.18 |
115 | 2020 | M | N | CIS | 6 | 6.33 |
participant | year | sex | bl_edu | study | english_grade | english_score |
---|---|---|---|---|---|---|
225 | 2021 | M | N | LING | 8 | 9.02 |
226 | 2021 | F | N | LING | 8 | 7.44 |
227 | 2021 | F | N | CIS | 9 | 9.74 |
228 | 2021 | F | N | CIS | 7 | 9.06 |
229 | 2021 | F | N | CIS | 8 | 8.35 |
230 | 2021 | F | N | LING | 7 | 8.55 |
231 | 2021 | F | N | CIS | 6 | 6.51 |
232 | 2021 | F | N | LING | 7 | 7.87 |
233 | 2021 | M | N | CIS | 6 | 7.22 |
234 | 2021 | F | N | LING | 7 | 7.08 |
235 | 2021 | F | N | OTHER | 8 | 8.69 |
236 | 2021 | M | N | IS | 6 | 6.89 |
237 | 2021 | F | N | LING | 7 | 8.72 |
participant | year | sex | bl_edu | study | english_grade | english_score |
---|---|---|---|---|---|---|
316 | 2022 | M | N | OTHER | 6 | 6.86 |
317 | 2022 | M | N | LING | 8 | 8.07 |
318 | 2022 | M | N | IS | 8 | 7.72 |
319 | 2022 | F | N | LING | 7 | 7.53 |
320 | 2022 | F | Y | CIS | 8 | 9.23 |
321 | 2022 | M | N | IS | 7 | 7.64 |
322 | 2022 | M | N | IS | 7 | 7.82 |
323 | 2022 | F | N | LING | 8 | 8.65 |
324 | 2022 | M | N | LING | 9 | 9.09 |
325 | 2022 | M | N | IS | 6 | 7.61 |
326 | 2022 | F | N | LING | 8 | 8.26 |
327 | 2022 | F | N | LING | 7 | 6.67 |
328 | 2022 | F | N | LING | 8 | 8.38 |
participant | year | sex | bl_edu | study | english_grade | english_score |
---|---|---|---|---|---|---|
419 | 2023 | M | N | IS | 8 | 7.42 |
420 | 2023 | F | N | OTHER | 9 | 8.63 |
421 | 2023 | F | N | OTHER | 9 | 9.52 |
422 | 2023 | F | N | CIS | 7 | 7.26 |
423 | 2023 | F | N | LING | 7 | 7.62 |
424 | 2023 | F | N | OTHER | 7 | 8.40 |
425 | 2023 | M | N | LING | 7 | 8.00 |
426 | 2023 | F | N | CIS | 8 | 8.77 |
427 | 2023 | F | N | CIS | 7 | 7.78 |
428 | 2023 | M | Y | IS | 9 | 9.50 |
429 | 2023 | F | N | CIS | 7 | 7.55 |
430 | 2023 | F | N | CIS | 6 | 6.81 |
431 | 2023 | F | N | LING | 7 | 7.43 |
R
(this lecture)
R
R
?R
compared to (e.g.,) SPSS
R
)R
as calculator# Addition (this is a comment: preceded by '#')
5 + 5
# [1] 10
# Multiplication
5 * 3
# [1] 15
# Division
5/3
# [1] 1.6667
a <- 5 # store a single value; instead of '<-' you can also use '='
a # display the value
# [1] 5
b <- a * a # b contains the value of multiplying a with itself
b
# [1] 25
(d <- NA) # set value of d to missing (NA) and show value
# [1] NA
b <- c(2, 4, 6, 7, 8) # store a series of values in a vector (reusing variable b)
b
# [1] 2 4 6 7 8
b[4] <- a # assign value 5 (stored in 'a') to the 4th element of vector b
b
# [1] 2 4 6 5 8
b <- c(b, NA) # add element NA to b
b
# [1] 2 4 6 5 8 NA
b # show values in variable b (b contains a vector: a list of values)
# [1] 2 4 6 5 8 NA
mn <- mean(b) # calculating the mean and storing in variable mn
mn
# [1] NA
# mn is NA (missing) as one of the values is missing
mean(b, na.rm = TRUE) # we can use the function parameter na.rm to ignore NAs
# [1] 5
# But which parameters does a function have: use help!
help(mean) # alternatively: ?mean
R
: exporting a data setR
: importing a data setsetwd("C:/Users/Martijn/Desktop/Statistiek-I/HC1") # set working directory
dat <- read.csv("survey.csv", sep = ",", dec = ".") # reads csv file from work dir
str(dat) # shows structure of the data frame (i.e. table is 2-dimensional)
# 'data.frame': 500 obs. of 7 variables:
# $ participant : int 1 2 3 4 5 6 7 8 9 10 ...
# $ year : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
# $ sex : chr "M" "F" "F" "F" ...
# $ bl_edu : chr "N" "N" "N" "N" ...
# $ study : chr "LING" "CIS" "CIS" "LING" ...
# $ english_grade: num 5 6 7 8 7 8.4 8 6 8 8 ...
# $ english_score: num 6.1 6.67 7.42 9.1 7.47 ...
dim(dat) # number of rows and columns of data set
# [1] 500 7
head
head(dat) # show first few rows of dat
# participant year sex bl_edu study english_grade english_score
# 1 1 2019 M N LING 5.0 6.1000
# 2 2 2019 F N CIS 6.0 6.6736
# 3 3 2019 F N CIS 7.0 7.4229
# 4 4 2019 F N LING 8.0 9.0964
# 5 5 2019 F N CIS 7.0 7.4698
# 6 6 2019 M N LING 8.4 8.1369
dat[a,b]
:
a
indicates the selected rows of dat
b
indicates the selected columns of dat
dat[1, ] # values in first row (dat[,1]: values in first column)
# participant year sex bl_edu study english_grade english_score
# 1 1 2019 M N LING 5 6.1
dat[c(1, 5), c(1, 2, 3)] # values in rows 1 and 5 and columns 1, 2 and 3
# participant year sex
# 1 1 2019 M
# 5 5 2019 F
dat[c(1, 3, 5), c("participant", "study")] # rows 1, 3 and 5, and two named columns
# participant study
# 1 1 LING
# 3 3 CIS
# 5 5 CIS
$
operator
dat$sex
accesses the column sex
of dat
head(dat$sex, 200) # show sex of first 200 students
# [1] "M" "F" "F" "F" "F" "M" "F" "F" "F" "M" "F" "M" "F" "M" "M" "F" "M" "F" "F" "M"
# [21] "M" "F" "F" "M" "F" "F" "F" "F" "F" "F" "F" "M" "F" "F" "F" "F" "M" "M" "F" "F"
# [41] "F" "F" "F" "F" "F" "M" "M" "F" "F" "F" "M" "F" "F" "M" "F" "F" "F" "F" "F" "F"
# [61] "F" "F" "F" "F" "F" "M" "F" "F" "M" "F" "F" "F" "F" "F" "F" "F" "F" "F" "F" "F"
# [81] "F" "F" "F" "F" "M" "F" "F" "F" "F" "F" "F" "F" "M" "F" "F" "F" "M" "M" "F" "F"
# [101] "F" "F" "F" "M" "M" "F" "F" "F" "F" "F" "F" "M" "M" "F" "M" "F" "F" "F" "F" "F"
# [121] "F" "F" "F" "F" "F" "F" "M" "F" "M" "M" "F" "F" "F" "F" "F" "F" "F" "F" "M" "M"
# [141] "F" "F" "F" "M" "M" "F" "F" "F" "M" "M" "M" "F" "F" "F" "M" "M" "F" "F" "F" "F"
# [161] "F" "F" "F" "F" "M" "F" "M" "F" "F" "F" "M" "F" "F" "M" "M" "M" "M" "F" "F" "F"
# [181] "F" "F" "F" "F" "F" "F" "F" "F" "M" "F" "F" "M" "M" "F" "M" "F" "M" "M" "F" "F"
tmp <- dat[5:8, c(1, 3)] # store columns 1 and 3 for rows 5 to 8 in variable tmp
tmp # show what is stored in variable tmp
# participant sex
# 5 5 F
# 6 6 M
# 7 7 F
# 8 8 F
tmp <- dat[dat$sex == "M", ] # only observations for male participants
head(tmp)
# participant year sex bl_edu study english_grade english_score
# 1 1 2019 M N LING 5.0 6.1000
# 6 6 2019 M N LING 8.4 8.1369
# 10 10 2019 M N IS 8.0 8.3883
# 12 12 2019 M N OTHER 7.0 6.1467
# 14 14 2019 M N IS 9.0 9.1297
# 15 15 2019 M N CIS 7.0 8.2633
&
|
# only participants who study IS *and* are male
tmp <- dat[dat$sex == "M" & dat$study == "IS", ]
head(tmp)
# participant year sex bl_edu study english_grade english_score
# 10 10 2019 M N IS 8 8.3883
# 14 14 2019 M N IS 9 9.1297
# 17 17 2019 M N IS 8 7.1360
# 20 20 2019 M N IS 8 8.4594
# 21 21 2019 M N IS 7 8.2508
# 32 32 2019 M N IS 7 7.5851
!
(not)
!=
# only females (i.e. not males) *or* everybody with an English grade over 7
tmp <- dat[dat$sex != "M" | dat$english_grade > 7, ]
tail(tmp) # tail shows final 6 rows
# participant year sex bl_edu study english_grade english_score
# 493 493 2023 M N IS 8.0 8.1840
# 494 494 2023 M N IS 8.0 8.0375
# 495 495 2023 F N OTHER 8.0 8.4751
# 497 497 2023 F N OTHER 7.0 7.1963
# 498 498 2023 F N LING 6.0 7.2741
# 500 500 2023 M N IS 7.5 6.2609
$
helps us to do that# new column 'diff': English grade - English proficiency score
dat$diff <- dat$english_grade - dat$english_score
head(dat)
# participant year sex bl_edu study english_grade english_score diff
# 1 1 2019 M N LING 5.0 6.1000 -1.09996
# 2 2 2019 F N CIS 6.0 6.6736 -0.67357
# 3 3 2019 F N CIS 7.0 7.4229 -0.42291
# 4 4 2019 F N LING 8.0 9.0964 -1.09636
# 5 5 2019 F N CIS 7.0 7.4698 -0.46977
# 6 6 2019 M N LING 8.4 8.1369 0.26309
dat$pass_fail <- "PASS" # new column, initially PASS for everybody
dat[dat$english_grade < 5.5, ]$pass_fail <- "FAIL" # if grade too low, then FAIL
tail(dat[dat$english_grade > 4 & dat$english_grade < 6, 2:9]) # show subset of data
# year sex bl_edu study english_grade english_score diff pass_fail
# 341 2022 F N IS 5.8 5.7252 0.074803 PASS
# 359 2022 F N LING 5.0 6.1166 -1.116598 FAIL
# 373 2022 F Y CIS 5.0 4.3000 0.700000 FAIL
# 395 2022 F N LING 5.8 6.0576 -0.257642 PASS
# 399 2022 F N LING 5.8 5.1720 0.627971 PASS
# 469 2023 F N LING 5.0 5.9713 -0.971288 FAIL
R
R
barplot()
(illustrated in the following)plot()
boxplot()
hist()
qqnorm()
and qqline()
(counts <- table(dat$sex)) # first create frequency table
#
# F M
# 346 154
barplot(counts)
barplot(counts, col = c("pink", "lightblue"), ylim = c(0, 350), main = "My barplot",
xlab = "Sex", ylab = "Frequency")
(counts <- table(dat$sex, dat$study))
#
# CIS IS LING OTHER
# F 99 26 179 42
# M 24 98 16 16
barplot(counts, col = c("pink", "lightblue"), legend = c("F", "M"), ylim = c(0, 185))
R
R
is to conduct statistical analysesR
R
mean(dat$english_score) # mean of all people for English score
# [1] 7.6178
mean(dat[dat$sex == "F", ]$english_score) # mean of females for English score
# [1] 7.5342
median(dat$english_score) # median of all people for English score
# [1] 7.6377
min(dat$english_score) # minimum value
# [1] 4.3
max(dat$english_score) # maximum value
# [1] 9.7421
var(dat$english_score) # variance: average squared deviation from mean
# [1] 0.85021
sd(dat$english_score) # standard deviation (square root of variance)
# [1] 0.92207
table(dat$sex)
#
# F M
# 346 154
table(dat$study)
#
# CIS IS LING OTHER
# 123 124 195 58
table(dat$sex, dat$study)
#
# CIS IS LING OTHER
# F 99 26 179 42
# M 24 98 16 16
table(dat$sex, dat$bl_edu)
#
# N Y
# F 313 33
# M 140 14
R
t.test()
for a \(t\)-test (single sample, paired, independent)wilcox.test()
for non-parametric alternatives to the \(t\)-testbinom.test()
for the sign testchisq.test()
for the chi-square testcor()
for the correlationalpha()
(from package psych
) for Cronbachās \(\alpha\)t.test(english_grade ~ bl_edu, data = dat)
#
# Welch Two Sample t-test
#
# data: english_grade by bl_edu
# t = -3.26, df = 58.4, p-value = 0.0019
# alternative hypothesis: true difference in means between group N and group Y is not equal to 0
# 95 percent confidence interval:
# -0.63726 -0.15220
# sample estimates:
# mean in group N mean in group Y
# 7.2457 7.6404
cor(dat$english_score, dat$english_grade)
# [1] 0.74079
R
R
as calculatorR
R
R
Thank you for your attention!