Martijn Wieling
University of Groningen
R
R
R
Variable | Example values |
---|---|
Sex | male, female, … |
English grade | 4.6, 5.5, 6.3, 7.2, … |
Year of birth | 1990, 1991, 1993, … |
Native language | Dutch, German, English, … |
participant | year | sex | bl_edu | study | english_grade | english_score |
---|---|---|---|---|---|---|
495 | 2023 | F | N | OTHER | 8 | 8.48 |
496 | 2023 | M | N | IS | 7 | 7.50 |
497 | 2023 | F | N | OTHER | 7 | 7.20 |
498 | 2023 | F | N | LING | 6 | 7.27 |
499 | 2023 | M | N | IS | 7 | 6.20 |
500 | 2023 | M | N | IS | 8 | 6.26 |
table(dat$sex) # absolute frequencies
#
# F M
# 346 154
prop.table(table(dat$sex)) # relative frequencies
#
# F M
# 0.692 0.308
par(mfrow = c(1, 2))
barplot(table(dat$sex), col = c("pink", "lightblue"), main = "abs. frq.")
barplot(prop.table(table(dat$sex)), col = c("pink", "lightblue"), main = "rel. frq.")
pie(table(dat$study), col = c("red", "cyan", "blue", "yellow"))
table(dat$english_grade)
#
# 5 5.5 6 6.5 7 7.5 8 8.5 9 9.5
# 6 2 76 12 191 21 148 15 26 3
hist(dat$english_grade, xlab = "English grade", main = "")
plot(density(dat$english_grade), main = "", xlab = "English grade")
$$\bar{x} = \frac{x_1 + x_2 + \cdots + x_n}{n} = \frac{1}{n}\sum\limits_{i=1}^n x_i$$
mean(dat$english_grade) # arithmetic average
# [1] 7.2828
median(dat$english_grade)
# [1] 7
# no built-in function to get mode: new function
my_mode <- function(x) {
counts <- table(x)
as.numeric(names(which(counts == max(counts))))
}
my_mode(dat$english_grade)
# [1] 7
quantile(dat$english_grade) # default: quartiles
# 0% 25% 50% 75% 100%
# 5.0 7.0 7.0 8.0 9.5
boxplot(dat$english_grade, col = "red")
min(dat$english_grade) # minimum value
# [1] 5
max(dat$english_grade) # maximum value
# [1] 9.5
range(dat$english_grade) # returns minimum and maximum value
# [1] 5.0 9.5
diff(range(dat$english_grade)) # returns difference between min. and max.
# [1] 4.5
IQR(dat$english_grade) # interquartile range
# [1] 1
var(dat$english_grade) # sample variance
# [1] 0.75173
sd(dat$english_grade) # sample standard deviation
# [1] 0.86702
sd(dat$english_grade) == sqrt(var(dat$english_grade)) # std. dev. = sqrt of var.?
# [1] TRUE
\(P(\mu - \sigma \leq x \leq \mu + \sigma) \approx 68\%\) (34 + 34)
\(P(\mu - 2\sigma \leq x \leq \mu + 2\sigma) \approx 95\%\) (34 + 34 + 13.5 + 13.5)
\(P(\mu - 3\sigma \leq x \leq \mu + 3\sigma) \approx 99.7\%\) (34 + 34 + 13.5 + 13.5 + 2.35 + 2.35)
\(P(85 \leq \rm{IQ} \leq 115) \approx 68\%\) (34 + 34)
\(P(70 \leq \rm{IQ} \leq 130) \approx 95\%\) (34 + 34 + 13.5 + 13.5)
\(P(55 \leq \rm{IQ} \leq 145) \approx 99.7\%\) (34 + 34 + 13.5 + 13.5 + 2.35 + 2.35)
dat$english_grade.z <- scale(dat$english_grade) # scale: calculates z-scores
mean(dat$english_grade.z) # should be 0
# [1] 0
sd(dat$english_grade.z) # should be 1
# [1] 1
\(P(-1 \leq z \leq 1) \approx 68\%\) (34 + 34)
\(P(-2 \leq z \leq 2) \approx 95\%\) (34 + 34 + 13.5 + 13.5)
\(P(-3 \leq z \leq 3) \approx 99.7\%\) (34 + 34 + 13.5 + 13.5 + 2.35 + 2.35)
\(P(\mu - \sigma \leq x \leq \mu + \sigma) \approx 68\%\) (34 + 34)
\(P(\mu - 2\sigma \leq x \leq \mu + 2\sigma) \approx 95\%\) (34 + 34 + 13.5 + 13.5)
\(P(\mu - 3\sigma \leq x \leq \mu + 3\sigma) \approx 99.7\%\) (34 + 34 + 13.5 + 13.5 + 2.35 + 2.35)
qnorm
returns the \(z\)-values for a certain proportion (percentile / 100)qnorm(95/100) # z-value associated with 95th percentile
# [1] 1.6449
pnorm
returns the proportion of data < a specified \(z\)-value
100 * pnorm(1.6449)
# [1] 95
1 - pnorm(1.64)
# [1] 0.050503
pnorm(2) - pnorm(-2)
# [1] 0.9545
pnorm(2)
# [1] 0.97725
pnorm(-2)
# [1] 0.02275
qqnorm(dat$english_grade)
qqline(dat$english_grade)
qqnorm(dat$english_score)
qqline(dat$english_score)
Thank you for your attention!