
Martijn Wieling
University of Groningen
# Addition (this is a comment: preceded by '#')
5 + 5
# [1] 10
# Multiplication
5 * 3
# [1] 15
# Division
5/3
# [1] 1.6667
a <- 5 # store a single value; instead of '<-' you can also use '='
a # display the value
# [1] 5
b <- c(2, 4, 6, 7, 8) # store a series of values in a vector
b
# [1] 2 4 6 7 8
b[4] <- a # assign value 5 (stored in 'a') to the 4th element of vector b
b[1] <- NA # assign NA (missing) to the first element of vector b
b <- b * 10 # multiply all values in vector b with 10
b
# [1] NA 40 60 50 80
mn <- mean(b) # calculating the mean and storing in variable mn
mn
# [1] NA
# mn is NA (missing) as one of the values is missing
mean(b, na.rm = TRUE) # we can use the function parameter na.rm to ignore NAs
# [1] 57.5
# But which parameters does a function have: use help!
help(mean) # alternatively: ?mean
install.packages("swirl", repos = "http://cran.rstudio.com/")
library(swirl)
swirl()
setwd("C:/Users/Martijn/Desktop/Statistics/Intro-R") # set working directory
dat <- read.csv2("thnl.csv") # read.csv2 reads Excel csv file from work dir
str(dat) # shows structure of the data frame dat (note: wide format)
# 'data.frame': 19 obs. of 4 variables:
# $ Participant : chr "VENI-NL_1" "VENI-NL_10" "VENI-NL_11" "VENI-NL_12" ...
# $ Sex : chr "M" "M" "M" "M" ...
# $ Frontness.T : num 0.781 0.766 0.884 0.748 0.748 ...
# $ Frontness.TH: num 0.738 0.767 0.879 0.761 0.774 ...
dim(dat) # number of rows and columns of data set
# [1] 19 4
head
head(dat) # show first few rows of dat
# Participant Sex Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
# 2 VENI-NL_10 M 0.76621 0.76685
# 3 VENI-NL_11 M 0.88366 0.87871
# 4 VENI-NL_12 M 0.74757 0.76094
# 5 VENI-NL_13 M 0.74761 0.77420
# 6 VENI-NL_14 M 0.75186 0.74913
dat[1, ] # values in first row
# Participant Sex Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
dat[1:2, c(2, 3)] # values of first two rows for second and third column
# Sex Frontness.T
# 1 M 0.78052
# 2 M 0.76621
dat[c(1, 2, 3), "Participant"] # values of first three rows for column 'Participant'
# [1] "VENI-NL_1" "VENI-NL_10" "VENI-NL_11"
tmp <- dat[5:8, c(1, 3)] # store columns 1 and 3 for rows 5 to 8 in tmp
tmp <- dat[dat$Sex == "M", ] # only observations for male participants
head(tmp, n = 2) # show first two rows
# Participant Sex Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
# 2 VENI-NL_10 M 0.76621 0.76685
# more advanced subsetting: include rows for which frontness for the T sound is
# higher than 0.74 AND participant is either 1 or 2 N.B. use '|' instead of '&' for
# logical OR
dat[dat$Frontness.T > 0.74 & dat$Participant %in% c("VENI-NL_1", "VENI-NL_2"), ]
# Participant Sex Frontness.T Frontness.TH
# 1 VENI-NL_1 M 0.78052 0.73801
# new column Diff containing difference between TH and T positions
dat$Diff <- dat$Frontness.TH - dat$Frontness.T
# new column DiffClass, initially all observations set to TH0
dat$DiffClass <- "TH0"
# observations with Diff larger than 0.02 are categorized as TH1, negative as TH-
dat[dat$Diff > 0.02, ]$DiffClass <- "TH1"
dat[dat$Diff < 0, ]$DiffClass <- "TH-"
dat$DiffClass <- factor(dat$DiffClass) # convert string variable to factor
head(dat, 2)
# Participant Sex Frontness.T Frontness.TH Diff DiffClass
# 1 VENI-NL_1 M 0.78052 0.73801 -0.04250668 TH-
# 2 VENI-NL_10 M 0.76621 0.76685 0.00064245 TH0
swirl()
and finish the following lessons of the R Programming course:
mean(dat$Diff) # mean
# [1] 0.016263
median(dat$Diff) # median
# [1] 0.01093
min(dat$Diff) # minimum value
# [1] -0.042507
max(dat$Diff) # maximum value
# [1] 0.10346
sd(dat$Diff) # or: sqrt((1/(length(dat$Diff)-1)) * sum((dat$Diff - mean(dat$Diff))^2))
# [1] 0.038213
var(dat$Diff) # or: sd(dat$Diff)^2
# [1] 0.0014603
quantile(dat$Diff) # quantiles
# 0% 25% 50% 75% 100%
# -0.0425067 -0.0038419 0.0109299 0.0248903 0.1034607
summary(dat$Diff) # summary
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# -0.04251 -0.00384 0.01093 0.01626 0.02489 0.10346
table(dat$Sex)
#
# F M
# 9 10
with(dat, table(Sex)) # alternative
# Sex
# F M
# 9 10
table(dat$DiffClass)
#
# TH- TH0 TH1
# 6 7 6
# correlation: relation between two numerical variables
cor(dat$Frontness.T, dat$Frontness.TH)
# [1] 0.71054
# crosstable: relation between two categorical variables
table(dat$Sex, dat$DiffClass) # or: with(dat, table(Sex,DiffClass))
#
# TH- TH0 TH1
# F 1 3 5
# M 5 4 1
# means per category: relation between numerical and categorical variable
c(mean(dat[dat$Sex == "M", ]$Diff), mean(dat[dat$Sex == "F", ]$Diff))
# [1] -0.0034299 0.0381446
R
boxplot()
for a boxplothist()
for a histogramqqnorm()
and qqline()
for a quantile-quantile plotplot()
for many types of plots (scatter, line, etc.)barplot()
for a barplot (plotting frequencies)par(mfrow = c(1, 2)) # set graphics option: 2 graphs side-by-side
boxplot(dat$Diff, main = "Difference") # boxplot of difference values
boxplot(dat[, c("Frontness.T", "Frontness.TH")]) # frontness per group
hist(dat$Diff, main = "Difference histogram")
qqnorm(dat$Diff) # plot actual values vs. theoretical quantiles
qqline(dat$Diff) # plot reference line of normal distribution
plot(dat$Frontness.T, dat$Frontness.TH, col = "blue")
counts <- table(dat$Sex) # frequency table for sex
barplot(counts, ylim = c(0, 15))
counts <- table(dat$Sex, dat$DiffClass)
barplot(counts, col = c("pink", "lightblue"), legend = rownames(counts), ylim = c(0, 10))
swirl()
and finish the following lesson of the R Programming course:
R
R
for:
install_from_swirl('Exploratory_Data_Analysis')
Thank you for your attention!