Warwick Data Science Society
c()
functionx <- c(1, 4, 9)
x
[1] 1 4 9
seq()
seq(-5, 5, length.out = 6)
[1] -5 -3 -1 1 3 5
sqrt(x)
[1] 1 2 3
read_csv()
function from the readr
package# import a csv file at ~/project_name/data/my_data.csv
setwd('~/project_name') # or use Session > Set Working Director > ...
read_csv('data/my_data.csv')
skip = n
- skip the first n
lines of the filecomment = '{char}'
- ignore lines which begin with {char}
col_names = FALSE
- the file has no column names and we don't know themcol_names = c('col1_name', ...)
- the file has no column names but we do know themna = '{char}'
- import {char}
as a missing valuecol_types = cols(col1 = col_{type}(), ...)
- override types of columnsgeom_line()
or geom_smooth()
linetype
and group
se = FALSE
method = 'lm'/'loess'/'gam'
x <- c(4, 10, 10, 12, 19)
mean(x)
[1] 11
median(x)
[1] 10
quantile(x)
0% 25% 50% 75% 100%
4 10 10 12 19
quantile(x)[2]
25%
10
x <- c(4, 10, 10, 12, 19)
# returns two values
range(x)
[1] 4 19
# use diff() to find difference
diff(range(x))
[1] 15
# returns one value
IQR(x)
[1] 2
x <- c(4, 10, 10, 12, 19)
# variance
var(x)
[1] 29
# standard deviation = sqrt(variance)
sd(x)
[1] 5.385165
NA
) has a statistical transformation applied to it then the result will always be NA
x <- c(1, 5, 10, NA, 12)
mean(x)
[1] NA
na.rm = TRUE
mean(x, na.rm = TRUE)
[1] 7
4 < 6
[1] TRUE
3 <= 3
[1] TRUE
5 < 4
[1] FALSE
5 >= 4
[1] TRUE
# use double ='s to check for equality
# a single equals is already used for specifying parameters of function
4 == 4
[1] TRUE
4 == 5
[1] FALSE
4 != 4
[1] FALSE
4 != 5
[1] TRUE
x <- c(2, 6, 10)
y <- c(3, 5, 10)
x <= y
[1] TRUE FALSE TRUE
x == y
[1] FALSE FALSE TRUE
all()
and any()
. See their corresponding help pages for more detailsTRUE & TRUE
[1] TRUE
TRUE & FALSE
[1] FALSE
FALSE & FALSE
[1] FALSE
TRUE | TRUE
[1] TRUE
TRUE | FALSE
[1] TRUE
FALSE | FALSE
[1] FALSE
|
are TRUE
then so is the output!TRUE
[1] FALSE
!FALSE
[1] TRUE
(4 > 3) & (2 == 1)
[1] FALSE
(7 != 5) | (4 < 2)
[1] TRUE
(!(4 < 3)) & (2 == 2)
[1] TRUE
?Syntax
for more informationx <- c(1, 5, 9); y <- c(2, 5, 7); z <- c(1, 6, 8)
x == y
[1] FALSE TRUE FALSE
y < z
[1] FALSE TRUE TRUE
(x == y) & (y < z)
[1] FALSE TRUE FALSE
!(x == y)
[1] TRUE FALSE TRUE
dplyr
is the third tidyverse package that we will be looking atdplyr
verbsdplyr
functions referred to as verbs. filter()
- pick observations by their valuesarrange()
- reorder observations based on their valuesselect()
- pick variables by their namesmutate()
- create new variables as functions of existing variablessummarise()
/summarize()
- collapse many values down to a single summarygroup_by()
which changes the scope of each function from operating on the entire dataset to operating on it group-by-group. More on this latergroup_by
verb work similarly:
filter(mpg, cty > 30, cyl == 4)
# A tibble: 2 x 11
manufacturer model displ year cyl trans drv cty hwy fl class
<chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
1 volkswagen jetta 1.9 1999 4 manual~ f 33 44 d compact
2 volkswagen new be~ 1.9 1999 4 manual~ f 35 44 d subcom~
Warning: Don't confuse ==
with =
else you'll get the error
Error: [...] must not be named, do you need '=='?
filter(mpg, model == 'land cruiser wagon 4wd' | displ > 6.8)
# A tibble: 3 x 11
manufacturer model displ year cyl trans drv cty hwy fl class
<chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
1 chevrolet corvette 7 2008 8 manua~ r 15 24 p 2sea~
2 toyota land crui~ 4.7 1999 8 auto(~ 4 11 15 r suv
3 toyota land crui~ 5.7 2008 8 auto(~ 4 13 18 r suv
%in%
which checks if a value is in a vectorfilter(band_members, name %in% c('John', 'Paul'))
# A tibble: 2 x 2
name band
<chr> <chr>
1 John Beatles
2 Paul Beatles
sqrt(2) ^ 2 == 2
[1] FALSE
(1 / 49) * 49 == 1
[1] FALSE
near()
function to check for equivalence.near(sqrt(2) ^ 2, 2)
[1] TRUE
near((1 / 49) * 49, 1)
[1] TRUE
is.na()
function can be used to this extentdf <- tibble(x = c(1, NA, 3), y = c(4, 5, 6))
filter(df, is.na(x))
# A tibble: 1 x 2
x y
<dbl> <dbl>
1 NA 5
filter(df, !is.na(x))
# A tibble: 2 x 2
x y
<dbl> <dbl>
1 1 4
2 3 6
arrange()
orders observations by one or more variables in ascending order# order by class, break ties with year
arrange(mpg, class, year)
# A tibble: 234 x 11
manufacturer model displ year cyl trans drv cty hwy fl class
<chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
1 chevrolet corvette 5.7 1999 8 manual~ r 16 26 p 2sea~
2 chevrolet corvette 5.7 1999 8 auto(l~ r 15 23 p 2sea~
3 chevrolet corvette 6.2 2008 8 manual~ r 16 26 p 2sea~
4 chevrolet corvette 6.2 2008 8 auto(s~ r 15 25 p 2sea~
5 chevrolet corvette 7 2008 8 manual~ r 15 24 p 2sea~
6 audi a4 1.8 1999 4 auto(l~ f 18 29 p comp~
7 audi a4 1.8 1999 4 manual~ f 21 29 p comp~
8 audi a4 2.8 1999 6 auto(l~ f 16 26 p comp~
9 audi a4 2.8 1999 6 manual~ f 18 26 p comp~
10 audi a4 quat~ 1.8 1999 4 manual~ 4 18 26 p comp~
# ... with 224 more rows
desc()
to use descending order for that columnarrange(mpg, class, desc(year))
# A tibble: 234 x 11
manufacturer model displ year cyl trans drv cty hwy fl class
<chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
1 chevrolet corvette 6.2 2008 8 manual~ r 16 26 p 2sea~
2 chevrolet corvette 6.2 2008 8 auto(s~ r 15 25 p 2sea~
3 chevrolet corvette 7 2008 8 manual~ r 15 24 p 2sea~
4 chevrolet corvette 5.7 1999 8 manual~ r 16 26 p 2sea~
5 chevrolet corvette 5.7 1999 8 auto(l~ r 15 23 p 2sea~
6 audi a4 2 2008 4 manual~ f 20 31 p comp~
7 audi a4 2 2008 4 auto(a~ f 21 30 p comp~
8 audi a4 3.1 2008 6 auto(a~ f 18 27 p comp~
9 audi a4 quat~ 2 2008 4 manual~ 4 20 28 p comp~
10 audi a4 quat~ 2 2008 4 auto(s~ 4 19 27 p comp~
# ... with 224 more rows
TRUE
is greater than FALSE
since the underlying representation of these objects are 1
and 0
respectivelydf <- tibble(x = c(1, NA, 5))
arrange(df, x)
# A tibble: 3 x 1
x
<dbl>
1 1
2 5
3 NA
arrange(df, desc(x))
# A tibble: 3 x 1
x
<dbl>
1 5
2 1
3 NA
select(iris, Petal.Length, Petal.Width, Species)
# A tibble: 150 x 3
Petal.Length Petal.Width Species
<dbl> <dbl> <fct>
1 1.4 0.2 setosa
2 1.4 0.2 setosa
3 1.3 0.2 setosa
4 1.5 0.2 setosa
5 1.4 0.2 setosa
6 1.7 0.4 setosa
7 1.4 0.3 setosa
8 1.5 0.2 setosa
9 1.4 0.2 setosa
10 1.5 0.1 setosa
# ... with 140 more rows
-
symbolselect(iris, -Species)
# A tibble: 150 x 4
Sepal.Length Sepal.Width Petal.Length Petal.Width
<dbl> <dbl> <dbl> <dbl>
1 5.1 3.5 1.4 0.2
2 4.9 3 1.4 0.2
3 4.7 3.2 1.3 0.2
4 4.6 3.1 1.5 0.2
5 5 3.6 1.4 0.2
6 5.4 3.9 1.7 0.4
7 4.6 3.4 1.4 0.3
8 5 3.4 1.5 0.2
9 4.4 2.9 1.4 0.2
10 4.9 3.1 1.5 0.1
# ... with 140 more rows
:
operator to select or remove columns in a range (inclusive)select(iris, Sepal.Length:Petal.Length)
# A tibble: 150 x 3
Sepal.Length Sepal.Width Petal.Length
<dbl> <dbl> <dbl>
1 5.1 3.5 1.4
2 4.9 3 1.4
3 4.7 3.2 1.3
4 4.6 3.1 1.5
5 5 3.6 1.4
6 5.4 3.9 1.7
7 4.6 3.4 1.4
8 5 3.4 1.5
9 4.4 2.9 1.4
10 4.9 3.1 1.5
# ... with 140 more rows
-
select(iris, -(Sepal.Length:Petal.Length))
# A tibble: 150 x 2
Petal.Width Species
<dbl> <fct>
1 0.2 setosa
2 0.2 setosa
3 0.2 setosa
4 0.2 setosa
5 0.2 setosa
6 0.4 setosa
7 0.3 setosa
8 0.2 setosa
9 0.2 setosa
10 0.1 setosa
# ... with 140 more rows
select()
:
starts_with('abc')
ends_with('xyz')
contains('ijk')
num_range('var', 2:4)
- matches var2
, var3
, var4
everything
- matches all variables?select_helpers
/?select
for more informationselect()
can be used to rename columns but this can be more easily achieved using rename()
rename(iris, length_of_sepal = Sepal.Length, width_of_sepal = Sepal.Width)
# A tibble: 150 x 5
length_of_sepal width_of_sepal Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ... with 140 more rows
mutate()
lets you transform one or more variables currently in your dataset to create a new onetemps <- tibble(day = c('Mon', 'Tues', 'Wed'), temp_c = c(22, 24, 19))
mutate(temps, temp_f = temp_c * 1.8 + 32)
# A tibble: 3 x 3
day temp_c temp_f
<chr> <dbl> <dbl>
1 Mon 22 71.6
2 Tues 24 75.2
3 Wed 19 66.2
health <- tibble(name = c('Ann', 'Bob', 'Charlie'),
weight = c(71, 87, 65),
height = c(1.68, 1.77, 1.72))
mutate(health, bmi = weight / height ^ 2)
# A tibble: 3 x 4
name weight height bmi
<chr> <dbl> <dbl> <dbl>
1 Ann 71 1.68 25.2
2 Bob 87 1.77 27.8
3 Charlie 65 1.72 22.0
sprint <- tibble(time_sec = c(1, 3, 5), dist_m = c(8, 26, 47))
mutate(sprint,
avg_speed_m_sec = dist_m / time_sec,
avg_speed_km_hr = avg_speed_m_sec * 3.6)
# A tibble: 3 x 4
time_sec dist_m avg_speed_m_sec avg_speed_km_hr
<dbl> <dbl> <dbl> <dbl>
1 1 8 8 28.8
2 3 26 8.67 31.2
3 5 47 9.4 33.8
+
, -
, *
, /
/ ^
x / sum(x)
(proportion), y - mean(y)
(centring)%/%
(integer division), %%
(remainder)times <- tibble(time = c(0923, 1321, 1908))
mutate(times,
hour = time %/% 100,
min = time %% 100)
# A tibble: 3 x 3
time hour min
<dbl> <dbl> <dbl>
1 923 9 23
2 1321 13 21
3 1908 19 8
log()
(natural), log2()
, log10()
cumsum()
, cumprod()
, cummin()
, cummax()
, cummean()
mutate(tibble(x = c(1, 4, 0)), sum = cumsum(x), prod = cumprod(x),
min = cummin(x), max = cummax(x), mean = cummean(x))
# A tibble: 3 x 6
x sum prod min max mean
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 1
2 4 5 4 1 4 2.5
3 0 5 0 0 4 1.67
<
, <=
, >
, >=
, ==
, !=
mutate(tibble(time = c(1147, 1252)), afternoon = time >= 1200)
# A tibble: 2 x 2
time afternoon
<dbl> <lgl>
1 1147 FALSE
2 1252 TRUE
summarise()
(or US spelling summarize()
) allows you to collapse a data frame to a single row based on an aggregation functionprofits <- tibble(day = c('Mon', 'Tues', 'Wed', 'Thurs', 'Fri'),
profit = c(323, 432, 491, NA, 402))
summarise(profits, avg_profit = mean(profit, na.rm = TRUE))
# A tibble: 1 x 1
avg_profit
<dbl>
1 412
summarise()
by itself is not very useful. We could have done this with mean(profits$profit, na.rm = TRUE)
group_by()
functionprofits <- tibble(day = c('Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'),
wkdy = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE),
profit = c(323, 432, 491, NA, 402, 631, 583))
by_wkdy <- group_by(profits, wkdy)
summarise(by_wkdy, avg_profit = mean(profit, na.rm = TRUE))
# A tibble: 2 x 2
wkdy avg_profit
<lgl> <dbl>
1 FALSE 607
2 TRUE 412
by_species <- group_by(iris, Species)
summarise(by_species, mean_sepal_len = mean(Sepal.Length),
count = n(), # use function n() to count how many in each group
range_of_petal_width = diff(range(Petal.Width)))
# A tibble: 3 x 4
Species mean_sepal_len count range_of_petal_width
<fct> <dbl> <int> <dbl>
1 setosa 5.01 50 0.5
2 versicolor 5.94 50 0.8
3 virginica 6.59 50 1.1
mon_split <- mutate(airquality, mon_half = ifelse(Day <= 15, 'Start', 'End'))
airquality_grpd <- group_by(mon_split, Month, mon_half)
summarise(airquality_grpd, med_wind = median(Wind))
# A tibble: 10 x 3
# Groups: Month [5]
Month mon_half med_wind
<int> <chr> <dbl>
1 5 End 11.8
2 5 Start 10.9
3 6 End 9.2
4 6 Start 9.7
5 7 End 8.3
6 7 Start 9.2
7 8 End 8.85
8 8 Start 8.6
9 9 End 10.3
10 9 Start 10.3
dpylr
comes with an amazing tool called the pipe - %>%
airquality %>%
mutate(mon_half = ifelse(Day <= 15, 'Start', 'End')) %>%
group_by(Month, mon_half) %>%
summarise(med_wind = median(Wind))
G
, what is the median volume (assume approximate ellipsoidality) for each cut?diamonds %>%
filter(color == 'G') %>%
# volume of ellipsoid: https://en.wikipedia.org/wiki/Ellipsoid#Volume
mutate(vol = 4/3 * pi * x * y * z) %>%
group_by(cut) %>%
summarise(med_vol = median(vol)) %>%
ggplot(aes(x = cut, y = med_vol, fill = cut)) +
# we'll learn more about geom_col in session 5
geom_col()