Announcement

Reproduce lecture slides demo
HW2

Acknowledgement

Dr. Hua Zhou’s slides

rm(list = ls()) # clean-up workspace
library("tidyverse")

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library("nycflights13")

Add new variables with `mutate()`

Add variables gain and speed:

flights_sml <- 
  select(flights, year:day, ends_with("delay"), distance, air_time)
flights_sml

## # A tibble: 336,776 x 7
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1         2        11     1400      227
##  2  2013     1     1         4        20     1416      227
##  3  2013     1     1         2        33     1089      160
##  4  2013     1     1        -1       -18     1576      183
##  5  2013     1     1        -6       -25      762      116
##  6  2013     1     1        -4        12      719      150
##  7  2013     1     1        -5        19     1065      158
##  8  2013     1     1        -3       -14      229       53
##  9  2013     1     1        -3        -8      944      140
## 10  2013     1     1        -2         8      733      138
## # … with 336,766 more rows

mutate(flights_sml,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60
)

## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227     9  370.
##  2  2013     1     1         4        20     1416      227    16  374.
##  3  2013     1     1         2        33     1089      160    31  408.
##  4  2013     1     1        -1       -18     1576      183   -17  517.
##  5  2013     1     1        -6       -25      762      116   -19  394.
##  6  2013     1     1        -4        12      719      150    16  288.
##  7  2013     1     1        -5        19     1065      158    24  404.
##  8  2013     1     1        -3       -14      229       53   -11  259.
##  9  2013     1     1        -3        -8      944      140    -5  405.
## 10  2013     1     1        -2         8      733      138    10  319.
## # … with 336,766 more rows

Refer to columns that you’ve just created:

mutate(flights_sml,
  gain = arr_delay - dep_delay,
  hours = air_time %/% 60,
  minute = air_time %% 60,
  gain_per_hour = gain / (hours + minute / 60)
)

## # A tibble: 336,776 x 11
##     year month   day dep_delay arr_delay distance air_time  gain hours minute
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>  <dbl>
##  1  2013     1     1         2        11     1400      227     9     3     47
##  2  2013     1     1         4        20     1416      227    16     3     47
##  3  2013     1     1         2        33     1089      160    31     2     40
##  4  2013     1     1        -1       -18     1576      183   -17     3      3
##  5  2013     1     1        -6       -25      762      116   -19     1     56
##  6  2013     1     1        -4        12      719      150    16     2     30
##  7  2013     1     1        -5        19     1065      158    24     2     38
##  8  2013     1     1        -3       -14      229       53   -11     0     53
##  9  2013     1     1        -3        -8      944      140    -5     2     20
## 10  2013     1     1        -2         8      733      138    10     2     18
## # … with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>

Modular arithmetic: %/% (integer division) and %% (remainder).

Only keep the new variables by transmute():

transmute(flights,
  gain = arr_delay - dep_delay,
  hours = air_time %/% 60,
  minute = air_time %% 60,
  gain_per_hour = gain / (hours + minute / 60)
)

## # A tibble: 336,776 x 4
##     gain hours minute gain_per_hour
##    <dbl> <dbl>  <dbl>         <dbl>
##  1     9     3     47          2.38
##  2    16     3     47          4.23
##  3    31     2     40         11.6 
##  4   -17     3      3         -5.57
##  5   -19     1     56         -9.83
##  6    16     2     30          6.4 
##  7    24     2     38          9.11
##  8   -11     0     53        -12.5 
##  9    -5     2     20         -2.14
## 10    10     2     18          4.35
## # … with 336,766 more rows

across(): apply a function (or a set of functions) to a set of columns

flights %>%
  transmute(across(1:4, list(log = log, log2 = log2)))

## # A tibble: 336,776 x 8
##    year_log year_log2 month_log month_log2 day_log day_log2 dep_time_log
##       <dbl>     <dbl>     <dbl>      <dbl>   <dbl>    <dbl>        <dbl>
##  1     7.61      11.0         0          0       0        0         6.25
##  2     7.61      11.0         0          0       0        0         6.28
##  3     7.61      11.0         0          0       0        0         6.30
##  4     7.61      11.0         0          0       0        0         6.30
##  5     7.61      11.0         0          0       0        0         6.32
##  6     7.61      11.0         0          0       0        0         6.32
##  7     7.61      11.0         0          0       0        0         6.32
##  8     7.61      11.0         0          0       0        0         6.32
##  9     7.61      11.0         0          0       0        0         6.32
## 10     7.61      11.0         0          0       0        0         6.32
## # … with 336,766 more rows, and 1 more variable: dep_time_log2 <dbl>

apply funs of one type

flights %>%
  filter(dep_delay > 0 & arr_delay > 0) %>%
  transmute(across(where(is.numeric), log))

## # A tibble: 92,303 x 14
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>    <dbl>          <dbl>
##  1  7.61     0     0     6.25           6.24     0.693     6.72           6.71
##  2  7.61     0     0     6.28           6.27     1.39      6.75           6.72
##  3  7.61     0     0     6.30           6.29     0.693     6.83           6.75
##  4  7.61     0     0     6.41           6.40     2.08      6.69           6.60
##  5  7.61     0     0     6.42           6.40     2.40      6.85           6.84
##  6  7.61     0     0     6.42           6.41     1.10      6.83           6.83
##  7  7.61     0     0     6.43           6.41     2.56      6.82           6.82
##  8  7.61     0     0     6.45           6.41     3.18      6.61           6.59
##  9  7.61     0     0     6.55           6.55     0.693     6.96           6.92
## 10  7.61     0     0     6.56           6.55     2.20      6.75           6.72
## # … with 92,293 more rows, and 6 more variables: arr_delay <dbl>, flight <dbl>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>

Summaries

summarise() collapses a data frame to a single row.

Not very useful unless paired with group_by().
group_by() changes the unit of the analysis from the complete dataset to individual groups.

Mean of a variable:

(by_day <- group_by(flights, year, month, day))

## # A tibble: 336,776 x 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # … with 355 more rows

Convert a tibble into a grouped tibble:

(by_day <- group_by(flights, year, month, day))

Grouped summaries:

summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

Pipe

Consider following analysis:

by_dest <- group_by(flights, dest)
delay <- summarise(by_dest, count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
delay

## # A tibble: 96 x 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254 1826   4.38
##  2 ACK     265  199   4.85
##  3 ALB     439  143  14.4 
##  4 ATL   17215  757. 11.3 
##  5 AUS    2439 1514.  6.02
##  6 AVL     275  584.  8.00
##  7 BDL     443  116   7.05
##  8 BGR     375  378   8.03
##  9 BHM     297  866. 16.9 
## 10 BNA    6333  758. 11.8 
## # … with 86 more rows

n() count for each sub-group
sum(!is.na(x)) counts for all non-missing values

Cleaner code using pipe %>%:

delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")

## `summarise()` ungrouping output (override with `.groups` argument)

delays

## # A tibble: 96 x 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254 1826   4.38
##  2 ACK     265  199   4.85
##  3 ALB     439  143  14.4 
##  4 ATL   17215  757. 11.3 
##  5 AUS    2439 1514.  6.02
##  6 AVL     275  584.  8.00
##  7 BDL     443  116   7.05
##  8 BGR     375  378   8.03
##  9 BHM     297  866. 16.9 
## 10 BNA    6333  758. 11.8 
## # … with 86 more rows

ggplot2 accepts pipe too.

delays %>%
  ggplot(mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) + 
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Other summary functions

Measures of Location: mean(x), median(x).

not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled

## # A tibble: 327,346 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 327,336 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay = mean(arr_delay),
    avg_pos_delay = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )

## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)

## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day avg_delay avg_pos_delay
##    <int> <int> <int>     <dbl>         <dbl>
##  1  2013     1     1    12.7            32.5
##  2  2013     1     2    12.7            32.0
##  3  2013     1     3     5.73           27.7
##  4  2013     1     4    -1.93           28.3
##  5  2013     1     5    -1.53           22.6
##  6  2013     1     6     4.24           24.4
##  7  2013     1     7    -4.95           27.8
##  8  2013     1     8    -3.23           20.8
##  9  2013     1     9    -0.264          25.6
## 10  2013     1    10    -5.90           27.3
## # … with 355 more rows

Measures of Spread: sd(x), IQR(x), mad(x).

sd() standard deviation
IQR() interquartile range
mad() median absolute deviation

not_cancelled %>% 
  group_by(dest) %>% 
  summarise(distance_sd = sd(distance)) %>% 
  arrange(desc(distance_sd))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 104 x 2
##    dest  distance_sd
##    <chr>       <dbl>
##  1 EGE         10.5 
##  2 SAN         10.4 
##  3 SFO         10.2 
##  4 HNL         10.0 
##  5 SEA          9.98
##  6 LAS          9.91
##  7 PDX          9.87
##  8 PHX          9.86
##  9 LAX          9.66
## 10 IND          9.46
## # … with 94 more rows

Measures of Rank: min(x), quantile(x, 0.25), max(x).

# When do the first and last flights leave each day?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )

## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)

## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # … with 355 more rows

Measures of Position: first(x), nth(x, 2), last(x).

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )

## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)

## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day first_dep last_dep
##    <int> <int> <int>     <int>    <int>
##  1  2013     1     1       517     2356
##  2  2013     1     2        42     2354
##  3  2013     1     3        32     2349
##  4  2013     1     4        25     2358
##  5  2013     1     5        14     2357
##  6  2013     1     6        16     2355
##  7  2013     1     7        49     2359
##  8  2013     1     8       454     2351
##  9  2013     1     9         2     2252
## 10  2013     1    10         3     2320
## # … with 355 more rows

Count: n(x), sum(!is.na(x)), n_distinct(x).

# Which destinations have the most carriers?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 104 x 2
##    dest  carriers
##    <chr>    <int>
##  1 ATL          7
##  2 BOS          7
##  3 CLT          7
##  4 ORD          7
##  5 TPA          7
##  6 AUS          6
##  7 DCA          6
##  8 DTW          6
##  9 IAD          6
## 10 MSP          6
## # … with 94 more rows

not_cancelled %>% 
  count(dest)

## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # … with 94 more rows

# total number of miles a plane flew
not_cancelled %>% 
  count(tailnum, wt = distance)

## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # … with 4,027 more rows

# How many flights left before 5am? (these usually indicate delayed
# flights from the previous day)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time < 500))

## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day n_early
##    <int> <int> <int>   <int>
##  1  2013     1     1       0
##  2  2013     1     2       3
##  3  2013     1     3       4
##  4  2013     1     4       3
##  5  2013     1     5       3
##  6  2013     1     6       2
##  7  2013     1     7       2
##  8  2013     1     8       1
##  9  2013     1     9       3
## 10  2013     1    10       3
## # … with 355 more rows

# What proportion of flights are delayed by more than an hour?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_perc = mean(arr_delay > 60))

## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day hour_perc
##    <int> <int> <int>     <dbl>
##  1  2013     1     1    0.0722
##  2  2013     1     2    0.0851
##  3  2013     1     3    0.0567
##  4  2013     1     4    0.0396
##  5  2013     1     5    0.0349
##  6  2013     1     6    0.0470
##  7  2013     1     7    0.0333
##  8  2013     1     8    0.0213
##  9  2013     1     9    0.0202
## 10  2013     1    10    0.0183
## # … with 355 more rows

Grouped mutates (and filters)

Recall the flights_sml tibble created earlier:

flights_sml

## # A tibble: 336,776 x 7
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1         2        11     1400      227
##  2  2013     1     1         4        20     1416      227
##  3  2013     1     1         2        33     1089      160
##  4  2013     1     1        -1       -18     1576      183
##  5  2013     1     1        -6       -25      762      116
##  6  2013     1     1        -4        12      719      150
##  7  2013     1     1        -5        19     1065      158
##  8  2013     1     1        -3       -14      229       53
##  9  2013     1     1        -3        -8      944      140
## 10  2013     1     1        -2         8      733      138
## # … with 336,766 more rows

Find the worst members of each group:

flights_sml %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)

## # A tibble: 3,306 x 7
## # Groups:   year, month, day [365]
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1       853       851      184       41
##  2  2013     1     1       290       338     1134      213
##  3  2013     1     1       260       263      266       46
##  4  2013     1     1       157       174      213       60
##  5  2013     1     1       216       222      708      121
##  6  2013     1     1       255       250      589      115
##  7  2013     1     1       285       246     1085      146
##  8  2013     1     1       192       191      199       44
##  9  2013     1     1       379       456     1092      222
## 10  2013     1     2       224       207      550       94
## # … with 3,296 more rows

Find all groups bigger than a threshold:

(popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365))

## # A tibble: 332,577 x 19
## # Groups:   dest [77]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 332,567 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Standardise to compute per group metrics:

popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)

## # A tibble: 131,106 x 6
## # Groups:   dest [77]
##     year month   day dest  arr_delay prop_delay
##    <int> <int> <int> <chr>     <dbl>      <dbl>
##  1  2013     1     1 IAH          11  0.000111 
##  2  2013     1     1 IAH          20  0.000201 
##  3  2013     1     1 MIA          33  0.000235 
##  4  2013     1     1 ORD          12  0.0000424
##  5  2013     1     1 FLL          19  0.0000938
##  6  2013     1     1 ORD           8  0.0000283
##  7  2013     1     1 LAX           7  0.0000344
##  8  2013     1     1 DFW          31  0.000282 
##  9  2013     1     1 ATL          12  0.0000400
## 10  2013     1     1 DTW          16  0.000116 
## # … with 131,096 more rows

Combine tables | r4ds chapter 13

nycflights13 package has >1 tables:

We already know a lot about flights:

flights %>% print(width = Inf)

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
##  2        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
##  3        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
##  4       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
##  5       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
##  6        12 UA        1696 N39463  EWR    ORD        150      719     5     58
##  7        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
##  8       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
##  9        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
## 10         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
##    time_hour          
##    <dttm>             
##  1 2013-01-01 05:00:00
##  2 2013-01-01 05:00:00
##  3 2013-01-01 05:00:00
##  4 2013-01-01 05:00:00
##  5 2013-01-01 06:00:00
##  6 2013-01-01 05:00:00
##  7 2013-01-01 06:00:00
##  8 2013-01-01 06:00:00
##  9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # … with 336,766 more rows

airlines:

airlines

## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.

airports:

airports

## # A tibble: 1,458 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo…
##  2 06A   Moton Field Municipal A…  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo…
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo…
##  6 0A9   Elizabethton Municipal …  36.4  -82.2  1593    -5 A     America/New_Yo…
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo…
##  8 0G7   Finger Lakes Regional A…  42.9  -76.8   492    -5 A     America/New_Yo…
##  9 0P2   Shoestring Aviation Air…  39.8  -76.6  1000    -5 U     America/New_Yo…
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An…
## # … with 1,448 more rows

planes:

planes

## # A tibble: 3,322 x 9
##    tailnum  year type          manufacturer   model  engines seats speed engine 
##    <chr>   <int> <chr>         <chr>          <chr>    <int> <int> <int> <chr>  
##  1 N10156   2004 Fixed wing m… EMBRAER        EMB-1…       2    55    NA Turbo-…
##  2 N102UW   1998 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  3 N103US   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  4 N104UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  5 N10575   2002 Fixed wing m… EMBRAER        EMB-1…       2    55    NA Turbo-…
##  6 N105UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  7 N107US   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  8 N108UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  9 N109UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
## 10 N110UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
## # … with 3,312 more rows

Weather:

weather

## # A tibble: 26,115 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
##  8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
##  9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
## 10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>

Relational data

Keys

A primary key uniquely identifies an observation in its own table.
- single variable, e.g., tailnum for each plane
- multiple variables, e.g., year, month, day, hour, and origin to identify an observation in weather
A foreign key uniquely identifies an observation in another table.

Good practice: verify the primary keys by using count()

planes %>% 
  count(tailnum) %>% 
  filter(n > 1)

## # A tibble: 0 x 2
## # … with 2 variables: tailnum <chr>, n <int>

weather %>% 
  count(year, month, day, hour, origin) %>% 
  filter(n > 1)

## # A tibble: 3 x 6
##    year month   day  hour origin     n
##   <int> <int> <int> <int> <chr>  <int>
## 1  2013    11     3     1 EWR        2
## 2  2013    11     3     1 JFK        2
## 3  2013    11     3     1 LGA        2

Combine variables (columns)

Demo tables

(x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
))

## # A tibble: 3 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     3 x3

(y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
))

## # A tibble: 3 x 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2   
## 3     4 y3

Inner join

An inner join matches pairs of observations whenever their keys are equal:

inner_join(x, y, by = "key")

## # A tibble: 2 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2

Same as

x %>% inner_join(y, by = "key")

Generally not appropriate for use because of loss of observations

Outer join

An outer join keeps observations that appear in at least one of the tables.

Three types of outer joins:

A left join keeps all observations in x.

left_join(x, y, by = "key")

## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>

A right join keeps all observations in y.

right_join(x, y, by = "key")

## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3

A full join keeps all observations in x or y.

full_join(x, y, by = "key")

## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3

Duplicate keys

One table has duplicate keys.

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     1, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2"
)
left_join(x, y, by = "key")

## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x3    y2   
## 4     1 x4    y1

Both tables have duplicate keys. You get all possible combinations, the Cartesian product:

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     3, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     2, "y3",
     3, "y4"
)
left_join(x, y, by = "key")

## # A tibble: 6 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x2    y3   
## 4     2 x3    y2   
## 5     2 x3    y3   
## 6     3 x4    y4

Let’s create a narrower table from the flights data:

flights2 <- flights %>% 
  select(year:day, hour, origin, dest, tailnum, carrier)
flights2

## # A tibble: 336,776 x 8
##     year month   day  hour origin dest  tailnum carrier
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
##  1  2013     1     1     5 EWR    IAH   N14228  UA     
##  2  2013     1     1     5 LGA    IAH   N24211  UA     
##  3  2013     1     1     5 JFK    MIA   N619AA  AA     
##  4  2013     1     1     5 JFK    BQN   N804JB  B6     
##  5  2013     1     1     6 LGA    ATL   N668DN  DL     
##  6  2013     1     1     5 EWR    ORD   N39463  UA     
##  7  2013     1     1     6 EWR    FLL   N516JB  B6     
##  8  2013     1     1     6 LGA    IAD   N829AS  EV     
##  9  2013     1     1     6 JFK    MCO   N593JB  B6     
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA     
## # … with 336,766 more rows

We want to merge with the weather table:

weather

## # A tibble: 26,115 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
##  8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
##  9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
## 10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>

Defining the key columns

by = NULL (default): use all variables that appear in both tables:

# same as: flights2 %>% left_join(weather)
left_join(flights2, weather)

## Joining, by = c("year", "month", "day", "hour", "origin")

## # A tibble: 336,776 x 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
##  2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
##  3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
##  4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
##  5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
##  6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
##  7  2013     1     1     6 EWR    FLL   N516JB  B6       37.9  28.0  67.2
##  8  2013     1     1     6 LGA    IAD   N829AS  EV       39.9  25.0  54.8
##  9  2013     1     1     6 JFK    MCO   N593JB  B6       37.9  27.0  64.3
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA       39.9  25.0  54.8
## # … with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>

by = "x": use the common variable x:

# same as: flights2 %>% left_join(weather)
left_join(flights2, planes, by = "tailnum")

## # A tibble: 336,776 x 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type 
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>
##  1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixe…
##  2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixe…
##  3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixe…
##  4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixe…
##  5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixe…
##  6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixe…
##  7   2013     1     1     6 EWR    FLL   N516JB  B6        2000 Fixe…
##  8   2013     1     1     6 LGA    IAD   N829AS  EV        1998 Fixe…
##  9   2013     1     1     6 JFK    MCO   N593JB  B6        2004 Fixe…
## 10   2013     1     1     6 LGA    ORD   N3ALAA  AA          NA <NA> 
## # … with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>

by = c("a" = "b"): match variable a in table x to the variable b in table y.

# same as: flights2 %>% left_join(weather)
left_join(flights2, airports, by = c("dest" = "faa"))

## # A tibble: 336,776 x 15
##     year month   day  hour origin dest  tailnum carrier name    lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr> <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Geor…  30.0 -95.3    97
##  2  2013     1     1     5 LGA    IAH   N24211  UA      Geor…  30.0 -95.3    97
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Miam…  25.8 -80.3     8
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>   NA    NA      NA
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Hart…  33.6 -84.4  1026
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Chic…  42.0 -87.9   668
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Fort…  26.1 -80.2     9
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      Wash…  38.9 -77.5   313
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      Orla…  28.4 -81.3    96
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      Chic…  42.0 -87.9   668
## # … with 336,766 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## #   tzone <chr>

Combine cases (rows)

Top 10 most popular destinations:

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10)
top_dest

## # A tibble: 10 x 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705

How to filter the cases that fly to these destinations?

Semi-join

semi_join(x, y) keeps the rows in x that have a match in y.
Useful to see what will be joined.

semi_join(flights, top_dest)

## Joining, by = "dest"

## # A tibble: 141,145 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      555            600        -5      913            854
##  5  2013     1     1      557            600        -3      838            846
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            559         0      702            706
## 10  2013     1     1      600            600         0      851            858
## # … with 141,135 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Anti-join

anti_join(x, y) keeps the rows that don’t have a match.
Useful to see what will not be joined.

flights %>% 
  anti_join(planes, by = "tailnum") %>%
  count(tailnum, sort = TRUE)

## # A tibble: 722 x 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # … with 712 more rows

Set operations

All these operations work with a complete row, comparing the values of every variable.
These operations expect the x and y inputs to have the same variables.

Generate two tables:

(df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
))

## # A tibble: 2 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1

(df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
))

## # A tibble: 2 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     1     2

bind_rows(x, y) stacks table x on top of y.

bind_rows(df1, df2)

## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     1     1
## 4     1     2

intersect(x, y) returns rows that appear in both x and y.

intersect(df1, df2)

## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1

union(x, y) returns unique observations in x and y.

union(df1, df2)

## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     1     2

setdiff(x, y) returns rows that appear in x but not in y.

setdiff(df1, df2)

## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     2     1

setdiff(df2, df1)

## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     2

Cheat sheet

RStudio cheat sheet is extremely helpful.

Data transformation

MATH-7360 Data Analysis

Dr. Xiang Ji @ Tulane University

Sep 28, 2020

Announcement

Acknowledgement

Add new variables with `mutate()`

Summaries

Pipe

Other summary functions

Grouped mutates (and filters)

Combine tables | r4ds chapter 13

Relational data

Keys

Combine variables (columns)

Demo tables

Inner join

Outer join

Duplicate keys

Defining the key columns

Combine cases (rows)

Semi-join

Anti-join

Set operations

Cheat sheet

Data transformation

MATH-7360 Data Analysis

Dr. Xiang Ji @ Tulane University

Sep 28, 2020

Announcement

Acknowledgement

Add new variables with mutate()

Summaries

Pipe

Other summary functions

Grouped mutates (and filters)

Combine tables | r4ds chapter 13

Relational data

Keys

Combine variables (columns)

Demo tables

Inner join

Outer join

Duplicate keys

Defining the key columns

Combine cases (rows)

Semi-join

Anti-join

Set operations

Cheat sheet

Add new variables with `mutate()`