Reproduce lecture slides demo
HW2
Dr. Hua Zhou’s slides
rm(list = ls()) # clean-up workspace
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("nycflights13")
mutate()Add variables gain and speed:
flights_sml <-
select(flights, year:day, ends_with("delay"), distance, air_time)
flights_sml
## # A tibble: 336,776 x 7
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227
## 2 2013 1 1 4 20 1416 227
## 3 2013 1 1 2 33 1089 160
## 4 2013 1 1 -1 -18 1576 183
## 5 2013 1 1 -6 -25 762 116
## 6 2013 1 1 -4 12 719 150
## 7 2013 1 1 -5 19 1065 158
## 8 2013 1 1 -3 -14 229 53
## 9 2013 1 1 -3 -8 944 140
## 10 2013 1 1 -2 8 733 138
## # … with 336,766 more rows
mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
## # A tibble: 336,776 x 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 370.
## 2 2013 1 1 4 20 1416 227 16 374.
## 3 2013 1 1 2 33 1089 160 31 408.
## 4 2013 1 1 -1 -18 1576 183 -17 517.
## 5 2013 1 1 -6 -25 762 116 -19 394.
## 6 2013 1 1 -4 12 719 150 16 288.
## 7 2013 1 1 -5 19 1065 158 24 404.
## 8 2013 1 1 -3 -14 229 53 -11 259.
## 9 2013 1 1 -3 -8 944 140 -5 405.
## 10 2013 1 1 -2 8 733 138 10 319.
## # … with 336,766 more rowsRefer to columns that you’ve just created:
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time %/% 60,
minute = air_time %% 60,
gain_per_hour = gain / (hours + minute / 60)
)
## # A tibble: 336,776 x 11
## year month day dep_delay arr_delay distance air_time gain hours minute
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 3 47
## 2 2013 1 1 4 20 1416 227 16 3 47
## 3 2013 1 1 2 33 1089 160 31 2 40
## 4 2013 1 1 -1 -18 1576 183 -17 3 3
## 5 2013 1 1 -6 -25 762 116 -19 1 56
## 6 2013 1 1 -4 12 719 150 16 2 30
## 7 2013 1 1 -5 19 1065 158 24 2 38
## 8 2013 1 1 -3 -14 229 53 -11 0 53
## 9 2013 1 1 -3 -8 944 140 -5 2 20
## 10 2013 1 1 -2 8 733 138 10 2 18
## # … with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>Modular arithmetic: %/% (integer division) and %% (remainder).
Only keep the new variables by transmute():
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time %/% 60,
minute = air_time %% 60,
gain_per_hour = gain / (hours + minute / 60)
)
## # A tibble: 336,776 x 4
## gain hours minute gain_per_hour
## <dbl> <dbl> <dbl> <dbl>
## 1 9 3 47 2.38
## 2 16 3 47 4.23
## 3 31 2 40 11.6
## 4 -17 3 3 -5.57
## 5 -19 1 56 -9.83
## 6 16 2 30 6.4
## 7 24 2 38 9.11
## 8 -11 0 53 -12.5
## 9 -5 2 20 -2.14
## 10 10 2 18 4.35
## # … with 336,766 more rowsacross(): apply a function (or a set of functions) to a set of columns
flights %>%
transmute(across(1:4, list(log = log, log2 = log2)))
## # A tibble: 336,776 x 8
## year_log year_log2 month_log month_log2 day_log day_log2 dep_time_log
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.61 11.0 0 0 0 0 6.25
## 2 7.61 11.0 0 0 0 0 6.28
## 3 7.61 11.0 0 0 0 0 6.30
## 4 7.61 11.0 0 0 0 0 6.30
## 5 7.61 11.0 0 0 0 0 6.32
## 6 7.61 11.0 0 0 0 0 6.32
## 7 7.61 11.0 0 0 0 0 6.32
## 8 7.61 11.0 0 0 0 0 6.32
## 9 7.61 11.0 0 0 0 0 6.32
## 10 7.61 11.0 0 0 0 0 6.32
## # … with 336,766 more rows, and 1 more variable: dep_time_log2 <dbl>apply funs of one type
flights %>%
filter(dep_delay > 0 & arr_delay > 0) %>%
transmute(across(where(is.numeric), log))
## # A tibble: 92,303 x 14
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.61 0 0 6.25 6.24 0.693 6.72 6.71
## 2 7.61 0 0 6.28 6.27 1.39 6.75 6.72
## 3 7.61 0 0 6.30 6.29 0.693 6.83 6.75
## 4 7.61 0 0 6.41 6.40 2.08 6.69 6.60
## 5 7.61 0 0 6.42 6.40 2.40 6.85 6.84
## 6 7.61 0 0 6.42 6.41 1.10 6.83 6.83
## 7 7.61 0 0 6.43 6.41 2.56 6.82 6.82
## 8 7.61 0 0 6.45 6.41 3.18 6.61 6.59
## 9 7.61 0 0 6.55 6.55 0.693 6.96 6.92
## 10 7.61 0 0 6.56 6.55 2.20 6.75 6.72
## # … with 92,293 more rows, and 6 more variables: arr_delay <dbl>, flight <dbl>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>summarise() collapses a data frame to a single row.
Not very useful unless paired with group_by().
group_by() changes the unit of the analysis from the complete dataset to individual groups.
Mean of a variable:
(by_day <- group_by(flights, year, month, day))
## # A tibble: 336,776 x 19
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # … with 355 more rowsConvert a tibble into a grouped tibble:
(by_day <- group_by(flights, year, month, day))Grouped summaries:
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))Consider following analysis:
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest, count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
delay
## # A tibble: 96 x 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00
## 7 BDL 443 116 7.05
## 8 BGR 375 378 8.03
## 9 BHM 297 866. 16.9
## 10 BNA 6333 758. 11.8
## # … with 86 more rows
n() count for each sub-group
sum(!is.na(x)) counts for all non-missing values
Cleaner code using pipe %>%:
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
## `summarise()` ungrouping output (override with `.groups` argument)
delays
## # A tibble: 96 x 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00
## 7 BDL 443 116 7.05
## 8 BGR 375 378 8.03
## 9 BHM 297 866. 16.9
## 10 BNA 6333 758. 11.8
## # … with 86 more rowsggplot2 accepts pipe too.
delays %>%
ggplot(mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Measures of Location: mean(x), median(x).
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled
## # A tibble: 327,346 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 327,336 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay = mean(arr_delay),
avg_pos_delay = mean(arr_delay[arr_delay > 0]) # the average positive delay
)
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day avg_delay avg_pos_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 12.7 32.5
## 2 2013 1 2 12.7 32.0
## 3 2013 1 3 5.73 27.7
## 4 2013 1 4 -1.93 28.3
## 5 2013 1 5 -1.53 22.6
## 6 2013 1 6 4.24 24.4
## 7 2013 1 7 -4.95 27.8
## 8 2013 1 8 -3.23 20.8
## 9 2013 1 9 -0.264 25.6
## 10 2013 1 10 -5.90 27.3
## # … with 355 more rowsMeasures of Spread: sd(x), IQR(x), mad(x).
sd() standard deviation
IQR() interquartile range
mad() median absolute deviation
not_cancelled %>%
group_by(dest) %>%
summarise(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 104 x 2
## dest distance_sd
## <chr> <dbl>
## 1 EGE 10.5
## 2 SAN 10.4
## 3 SFO 10.2
## 4 HNL 10.0
## 5 SEA 9.98
## 6 LAS 9.91
## 7 PDX 9.87
## 8 PHX 9.86
## 9 LAX 9.66
## 10 IND 9.46
## # … with 94 more rowsMeasures of Rank: min(x), quantile(x, 0.25), max(x).
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # … with 355 more rowsMeasures of Position: first(x), nth(x, 2), last(x).
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day first_dep last_dep
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # … with 355 more rowsCount: n(x), sum(!is.na(x)), n_distinct(x).
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 104 x 2
## dest carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # … with 94 more rowsnot_cancelled %>%
count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # … with 94 more rows# total number of miles a plane flew
not_cancelled %>%
count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # … with 4,027 more rows# How many flights left before 5am? (these usually indicate delayed
# flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day n_early
## <int> <int> <int> <int>
## 1 2013 1 1 0
## 2 2013 1 2 3
## 3 2013 1 3 4
## 4 2013 1 4 3
## 5 2013 1 5 3
## 6 2013 1 6 2
## 7 2013 1 7 2
## 8 2013 1 8 1
## 9 2013 1 9 3
## 10 2013 1 10 3
## # … with 355 more rows# What proportion of flights are delayed by more than an hour?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(hour_perc = mean(arr_delay > 60))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day hour_perc
## <int> <int> <int> <dbl>
## 1 2013 1 1 0.0722
## 2 2013 1 2 0.0851
## 3 2013 1 3 0.0567
## 4 2013 1 4 0.0396
## 5 2013 1 5 0.0349
## 6 2013 1 6 0.0470
## 7 2013 1 7 0.0333
## 8 2013 1 8 0.0213
## 9 2013 1 9 0.0202
## 10 2013 1 10 0.0183
## # … with 355 more rowsRecall the flights_sml tibble created earlier:
flights_sml
## # A tibble: 336,776 x 7
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227
## 2 2013 1 1 4 20 1416 227
## 3 2013 1 1 2 33 1089 160
## 4 2013 1 1 -1 -18 1576 183
## 5 2013 1 1 -6 -25 762 116
## 6 2013 1 1 -4 12 719 150
## 7 2013 1 1 -5 19 1065 158
## 8 2013 1 1 -3 -14 229 53
## 9 2013 1 1 -3 -8 944 140
## 10 2013 1 1 -2 8 733 138
## # … with 336,766 more rowsFind the worst members of each group:
flights_sml %>%
group_by(year, month, day) %>%
filter(rank(desc(arr_delay)) < 10)
## # A tibble: 3,306 x 7
## # Groups: year, month, day [365]
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 853 851 184 41
## 2 2013 1 1 290 338 1134 213
## 3 2013 1 1 260 263 266 46
## 4 2013 1 1 157 174 213 60
## 5 2013 1 1 216 222 708 121
## 6 2013 1 1 255 250 589 115
## 7 2013 1 1 285 246 1085 146
## 8 2013 1 1 192 191 199 44
## 9 2013 1 1 379 456 1092 222
## 10 2013 1 2 224 207 550 94
## # … with 3,296 more rowsFind all groups bigger than a threshold:
(popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365))
## # A tibble: 332,577 x 19
## # Groups: dest [77]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 332,567 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>Standardise to compute per group metrics:
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
## # A tibble: 131,106 x 6
## # Groups: dest [77]
## year month day dest arr_delay prop_delay
## <int> <int> <int> <chr> <dbl> <dbl>
## 1 2013 1 1 IAH 11 0.000111
## 2 2013 1 1 IAH 20 0.000201
## 3 2013 1 1 MIA 33 0.000235
## 4 2013 1 1 ORD 12 0.0000424
## 5 2013 1 1 FLL 19 0.0000938
## 6 2013 1 1 ORD 8 0.0000283
## 7 2013 1 1 LAX 7 0.0000344
## 8 2013 1 1 DFW 31 0.000282
## 9 2013 1 1 ATL 12 0.0000400
## 10 2013 1 1 DTW 16 0.000116
## # … with 131,096 more rowsnycflights13 package has >1 tables:
We already know a lot about flights:
flights %>% print(width = Inf)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # … with 336,766 more rowsairlines:
airlines
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.airports:
airports
## # A tibble: 1,458 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo…
## 2 06A Moton Field Municipal A… 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo…
## 6 0A9 Elizabethton Municipal … 36.4 -82.2 1593 -5 A America/New_Yo…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo…
## 8 0G7 Finger Lakes Regional A… 42.9 -76.8 492 -5 A America/New_Yo…
## 9 0P2 Shoestring Aviation Air… 39.8 -76.6 1000 -5 U America/New_Yo…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An…
## # … with 1,448 more rowsplanes:
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 2 N102UW 1998 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 3 N103US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 4 N104UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 5 N10575 2002 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 6 N105UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 7 N107US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 8 N108UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 9 N109UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 10 N110UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## # … with 3,312 more rowsWeather:
weather
## # A tibble: 26,115 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>
A primary key uniquely identifies an observation in its own table.
single variable, e.g., tailnum for each plane
multiple variables, e.g., year, month, day, hour, and origin to identify an observation in weather
A foreign key uniquely identifies an observation in another table.
Good practice: verify the primary keys by using count()
planes %>%
count(tailnum) %>%
filter(n > 1)
## # A tibble: 0 x 2
## # … with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
## # A tibble: 3 x 6
## year month day hour origin n
## <int> <int> <int> <int> <chr> <int>
## 1 2013 11 3 1 EWR 2
## 2 2013 11 3 1 JFK 2
## 3 2013 11 3 1 LGA 2
(x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
))
## # A tibble: 3 x 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
## 3 3 x3
(y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
))
## # A tibble: 3 x 2
## key val_y
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
## 3 4 y3
inner_join(x, y, by = "key")
## # A tibble: 2 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
Same as
x %>% inner_join(y, by = "key")Generally not appropriate for use because of loss of observations
An outer join keeps observations that appear in at least one of the tables.
Three types of outer joins:
x.left_join(x, y, by = "key")
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
y.right_join(x, y, by = "key")
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
x or y.full_join(x, y, by = "key")
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
One table has duplicate keys.
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x3 y2
## 4 1 x4 y1Both tables have duplicate keys. You get all possible combinations, the Cartesian product:
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = "key")
## # A tibble: 6 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x2 y3
## 4 2 x3 y2
## 5 2 x3 y3
## 6 3 x4 y4Let’s create a narrower table from the flights data:
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rowsWe want to merge with the weather table:
weather
## # A tibble: 26,115 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>by = NULL (default): use all variables that appear in both tables:
# same as: flights2 %>% left_join(weather)
left_join(flights2, weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 x 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8
## # … with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>by = "x": use the common variable x:
# same as: flights2 %>% left_join(weather)
left_join(flights2, planes, by = "tailnum")
## # A tibble: 336,776 x 16
## year.x month day hour origin dest tailnum carrier year.y type
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixe…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixe…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixe…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixe…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixe…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixe…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixe…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixe…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixe…
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA <NA>
## # … with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>by = c("a" = "b"): match variable a in table x to the variable b in table y.
# same as: flights2 %>% left_join(weather)
left_join(flights2, airports, by = c("dest" = "faa"))
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon alt
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Geor… 30.0 -95.3 97
## 2 2013 1 1 5 LGA IAH N24211 UA Geor… 30.0 -95.3 97
## 3 2013 1 1 5 JFK MIA N619AA AA Miam… 25.8 -80.3 8
## 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA NA
## 5 2013 1 1 6 LGA ATL N668DN DL Hart… 33.6 -84.4 1026
## 6 2013 1 1 5 EWR ORD N39463 UA Chic… 42.0 -87.9 668
## 7 2013 1 1 6 EWR FLL N516JB B6 Fort… 26.1 -80.2 9
## 8 2013 1 1 6 LGA IAD N829AS EV Wash… 38.9 -77.5 313
## 9 2013 1 1 6 JFK MCO N593JB B6 Orla… 28.4 -81.3 96
## 10 2013 1 1 6 LGA ORD N3ALAA AA Chic… 42.0 -87.9 668
## # … with 336,766 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## # tzone <chr>Top 10 most popular destinations:
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
## # A tibble: 10 x 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705How to filter the cases that fly to these destinations?
semi_join(x, y) keeps the rows in x that have a match in y.
Useful to see what will be joined.
semi_join(flights, top_dest)
## Joining, by = "dest"
## # A tibble: 141,145 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 554 558 -4 740 728
## 4 2013 1 1 555 600 -5 913 854
## 5 2013 1 1 557 600 -3 838 846
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 559 0 702 706
## 10 2013 1 1 600 600 0 851 858
## # … with 141,135 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>anti_join(x, y) keeps the rows that don’t have a match.
Useful to see what will not be joined.
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 x 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # … with 712 more rowsAll these operations work with a complete row, comparing the values of every variable.
These operations expect the x and y inputs to have the same variables.
Generate two tables:
(df1 <- tribble(
~x, ~y,
1, 1,
2, 1
))
## # A tibble: 2 x 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
(df2 <- tribble(
~x, ~y,
1, 1,
1, 2
))
## # A tibble: 2 x 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 1 2bind_rows(x, y) stacks table x on top of y.
bind_rows(df1, df2)
## # A tibble: 4 x 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
## 3 1 1
## 4 1 2intersect(x, y) returns rows that appear in both x and y.
intersect(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 1union(x, y) returns unique observations in x and y.
union(df1, df2)
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
## 3 1 2setdiff(x, y) returns rows that appear in x but not in y.
setdiff(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 2 1
setdiff(df2, df1)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 2RStudio cheat sheet is extremely helpful.