rm(list = ls()) # clean-up workspace
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Dr. Hua Zhou’s slides
readr
package implements functions that turn flat files into tibbles.
read_csv()
(comma delimited files), read_csv2()
(semicolon seperated files), read_tsv()
(tab delimited files), read_delim()
(any delimiter).
read_fwf()
(fixed width files), read_table()
.
read_log()
(Apache style log files).
An example file heights.csv:
head heights.csv
## "earn","height","sex","ed","age","race"
## 50000,74.4244387818035,"male",16,45,"white"
## 60000,65.5375428255647,"female",16,58,"white"
## 30000,63.6291977374349,"female",16,29,"white"
## 50000,63.1085616752971,"female",16,91,"other"
## 51000,63.4024835710879,"female",17,39,"white"
## 9000,64.3995075440034,"female",15,26,"white"
## 29000,61.6563258264214,"female",12,49,"white"
## 32000,72.6985437364783,"male",17,46,"white"
## 2000,72.0394668497611,"male",15,21,"hispanic"
Read from a local file heights.csv:
(heights <- read_csv("heights.csv"))
## Parsed with column specification:
## cols(
## earn = col_double(),
## height = col_double(),
## sex = col_character(),
## ed = col_double(),
## age = col_double(),
## race = col_character()
## )
## # A tibble: 1,192 x 6
## earn height sex ed age race
## <dbl> <dbl> <chr> <dbl> <dbl> <chr>
## 1 50000 74.4 male 16 45 white
## 2 60000 65.5 female 16 58 white
## 3 30000 63.6 female 16 29 white
## 4 50000 63.1 female 16 91 other
## 5 51000 63.4 female 17 39 white
## 6 9000 64.4 female 15 26 white
## 7 29000 61.7 female 12 49 white
## 8 32000 72.7 male 17 46 white
## 9 2000 72.0 male 15 21 hispanic
## 10 27000 72.2 male 12 26 white
## # … with 1,182 more rows
I’m curious about relation between earn
and height
and sex
ggplot(data = heights) +
geom_point(mapping = aes(x = height, y = earn, color = sex))
Read from inline csv file:
read_csv("a,b,c
1,2,3
4,5,6")
## # A tibble: 2 x 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
Skip first n
lines:
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
## # A tibble: 1 x 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
Skip comment lines:
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
## # A tibble: 1 x 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
No header line:
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## # A tibble: 2 x 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
No header line and specify colnames:
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## # A tibble: 2 x 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
Specify the symbol representing missing values:
read_csv("a,b,c\n1,2,.", na = ".")
## # A tibble: 1 x 3
## a b c
## <dbl> <dbl> <lgl>
## 1 1 2 NA
Write to csv:
write_csv(challenge, "challenge.csv")
Write (and read) RDS files:
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
readxl package (part of tidyverse) reads both xls and xlsx files:
library(readxl)
# xls file
read_excel("datasets.xls")
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
# xls file
read_excel("datasets.xlsx")
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
List the sheet name:
excel_sheets("datasets.xlsx")
## [1] "iris" "mtcars" "chickwts" "quakes"
Read in a specific sheet by name or number:
read_excel("datasets.xlsx", sheet = "mtcars")
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # … with 22 more rows
read_excel("datasets.xlsx", sheet = 4)
## # A tibble: 1,000 x 5
## lat long depth mag stations
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -20.4 182. 562 4.8 41
## 2 -20.6 181. 650 4.2 15
## 3 -26 184. 42 5.4 43
## 4 -18.0 182. 626 4.1 19
## 5 -20.4 182. 649 4 11
## 6 -19.7 184. 195 4 12
## 7 -11.7 166. 82 4.8 43
## 8 -28.1 182. 194 4.4 15
## 9 -28.7 182. 211 4.7 35
## 10 -17.5 180. 622 4.3 19
## # … with 990 more rows
Control subset of cells to read:
# first 3 rows
read_excel("datasets.xlsx", n_max = 3)
## # A tibble: 3 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
Excel type range
read_excel("datasets.xlsx", range = "C1:E4")
## # A tibble: 3 x 3
## Petal.Length Petal.Width Species
## <dbl> <dbl> <chr>
## 1 1.4 0.2 setosa
## 2 1.4 0.2 setosa
## 3 1.3 0.2 setosa
# first 4 rows
read_excel("datasets.xlsx", range = cell_rows(1:4))
## # A tibble: 3 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
# columns B-D
read_excel("datasets.xlsx", range = cell_cols("B:D"))
## # A tibble: 150 x 3
## Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl>
## 1 3.5 1.4 0.2
## 2 3 1.4 0.2
## 3 3.2 1.3 0.2
## 4 3.1 1.5 0.2
## 5 3.6 1.4 0.2
## 6 3.9 1.7 0.4
## 7 3.4 1.4 0.3
## 8 3.4 1.5 0.2
## 9 2.9 1.4 0.2
## 10 3.1 1.5 0.1
## # … with 140 more rows
# sheet
read_excel("datasets.xlsx", range = "mtcars!B1:D5")
## # A tibble: 4 x 3
## cyl disp hp
## <dbl> <dbl> <dbl>
## 1 6 160 110
## 2 6 160 110
## 3 4 108 93
## 4 6 258 110
Specify NA
s:
read_excel("datasets.xlsx", na = "setosa")
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 <NA>
## 2 4.9 3 1.4 0.2 <NA>
## 3 4.7 3.2 1.3 0.2 <NA>
## 4 4.6 3.1 1.5 0.2 <NA>
## 5 5 3.6 1.4 0.2 <NA>
## 6 5.4 3.9 1.7 0.4 <NA>
## 7 4.6 3.4 1.4 0.3 <NA>
## 8 5 3.4 1.5 0.2 <NA>
## 9 4.4 2.9 1.4 0.2 <NA>
## 10 4.9 3.1 1.5 0.1 <NA>
## # … with 140 more rows
Writing Excel files: openxlsx
and writexl
packages.
haven reads SPSS, Stata, and SAS files.
DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc) allows you to run SQL queries against a database and return a data frame.
jsonlite reads json files.
xml2 reads XML files.
tidyxl reads non-tabular data from Excel.
“Happy families are all alike; every unhappy family is unhappy in its own way.” –– Leo Tolstoy
“Tidy datasets are all alike, but every messy dataset is messy in its own way.” –– Hadley Wickham
There are three interrelated rules which make a dataset tidy:
Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.
Example table1
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
is tidy.
Example table2
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
is not tidy.
Example table3
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
is not tidy.
Example table4a
table4a
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
is not tidy.
Example table4b
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
is not tidy.
gather
columns into a new pair of variables.
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
We can gather table4b too and then join them
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Brazil 1999 37737 172006362
## 3 China 1999 212258 1272915272
## 4 Afghanistan 2000 2666 20595360
## 5 Brazil 2000 80488 174504898
## 6 China 2000 213766 1280428583
Spreading is the opposite of gathering.
spread(table2, key = type, value = count)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3 %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
Seperate into numeric values:
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
Separate at a fixed position:
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 x 4
## country century year rate
## <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
table5
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
unite()
is the inverse of separate()
.
table5 %>%
unite(new, century, year, sep = "")
## # A tibble: 6 x 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583