Import and Tidy Data

Acknowledgement
Data import | r4ds chapter 11
Tidy data | r4ds chapter 12
- Tidy data
Gathering
Spreading
Separating
Unite

rm(list = ls()) # clean-up workspace
library("tidyverse")

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Acknowledgement

Dr. Hua Zhou’s slides

Data import | r4ds chapter 11

readr

readr package implements functions that turn flat files into tibbles.
- read_csv() (comma delimited files), read_csv2() (semicolon seperated files), read_tsv() (tab delimited files), read_delim() (any delimiter).
- read_fwf() (fixed width files), read_table().
- read_log() (Apache style log files).

An example file heights.csv:

head heights.csv

## "earn","height","sex","ed","age","race"
## 50000,74.4244387818035,"male",16,45,"white"
## 60000,65.5375428255647,"female",16,58,"white"
## 30000,63.6291977374349,"female",16,29,"white"
## 50000,63.1085616752971,"female",16,91,"other"
## 51000,63.4024835710879,"female",17,39,"white"
## 9000,64.3995075440034,"female",15,26,"white"
## 29000,61.6563258264214,"female",12,49,"white"
## 32000,72.6985437364783,"male",17,46,"white"
## 2000,72.0394668497611,"male",15,21,"hispanic"

Read from a local file heights.csv:

(heights <- read_csv("heights.csv"))

## Parsed with column specification:
## cols(
##   earn = col_double(),
##   height = col_double(),
##   sex = col_character(),
##   ed = col_double(),
##   age = col_double(),
##   race = col_character()
## )

## # A tibble: 1,192 x 6
##     earn height sex       ed   age race    
##    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
##  1 50000   74.4 male      16    45 white   
##  2 60000   65.5 female    16    58 white   
##  3 30000   63.6 female    16    29 white   
##  4 50000   63.1 female    16    91 other   
##  5 51000   63.4 female    17    39 white   
##  6  9000   64.4 female    15    26 white   
##  7 29000   61.7 female    12    49 white   
##  8 32000   72.7 male      17    46 white   
##  9  2000   72.0 male      15    21 hispanic
## 10 27000   72.2 male      12    26 white   
## # … with 1,182 more rows

I’m curious about relation between earn and height and sex

ggplot(data = heights) + 
  geom_point(mapping = aes(x = height, y = earn, color = sex))

Read from inline csv file:

read_csv("a,b,c
  1,2,3
  4,5,6")

## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

Skip first n lines:

read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)

## # A tibble: 1 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3

Skip comment lines:

read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")

## # A tibble: 1 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3

No header line:

read_csv("1,2,3\n4,5,6", col_names = FALSE)

## # A tibble: 2 x 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

No header line and specify colnames:

read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))

## # A tibble: 2 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

Specify the symbol representing missing values:

read_csv("a,b,c\n1,2,.", na = ".")

## # A tibble: 1 x 3
##       a     b c    
##   <dbl> <dbl> <lgl>
## 1     1     2 NA

Writing to a file

Write to csv:
```
write_csv(challenge, "challenge.csv")
```

Write (and read) RDS files:

write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")

Excel files

readxl package (part of tidyverse) reads both xls and xlsx files:

library(readxl)
# xls file
read_excel("datasets.xls")

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows

# xls file
read_excel("datasets.xlsx")

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows

List the sheet name:

excel_sheets("datasets.xlsx")

## [1] "iris"     "mtcars"   "chickwts" "quakes"

Read in a specific sheet by name or number:

read_excel("datasets.xlsx", sheet = "mtcars")

## # A tibble: 32 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # … with 22 more rows

read_excel("datasets.xlsx", sheet = 4)

## # A tibble: 1,000 x 5
##      lat  long depth   mag stations
##    <dbl> <dbl> <dbl> <dbl>    <dbl>
##  1 -20.4  182.   562   4.8       41
##  2 -20.6  181.   650   4.2       15
##  3 -26    184.    42   5.4       43
##  4 -18.0  182.   626   4.1       19
##  5 -20.4  182.   649   4         11
##  6 -19.7  184.   195   4         12
##  7 -11.7  166.    82   4.8       43
##  8 -28.1  182.   194   4.4       15
##  9 -28.7  182.   211   4.7       35
## 10 -17.5  180.   622   4.3       19
## # … with 990 more rows

Control subset of cells to read:

# first 3 rows
read_excel("datasets.xlsx", n_max = 3)

## # A tibble: 3 x 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa

Excel type range

read_excel("datasets.xlsx", range = "C1:E4")

## # A tibble: 3 x 3
##   Petal.Length Petal.Width Species
##          <dbl>       <dbl> <chr>  
## 1          1.4         0.2 setosa 
## 2          1.4         0.2 setosa 
## 3          1.3         0.2 setosa

# first 4 rows
read_excel("datasets.xlsx", range = cell_rows(1:4))

## # A tibble: 3 x 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa

# columns B-D
read_excel("datasets.xlsx", range = cell_cols("B:D"))

## # A tibble: 150 x 3
##    Sepal.Width Petal.Length Petal.Width
##          <dbl>        <dbl>       <dbl>
##  1         3.5          1.4         0.2
##  2         3            1.4         0.2
##  3         3.2          1.3         0.2
##  4         3.1          1.5         0.2
##  5         3.6          1.4         0.2
##  6         3.9          1.7         0.4
##  7         3.4          1.4         0.3
##  8         3.4          1.5         0.2
##  9         2.9          1.4         0.2
## 10         3.1          1.5         0.1
## # … with 140 more rows

# sheet
read_excel("datasets.xlsx", range = "mtcars!B1:D5")

## # A tibble: 4 x 3
##     cyl  disp    hp
##   <dbl> <dbl> <dbl>
## 1     6   160   110
## 2     6   160   110
## 3     4   108    93
## 4     6   258   110

Specify NAs:

read_excel("datasets.xlsx", na = "setosa")

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1          5.1         3.5          1.4         0.2 <NA>   
##  2          4.9         3            1.4         0.2 <NA>   
##  3          4.7         3.2          1.3         0.2 <NA>   
##  4          4.6         3.1          1.5         0.2 <NA>   
##  5          5           3.6          1.4         0.2 <NA>   
##  6          5.4         3.9          1.7         0.4 <NA>   
##  7          4.6         3.4          1.4         0.3 <NA>   
##  8          5           3.4          1.5         0.2 <NA>   
##  9          4.4         2.9          1.4         0.2 <NA>   
## 10          4.9         3.1          1.5         0.1 <NA>   
## # … with 140 more rows

Writing Excel files: openxlsx and writexl packages.

Other types of data

haven reads SPSS, Stata, and SAS files.
DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc) allows you to run SQL queries against a database and return a data frame.
jsonlite reads json files.
xml2 reads XML files.
tidyxl reads non-tabular data from Excel.

Tidy data | r4ds chapter 12

“Happy families are all alike; every unhappy family is unhappy in its own way.” –– Leo Tolstoy

“Tidy datasets are all alike, but every messy dataset is messy in its own way.” –– Hadley Wickham

Tidy data

There are three interrelated rules which make a dataset tidy:

Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.

Example table1

table1

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

is tidy.

Example table2

table2

## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

is not tidy.

Example table3

table3

## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

is not tidy.

Example table4a

table4a

## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766

is not tidy.

Example table4b

table4b

## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

is not tidy.

Gathering

gather columns into a new pair of variables.

table4a %>%
  gather(`1999`, `2000`, key = "year", value = "cases")

## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766

We can gather table4b too and then join them

tidy4a <- table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>% 
  gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)

## Joining, by = c("country", "year")

## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 1999     745   19987071
## 2 Brazil      1999   37737  172006362
## 3 China       1999  212258 1272915272
## 4 Afghanistan 2000    2666   20595360
## 5 Brazil      2000   80488  174504898
## 6 China       2000  213766 1280428583

Spreading

Spreading is the opposite of gathering.

spread(table2, key = type, value = count)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

Separating

table3 %>% 
  separate(rate, into = c("cases", "population"))

## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

Seperate into numeric values:

table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

Separate at a fixed position:

table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)

## # A tibble: 6 x 4
##   country     century year  rate             
##   <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

Unite

table5

## # A tibble: 6 x 4
##   country     century year  rate             
## * <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

unite() is the inverse of separate().

table5 %>% 
  unite(new, century, year, sep = "")

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583

Import and Tidy Data

MATH-7360 Data Analysis

Dr. Xiang Ji @ Tulane University

Sep 14, 2020

Acknowledgement

Data import | r4ds chapter 11

readr

Writing to a file

Excel files

Other types of data

Tidy data | r4ds chapter 12

Tidy data

Gathering

Spreading

Separating

Unite