rm(list = ls()) # clean-up workspace
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Acknowledgement

Dr. Hua Zhou’s slides

Data import | r4ds chapter 11

readr

  • readr package implements functions that turn flat files into tibbles.

    • read_csv() (comma delimited files), read_csv2() (semicolon seperated files), read_tsv() (tab delimited files), read_delim() (any delimiter).

    • read_fwf() (fixed width files), read_table().

    • read_log() (Apache style log files).

  • An example file heights.csv:

    head heights.csv
    ## "earn","height","sex","ed","age","race"
    ## 50000,74.4244387818035,"male",16,45,"white"
    ## 60000,65.5375428255647,"female",16,58,"white"
    ## 30000,63.6291977374349,"female",16,29,"white"
    ## 50000,63.1085616752971,"female",16,91,"other"
    ## 51000,63.4024835710879,"female",17,39,"white"
    ## 9000,64.3995075440034,"female",15,26,"white"
    ## 29000,61.6563258264214,"female",12,49,"white"
    ## 32000,72.6985437364783,"male",17,46,"white"
    ## 2000,72.0394668497611,"male",15,21,"hispanic"

  • Read from a local file heights.csv:

    (heights <- read_csv("heights.csv"))
    ## Parsed with column specification:
    ## cols(
    ##   earn = col_double(),
    ##   height = col_double(),
    ##   sex = col_character(),
    ##   ed = col_double(),
    ##   age = col_double(),
    ##   race = col_character()
    ## )
    ## # A tibble: 1,192 x 6
    ##     earn height sex       ed   age race    
    ##    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
    ##  1 50000   74.4 male      16    45 white   
    ##  2 60000   65.5 female    16    58 white   
    ##  3 30000   63.6 female    16    29 white   
    ##  4 50000   63.1 female    16    91 other   
    ##  5 51000   63.4 female    17    39 white   
    ##  6  9000   64.4 female    15    26 white   
    ##  7 29000   61.7 female    12    49 white   
    ##  8 32000   72.7 male      17    46 white   
    ##  9  2000   72.0 male      15    21 hispanic
    ## 10 27000   72.2 male      12    26 white   
    ## # … with 1,182 more rows

  • I’m curious about relation between earn and height and sex

    ggplot(data = heights) + 
      geom_point(mapping = aes(x = height, y = earn, color = sex))


  • Read from inline csv file:

    read_csv("a,b,c
      1,2,3
      4,5,6")
    ## # A tibble: 2 x 3
    ##       a     b     c
    ##   <dbl> <dbl> <dbl>
    ## 1     1     2     3
    ## 2     4     5     6
  • Skip first n lines:

    read_csv("The first line of metadata
      The second line of metadata
      x,y,z
      1,2,3", skip = 2)
    ## # A tibble: 1 x 3
    ##       x     y     z
    ##   <dbl> <dbl> <dbl>
    ## 1     1     2     3

  • Skip comment lines:

    read_csv("# A comment I want to skip
      x,y,z
      1,2,3", comment = "#")
    ## # A tibble: 1 x 3
    ##       x     y     z
    ##   <dbl> <dbl> <dbl>
    ## 1     1     2     3
  • No header line:

    read_csv("1,2,3\n4,5,6", col_names = FALSE)
    ## # A tibble: 2 x 3
    ##      X1    X2    X3
    ##   <dbl> <dbl> <dbl>
    ## 1     1     2     3
    ## 2     4     5     6

  • No header line and specify colnames:

    read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
    ## # A tibble: 2 x 3
    ##       x     y     z
    ##   <dbl> <dbl> <dbl>
    ## 1     1     2     3
    ## 2     4     5     6
  • Specify the symbol representing missing values:

    read_csv("a,b,c\n1,2,.", na = ".")
    ## # A tibble: 1 x 3
    ##       a     b c    
    ##   <dbl> <dbl> <lgl>
    ## 1     1     2 NA

Writing to a file

  • Write to csv:

    write_csv(challenge, "challenge.csv")
  • Write (and read) RDS files:

    write_rds(challenge, "challenge.rds")
    read_rds("challenge.rds")

Excel files

  • readxl package (part of tidyverse) reads both xls and xlsx files:

    library(readxl)
    # xls file
    read_excel("datasets.xls")
    ## # A tibble: 150 x 5
    ##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    ##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
    ##  1          5.1         3.5          1.4         0.2 setosa 
    ##  2          4.9         3            1.4         0.2 setosa 
    ##  3          4.7         3.2          1.3         0.2 setosa 
    ##  4          4.6         3.1          1.5         0.2 setosa 
    ##  5          5           3.6          1.4         0.2 setosa 
    ##  6          5.4         3.9          1.7         0.4 setosa 
    ##  7          4.6         3.4          1.4         0.3 setosa 
    ##  8          5           3.4          1.5         0.2 setosa 
    ##  9          4.4         2.9          1.4         0.2 setosa 
    ## 10          4.9         3.1          1.5         0.1 setosa 
    ## # … with 140 more rows
    # xls file
    read_excel("datasets.xlsx")
    ## # A tibble: 150 x 5
    ##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    ##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
    ##  1          5.1         3.5          1.4         0.2 setosa 
    ##  2          4.9         3            1.4         0.2 setosa 
    ##  3          4.7         3.2          1.3         0.2 setosa 
    ##  4          4.6         3.1          1.5         0.2 setosa 
    ##  5          5           3.6          1.4         0.2 setosa 
    ##  6          5.4         3.9          1.7         0.4 setosa 
    ##  7          4.6         3.4          1.4         0.3 setosa 
    ##  8          5           3.4          1.5         0.2 setosa 
    ##  9          4.4         2.9          1.4         0.2 setosa 
    ## 10          4.9         3.1          1.5         0.1 setosa 
    ## # … with 140 more rows
  • List the sheet name:

    excel_sheets("datasets.xlsx")
    ## [1] "iris"     "mtcars"   "chickwts" "quakes"
  • Read in a specific sheet by name or number:

    read_excel("datasets.xlsx", sheet = "mtcars")
    ## # A tibble: 32 x 11
    ##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
    ##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    ##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
    ##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
    ##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
    ##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
    ##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
    ##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
    ##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
    ##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
    ##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
    ## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
    ## # … with 22 more rows
    read_excel("datasets.xlsx", sheet = 4)
    ## # A tibble: 1,000 x 5
    ##      lat  long depth   mag stations
    ##    <dbl> <dbl> <dbl> <dbl>    <dbl>
    ##  1 -20.4  182.   562   4.8       41
    ##  2 -20.6  181.   650   4.2       15
    ##  3 -26    184.    42   5.4       43
    ##  4 -18.0  182.   626   4.1       19
    ##  5 -20.4  182.   649   4         11
    ##  6 -19.7  184.   195   4         12
    ##  7 -11.7  166.    82   4.8       43
    ##  8 -28.1  182.   194   4.4       15
    ##  9 -28.7  182.   211   4.7       35
    ## 10 -17.5  180.   622   4.3       19
    ## # … with 990 more rows
  • Control subset of cells to read:

    # first 3 rows
    read_excel("datasets.xlsx", n_max = 3)
    ## # A tibble: 3 x 5
    ##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    ##          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
    ## 1          5.1         3.5          1.4         0.2 setosa 
    ## 2          4.9         3            1.4         0.2 setosa 
    ## 3          4.7         3.2          1.3         0.2 setosa

    Excel type range

    read_excel("datasets.xlsx", range = "C1:E4")
    ## # A tibble: 3 x 3
    ##   Petal.Length Petal.Width Species
    ##          <dbl>       <dbl> <chr>  
    ## 1          1.4         0.2 setosa 
    ## 2          1.4         0.2 setosa 
    ## 3          1.3         0.2 setosa
    # first 4 rows
    read_excel("datasets.xlsx", range = cell_rows(1:4))
    ## # A tibble: 3 x 5
    ##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    ##          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
    ## 1          5.1         3.5          1.4         0.2 setosa 
    ## 2          4.9         3            1.4         0.2 setosa 
    ## 3          4.7         3.2          1.3         0.2 setosa
    # columns B-D
    read_excel("datasets.xlsx", range = cell_cols("B:D"))
    ## # A tibble: 150 x 3
    ##    Sepal.Width Petal.Length Petal.Width
    ##          <dbl>        <dbl>       <dbl>
    ##  1         3.5          1.4         0.2
    ##  2         3            1.4         0.2
    ##  3         3.2          1.3         0.2
    ##  4         3.1          1.5         0.2
    ##  5         3.6          1.4         0.2
    ##  6         3.9          1.7         0.4
    ##  7         3.4          1.4         0.3
    ##  8         3.4          1.5         0.2
    ##  9         2.9          1.4         0.2
    ## 10         3.1          1.5         0.1
    ## # … with 140 more rows
    # sheet
    read_excel("datasets.xlsx", range = "mtcars!B1:D5")
    ## # A tibble: 4 x 3
    ##     cyl  disp    hp
    ##   <dbl> <dbl> <dbl>
    ## 1     6   160   110
    ## 2     6   160   110
    ## 3     4   108    93
    ## 4     6   258   110
  • Specify NAs:

    read_excel("datasets.xlsx", na = "setosa")
    ## # A tibble: 150 x 5
    ##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    ##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
    ##  1          5.1         3.5          1.4         0.2 <NA>   
    ##  2          4.9         3            1.4         0.2 <NA>   
    ##  3          4.7         3.2          1.3         0.2 <NA>   
    ##  4          4.6         3.1          1.5         0.2 <NA>   
    ##  5          5           3.6          1.4         0.2 <NA>   
    ##  6          5.4         3.9          1.7         0.4 <NA>   
    ##  7          4.6         3.4          1.4         0.3 <NA>   
    ##  8          5           3.4          1.5         0.2 <NA>   
    ##  9          4.4         2.9          1.4         0.2 <NA>   
    ## 10          4.9         3.1          1.5         0.1 <NA>   
    ## # … with 140 more rows
  • Writing Excel files: openxlsx and writexl packages.

Other types of data

  • haven reads SPSS, Stata, and SAS files.

  • DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc) allows you to run SQL queries against a database and return a data frame.

  • jsonlite reads json files.

  • xml2 reads XML files.

  • tidyxl reads non-tabular data from Excel.

Tidy data | r4ds chapter 12

“Happy families are all alike; every unhappy family is unhappy in its own way.” –– Leo Tolstoy

“Tidy datasets are all alike, but every messy dataset is messy in its own way.” –– Hadley Wickham

Tidy data

There are three interrelated rules which make a dataset tidy:

  • Each variable must have its own column.

  • Each observation must have its own row.

  • Each value must have its own cell.


  • Example table1

    table1
    ## # A tibble: 6 x 4
    ##   country      year  cases population
    ##   <chr>       <int>  <int>      <int>
    ## 1 Afghanistan  1999    745   19987071
    ## 2 Afghanistan  2000   2666   20595360
    ## 3 Brazil       1999  37737  172006362
    ## 4 Brazil       2000  80488  174504898
    ## 5 China        1999 212258 1272915272
    ## 6 China        2000 213766 1280428583

    is tidy.


  • Example table2

    table2
    ## # A tibble: 12 x 4
    ##    country      year type            count
    ##    <chr>       <int> <chr>           <int>
    ##  1 Afghanistan  1999 cases             745
    ##  2 Afghanistan  1999 population   19987071
    ##  3 Afghanistan  2000 cases            2666
    ##  4 Afghanistan  2000 population   20595360
    ##  5 Brazil       1999 cases           37737
    ##  6 Brazil       1999 population  172006362
    ##  7 Brazil       2000 cases           80488
    ##  8 Brazil       2000 population  174504898
    ##  9 China        1999 cases          212258
    ## 10 China        1999 population 1272915272
    ## 11 China        2000 cases          213766
    ## 12 China        2000 population 1280428583

    is not tidy.


  • Example table3

    table3
    ## # A tibble: 6 x 3
    ##   country      year rate             
    ## * <chr>       <int> <chr>            
    ## 1 Afghanistan  1999 745/19987071     
    ## 2 Afghanistan  2000 2666/20595360    
    ## 3 Brazil       1999 37737/172006362  
    ## 4 Brazil       2000 80488/174504898  
    ## 5 China        1999 212258/1272915272
    ## 6 China        2000 213766/1280428583

    is not tidy.


  • Example table4a

    table4a
    ## # A tibble: 3 x 3
    ##   country     `1999` `2000`
    ## * <chr>        <int>  <int>
    ## 1 Afghanistan    745   2666
    ## 2 Brazil       37737  80488
    ## 3 China       212258 213766

    is not tidy.

  • Example table4b

    table4b
    ## # A tibble: 3 x 3
    ##   country         `1999`     `2000`
    ## * <chr>            <int>      <int>
    ## 1 Afghanistan   19987071   20595360
    ## 2 Brazil       172006362  174504898
    ## 3 China       1272915272 1280428583

    is not tidy.

Gathering


Spreading

Separating



Unite