Announcement

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.0.2  magrittr_1.5    tools_4.0.2     htmltools_0.5.0
##  [5] yaml_2.2.1      stringi_1.5.3   rmarkdown_2.3   knitr_1.30     
##  [9] stringr_1.4.0   xfun_0.17       digest_0.6.25   rlang_0.4.7    
## [13] evaluate_0.14

Acknowledgement

Dr. Hua Zhou’s slides

Josh McCrain’s RSelenium tutorial

Load tidyverse and other packages for this lecture:

library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("rvest")
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding

Web scraping

There is a wealth of data on internet. How to scrape them and analyze them?

rvest

rvest is an R package written by Hadley Wickham which makes web scraping easy.

Example: Scraping from webpage

Rank

  • Use SelectorGadget to highlight the element we want to scrape

  • Use the CSS selector to get the rankings

    # Use CSS selectors to scrap the rankings section
    (rank_data_html <- html_nodes(webpage, '.text-primary'))
    ## {xml_nodeset (100)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [3] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [4] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [5] <span class="lister-item-index unbold text-primary">5.</span>
    ##  [6] <span class="lister-item-index unbold text-primary">6.</span>
    ##  [7] <span class="lister-item-index unbold text-primary">7.</span>
    ##  [8] <span class="lister-item-index unbold text-primary">8.</span>
    ##  [9] <span class="lister-item-index unbold text-primary">9.</span>
    ## [10] <span class="lister-item-index unbold text-primary">10.</span>
    ## [11] <span class="lister-item-index unbold text-primary">11.</span>
    ## [12] <span class="lister-item-index unbold text-primary">12.</span>
    ## [13] <span class="lister-item-index unbold text-primary">13.</span>
    ## [14] <span class="lister-item-index unbold text-primary">14.</span>
    ## [15] <span class="lister-item-index unbold text-primary">15.</span>
    ## [16] <span class="lister-item-index unbold text-primary">16.</span>
    ## [17] <span class="lister-item-index unbold text-primary">17.</span>
    ## [18] <span class="lister-item-index unbold text-primary">18.</span>
    ## [19] <span class="lister-item-index unbold text-primary">19.</span>
    ## [20] <span class="lister-item-index unbold text-primary">20.</span>
    ## ...
    # (rank_data_html <- html_nodes(webpage, '.lister-item-content .text-primary'))
    # Convert the ranking data to text
    (rank_data <- html_text(rank_data_html))
    ##   [1] "1."   "2."   "3."   "4."   "5."   "6."   "7."   "8."   "9."   "10." 
    ##  [11] "11."  "12."  "13."  "14."  "15."  "16."  "17."  "18."  "19."  "20." 
    ##  [21] "21."  "22."  "23."  "24."  "25."  "26."  "27."  "28."  "29."  "30." 
    ##  [31] "31."  "32."  "33."  "34."  "35."  "36."  "37."  "38."  "39."  "40." 
    ##  [41] "41."  "42."  "43."  "44."  "45."  "46."  "47."  "48."  "49."  "50." 
    ##  [51] "51."  "52."  "53."  "54."  "55."  "56."  "57."  "58."  "59."  "60." 
    ##  [61] "61."  "62."  "63."  "64."  "65."  "66."  "67."  "68."  "69."  "70." 
    ##  [71] "71."  "72."  "73."  "74."  "75."  "76."  "77."  "78."  "79."  "80." 
    ##  [81] "81."  "82."  "83."  "84."  "85."  "86."  "87."  "88."  "89."  "90." 
    ##  [91] "91."  "92."  "93."  "94."  "95."  "96."  "97."  "98."  "99."  "100."
    # Turn into numerical values
    (rank_data <- as.integer(rank_data))
    ##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
    ##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
    ##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
    ##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
    ##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
    ##  [91]  91  92  93  94  95  96  97  98  99 100

Title

  • Use SelectorGadget to find the CSS selector .lister-item-header a.

    # Using CSS selectors to scrap the title section
    (title_data_html <- html_nodes(webpage, '.lister-item-header a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/title/tt8367814/?ref_=adv_li_tt">The Gentlemen</a>
    ##  [2] <a href="/title/tt8946378/?ref_=adv_li_tt">Knives Out</a>
    ##  [3] <a href="/title/tt7286456/?ref_=adv_li_tt">Joker</a>
    ##  [4] <a href="/title/tt7549996/?ref_=adv_li_tt">Judy</a>
    ##  [5] <a href="/title/tt5363618/?ref_=adv_li_tt">Sound of Metal</a>
    ##  [6] <a href="/title/tt4154796/?ref_=adv_li_tt">Avengers: Endgame</a>
    ##  [7] <a href="/title/tt7131622/?ref_=adv_li_tt">Once Upon a Time... in Hollyw ...
    ##  [8] <a href="/title/tt6751668/?ref_=adv_li_tt">Parasite</a>
    ##  [9] <a href="/title/tt1025100/?ref_=adv_li_tt">Gemini Man</a>
    ## [10] <a href="/title/tt1950186/?ref_=adv_li_tt">Ford v Ferrari</a>
    ## [11] <a href="/title/tt2584384/?ref_=adv_li_tt">Jojo Rabbit</a>
    ## [12] <a href="/title/tt4916630/?ref_=adv_li_tt">Just Mercy</a>
    ## [13] <a href="/title/tt4126476/?ref_=adv_li_tt">After</a>
    ## [14] <a href="/title/tt5606664/?ref_=adv_li_tt">Doctor Sleep</a>
    ## [15] <a href="/title/tt8688634/?ref_=adv_li_tt">21 Bridges</a>
    ## [16] <a href="/title/tt1620981/?ref_=adv_li_tt">The Addams Family</a>
    ## [17] <a href="/title/tt8772262/?ref_=adv_li_tt">Midsommar</a>
    ## [18] <a href="/title/tt6450804/?ref_=adv_li_tt">Terminator: Dark Fate</a>
    ## [19] <a href="/title/tt8579674/?ref_=adv_li_tt">1917</a>
    ## [20] <a href="/title/tt7984734/?ref_=adv_li_tt">The Lighthouse</a>
    ## ...
    # Converting the title data to text
    (title_data <- html_text(title_data_html))
    ##   [1] "The Gentlemen"                                
    ##   [2] "Knives Out"                                   
    ##   [3] "Joker"                                        
    ##   [4] "Judy"                                         
    ##   [5] "Sound of Metal"                               
    ##   [6] "Avengers: Endgame"                            
    ##   [7] "Once Upon a Time... in Hollywood"             
    ##   [8] "Parasite"                                     
    ##   [9] "Gemini Man"                                   
    ##  [10] "Ford v Ferrari"                               
    ##  [11] "Jojo Rabbit"                                  
    ##  [12] "Just Mercy"                                   
    ##  [13] "After"                                        
    ##  [14] "Doctor Sleep"                                 
    ##  [15] "21 Bridges"                                   
    ##  [16] "The Addams Family"                            
    ##  [17] "Midsommar"                                    
    ##  [18] "Terminator: Dark Fate"                        
    ##  [19] "1917"                                         
    ##  [20] "The Lighthouse"                               
    ##  [21] "Motherless Brooklyn"                          
    ##  [22] "Bombshell"                                    
    ##  [23] "The Peanut Butter Falcon"                     
    ##  [24] "Jumanji: The Next Level"                      
    ##  [25] "Little Women"                                 
    ##  [26] "Star Wars: Episode IX - The Rise of Skywalker"
    ##  [27] "Ready or Not"                                 
    ##  [28] "Spiral"                                       
    ##  [29] "The Lion King"                                
    ##  [30] "It Chapter Two"                               
    ##  [31] "Charlie's Angels"                             
    ##  [32] "Uncut Gems"                                   
    ##  [33] "Infidel"                                      
    ##  [34] "The Irishman"                                 
    ##  [35] "The King"                                     
    ##  [36] "Midway"                                       
    ##  [37] "Spider-Man: Far from Home"                    
    ##  [38] "A Call to Spy"                                
    ##  [39] "Us"                                           
    ##  [40] "Alita: Battle Angel"                          
    ##  [41] "The Lodge"                                    
    ##  [42] "Captain Marvel"                               
    ##  [43] "Frozen II"                                    
    ##  [44] "Hustlers"                                     
    ##  [45] "Aladdin"                                      
    ##  [46] "I See You"                                    
    ##  [47] "Portrait of a Lady on Fire"                   
    ##  [48] "Richard Jewell"                               
    ##  [49] "The Platform"                                 
    ##  [50] "6 Underground"                                
    ##  [51] "Spies in Disguise"                            
    ##  [52] "Fast & Furious Presents: Hobbs & Shaw"        
    ##  [53] "Zombieland: Double Tap"                       
    ##  [54] "John Wick: Chapter 3 - Parabellum"            
    ##  [55] "Ad Astra"                                     
    ##  [56] "Godzilla: King of the Monsters"               
    ##  [57] "Marriage Story"                               
    ##  [58] "Do Not Reply"                                 
    ##  [59] "Rocketman"                                    
    ##  [60] "Toy Story 4"                                  
    ##  [61] "Vivarium"                                     
    ##  [62] "Iron Mask"                                    
    ##  [63] "X-Men: Dark Phoenix"                          
    ##  [64] "The Personal History of David Copperfield"    
    ##  [65] "Booksmart"                                    
    ##  [66] "Shazam!"                                      
    ##  [67] "Maleficent: Mistress of Evil"                 
    ##  [68] "Anna"                                         
    ##  [69] "Glass"                                        
    ##  [70] "A Beautiful Day in the Neighborhood"          
    ##  [71] "Color Out of Space"                           
    ##  [72] "Waiting for the Barbarians"                   
    ##  [73] "Pets United"                                  
    ##  [74] "Dark Waters"                                  
    ##  [75] "Guns Akimbo"                                  
    ##  [76] "Honey Boy"                                    
    ##  [77] "Bad Education"                                
    ##  [78] "Operation Brothers"                           
    ##  [79] "Scary Stories to Tell in the Dark"            
    ##  [80] "Yesterday"                                    
    ##  [81] "Cats"                                         
    ##  [82] "The Informer"                                 
    ##  [83] "The Jesus Rolls"                              
    ##  [84] "Extremely Wicked, Shockingly Evil and Vile"   
    ##  [85] "Dora and the Lost City of Gold"               
    ##  [86] "Polar"                                        
    ##  [87] "Angel Has Fallen"                             
    ##  [88] "3022"                                         
    ##  [89] "Babyteeth"                                    
    ##  [90] "A Hidden Life"                                
    ##  [91] "Rambo: Last Blood"                            
    ##  [92] "Monsoon"                                      
    ##  [93] "Saint Maud"                                   
    ##  [94] "El Camino: A Breaking Bad Movie"              
    ##  [95] "In the Shadow of the Moon"                    
    ##  [96] "Downton Abbey"                                
    ##  [97] "The Wretched"                                 
    ##  [98] "Murder Mystery"                               
    ##  [99] "True History of the Kelly Gang"               
    ## [100] "Blackbird"

Description

  • # Using CSS selectors to scrap the description section
    (description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
    ## {xml_nodeset (100)}
    ##  [1] <p class="text-muted">\n    An American expat tries to sell off his high ...
    ##  [2] <p class="text-muted">\n    A detective investigates the death of a patr ...
    ##  [3] <p class="text-muted">\n    In Gotham City, mentally troubled comedian A ...
    ##  [4] <p class="text-muted">\n    Legendary performer <a href="/name/nm0000023 ...
    ##  [5] <p class="text-muted">\n    A heavy-metal drummer's life is thrown into  ...
    ##  [6] <p class="text-muted">\n    After the devastating events of <a href="/ti ...
    ##  [7] <p class="text-muted">\n    A faded television actor and his stunt doubl ...
    ##  [8] <p class="text-muted">\n    Greed and class discrimination threaten the  ...
    ##  [9] <p class="text-muted">\n    An over-the-hill hitman faces off against a  ...
    ## [10] <p class="text-muted">\n    American car designer <a href="/name/nm07909 ...
    ## [11] <p class="text-muted">\n    A young boy in Hitler's army finds out his m ...
    ## [12] <p class="text-muted">\n    World-renowned civil rights defense attorney ...
    ## [13] <p class="text-muted">\n    A young woman falls for a guy with a dark se ...
    ## [14] <p class="text-muted">\n    Years following the events of <a href="/titl ...
    ## [15] <p class="text-muted">\n    An embattled NYPD detective is thrust into a ...
    ## [16] <p class="text-muted">\n    The eccentrically macabre family moves to a  ...
    ## [17] <p class="text-muted">\n    A couple travels to Sweden to visit a rural  ...
    ## [18] <p class="text-muted">\n    An augmented human and Sarah Connor must sto ...
    ## [19] <p class="text-muted">\n    April 6th, 1917. As a regiment assembles to  ...
    ## [20] <p class="text-muted">\n    Two lighthouse keepers try to maintain their ...
    ## ...
    # Converting the description data to text
    description_data <- html_text(description_data_html)
    # take a look at first few
    head(description_data)
    ## [1] "\n    An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                           
    ## [2] "\n    A detective investigates the death of a patriarch of an eccentric, combative family."                                                                                                                                                   
    ## [3] "\n    In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker."
    ## [4] "\n    Legendary performer Judy Garland arrives in London in the winter of 1968 to perform a series of sold-out concerts."                                                                                                                     
    ## [5] "\n    A heavy-metal drummer's life is thrown into freefall when he begins to lose his hearing."                                                                                                                                               
    ## [6] "\n    After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
    # strip the '\n'
    description_data <- str_replace(description_data, "^\\n\\s+", "")
    head(description_data)
    ## [1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                           
    ## [2] "A detective investigates the death of a patriarch of an eccentric, combative family."                                                                                                                                                   
    ## [3] "In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker."
    ## [4] "Legendary performer Judy Garland arrives in London in the winter of 1968 to perform a series of sold-out concerts."                                                                                                                     
    ## [5] "A heavy-metal drummer's life is thrown into freefall when he begins to lose his hearing."                                                                                                                                               
    ## [6] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."

Runtime

# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
  html_nodes('.runtime') %>%
  html_text() %>%
  str_replace(" min", "") %>%
  as.integer())
##   [1] 113 130 122 118 130 181 161 132 117 152 108 137 105 152  99  86 148 128
##  [19] 119 109 144 109  97 123 135 141  95  87 118 169 118 135 108 209 140 138
##  [37] 129 123 116 122 108 123 103 110 128  98 122 131  94 128 102 137  99 130
##  [55] 123 132 137  98 121 100  97 120 113 119 102 132 119 118 129 109 111 112
##  [73]  89 126  98  94 108 129 108 116 110 113  85 110 102 118 121  91 118 174
##  [91]  89  85  84 122 115 122  95  97 124  97
# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
## [1] "113 min" "130 min" "122 min" "118 min" "130 min" "181 min"
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 113 130 122 118 130 181

Genre

  • Collect the (first) genre of each movie:

    # Using CSS selectors to scrap the Movie genre section
    genre_data_html <- html_nodes(webpage, '.genre')
    # Converting the genre data to text
    genre_data <- html_text(genre_data_html)
    # Let's have a look at the genre data
    head(genre_data)    
    ## [1] "\nAction, Comedy, Crime            "    
    ## [2] "\nComedy, Crime, Drama            "     
    ## [3] "\nCrime, Drama, Thriller            "   
    ## [4] "\nBiography, Drama, Romance            "
    ## [5] "\nDrama, Music            "             
    ## [6] "\nAction, Adventure, Drama            "
    # Data-Preprocessing: retrieve the first word
    genre_data <- str_extract(genre_data, "[:alpha:]+")
    # Convering each genre from text to factor
    #genre_data <- as.factor(genre_data)
    # Let's have another look at the genre data
    head(genre_data)
    ## [1] "Action"    "Comedy"    "Crime"     "Biography" "Drama"     "Action"

Rating

  • # Using CSS selectors to scrap the IMDB rating section
    rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
    # Converting the ratings data to text
    rating_data <- html_text(rating_data_html)
    # Let's have a look at the ratings
    head(rating_data)
    ## [1] "7.9" "7.9" "8.5" "6.8" "7.0" "8.4"
    # Data-Preprocessing: converting ratings to numerical
    rating_data <- as.numeric(rating_data)
    # Let's have another look at the ratings data
    rating_data
    ##   [1] 7.9 7.9 8.5 6.8 7.0 8.4 7.6 8.6 5.7 8.1 7.9 7.6 5.3 7.4 6.6 5.8 7.1 6.2
    ##  [19] 8.3 7.5 6.8 6.8 7.6 6.7 7.8 6.6 6.8 5.9 6.9 6.5 4.8 7.4 6.6 7.9 7.2 6.7
    ##  [37] 7.5 5.5 6.9 7.3 6.0 6.9 6.9 6.3 7.0 6.8 8.1 7.5 7.0 6.1 6.8 6.4 6.7 7.4
    ##  [55] 6.6 6.0 8.0 4.1 7.3 7.8 5.8 4.6 5.8 6.4 7.2 7.1 6.6 6.6 6.7 7.3 6.2 5.8
    ##  [73] 3.4 7.6 6.3 7.3 7.1 6.6 6.2 6.8 2.8 6.6 4.3 6.6 6.1 6.3 6.4 4.6 7.1 7.4
    ##  [91] 6.1 5.9 6.9 7.3 6.2 7.4 5.8 6.0 6.0 6.4

Votes

  • # Using CSS selectors to scrap the votes section
    votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
    # Converting the votes data to text
    votes_data <- html_text(votes_data_html)
    # Let's have a look at the votes data
    head(votes_data)
    ## [1] "189,868" "396,786" "875,700" "36,134"  "381"     "767,928"
    # Data-Preprocessing: removing commas
    votes_data <- str_replace(votes_data, ",", "")
    # Data-Preprocessing: converting votes to numerical
    votes_data <- as.numeric(votes_data)
    #Let's have another look at the votes data
    votes_data
    ##   [1] 189868 396786 875700  36134    381 767928 517451 487781  78802 262853
    ##  [11] 264068  39822  32707 127429  41749  24700 181502 137047 373982 125799
    ##  [21]  36334  73816  56243 173739 126331 352201 100029    888 202357 195488
    ##  [31]  51674 203531    755 308378  81408  59935 308543    304 212981 219723
    ##  [41]  26582 426110 125971  78510 218135  23389  52756  51111 149289 131383
    ##  [51]  30265 160281 129218 261386 188637 126840 231240    224 126091 191907
    ##  [61]  31073   2663 148296   8641  86634 243422  78676  56625 199778  52035
    ##  [71]  26373   4157   1024  47926  32549  24332  25659  21828  56304 107094
    ##  [81]  36619  16870   2465  74100  22177  71624  74166   3263   5189  15707
    ##  [91]  77768    371    656 174550  37827  36718   7813  96490   5793    623

Director

  • CSS selector reference

    # Using CSS selectors to scrap the directors section
    (directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/name/nm0005363/?ref_=adv_li_dr_0">Guy Ritchie</a>
    ##  [2] <a href="/name/nm0426059/?ref_=adv_li_dr_0">Rian Johnson</a>
    ##  [3] <a href="/name/nm0680846/?ref_=adv_li_dr_0">Todd Phillips</a>
    ##  [4] <a href="/name/nm3734458/?ref_=adv_li_dr_0">Rupert Goold</a>
    ##  [5] <a href="/name/nm2942187/?ref_=adv_li_dr_0">Darius Marder</a>
    ##  [6] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
    ##  [7] <a href="/name/nm0000233/?ref_=adv_li_dr_0">Quentin Tarantino</a>
    ##  [8] <a href="/name/nm0094435/?ref_=adv_li_dr_0">Bong Joon Ho</a>
    ##  [9] <a href="/name/nm0000487/?ref_=adv_li_dr_0">Ang Lee</a>
    ## [10] <a href="/name/nm0003506/?ref_=adv_li_dr_0">James Mangold</a>
    ## [11] <a href="/name/nm0169806/?ref_=adv_li_dr_0">Taika Waititi</a>
    ## [12] <a href="/name/nm2308774/?ref_=adv_li_dr_0">Destin Daniel Cretton</a>
    ## [13] <a href="/name/nm1788310/?ref_=adv_li_dr_0">Jenny Gage</a>
    ## [14] <a href="/name/nm1093039/?ref_=adv_li_dr_0">Mike Flanagan</a>
    ## [15] <a href="/name/nm1047532/?ref_=adv_li_dr_0">Brian Kirk</a>
    ## [16] <a href="/name/nm0862911/?ref_=adv_li_dr_0">Greg Tiernan</a>
    ## [17] <a href="/name/nm4170048/?ref_=adv_li_dr_0">Ari Aster</a>
    ## [18] <a href="/name/nm1783265/?ref_=adv_li_dr_0">Tim Miller</a>
    ## [19] <a href="/name/nm0005222/?ref_=adv_li_dr_0">Sam Mendes</a>
    ## [20] <a href="/name/nm3211470/?ref_=adv_li_dr_0">Robert Eggers</a>
    ## ...
    # Converting the directors data to text
    directors_data <- html_text(directors_data_html)
    # Let's have a look at the directors data
    directors_data
    ##   [1] "Guy Ritchie"            "Rian Johnson"           "Todd Phillips"         
    ##   [4] "Rupert Goold"           "Darius Marder"          "Anthony Russo"         
    ##   [7] "Quentin Tarantino"      "Bong Joon Ho"           "Ang Lee"               
    ##  [10] "James Mangold"          "Taika Waititi"          "Destin Daniel Cretton" 
    ##  [13] "Jenny Gage"             "Mike Flanagan"          "Brian Kirk"            
    ##  [16] "Greg Tiernan"           "Ari Aster"              "Tim Miller"            
    ##  [19] "Sam Mendes"             "Robert Eggers"          "Edward Norton"         
    ##  [22] "Jay Roach"              "Tyler Nilson"           "Jake Kasdan"           
    ##  [25] "Greta Gerwig"           "J.J. Abrams"            "Matt Bettinelli-Olpin" 
    ##  [28] "Kurtis David Harder"    "Jon Favreau"            "Andy Muschietti"       
    ##  [31] "Elizabeth Banks"        "Benny Safdie"           "Cyrus Nowrasteh"       
    ##  [34] "Martin Scorsese"        "David Michôd"           "Roland Emmerich"       
    ##  [37] "Jon Watts"              "Lydia Dean Pilcher"     "Jordan Peele"          
    ##  [40] "Robert Rodriguez"       "Severin Fiala"          "Anna Boden"            
    ##  [43] "Chris Buck"             "Lorene Scafaria"        "Guy Ritchie"           
    ##  [46] "Adam Randall"           "Céline Sciamma"         "Clint Eastwood"        
    ##  [49] "Galder Gaztelu-Urrutia" "Michael Bay"            "Nick Bruno"            
    ##  [52] "David Leitch"           "Ruben Fleischer"        "Chad Stahelski"        
    ##  [55] "James Gray"             "Michael Dougherty"      "Noah Baumbach"         
    ##  [58] "Daniel Woltosz"         "Dexter Fletcher"        "Josh Cooley"           
    ##  [61] "Lorcan Finnegan"        "Oleg Stepchenko"        "Simon Kinberg"         
    ##  [64] "Armando Iannucci"       "Olivia Wilde"           "David F. Sandberg"     
    ##  [67] "Joachim Rønning"        "Luc Besson"             "M. Night Shyamalan"    
    ##  [70] "Marielle Heller"        "Richard Stanley"        "Ciro Guerra"           
    ##  [73] "Reinhard Klooss"        "Todd Haynes"            "Jason Lei Howden"      
    ##  [76] "Alma Har'el"            "Cory Finley"            "Gideon Raff"           
    ##  [79] "André Øvredal"          "Danny Boyle"            "Tom Hooper"            
    ##  [82] "Andrea Di Stefano"      "John Turturro"          "Joe Berlinger"         
    ##  [85] "James Bobin"            "Jonas Åkerlund"         "Ric Roman Waugh"       
    ##  [88] "John Suits"             "Shannon Murphy"         "Terrence Malick"       
    ##  [91] "Adrian Grunberg"        "Hong Khaou"             "Rose Glass"            
    ##  [94] "Vince Gilligan"         "Jim Mickle"             "Michael Engler"        
    ##  [97] "Brett Pierce"           "Kyle Newacheck"         "Justin Kurzel"         
    ## [100] "Roger Michell"

Actor

  • # Using CSS selectors to scrap the actors section
    (actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/name/nm0000190/?ref_=adv_li_st_0">Matthew McConaughey</a>
    ##  [2] <a href="/name/nm0185819/?ref_=adv_li_st_0">Daniel Craig</a>
    ##  [3] <a href="/name/nm0001618/?ref_=adv_li_st_0">Joaquin Phoenix</a>
    ##  [4] <a href="/name/nm0000250/?ref_=adv_li_st_0">Renée Zellweger</a>
    ##  [5] <a href="/name/nm1981893/?ref_=adv_li_st_0">Riz Ahmed</a>
    ##  [6] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
    ##  [7] <a href="/name/nm0000138/?ref_=adv_li_st_0">Leonardo DiCaprio</a>
    ##  [8] <a href="/name/nm0814280/?ref_=adv_li_st_0">Kang-ho Song</a>
    ##  [9] <a href="/name/nm0000226/?ref_=adv_li_st_0">Will Smith</a>
    ## [10] <a href="/name/nm0000354/?ref_=adv_li_st_0">Matt Damon</a>
    ## [11] <a href="/name/nm9877392/?ref_=adv_li_st_0">Roman Griffin Davis</a>
    ## [12] <a href="/name/nm0430107/?ref_=adv_li_st_0">Michael B. Jordan</a>
    ## [13] <a href="/name/nm6466214/?ref_=adv_li_st_0">Josephine Langford</a>
    ## [14] <a href="/name/nm0000191/?ref_=adv_li_st_0">Ewan McGregor</a>
    ## [15] <a href="/name/nm1569276/?ref_=adv_li_st_0">Chadwick Boseman</a>
    ## [16] <a href="/name/nm1209966/?ref_=adv_li_st_0">Oscar Isaac</a>
    ## [17] <a href="/name/nm6073955/?ref_=adv_li_st_0">Florence Pugh</a>
    ## [18] <a href="/name/nm0000157/?ref_=adv_li_st_0">Linda Hamilton</a>
    ## [19] <a href="/name/nm2835616/?ref_=adv_li_st_0">Dean-Charles Chapman</a>
    ## [20] <a href="/name/nm1500155/?ref_=adv_li_st_0">Robert Pattinson</a>
    ## ...
    # Converting the gross actors data to text
    actors_data <- html_text(actors_data_html)
    # Let's have a look at the actors data
    head(actors_data)
    ## [1] "Matthew McConaughey" "Daniel Craig"        "Joaquin Phoenix"    
    ## [4] "Renée Zellweger"     "Riz Ahmed"           "Robert Downey Jr."

Metascore

  • Be careful with missing data.

    # Using CSS selectors to scrap the metascore section
    metascore_data_html <- html_nodes(webpage, '.metascore')
    # Converting the runtime data to text
    metascore_data <- html_text(metascore_data_html)
    # Let's have a look at the metascore 
    head(metascore_data)
    ## [1] "51        " "82        " "59        " "66        " "87        "
    ## [6] "78        "
    # Data-Preprocessing: removing extra space in metascore
    metascore_data <- str_replace(metascore_data, "\\s*$", "")
    metascore_data <- as.numeric(metascore_data)
    metascore_data
    ##  [1] 51 82 59 66 87 78 83 96 38 81 58 68 30 59 51 46 72 54 78 83 60 64 70 58 91
    ## [26] 53 64 55 58 52 91 94 62 47 69 67 81 53 64 64 64 79 53 65 95 68 73 41 54 60
    ## [51] 55 73 80 48 94 69 84 64 43 77 84 71 43 40 43 80 70 52 73 42 73 79 61 55 32
    ## [76] 61 44 52 63 19 45 77 78 26 77 72 48 64 61 38 75 54
    # Lets check the length of metascore data
    length(metascore_data)
    ## [1] 92
    # Visual inspection finds 24, 85, 100 don't have metascore
    ms <- rep(NA, 100)
    ms[-c(24, 85, 100)] <- metascore_data
    ## Warning in ms[-c(24, 85, 100)] <- metascore_data: number of items to replace is
    ## not a multiple of replacement length
    (metascore_data <- ms)
    ##   [1] 51 82 59 66 87 78 83 96 38 81 58 68 30 59 51 46 72 54 78 83 60 64 70 NA 58
    ##  [26] 91 53 64 55 58 52 91 94 62 47 69 67 81 53 64 64 64 79 53 65 95 68 73 41 54
    ##  [51] 60 55 73 80 48 94 69 84 64 43 77 84 71 43 40 43 80 70 52 73 42 73 79 61 55
    ##  [76] 32 61 44 52 63 19 45 77 78 NA 26 77 72 48 64 61 38 75 54 51 82 59 66 87 NA

Gross

  • Be careful with missing data.

    # Using CSS selectors to scrap the gross revenue section
    gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
    # Converting the gross revenue data to text
    gross_data <- html_text(gross_data_html)
    # Let's have a look at the gross data
    head(gross_data)
    ## [1] "$165.36M" "$335.45M" "$858.37M" "$142.50M" "$53.37M"  "$20.55M"
    # Data-Preprocessing: removing '$' and 'M' signs
    gross_data <- str_replace(gross_data, "M", "")
    gross_data <- str_sub(gross_data, 2, 10)
    #(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
    gross_data <- as.numeric(gross_data)
    # Let's check the length of gross data
    length(gross_data)
    ## [1] 51
    # Visual inspection finds below movies don't have gross
    #gs_data <- rep(NA, 100)
    #gs_data[-c(1, 2, 3, 5, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
    #(gross_data <- gs_data)

    60 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.

    (rank_and_gross <- webpage %>%
      html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
      html_text() %>%
      str_replace("\\s+", "") %>%
      str_replace_all("[$M]", ""))
    ##   [1] "1."     "2."     "165.36" "3."     "335.45" "4."     "5."     "6."    
    ##   [9] "858.37" "7."     "142.50" "8."     "53.37"  "9."     "20.55"  "10."   
    ##  [17] "117.62" "11."    "0.35"   "12."    "13."    "12.14"  "14."    "15."   
    ##  [25] "16."    "100.04" "17."    "27.33"  "18."    "62.25"  "19."    "159.23"
    ##  [33] "20."    "0.43"   "21."    "22."    "23."    "13.12"  "24."    "316.83"
    ##  [41] "25."    "108.10" "26."    "515.20" "27."    "26.74"  "28."    "29."   
    ##  [49] "543.64" "30."    "211.59" "31."    "32."    "33."    "34."    "7.00"  
    ##  [57] "35."    "36."    "37."    "390.53" "38."    "39."    "175.08" "40."   
    ##  [65] "85.71"  "41."    "42."    "426.83" "43."    "477.37" "44."    "80.55" 
    ##  [73] "45."    "355.56" "46."    "47."    "3.76"   "48."    "49."    "50."   
    ##  [81] "51."    "52."    "173.96" "53."    "26.80"  "54."    "171.02" "55."   
    ##  [89] "35.40"  "56."    "110.50" "57."    "2.00"   "58."    "59."    "96.37" 
    ##  [97] "60."    "434.04" "61."    "62."    "63."    "65.85"  "64."    "65."   
    ## [105] "22.68"  "66."    "140.37" "67."    "113.93" "68."    "7.74"   "69."   
    ## [113] "111.05" "70."    "61.70"  "71."    "72."    "73."    "74."    "75."   
    ## [121] "76."    "77."    "78."    "79."    "62.74"  "80."    "73.29"  "81."   
    ## [129] "82."    "83."    "84."    "85."    "54.89"  "86."    "87."    "67.16" 
    ## [137] "88."    "89."    "90."    "91."    "18.87"  "92."    "93."    "94."   
    ## [145] "95."    "96."    "96.85"  "97."    "98."    "99."    "100."
    isrank <- str_detect(rank_and_gross, "\\.$")
    ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    missingpos <- as.integer(rank_and_gross[ismissing])
    gs_data <- rep(NA, 100)
    gs_data[-missingpos] <- gross_data
    (gross_data <- gs_data)
    ##   [1]     NA 165.36 335.45     NA     NA 858.37 142.50  53.37  20.55 117.62
    ##  [11]   0.35     NA  12.14     NA     NA 100.04  27.33  62.25 159.23   0.43
    ##  [21]     NA     NA  13.12 316.83 108.10 515.20  26.74     NA 543.64 211.59
    ##  [31]     NA     NA     NA   7.00     NA     NA 390.53     NA 175.08  85.71
    ##  [41]     NA 426.83 477.37  80.55 355.56     NA   3.76     NA     NA     NA
    ##  [51]     NA 173.96  26.80 171.02  35.40 110.50   2.00     NA  96.37 434.04
    ##  [61]     NA     NA  65.85     NA  22.68 140.37 113.93   7.74 111.05  61.70
    ##  [71]     NA     NA     NA     NA     NA     NA     NA     NA  62.74  73.29
    ##  [81]     NA     NA     NA     NA  54.89     NA  67.16     NA     NA     NA
    ##  [91]  18.87     NA     NA     NA     NA  96.85     NA     NA     NA     NA

Missing entries - more reproducible way

  • Following code programatically figures out missing entries for metascore.

    # Use CSS selectors to scrap the rankings section
    (rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
    ## {xml_nodeset (192)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="metascore  mixed">51        </span>
    ##  [3] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [4] <span class="metascore  favorable">82        </span>
    ##  [5] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [6] <span class="metascore  mixed">59        </span>
    ##  [7] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [8] <span class="metascore  favorable">66        </span>
    ##  [9] <span class="lister-item-index unbold text-primary">5.</span>
    ## [10] <span class="metascore  favorable">87        </span>
    ## [11] <span class="lister-item-index unbold text-primary">6.</span>
    ## [12] <span class="metascore  favorable">78        </span>
    ## [13] <span class="lister-item-index unbold text-primary">7.</span>
    ## [14] <span class="metascore  favorable">83        </span>
    ## [15] <span class="lister-item-index unbold text-primary">8.</span>
    ## [16] <span class="metascore  favorable">96        </span>
    ## [17] <span class="lister-item-index unbold text-primary">9.</span>
    ## [18] <span class="metascore  unfavorable">38        </span>
    ## [19] <span class="lister-item-index unbold text-primary">10.</span>
    ## [20] <span class="metascore  favorable">81        </span>
    ## ...
    # Convert the ranking data to text
    (rank_metascore_data <- html_text(rank_metascore_data_html))
    ##   [1] "1."         "51        " "2."         "82        " "3."        
    ##   [6] "59        " "4."         "66        " "5."         "87        "
    ##  [11] "6."         "78        " "7."         "83        " "8."        
    ##  [16] "96        " "9."         "38        " "10."        "81        "
    ##  [21] "11."        "58        " "12."        "68        " "13."       
    ##  [26] "30        " "14."        "59        " "15."        "51        "
    ##  [31] "16."        "46        " "17."        "72        " "18."       
    ##  [36] "54        " "19."        "78        " "20."        "83        "
    ##  [41] "21."        "60        " "22."        "64        " "23."       
    ##  [46] "70        " "24."        "58        " "25."        "91        "
    ##  [51] "26."        "53        " "27."        "64        " "28."       
    ##  [56] "29."        "55        " "30."        "58        " "31."       
    ##  [61] "52        " "32."        "91        " "33."        "34."       
    ##  [66] "94        " "35."        "62        " "36."        "47        "
    ##  [71] "37."        "69        " "38."        "67        " "39."       
    ##  [76] "81        " "40."        "53        " "41."        "64        "
    ##  [81] "42."        "64        " "43."        "64        " "44."       
    ##  [86] "79        " "45."        "53        " "46."        "65        "
    ##  [91] "47."        "95        " "48."        "68        " "49."       
    ##  [96] "73        " "50."        "41        " "51."        "54        "
    ## [101] "52."        "60        " "53."        "55        " "54."       
    ## [106] "73        " "55."        "80        " "56."        "48        "
    ## [111] "57."        "94        " "58."        "59."        "69        "
    ## [116] "60."        "84        " "61."        "64        " "62."       
    ## [121] "63."        "43        " "64."        "77        " "65."       
    ## [126] "84        " "66."        "71        " "67."        "43        "
    ## [131] "68."        "40        " "69."        "43        " "70."       
    ## [136] "80        " "71."        "70        " "72."        "52        "
    ## [141] "73."        "74."        "73        " "75."        "42        "
    ## [146] "76."        "73        " "77."        "79        " "78."       
    ## [151] "79."        "61        " "80."        "55        " "81."       
    ## [156] "32        " "82."        "61        " "83."        "44        "
    ## [161] "84."        "52        " "85."        "63        " "86."       
    ## [166] "19        " "87."        "45        " "88."        "89."       
    ## [171] "77        " "90."        "78        " "91."        "26        "
    ## [176] "92."        "93."        "77        " "94."        "72        "
    ## [181] "95."        "48        " "96."        "64        " "97."       
    ## [186] "61        " "98."        "38        " "99."        "75        "
    ## [191] "100."       "54        "
    # Strip spaces
    (rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
    ##   [1] "1."   "51"   "2."   "82"   "3."   "59"   "4."   "66"   "5."   "87"  
    ##  [11] "6."   "78"   "7."   "83"   "8."   "96"   "9."   "38"   "10."  "81"  
    ##  [21] "11."  "58"   "12."  "68"   "13."  "30"   "14."  "59"   "15."  "51"  
    ##  [31] "16."  "46"   "17."  "72"   "18."  "54"   "19."  "78"   "20."  "83"  
    ##  [41] "21."  "60"   "22."  "64"   "23."  "70"   "24."  "58"   "25."  "91"  
    ##  [51] "26."  "53"   "27."  "64"   "28."  "29."  "55"   "30."  "58"   "31." 
    ##  [61] "52"   "32."  "91"   "33."  "34."  "94"   "35."  "62"   "36."  "47"  
    ##  [71] "37."  "69"   "38."  "67"   "39."  "81"   "40."  "53"   "41."  "64"  
    ##  [81] "42."  "64"   "43."  "64"   "44."  "79"   "45."  "53"   "46."  "65"  
    ##  [91] "47."  "95"   "48."  "68"   "49."  "73"   "50."  "41"   "51."  "54"  
    ## [101] "52."  "60"   "53."  "55"   "54."  "73"   "55."  "80"   "56."  "48"  
    ## [111] "57."  "94"   "58."  "59."  "69"   "60."  "84"   "61."  "64"   "62." 
    ## [121] "63."  "43"   "64."  "77"   "65."  "84"   "66."  "71"   "67."  "43"  
    ## [131] "68."  "40"   "69."  "43"   "70."  "80"   "71."  "70"   "72."  "52"  
    ## [141] "73."  "74."  "73"   "75."  "42"   "76."  "73"   "77."  "79"   "78." 
    ## [151] "79."  "61"   "80."  "55"   "81."  "32"   "82."  "61"   "83."  "44"  
    ## [161] "84."  "52"   "85."  "63"   "86."  "19"   "87."  "45"   "88."  "89." 
    ## [171] "77"   "90."  "78"   "91."  "26"   "92."  "93."  "77"   "94."  "72"  
    ## [181] "95."  "48"   "96."  "64"   "97."  "61"   "98."  "38"   "99."  "75"  
    ## [191] "100." "54"
    # a rank followed by another rank means the metascore for the 1st rank is missing
    (isrank <- str_detect(rank_metascore_data, "\\.$"))
    ##   [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [13]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [25]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [37]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [49]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
    ##  [61] FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [73]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [85]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [97]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [109]  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [121]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [133]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
    ## [145] FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [157]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [169]  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE
    ## [181]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ismissing <- isrank[1:length(rank_metascore_data)-1] & 
      isrank[2:length(rank_metascore_data)]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    (missingpos <- as.integer(rank_metascore_data[ismissing]))
    ## [1] 28 33 58 62 73 78 88 92
    #(rank_metascore_data <- as.integer(rank_metascore_data))
  • You (students) should work out the code for finding missing positions for gross.

Visualizing movie data

  • Form a tibble:

    # Combining all the lists to form a data frame
    movies <- tibble(Rank = rank_data, 
                     Title = title_data,
                     Description = description_data, 
                     Runtime = runtime_data,
                     Genre = genre_data, 
                     Rating = rating_data,
                     Metascore = metascore_data, 
                     Votes = votes_data,
                     Gross_Earning_in_Mil = gross_data,
                     Director = directors_data, 
                     Actor = actors_data)
    movies %>% print(width=Inf)
    ## # A tibble: 100 x 11
    ##     Rank Title                           
    ##    <int> <chr>                           
    ##  1     1 The Gentlemen                   
    ##  2     2 Knives Out                      
    ##  3     3 Joker                           
    ##  4     4 Judy                            
    ##  5     5 Sound of Metal                  
    ##  6     6 Avengers: Endgame               
    ##  7     7 Once Upon a Time... in Hollywood
    ##  8     8 Parasite                        
    ##  9     9 Gemini Man                      
    ## 10    10 Ford v Ferrari                  
    ##    Description                                                                  
    ##    <chr>                                                                        
    ##  1 An American expat tries to sell off his highly profitable marijuana empire i…
    ##  2 A detective investigates the death of a patriarch of an eccentric, combative…
    ##  3 In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and m…
    ##  4 Legendary performer Judy Garland arrives in London in the winter of 1968 to …
    ##  5 A heavy-metal drummer's life is thrown into freefall when he begins to lose …
    ##  6 After the devastating events of Avengers: Infinity War (2018), the universe …
    ##  7 A faded television actor and his stunt double strive to achieve fame and suc…
    ##  8 Greed and class discrimination threaten the newly formed symbiotic relations…
    ##  9 An over-the-hill hitman faces off against a younger clone of himself.        
    ## 10 American car designer Carroll Shelby and driver Ken Miles battle corporate i…
    ##    Runtime Genre     Rating Metascore  Votes Gross_Earning_in_Mil
    ##      <dbl> <chr>      <dbl>     <dbl>  <dbl>                <dbl>
    ##  1     113 Action       7.9        51 189868                 NA  
    ##  2     130 Comedy       7.9        82 396786                165. 
    ##  3     122 Crime        8.5        59 875700                335. 
    ##  4     118 Biography    6.8        66  36134                 NA  
    ##  5     130 Drama        7          87    381                 NA  
    ##  6     181 Action       8.4        78 767928                858. 
    ##  7     161 Comedy       7.6        83 517451                142. 
    ##  8     132 Comedy       8.6        96 487781                 53.4
    ##  9     117 Action       5.7        38  78802                 20.6
    ## 10     152 Action       8.1        81 262853                118. 
    ##    Director          Actor              
    ##    <chr>             <chr>              
    ##  1 Guy Ritchie       Matthew McConaughey
    ##  2 Rian Johnson      Daniel Craig       
    ##  3 Todd Phillips     Joaquin Phoenix    
    ##  4 Rupert Goold      Renée Zellweger    
    ##  5 Darius Marder     Riz Ahmed          
    ##  6 Anthony Russo     Robert Downey Jr.  
    ##  7 Quentin Tarantino Leonardo DiCaprio  
    ##  8 Bong Joon Ho      Kang-ho Song       
    ##  9 Ang Lee           Will Smith         
    ## 10 James Mangold     Matt Damon         
    ## # … with 90 more rows
  • How many top 100 movies are in each genre? (Be careful with interpretation.)

    movies %>%
      ggplot() +
      geom_bar(mapping = aes(x = Genre))

  • Which genre is most profitable in terms of average gross earnings?

    movies %>%
      group_by(Genre) %>%
      summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)) %>%
      ggplot() +
        geom_col(mapping = aes(x = Genre, y = avg_earning)) + 
        labs(y = "avg earning in millions")
    ## `summarise()` ungrouping output (override with `.groups` argument)
    ## Warning: Removed 1 rows containing missing values (position_stack).

    ggplot(data = movies) +
      geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) + 
      labs(y = "Gross earning in millions")
    ## Warning: Removed 49 rows containing non-finite values (stat_boxplot).

  • Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre

    library("ggrepel")
    (best_in_genre <- movies %>%
        group_by(Genre) %>%
        filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
    ## # A tibble: 8 x 11
    ## # Groups:   Genre [8]
    ##    Rank Title Description Runtime Genre Rating Metascore  Votes Gross_Earning_i…
    ##   <int> <chr> <chr>         <dbl> <chr>  <dbl>     <dbl>  <dbl>            <dbl>
    ## 1     2 Kniv… A detectiv…     130 Come…    7.9        82 396786            165. 
    ## 2     3 Joker In Gotham …     122 Crime    8.5        59 875700            335. 
    ## 3     6 Aven… After the …     181 Acti…    8.4        78 767928            858. 
    ## 4    29 The … After the …     118 Anim…    6.9        55 202357            544. 
    ## 5    30 It C… Twenty-sev…     169 Drama    6.5        58 195488            212. 
    ## 6    39 Us    A family's…     116 Horr…    6.9        53 212981            175. 
    ## 7    45 Alad… A kind-hea…     128 Adve…    7          65 218135            356. 
    ## 8    59 Rock… A musical …     121 Biog…    7.3        64 126091             96.4
    ## # … with 2 more variables: Director <chr>, Actor <chr>
    ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
      geom_point(mapping = aes(size = Votes, color = Genre)) + 
      ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
      labs(y = "Gross earning in millions")
    ## Warning: Removed 49 rows containing missing values (geom_point).

RSelenium Example: FCC’s television broadcast signal strength

Many websites dynamically pull data from databases using JavasScript and JQuery that make them difficult to scrape.

The FCC’s dtvmaps webpage has a simple form in which you enter a zip code and it gives you the available local TV stations in that zip code and their signal strength.

You’ll also notice the URL stays fixed with different zip codes.

Why RSelenium

  • RSelenium loads the page that we want to scrape and download the HTML from that page.

    • particularly useful when scraping something behind a login

    • simulate human behavior on a website (e.g., mouse clicking)

  • rvest provides typical scraping tools

rm(list = ls()) # clean-up workspace
library("RSelenium")
library("tidyverse")
library("rvest")

Open up a browser

rD <- rsDriver(browser="firefox", port=7360L, verbose=F)
remDr <- rD[["client"]]

Open a webpage

remDr$navigate("https://www.fcc.gov/media/engineering/dtvmaps")

We want to send a string of text (zip code) into the form.

zip <- "70118"
# remDr$findElement(using = "id", value = "startpoint")$clearElement()
remDr$findElement(using = "id", value = "startpoint")$sendKeysToElement(list(zip))
# other possible ("xpath", "css selector", "id", "name", "tag name", "class name", "link text", "partial link text")

Click on the button Go!

remDr$findElements("id", "btnSub")[[1]]$clickElement()

Extract data from HTML

  • save HTML to an object

  • use rvest for the rest

Sys.sleep(5) # give the page time to fully load, in seconds
html <- remDr$getPageSource()[[1]]

signals <- read_html(html) %>% 
  html_nodes("table.tbl_mapReception") %>% # extract table nodes with class = "tbl_mapReception"
  .[3] %>% # keep the third of these tables
  .[[1]] %>% # keep the first element of this list
  html_table(fill=T) # have rvest turn it into a dataframe
signals
##                        Callsign                     Callsign
## 1  Click on callsign for detail Click on callsign for detail
## 2                                                     WWL-TV
## 3                                                           
## 4                                                       WUPL
## 5                                                           
## 6                                                    WPXL-TV
## 7                                                           
## 8                                                       WHNO
## 9                                                           
## 10                                                   WVUE-DT
## 11                                                          
## 12                                                      WGNO
## 13                                                          
## 14                                                      WDSU
## 15                                                          
## 16                                                   WNOL-TV
## 17                                                          
## 18                                                   WYES-TV
## 19                                                          
## 20                                                   WTNO-LP
## 21                                                          
## 22                                                   WLAE-TV
## 23                                                          
## 24                                                   KNOV-CD
## 25                                                          
## 26                                                   WBXN-CD
## 27                                                          
## 28                                                   KGLA-DT
## 29                                                          
## 30                                                   WBRZ-TV
## 31                                                          
## 32                                                   WVLA-TV
## 33                                                          
## 34                                                   WGMB-TV
## 35                                                          
## 36                                                      WAFB
## 37                                                          
##                         Network                          Ch#
## 1  Click on callsign for detail Click on callsign for detail
## 2                           CBS                            4
## 3                                                           
## 4                          MYNE                           54
## 5                                                           
## 6                           ION                           49
## 7                                                           
## 8                           IND                           20
## 9                                                           
## 10                          FOX                            8
## 11                                                          
## 12                          ABC                           26
## 13                                                          
## 14                          NBC                            6
## 15                                                          
## 16                           CW                           38
## 17                                                          
## 18                          PBS                           12
## 19                                                          
## 20                                                          
## 21                                                          
## 22                          PBS                           32
## 23                                                          
## 24                                                          
## 25                                                          
## 26                                                          
## 27                                                          
## 28                          IND                           42
## 29                                                          
## 30                          ABC                            2
## 31                                                          
## 32                          NBC                           33
## 33                                                          
## 34                          FOX                           44
## 35                                                          
## 36                          CBS                            9
## 37                                                          
##                            Band
## 1  Click on callsign for detail
## 2                           UHF
## 3                              
## 4                           UHF
## 5                              
## 6                           UHF
## 7                              
## 8                           UHF
## 9                              
## 10                          UHF
## 11                             
## 12                          UHF
## 13                             
## 14                          UHF
## 15                             
## 16                          UHF
## 17                             
## 18                         Hi-V
## 19                             
## 20                          UHF
## 21                             
## 22                          UHF
## 23                             
## 24                          UHF
## 25                             
## 26                          UHF
## 27                             
## 28                          UHF
## 29                             
## 30                         Hi-V
## 31                             
## 32                          UHF
## 33                             
## 34                          UHF
## 35                             
## 36                         Hi-V
## 37                             
##                                                                                                                                                                                                                                                           IA
## 1                                                                                                                                                                                                                                                       <NA>
## 2  RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 3                                                                                                                                                                                                                                                           
## 4  RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 5                                                                                                                                                                                                                                                           
## 6  RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 7                                                                                                                                                                                                                                                           
## 8                                                                                                                                                                                                                                                           
## 9                                                                                                                                                                                                                                                           
## 10                                                                                                                                                                                                                                                          
## 11                                                                                                                                                                                                                                                          
## 12                                                                                                                                                                                                                                                          
## 13                                                                                                                                                                                                                                                          
## 14 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 15                                                                                                                                                                                                                                                          
## 16                                                                                                                                                                                                                                                          
## 17                                                                                                                                                                                                                                                          
## 18                                                                                                                                                                                                                                                          
## 19                                                                                                                                                                                                                                                          
## 20                                                                                                                                                                                                                                                          
## 21                                                                                                                                                                                                                                                          
## 22 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 23                                                                                                                                                                                                                                                          
## 24 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 25                                                                                                                                                                                                                                                          
## 26 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 27                                                                                                                                                                                                                                                          
## 28 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 29                                                                                                                                                                                                                                                          
## 30                                                                                                                                                                                                                                                          
## 31                                                                                                                                                                                                                                                          
## 32                                                                                                                                                                                                                                                          
## 33                                                                                                                                                                                                                                                          
## 34 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 35                                                                                                                                                                                                                                                          
## 36                                                                                                                                                                                                                                                          
## 37

More formatting on signals

names(signals) <- c("rm", "callsign", "network", "ch_num", "band", "rm2") # rename columns

signals <- signals %>%
  slice(2:n()) %>% # drop unnecessary first row
  filter(callsign != "") %>% # drop blank rows
  select(callsign:band) # drop unnecessary columns
signals
##    callsign network ch_num band
## 1    WWL-TV     CBS      4  UHF
## 2      WUPL    MYNE     54  UHF
## 3   WPXL-TV     ION     49  UHF
## 4      WHNO     IND     20  UHF
## 5   WVUE-DT     FOX      8  UHF
## 6      WGNO     ABC     26  UHF
## 7      WDSU     NBC      6  UHF
## 8   WNOL-TV      CW     38  UHF
## 9   WYES-TV     PBS     12 Hi-V
## 10  WTNO-LP                 UHF
## 11  WLAE-TV     PBS     32  UHF
## 12  KNOV-CD                 UHF
## 13  WBXN-CD                 UHF
## 14  KGLA-DT     IND     42  UHF
## 15  WBRZ-TV     ABC      2 Hi-V
## 16  WVLA-TV     NBC     33  UHF
## 17  WGMB-TV     FOX     44  UHF
## 18     WAFB     CBS      9 Hi-V

Capture all text by clicking on each Callsign

read_html(html) %>% 
  html_nodes(".callsign") %>% 
  html_attr("onclick")
##  [1] "getdetail(6810,74192,'WWL-TV Facility ID: 74192 <br>WWL-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=74192 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/74192 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 27<br>RX Strength: 115 dbuV/m<br>Tower Distance: 5 mi; Direction: 116°<br>Repacked Channel: 27<br>Repacking Dates: 10/19/2019 to 1/17/2020','WWL-TV<br>Distance to Tower: 5 miles<br>Direction to Tower: 116 deg',29.9063611111111,-90.0394722222222,'WWL-TV')"    
##  [2] "getdetail(6787,13938,'WUPL Facility ID: 13938 <br>WUPL (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=13938 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/13938 target=_new>Public File</a>)<br>City of License: SLIDELL, LA<br>RF Channel: 17<br>RX Strength: 114 dbuV/m<br>Tower Distance: 5 mi; Direction: 116°<br>Repacked Channel: 17<br>Repacking Dates: 10/19/2019 to 1/17/2020','WUPL<br>Distance to Tower: 5 miles<br>Direction to Tower: 116 deg',29.9063611111111,-90.0394722222222,'WUPL')"                
##  [3] "getdetail(6862,21729,'WPXL-TV Facility ID: 21729 <br>WPXL-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=21729 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/21729 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 33<br>RX Strength: 111 dbuV/m<br>Tower Distance: 11 mi; Direction: 74°<br>Repacked Channel: 33<br>Repacking Dates: 10/19/2019 to 1/17/2020','WPXL-TV<br>Distance to Tower: 11 miles<br>Direction to Tower: 74 deg',29.9827777777778,-89.9494444444445,'WPXL-TV')"
##  [4] "getdetail(7951,37106,'WHNO Facility ID: 37106 <br>WHNO (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=37106 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/37106 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 21<br>RX Strength: 111 dbuV/m<br>Tower Distance: 6 mi; Direction: 103°','WHNO<br>Distance to Tower: 6 miles<br>Direction to Tower: 103 deg',29.9203055555556,-90.0245833333333,'WHNO')"                                                                                
##  [5] "getdetail(7480,4149,'WVUE-DT Facility ID: 4149 <br>WVUE-DT (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=4149 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/4149 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 29<br>RX Strength: 111 dbuV/m<br>Tower Distance: 10 mi; Direction: 84°','WVUE-DT<br>Distance to Tower: 10 miles<br>Direction to Tower: 84 deg',29.9541388888889,-89.9495277777778,'WVUE-DT')"                                                                        
##  [6] "getdetail(7420,72119,'WGNO Facility ID: 72119 <br>WGNO (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=72119 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/72119 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 26<br>RX Strength: 111 dbuV/m<br>Tower Distance: 10 mi; Direction: 86°','WGNO<br>Distance to Tower: 10 miles<br>Direction to Tower: 86 deg',29.95,-89.9577777777778,'WGNO')"                                                                                           
##  [7] "getdetail(6887,71357,'WDSU Facility ID: 71357 <br>WDSU (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=71357 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/71357 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 19<br>RX Strength: 111 dbuV/m<br>Tower Distance: 10 mi; Direction: 86°<br>Repacked Channel: 19<br>Repacking Dates: 10/19/2019 to 1/17/2020','WDSU<br>Distance to Tower: 10 miles<br>Direction to Tower: 86 deg',29.95,-89.9577777777778,'WDSU')"                       
##  [8] "getdetail(7421,54280,'WNOL-TV Facility ID: 54280 <br>WNOL-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=54280 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/54280 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 15<br>RX Strength: 110 dbuV/m<br>Tower Distance: 10 mi; Direction: 86°','WNOL-TV<br>Distance to Tower: 10 miles<br>Direction to Tower: 86 deg',29.95,-89.9577777777778,'WNOL-TV')"                                                                               
##  [9] "getdetail(7608,25090,'WYES-TV Facility ID: 25090 <br>WYES-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=25090 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/25090 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 11<br>RX Strength: 102 dbuV/m<br>Tower Distance: 10 mi; Direction: 85°','WYES-TV<br>Distance to Tower: 10 miles<br>Direction to Tower: 85 deg',29.9538888888889,-89.9494444444445,'WYES-TV')"                                                                    
## [10] "getdetail(8313,24981,'WTNO-LP Facility ID: 24981 <br>WTNO-LP (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=24981 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/24981 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 22<br>RX Strength: 106 dbuV/m<br>Tower Distance: 3 mi; Direction: 330°','WTNO-LP<br>Distance to Tower: 3 miles<br>Direction to Tower: 330 deg',29.9746111111111,-90.1434722222222,'WTNO-LP')"                                                                    
## [11] "getdetail(6946,18819,'WLAE-TV Facility ID: 18819 <br>WLAE-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=18819 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/18819 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 23<br>RX Strength: 104 dbuV/m<br>Tower Distance: 10 mi; Direction: 74°<br>Repacked Channel: 23<br>Repacking Dates: 10/19/2019 to 1/17/2020','WLAE-TV<br>Distance to Tower: 10 miles<br>Direction to Tower: 74 deg',29.9827777777778,-89.9525,'WLAE-TV')"         
## [12] "getdetail(8180,64048,'KNOV-CD Facility ID: 64048 <br>KNOV-CD (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=64048 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/64048 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 31<br>RX Strength: 101 dbuV/m<br>Tower Distance: 3 mi; Direction: 74°<br>Repacked Channel: 31<br>Repacking Dates: 10/19/2019 to 1/17/2020','KNOV-CD<br>Distance to Tower: 3 miles<br>Direction to Tower: 74 deg',29.9521388888889,-90.0702777777778,'KNOV-CD')"  
## [13] "getdetail(8155,70419,'WBXN-CD Facility ID: 70419 <br>WBXN-CD (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=70419 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/70419 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 36<br>RX Strength: 98 dbuV/m<br>Tower Distance: 5 mi; Direction: 116°<br>Repacked Channel: 36<br>Repacking Dates: 10/19/2019 to 1/17/2020','WBXN-CD<br>Distance to Tower: 5 miles<br>Direction to Tower: 116 deg',29.9063611111111,-90.0394722222222,'WBXN-CD')" 
## [14] "getdetail(6465,83945,'KGLA-DT Facility ID: 83945 <br>KGLA-DT (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=83945 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/83945 target=_new>Public File</a>)<br>City of License: HAMMOND, LA<br>RF Channel: 35<br>RX Strength: 93 dbuV/m<br>Tower Distance: 11 mi; Direction: 76°<br>Repacked Channel: 35<br>Repacking Dates: 3/14/2020 to 5/1/2020','KGLA-DT<br>Distance to Tower: 11 miles<br>Direction to Tower: 76 deg',29.9783333333333,-89.9405555555556,'KGLA-DT')"       
## [15] "getdetail(7483,38616,'WBRZ-TV Facility ID: 38616 <br>WBRZ-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=38616 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/38616 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 13<br>RX Strength: 45 dbuV/m<br>Tower Distance: 69 mi; Direction: 291°','WBRZ-TV<br>Distance to Tower: 69 miles<br>Direction to Tower: 291 deg',30.2969444444444,-91.1936111111111,'WBRZ-TV')"                                                                   
## [16] "getdetail(7918,70021,'WVLA-TV Facility ID: 70021 <br>WVLA-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=70021 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/70021 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 34<br>RX Strength: 46 dbuV/m<br>Tower Distance: 74 mi; Direction: 291°','WVLA-TV<br>Distance to Tower: 74 miles<br>Direction to Tower: 291 deg',30.3262777777778,-91.2766944444444,'WVLA-TV')"                                                                   
## [17] "getdetail(6750,12520,'WGMB-TV Facility ID: 12520 <br>WGMB-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=12520 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/12520 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 24<br>RX Strength: 43 dbuV/m<br>Tower Distance: 74 mi; Direction: 291°<br>Repacked Channel: 24<br>Repacking Dates: 1/18/2020 to 3/13/2020','WGMB-TV<br>Distance to Tower: 74 miles<br>Direction to Tower: 291 deg',30.3262777777778,-91.2766944444444,'WGMB-TV')"
## [18] "getdetail(7664,589,'WAFB Facility ID: 589 <br>WAFB (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=589 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/589 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 9<br>RX Strength: 37 dbuV/m<br>Tower Distance: 72 mi; Direction: 295°','WAFB<br>Distance to Tower: 72 miles<br>Direction to Tower: 295 deg',30.3663888888889,-91.2130555555556,'WAFB')"

Extract signal by string operations

strength <- read_html(html) %>% 
  html_nodes(".callsign") %>% 
  html_attr("onclick") %>% 
  str_extract("(?<=RX Strength: )\\s*\\-*[0-9.]+")

# (?<=…)  is a special regex expression for positive lookbehind

signals <- cbind(signals, strength)
signals
##    callsign network ch_num band strength
## 1    WWL-TV     CBS      4  UHF      115
## 2      WUPL    MYNE     54  UHF      114
## 3   WPXL-TV     ION     49  UHF      111
## 4      WHNO     IND     20  UHF      111
## 5   WVUE-DT     FOX      8  UHF      111
## 6      WGNO     ABC     26  UHF      111
## 7      WDSU     NBC      6  UHF      111
## 8   WNOL-TV      CW     38  UHF      110
## 9   WYES-TV     PBS     12 Hi-V      102
## 10  WTNO-LP                 UHF      106
## 11  WLAE-TV     PBS     32  UHF      104
## 12  KNOV-CD                 UHF      101
## 13  WBXN-CD                 UHF       98
## 14  KGLA-DT     IND     42  UHF       93
## 15  WBRZ-TV     ABC      2 Hi-V       45
## 16  WVLA-TV     NBC     33  UHF       46
## 17  WGMB-TV     FOX     44  UHF       43
## 18     WAFB     CBS      9 Hi-V       37