HW2
subjective nature of visualization
longitudinal
tidy data
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.0.2 magrittr_1.5 tools_4.0.2 htmltools_0.5.0
## [5] yaml_2.2.1 stringi_1.5.3 rmarkdown_2.3 knitr_1.30
## [9] stringr_1.4.0 xfun_0.17 digest_0.6.25 rlang_0.4.7
## [13] evaluate_0.14
Dr. Hua Zhou’s slides
Josh McCrain’s RSelenium tutorial
Load tidyverse and other packages for this lecture:
library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("rvest")
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
There is a wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2019.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2019 can be accessed at page https://www.imdb.com/search/title?count=100&release_date=2019,2019&title_type=feature.
#Loading the rvest and tidyverse package
#Specifying the url for desired website to be scraped
url <- "http://www.imdb.com/search/title?count=100&release_date=2019,2019&title_type=feature"
#Reading the HTML code from the website
(webpage <- read_html(url))
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
Suppose we want to scrape following 11 features from this page:
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# (rank_data_html <- html_nodes(webpage, '.lister-item-content .text-primary'))
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100
Use SelectorGadget to find the CSS selector .lister-item-header a
.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt8367814/?ref_=adv_li_tt">The Gentlemen</a>
## [2] <a href="/title/tt8946378/?ref_=adv_li_tt">Knives Out</a>
## [3] <a href="/title/tt7286456/?ref_=adv_li_tt">Joker</a>
## [4] <a href="/title/tt7549996/?ref_=adv_li_tt">Judy</a>
## [5] <a href="/title/tt5363618/?ref_=adv_li_tt">Sound of Metal</a>
## [6] <a href="/title/tt4154796/?ref_=adv_li_tt">Avengers: Endgame</a>
## [7] <a href="/title/tt7131622/?ref_=adv_li_tt">Once Upon a Time... in Hollyw ...
## [8] <a href="/title/tt6751668/?ref_=adv_li_tt">Parasite</a>
## [9] <a href="/title/tt1025100/?ref_=adv_li_tt">Gemini Man</a>
## [10] <a href="/title/tt1950186/?ref_=adv_li_tt">Ford v Ferrari</a>
## [11] <a href="/title/tt2584384/?ref_=adv_li_tt">Jojo Rabbit</a>
## [12] <a href="/title/tt4916630/?ref_=adv_li_tt">Just Mercy</a>
## [13] <a href="/title/tt4126476/?ref_=adv_li_tt">After</a>
## [14] <a href="/title/tt5606664/?ref_=adv_li_tt">Doctor Sleep</a>
## [15] <a href="/title/tt8688634/?ref_=adv_li_tt">21 Bridges</a>
## [16] <a href="/title/tt1620981/?ref_=adv_li_tt">The Addams Family</a>
## [17] <a href="/title/tt8772262/?ref_=adv_li_tt">Midsommar</a>
## [18] <a href="/title/tt6450804/?ref_=adv_li_tt">Terminator: Dark Fate</a>
## [19] <a href="/title/tt8579674/?ref_=adv_li_tt">1917</a>
## [20] <a href="/title/tt7984734/?ref_=adv_li_tt">The Lighthouse</a>
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "The Gentlemen"
## [2] "Knives Out"
## [3] "Joker"
## [4] "Judy"
## [5] "Sound of Metal"
## [6] "Avengers: Endgame"
## [7] "Once Upon a Time... in Hollywood"
## [8] "Parasite"
## [9] "Gemini Man"
## [10] "Ford v Ferrari"
## [11] "Jojo Rabbit"
## [12] "Just Mercy"
## [13] "After"
## [14] "Doctor Sleep"
## [15] "21 Bridges"
## [16] "The Addams Family"
## [17] "Midsommar"
## [18] "Terminator: Dark Fate"
## [19] "1917"
## [20] "The Lighthouse"
## [21] "Motherless Brooklyn"
## [22] "Bombshell"
## [23] "The Peanut Butter Falcon"
## [24] "Jumanji: The Next Level"
## [25] "Little Women"
## [26] "Star Wars: Episode IX - The Rise of Skywalker"
## [27] "Ready or Not"
## [28] "Spiral"
## [29] "The Lion King"
## [30] "It Chapter Two"
## [31] "Charlie's Angels"
## [32] "Uncut Gems"
## [33] "Infidel"
## [34] "The Irishman"
## [35] "The King"
## [36] "Midway"
## [37] "Spider-Man: Far from Home"
## [38] "A Call to Spy"
## [39] "Us"
## [40] "Alita: Battle Angel"
## [41] "The Lodge"
## [42] "Captain Marvel"
## [43] "Frozen II"
## [44] "Hustlers"
## [45] "Aladdin"
## [46] "I See You"
## [47] "Portrait of a Lady on Fire"
## [48] "Richard Jewell"
## [49] "The Platform"
## [50] "6 Underground"
## [51] "Spies in Disguise"
## [52] "Fast & Furious Presents: Hobbs & Shaw"
## [53] "Zombieland: Double Tap"
## [54] "John Wick: Chapter 3 - Parabellum"
## [55] "Ad Astra"
## [56] "Godzilla: King of the Monsters"
## [57] "Marriage Story"
## [58] "Do Not Reply"
## [59] "Rocketman"
## [60] "Toy Story 4"
## [61] "Vivarium"
## [62] "Iron Mask"
## [63] "X-Men: Dark Phoenix"
## [64] "The Personal History of David Copperfield"
## [65] "Booksmart"
## [66] "Shazam!"
## [67] "Maleficent: Mistress of Evil"
## [68] "Anna"
## [69] "Glass"
## [70] "A Beautiful Day in the Neighborhood"
## [71] "Color Out of Space"
## [72] "Waiting for the Barbarians"
## [73] "Pets United"
## [74] "Dark Waters"
## [75] "Guns Akimbo"
## [76] "Honey Boy"
## [77] "Bad Education"
## [78] "Operation Brothers"
## [79] "Scary Stories to Tell in the Dark"
## [80] "Yesterday"
## [81] "Cats"
## [82] "The Informer"
## [83] "The Jesus Rolls"
## [84] "Extremely Wicked, Shockingly Evil and Vile"
## [85] "Dora and the Lost City of Gold"
## [86] "Polar"
## [87] "Angel Has Fallen"
## [88] "3022"
## [89] "Babyteeth"
## [90] "A Hidden Life"
## [91] "Rambo: Last Blood"
## [92] "Monsoon"
## [93] "Saint Maud"
## [94] "El Camino: A Breaking Bad Movie"
## [95] "In the Shadow of the Moon"
## [96] "Downton Abbey"
## [97] "The Wretched"
## [98] "Murder Mystery"
## [99] "True History of the Kelly Gang"
## [100] "Blackbird"
# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\n An American expat tries to sell off his high ...
## [2] <p class="text-muted">\n A detective investigates the death of a patr ...
## [3] <p class="text-muted">\n In Gotham City, mentally troubled comedian A ...
## [4] <p class="text-muted">\n Legendary performer <a href="/name/nm0000023 ...
## [5] <p class="text-muted">\n A heavy-metal drummer's life is thrown into ...
## [6] <p class="text-muted">\n After the devastating events of <a href="/ti ...
## [7] <p class="text-muted">\n A faded television actor and his stunt doubl ...
## [8] <p class="text-muted">\n Greed and class discrimination threaten the ...
## [9] <p class="text-muted">\n An over-the-hill hitman faces off against a ...
## [10] <p class="text-muted">\n American car designer <a href="/name/nm07909 ...
## [11] <p class="text-muted">\n A young boy in Hitler's army finds out his m ...
## [12] <p class="text-muted">\n World-renowned civil rights defense attorney ...
## [13] <p class="text-muted">\n A young woman falls for a guy with a dark se ...
## [14] <p class="text-muted">\n Years following the events of <a href="/titl ...
## [15] <p class="text-muted">\n An embattled NYPD detective is thrust into a ...
## [16] <p class="text-muted">\n The eccentrically macabre family moves to a ...
## [17] <p class="text-muted">\n A couple travels to Sweden to visit a rural ...
## [18] <p class="text-muted">\n An augmented human and Sarah Connor must sto ...
## [19] <p class="text-muted">\n April 6th, 1917. As a regiment assembles to ...
## [20] <p class="text-muted">\n Two lighthouse keepers try to maintain their ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\n An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [2] "\n A detective investigates the death of a patriarch of an eccentric, combative family."
## [3] "\n In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker."
## [4] "\n Legendary performer Judy Garland arrives in London in the winter of 1968 to perform a series of sold-out concerts."
## [5] "\n A heavy-metal drummer's life is thrown into freefall when he begins to lose his hearing."
## [6] "\n After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n\\s+", "")
head(description_data)
## [1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [2] "A detective investigates the death of a patriarch of an eccentric, combative family."
## [3] "In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker."
## [4] "Legendary performer Judy Garland arrives in London in the winter of 1968 to perform a series of sold-out concerts."
## [5] "A heavy-metal drummer's life is thrown into freefall when he begins to lose his hearing."
## [6] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
html_nodes('.runtime') %>%
html_text() %>%
str_replace(" min", "") %>%
as.integer())
## [1] 113 130 122 118 130 181 161 132 117 152 108 137 105 152 99 86 148 128
## [19] 119 109 144 109 97 123 135 141 95 87 118 169 118 135 108 209 140 138
## [37] 129 123 116 122 108 123 103 110 128 98 122 131 94 128 102 137 99 130
## [55] 123 132 137 98 121 100 97 120 113 119 102 132 119 118 129 109 111 112
## [73] 89 126 98 94 108 129 108 116 110 113 85 110 102 118 121 91 118 174
## [91] 89 85 84 122 115 122 95 97 124 97
# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
## [1] "113 min" "130 min" "122 min" "118 min" "130 min" "181 min"
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 113 130 122 118 130 181
Collect the (first) genre of each movie:
# Using CSS selectors to scrap the Movie genre section
genre_data_html <- html_nodes(webpage, '.genre')
# Converting the genre data to text
genre_data <- html_text(genre_data_html)
# Let's have a look at the genre data
head(genre_data)
## [1] "\nAction, Comedy, Crime "
## [2] "\nComedy, Crime, Drama "
## [3] "\nCrime, Drama, Thriller "
## [4] "\nBiography, Drama, Romance "
## [5] "\nDrama, Music "
## [6] "\nAction, Adventure, Drama "
# Data-Preprocessing: retrieve the first word
genre_data <- str_extract(genre_data, "[:alpha:]+")
# Convering each genre from text to factor
#genre_data <- as.factor(genre_data)
# Let's have another look at the genre data
head(genre_data)
## [1] "Action" "Comedy" "Crime" "Biography" "Drama" "Action"
# Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
# Converting the ratings data to text
rating_data <- html_text(rating_data_html)
# Let's have a look at the ratings
head(rating_data)
## [1] "7.9" "7.9" "8.5" "6.8" "7.0" "8.4"
# Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
# Let's have another look at the ratings data
rating_data
## [1] 7.9 7.9 8.5 6.8 7.0 8.4 7.6 8.6 5.7 8.1 7.9 7.6 5.3 7.4 6.6 5.8 7.1 6.2
## [19] 8.3 7.5 6.8 6.8 7.6 6.7 7.8 6.6 6.8 5.9 6.9 6.5 4.8 7.4 6.6 7.9 7.2 6.7
## [37] 7.5 5.5 6.9 7.3 6.0 6.9 6.9 6.3 7.0 6.8 8.1 7.5 7.0 6.1 6.8 6.4 6.7 7.4
## [55] 6.6 6.0 8.0 4.1 7.3 7.8 5.8 4.6 5.8 6.4 7.2 7.1 6.6 6.6 6.7 7.3 6.2 5.8
## [73] 3.4 7.6 6.3 7.3 7.1 6.6 6.2 6.8 2.8 6.6 4.3 6.6 6.1 6.3 6.4 4.6 7.1 7.4
## [91] 6.1 5.9 6.9 7.3 6.2 7.4 5.8 6.0 6.0 6.4
# Using CSS selectors to scrap the votes section
votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
# Converting the votes data to text
votes_data <- html_text(votes_data_html)
# Let's have a look at the votes data
head(votes_data)
## [1] "189,868" "396,786" "875,700" "36,134" "381" "767,928"
# Data-Preprocessing: removing commas
votes_data <- str_replace(votes_data, ",", "")
# Data-Preprocessing: converting votes to numerical
votes_data <- as.numeric(votes_data)
#Let's have another look at the votes data
votes_data
## [1] 189868 396786 875700 36134 381 767928 517451 487781 78802 262853
## [11] 264068 39822 32707 127429 41749 24700 181502 137047 373982 125799
## [21] 36334 73816 56243 173739 126331 352201 100029 888 202357 195488
## [31] 51674 203531 755 308378 81408 59935 308543 304 212981 219723
## [41] 26582 426110 125971 78510 218135 23389 52756 51111 149289 131383
## [51] 30265 160281 129218 261386 188637 126840 231240 224 126091 191907
## [61] 31073 2663 148296 8641 86634 243422 78676 56625 199778 52035
## [71] 26373 4157 1024 47926 32549 24332 25659 21828 56304 107094
## [81] 36619 16870 2465 74100 22177 71624 74166 3263 5189 15707
## [91] 77768 371 656 174550 37827 36718 7813 96490 5793 623
# Using CSS selectors to scrap the directors section
(directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm0005363/?ref_=adv_li_dr_0">Guy Ritchie</a>
## [2] <a href="/name/nm0426059/?ref_=adv_li_dr_0">Rian Johnson</a>
## [3] <a href="/name/nm0680846/?ref_=adv_li_dr_0">Todd Phillips</a>
## [4] <a href="/name/nm3734458/?ref_=adv_li_dr_0">Rupert Goold</a>
## [5] <a href="/name/nm2942187/?ref_=adv_li_dr_0">Darius Marder</a>
## [6] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
## [7] <a href="/name/nm0000233/?ref_=adv_li_dr_0">Quentin Tarantino</a>
## [8] <a href="/name/nm0094435/?ref_=adv_li_dr_0">Bong Joon Ho</a>
## [9] <a href="/name/nm0000487/?ref_=adv_li_dr_0">Ang Lee</a>
## [10] <a href="/name/nm0003506/?ref_=adv_li_dr_0">James Mangold</a>
## [11] <a href="/name/nm0169806/?ref_=adv_li_dr_0">Taika Waititi</a>
## [12] <a href="/name/nm2308774/?ref_=adv_li_dr_0">Destin Daniel Cretton</a>
## [13] <a href="/name/nm1788310/?ref_=adv_li_dr_0">Jenny Gage</a>
## [14] <a href="/name/nm1093039/?ref_=adv_li_dr_0">Mike Flanagan</a>
## [15] <a href="/name/nm1047532/?ref_=adv_li_dr_0">Brian Kirk</a>
## [16] <a href="/name/nm0862911/?ref_=adv_li_dr_0">Greg Tiernan</a>
## [17] <a href="/name/nm4170048/?ref_=adv_li_dr_0">Ari Aster</a>
## [18] <a href="/name/nm1783265/?ref_=adv_li_dr_0">Tim Miller</a>
## [19] <a href="/name/nm0005222/?ref_=adv_li_dr_0">Sam Mendes</a>
## [20] <a href="/name/nm3211470/?ref_=adv_li_dr_0">Robert Eggers</a>
## ...
# Converting the directors data to text
directors_data <- html_text(directors_data_html)
# Let's have a look at the directors data
directors_data
## [1] "Guy Ritchie" "Rian Johnson" "Todd Phillips"
## [4] "Rupert Goold" "Darius Marder" "Anthony Russo"
## [7] "Quentin Tarantino" "Bong Joon Ho" "Ang Lee"
## [10] "James Mangold" "Taika Waititi" "Destin Daniel Cretton"
## [13] "Jenny Gage" "Mike Flanagan" "Brian Kirk"
## [16] "Greg Tiernan" "Ari Aster" "Tim Miller"
## [19] "Sam Mendes" "Robert Eggers" "Edward Norton"
## [22] "Jay Roach" "Tyler Nilson" "Jake Kasdan"
## [25] "Greta Gerwig" "J.J. Abrams" "Matt Bettinelli-Olpin"
## [28] "Kurtis David Harder" "Jon Favreau" "Andy Muschietti"
## [31] "Elizabeth Banks" "Benny Safdie" "Cyrus Nowrasteh"
## [34] "Martin Scorsese" "David Michôd" "Roland Emmerich"
## [37] "Jon Watts" "Lydia Dean Pilcher" "Jordan Peele"
## [40] "Robert Rodriguez" "Severin Fiala" "Anna Boden"
## [43] "Chris Buck" "Lorene Scafaria" "Guy Ritchie"
## [46] "Adam Randall" "Céline Sciamma" "Clint Eastwood"
## [49] "Galder Gaztelu-Urrutia" "Michael Bay" "Nick Bruno"
## [52] "David Leitch" "Ruben Fleischer" "Chad Stahelski"
## [55] "James Gray" "Michael Dougherty" "Noah Baumbach"
## [58] "Daniel Woltosz" "Dexter Fletcher" "Josh Cooley"
## [61] "Lorcan Finnegan" "Oleg Stepchenko" "Simon Kinberg"
## [64] "Armando Iannucci" "Olivia Wilde" "David F. Sandberg"
## [67] "Joachim Rønning" "Luc Besson" "M. Night Shyamalan"
## [70] "Marielle Heller" "Richard Stanley" "Ciro Guerra"
## [73] "Reinhard Klooss" "Todd Haynes" "Jason Lei Howden"
## [76] "Alma Har'el" "Cory Finley" "Gideon Raff"
## [79] "André Øvredal" "Danny Boyle" "Tom Hooper"
## [82] "Andrea Di Stefano" "John Turturro" "Joe Berlinger"
## [85] "James Bobin" "Jonas Åkerlund" "Ric Roman Waugh"
## [88] "John Suits" "Shannon Murphy" "Terrence Malick"
## [91] "Adrian Grunberg" "Hong Khaou" "Rose Glass"
## [94] "Vince Gilligan" "Jim Mickle" "Michael Engler"
## [97] "Brett Pierce" "Kyle Newacheck" "Justin Kurzel"
## [100] "Roger Michell"
# Using CSS selectors to scrap the actors section
(actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm0000190/?ref_=adv_li_st_0">Matthew McConaughey</a>
## [2] <a href="/name/nm0185819/?ref_=adv_li_st_0">Daniel Craig</a>
## [3] <a href="/name/nm0001618/?ref_=adv_li_st_0">Joaquin Phoenix</a>
## [4] <a href="/name/nm0000250/?ref_=adv_li_st_0">Renée Zellweger</a>
## [5] <a href="/name/nm1981893/?ref_=adv_li_st_0">Riz Ahmed</a>
## [6] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
## [7] <a href="/name/nm0000138/?ref_=adv_li_st_0">Leonardo DiCaprio</a>
## [8] <a href="/name/nm0814280/?ref_=adv_li_st_0">Kang-ho Song</a>
## [9] <a href="/name/nm0000226/?ref_=adv_li_st_0">Will Smith</a>
## [10] <a href="/name/nm0000354/?ref_=adv_li_st_0">Matt Damon</a>
## [11] <a href="/name/nm9877392/?ref_=adv_li_st_0">Roman Griffin Davis</a>
## [12] <a href="/name/nm0430107/?ref_=adv_li_st_0">Michael B. Jordan</a>
## [13] <a href="/name/nm6466214/?ref_=adv_li_st_0">Josephine Langford</a>
## [14] <a href="/name/nm0000191/?ref_=adv_li_st_0">Ewan McGregor</a>
## [15] <a href="/name/nm1569276/?ref_=adv_li_st_0">Chadwick Boseman</a>
## [16] <a href="/name/nm1209966/?ref_=adv_li_st_0">Oscar Isaac</a>
## [17] <a href="/name/nm6073955/?ref_=adv_li_st_0">Florence Pugh</a>
## [18] <a href="/name/nm0000157/?ref_=adv_li_st_0">Linda Hamilton</a>
## [19] <a href="/name/nm2835616/?ref_=adv_li_st_0">Dean-Charles Chapman</a>
## [20] <a href="/name/nm1500155/?ref_=adv_li_st_0">Robert Pattinson</a>
## ...
# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
# Let's have a look at the actors data
head(actors_data)
## [1] "Matthew McConaughey" "Daniel Craig" "Joaquin Phoenix"
## [4] "Renée Zellweger" "Riz Ahmed" "Robert Downey Jr."
Be careful with missing data.
# Using CSS selectors to scrap the metascore section
metascore_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
# Let's have a look at the metascore
head(metascore_data)
## [1] "51 " "82 " "59 " "66 " "87 "
## [6] "78 "
# Data-Preprocessing: removing extra space in metascore
metascore_data <- str_replace(metascore_data, "\\s*$", "")
metascore_data <- as.numeric(metascore_data)
metascore_data
## [1] 51 82 59 66 87 78 83 96 38 81 58 68 30 59 51 46 72 54 78 83 60 64 70 58 91
## [26] 53 64 55 58 52 91 94 62 47 69 67 81 53 64 64 64 79 53 65 95 68 73 41 54 60
## [51] 55 73 80 48 94 69 84 64 43 77 84 71 43 40 43 80 70 52 73 42 73 79 61 55 32
## [76] 61 44 52 63 19 45 77 78 26 77 72 48 64 61 38 75 54
# Lets check the length of metascore data
length(metascore_data)
## [1] 92
# Visual inspection finds 24, 85, 100 don't have metascore
ms <- rep(NA, 100)
ms[-c(24, 85, 100)] <- metascore_data
## Warning in ms[-c(24, 85, 100)] <- metascore_data: number of items to replace is
## not a multiple of replacement length
(metascore_data <- ms)
## [1] 51 82 59 66 87 78 83 96 38 81 58 68 30 59 51 46 72 54 78 83 60 64 70 NA 58
## [26] 91 53 64 55 58 52 91 94 62 47 69 67 81 53 64 64 64 79 53 65 95 68 73 41 54
## [51] 60 55 73 80 48 94 69 84 64 43 77 84 71 43 40 43 80 70 52 73 42 73 79 61 55
## [76] 32 61 44 52 63 19 45 77 78 NA 26 77 72 48 64 61 38 75 54 51 82 59 66 87 NA
Be careful with missing data.
# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the gross data
head(gross_data)
## [1] "$165.36M" "$335.45M" "$858.37M" "$142.50M" "$53.37M" "$20.55M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 10)
#(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
length(gross_data)
## [1] 51
# Visual inspection finds below movies don't have gross
#gs_data <- rep(NA, 100)
#gs_data[-c(1, 2, 3, 5, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
#(gross_data <- gs_data)
60 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.
(rank_and_gross <- webpage %>%
html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
html_text() %>%
str_replace("\\s+", "") %>%
str_replace_all("[$M]", ""))
## [1] "1." "2." "165.36" "3." "335.45" "4." "5." "6."
## [9] "858.37" "7." "142.50" "8." "53.37" "9." "20.55" "10."
## [17] "117.62" "11." "0.35" "12." "13." "12.14" "14." "15."
## [25] "16." "100.04" "17." "27.33" "18." "62.25" "19." "159.23"
## [33] "20." "0.43" "21." "22." "23." "13.12" "24." "316.83"
## [41] "25." "108.10" "26." "515.20" "27." "26.74" "28." "29."
## [49] "543.64" "30." "211.59" "31." "32." "33." "34." "7.00"
## [57] "35." "36." "37." "390.53" "38." "39." "175.08" "40."
## [65] "85.71" "41." "42." "426.83" "43." "477.37" "44." "80.55"
## [73] "45." "355.56" "46." "47." "3.76" "48." "49." "50."
## [81] "51." "52." "173.96" "53." "26.80" "54." "171.02" "55."
## [89] "35.40" "56." "110.50" "57." "2.00" "58." "59." "96.37"
## [97] "60." "434.04" "61." "62." "63." "65.85" "64." "65."
## [105] "22.68" "66." "140.37" "67." "113.93" "68." "7.74" "69."
## [113] "111.05" "70." "61.70" "71." "72." "73." "74." "75."
## [121] "76." "77." "78." "79." "62.74" "80." "73.29" "81."
## [129] "82." "83." "84." "85." "54.89" "86." "87." "67.16"
## [137] "88." "89." "90." "91." "18.87" "92." "93." "94."
## [145] "95." "96." "96.85" "97." "98." "99." "100."
isrank <- str_detect(rank_and_gross, "\\.$")
ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
missingpos <- as.integer(rank_and_gross[ismissing])
gs_data <- rep(NA, 100)
gs_data[-missingpos] <- gross_data
(gross_data <- gs_data)
## [1] NA 165.36 335.45 NA NA 858.37 142.50 53.37 20.55 117.62
## [11] 0.35 NA 12.14 NA NA 100.04 27.33 62.25 159.23 0.43
## [21] NA NA 13.12 316.83 108.10 515.20 26.74 NA 543.64 211.59
## [31] NA NA NA 7.00 NA NA 390.53 NA 175.08 85.71
## [41] NA 426.83 477.37 80.55 355.56 NA 3.76 NA NA NA
## [51] NA 173.96 26.80 171.02 35.40 110.50 2.00 NA 96.37 434.04
## [61] NA NA 65.85 NA 22.68 140.37 113.93 7.74 111.05 61.70
## [71] NA NA NA NA NA NA NA NA 62.74 73.29
## [81] NA NA NA NA 54.89 NA 67.16 NA NA NA
## [91] 18.87 NA NA NA NA 96.85 NA NA NA NA
Following code programatically figures out missing entries for metascore.
# Use CSS selectors to scrap the rankings section
(rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
## {xml_nodeset (192)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="metascore mixed">51 </span>
## [3] <span class="lister-item-index unbold text-primary">2.</span>
## [4] <span class="metascore favorable">82 </span>
## [5] <span class="lister-item-index unbold text-primary">3.</span>
## [6] <span class="metascore mixed">59 </span>
## [7] <span class="lister-item-index unbold text-primary">4.</span>
## [8] <span class="metascore favorable">66 </span>
## [9] <span class="lister-item-index unbold text-primary">5.</span>
## [10] <span class="metascore favorable">87 </span>
## [11] <span class="lister-item-index unbold text-primary">6.</span>
## [12] <span class="metascore favorable">78 </span>
## [13] <span class="lister-item-index unbold text-primary">7.</span>
## [14] <span class="metascore favorable">83 </span>
## [15] <span class="lister-item-index unbold text-primary">8.</span>
## [16] <span class="metascore favorable">96 </span>
## [17] <span class="lister-item-index unbold text-primary">9.</span>
## [18] <span class="metascore unfavorable">38 </span>
## [19] <span class="lister-item-index unbold text-primary">10.</span>
## [20] <span class="metascore favorable">81 </span>
## ...
# Convert the ranking data to text
(rank_metascore_data <- html_text(rank_metascore_data_html))
## [1] "1." "51 " "2." "82 " "3."
## [6] "59 " "4." "66 " "5." "87 "
## [11] "6." "78 " "7." "83 " "8."
## [16] "96 " "9." "38 " "10." "81 "
## [21] "11." "58 " "12." "68 " "13."
## [26] "30 " "14." "59 " "15." "51 "
## [31] "16." "46 " "17." "72 " "18."
## [36] "54 " "19." "78 " "20." "83 "
## [41] "21." "60 " "22." "64 " "23."
## [46] "70 " "24." "58 " "25." "91 "
## [51] "26." "53 " "27." "64 " "28."
## [56] "29." "55 " "30." "58 " "31."
## [61] "52 " "32." "91 " "33." "34."
## [66] "94 " "35." "62 " "36." "47 "
## [71] "37." "69 " "38." "67 " "39."
## [76] "81 " "40." "53 " "41." "64 "
## [81] "42." "64 " "43." "64 " "44."
## [86] "79 " "45." "53 " "46." "65 "
## [91] "47." "95 " "48." "68 " "49."
## [96] "73 " "50." "41 " "51." "54 "
## [101] "52." "60 " "53." "55 " "54."
## [106] "73 " "55." "80 " "56." "48 "
## [111] "57." "94 " "58." "59." "69 "
## [116] "60." "84 " "61." "64 " "62."
## [121] "63." "43 " "64." "77 " "65."
## [126] "84 " "66." "71 " "67." "43 "
## [131] "68." "40 " "69." "43 " "70."
## [136] "80 " "71." "70 " "72." "52 "
## [141] "73." "74." "73 " "75." "42 "
## [146] "76." "73 " "77." "79 " "78."
## [151] "79." "61 " "80." "55 " "81."
## [156] "32 " "82." "61 " "83." "44 "
## [161] "84." "52 " "85." "63 " "86."
## [166] "19 " "87." "45 " "88." "89."
## [171] "77 " "90." "78 " "91." "26 "
## [176] "92." "93." "77 " "94." "72 "
## [181] "95." "48 " "96." "64 " "97."
## [186] "61 " "98." "38 " "99." "75 "
## [191] "100." "54 "
# Strip spaces
(rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
## [1] "1." "51" "2." "82" "3." "59" "4." "66" "5." "87"
## [11] "6." "78" "7." "83" "8." "96" "9." "38" "10." "81"
## [21] "11." "58" "12." "68" "13." "30" "14." "59" "15." "51"
## [31] "16." "46" "17." "72" "18." "54" "19." "78" "20." "83"
## [41] "21." "60" "22." "64" "23." "70" "24." "58" "25." "91"
## [51] "26." "53" "27." "64" "28." "29." "55" "30." "58" "31."
## [61] "52" "32." "91" "33." "34." "94" "35." "62" "36." "47"
## [71] "37." "69" "38." "67" "39." "81" "40." "53" "41." "64"
## [81] "42." "64" "43." "64" "44." "79" "45." "53" "46." "65"
## [91] "47." "95" "48." "68" "49." "73" "50." "41" "51." "54"
## [101] "52." "60" "53." "55" "54." "73" "55." "80" "56." "48"
## [111] "57." "94" "58." "59." "69" "60." "84" "61." "64" "62."
## [121] "63." "43" "64." "77" "65." "84" "66." "71" "67." "43"
## [131] "68." "40" "69." "43" "70." "80" "71." "70" "72." "52"
## [141] "73." "74." "73" "75." "42" "76." "73" "77." "79" "78."
## [151] "79." "61" "80." "55" "81." "32" "82." "61" "83." "44"
## [161] "84." "52" "85." "63" "86." "19" "87." "45" "88." "89."
## [171] "77" "90." "78" "91." "26" "92." "93." "77" "94." "72"
## [181] "95." "48" "96." "64" "97." "61" "98." "38" "99." "75"
## [191] "100." "54"
# a rank followed by another rank means the metascore for the 1st rank is missing
(isrank <- str_detect(rank_metascore_data, "\\.$"))
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [13] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [25] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [49] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [61] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [73] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [85] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [97] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [109] TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [121] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [133] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [145] FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE
## [157] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [169] TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE
## [181] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
ismissing <- isrank[1:length(rank_metascore_data)-1] &
isrank[2:length(rank_metascore_data)]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
(missingpos <- as.integer(rank_metascore_data[ismissing]))
## [1] 28 33 58 62 73 78 88 92
#(rank_metascore_data <- as.integer(rank_metascore_data))
You (students) should work out the code for finding missing positions for gross.
Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data,
Title = title_data,
Description = description_data,
Runtime = runtime_data,
Genre = genre_data,
Rating = rating_data,
Metascore = metascore_data,
Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data,
Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 x 11
## Rank Title
## <int> <chr>
## 1 1 The Gentlemen
## 2 2 Knives Out
## 3 3 Joker
## 4 4 Judy
## 5 5 Sound of Metal
## 6 6 Avengers: Endgame
## 7 7 Once Upon a Time... in Hollywood
## 8 8 Parasite
## 9 9 Gemini Man
## 10 10 Ford v Ferrari
## Description
## <chr>
## 1 An American expat tries to sell off his highly profitable marijuana empire i…
## 2 A detective investigates the death of a patriarch of an eccentric, combative…
## 3 In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and m…
## 4 Legendary performer Judy Garland arrives in London in the winter of 1968 to …
## 5 A heavy-metal drummer's life is thrown into freefall when he begins to lose …
## 6 After the devastating events of Avengers: Infinity War (2018), the universe …
## 7 A faded television actor and his stunt double strive to achieve fame and suc…
## 8 Greed and class discrimination threaten the newly formed symbiotic relations…
## 9 An over-the-hill hitman faces off against a younger clone of himself.
## 10 American car designer Carroll Shelby and driver Ken Miles battle corporate i…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 113 Action 7.9 51 189868 NA
## 2 130 Comedy 7.9 82 396786 165.
## 3 122 Crime 8.5 59 875700 335.
## 4 118 Biography 6.8 66 36134 NA
## 5 130 Drama 7 87 381 NA
## 6 181 Action 8.4 78 767928 858.
## 7 161 Comedy 7.6 83 517451 142.
## 8 132 Comedy 8.6 96 487781 53.4
## 9 117 Action 5.7 38 78802 20.6
## 10 152 Action 8.1 81 262853 118.
## Director Actor
## <chr> <chr>
## 1 Guy Ritchie Matthew McConaughey
## 2 Rian Johnson Daniel Craig
## 3 Todd Phillips Joaquin Phoenix
## 4 Rupert Goold Renée Zellweger
## 5 Darius Marder Riz Ahmed
## 6 Anthony Russo Robert Downey Jr.
## 7 Quentin Tarantino Leonardo DiCaprio
## 8 Bong Joon Ho Kang-ho Song
## 9 Ang Lee Will Smith
## 10 James Mangold Matt Damon
## # … with 90 more rows
How many top 100 movies are in each genre? (Be careful with interpretation.)
movies %>%
ggplot() +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)) %>%
ggplot() +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 1 rows containing missing values (position_stack).
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 49 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
## # A tibble: 8 x 11
## # Groups: Genre [8]
## Rank Title Description Runtime Genre Rating Metascore Votes Gross_Earning_i…
## <int> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2 Kniv… A detectiv… 130 Come… 7.9 82 396786 165.
## 2 3 Joker In Gotham … 122 Crime 8.5 59 875700 335.
## 3 6 Aven… After the … 181 Acti… 8.4 78 767928 858.
## 4 29 The … After the … 118 Anim… 6.9 55 202357 544.
## 5 30 It C… Twenty-sev… 169 Drama 6.5 58 195488 212.
## 6 39 Us A family's… 116 Horr… 6.9 53 212981 175.
## 7 45 Alad… A kind-hea… 128 Adve… 7 65 218135 356.
## 8 59 Rock… A musical … 121 Biog… 7.3 64 126091 96.4
## # … with 2 more variables: Director <chr>, Actor <chr>
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 49 rows containing missing values (geom_point).
Many websites dynamically pull data from databases using JavasScript and JQuery that make them difficult to scrape.
The FCC’s dtvmaps webpage has a simple form in which you enter a zip code and it gives you the available local TV stations in that zip code and their signal strength.
You’ll also notice the URL stays fixed with different zip codes.
RSelenium loads the page that we want to scrape and download the HTML from that page.
particularly useful when scraping something behind a login
simulate human behavior on a website (e.g., mouse clicking)
rvest provides typical scraping tools
rm(list = ls()) # clean-up workspace
library("RSelenium")
library("tidyverse")
library("rvest")
rD <- rsDriver(browser="firefox", port=7360L, verbose=F)
remDr <- rD[["client"]]
Open a webpage
remDr$navigate("https://www.fcc.gov/media/engineering/dtvmaps")
We want to send a string of text (zip code) into the form.
zip <- "70118"
# remDr$findElement(using = "id", value = "startpoint")$clearElement()
remDr$findElement(using = "id", value = "startpoint")$sendKeysToElement(list(zip))
# other possible ("xpath", "css selector", "id", "name", "tag name", "class name", "link text", "partial link text")
Click on the button Go!
remDr$findElements("id", "btnSub")[[1]]$clickElement()
save HTML to an object
use rvest for the rest
Sys.sleep(5) # give the page time to fully load, in seconds
html <- remDr$getPageSource()[[1]]
signals <- read_html(html) %>%
html_nodes("table.tbl_mapReception") %>% # extract table nodes with class = "tbl_mapReception"
.[3] %>% # keep the third of these tables
.[[1]] %>% # keep the first element of this list
html_table(fill=T) # have rvest turn it into a dataframe
signals
## Callsign Callsign
## 1 Click on callsign for detail Click on callsign for detail
## 2 WWL-TV
## 3
## 4 WUPL
## 5
## 6 WPXL-TV
## 7
## 8 WHNO
## 9
## 10 WVUE-DT
## 11
## 12 WGNO
## 13
## 14 WDSU
## 15
## 16 WNOL-TV
## 17
## 18 WYES-TV
## 19
## 20 WTNO-LP
## 21
## 22 WLAE-TV
## 23
## 24 KNOV-CD
## 25
## 26 WBXN-CD
## 27
## 28 KGLA-DT
## 29
## 30 WBRZ-TV
## 31
## 32 WVLA-TV
## 33
## 34 WGMB-TV
## 35
## 36 WAFB
## 37
## Network Ch#
## 1 Click on callsign for detail Click on callsign for detail
## 2 CBS 4
## 3
## 4 MYNE 54
## 5
## 6 ION 49
## 7
## 8 IND 20
## 9
## 10 FOX 8
## 11
## 12 ABC 26
## 13
## 14 NBC 6
## 15
## 16 CW 38
## 17
## 18 PBS 12
## 19
## 20
## 21
## 22 PBS 32
## 23
## 24
## 25
## 26
## 27
## 28 IND 42
## 29
## 30 ABC 2
## 31
## 32 NBC 33
## 33
## 34 FOX 44
## 35
## 36 CBS 9
## 37
## Band
## 1 Click on callsign for detail
## 2 UHF
## 3
## 4 UHF
## 5
## 6 UHF
## 7
## 8 UHF
## 9
## 10 UHF
## 11
## 12 UHF
## 13
## 14 UHF
## 15
## 16 UHF
## 17
## 18 Hi-V
## 19
## 20 UHF
## 21
## 22 UHF
## 23
## 24 UHF
## 25
## 26 UHF
## 27
## 28 UHF
## 29
## 30 Hi-V
## 31
## 32 UHF
## 33
## 34 UHF
## 35
## 36 Hi-V
## 37
## IA
## 1 <NA>
## 2 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 3
## 4 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 5
## 6 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 23
## 24 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 25
## 26 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 27
## 28 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 29
## 30
## 31
## 32
## 33
## 34 RThis station received a new RF channel as a result of the Incentive Auction; Click on the station's callsign to see information about when the station will be changing frequencies. The FCC will update this page as new information becomes available.
## 35
## 36
## 37
More formatting on signals
names(signals) <- c("rm", "callsign", "network", "ch_num", "band", "rm2") # rename columns
signals <- signals %>%
slice(2:n()) %>% # drop unnecessary first row
filter(callsign != "") %>% # drop blank rows
select(callsign:band) # drop unnecessary columns
signals
## callsign network ch_num band
## 1 WWL-TV CBS 4 UHF
## 2 WUPL MYNE 54 UHF
## 3 WPXL-TV ION 49 UHF
## 4 WHNO IND 20 UHF
## 5 WVUE-DT FOX 8 UHF
## 6 WGNO ABC 26 UHF
## 7 WDSU NBC 6 UHF
## 8 WNOL-TV CW 38 UHF
## 9 WYES-TV PBS 12 Hi-V
## 10 WTNO-LP UHF
## 11 WLAE-TV PBS 32 UHF
## 12 KNOV-CD UHF
## 13 WBXN-CD UHF
## 14 KGLA-DT IND 42 UHF
## 15 WBRZ-TV ABC 2 Hi-V
## 16 WVLA-TV NBC 33 UHF
## 17 WGMB-TV FOX 44 UHF
## 18 WAFB CBS 9 Hi-V
Capture all text by clicking on each Callsign
read_html(html) %>%
html_nodes(".callsign") %>%
html_attr("onclick")
## [1] "getdetail(6810,74192,'WWL-TV Facility ID: 74192 <br>WWL-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=74192 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/74192 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 27<br>RX Strength: 115 dbuV/m<br>Tower Distance: 5 mi; Direction: 116°<br>Repacked Channel: 27<br>Repacking Dates: 10/19/2019 to 1/17/2020','WWL-TV<br>Distance to Tower: 5 miles<br>Direction to Tower: 116 deg',29.9063611111111,-90.0394722222222,'WWL-TV')"
## [2] "getdetail(6787,13938,'WUPL Facility ID: 13938 <br>WUPL (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=13938 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/13938 target=_new>Public File</a>)<br>City of License: SLIDELL, LA<br>RF Channel: 17<br>RX Strength: 114 dbuV/m<br>Tower Distance: 5 mi; Direction: 116°<br>Repacked Channel: 17<br>Repacking Dates: 10/19/2019 to 1/17/2020','WUPL<br>Distance to Tower: 5 miles<br>Direction to Tower: 116 deg',29.9063611111111,-90.0394722222222,'WUPL')"
## [3] "getdetail(6862,21729,'WPXL-TV Facility ID: 21729 <br>WPXL-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=21729 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/21729 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 33<br>RX Strength: 111 dbuV/m<br>Tower Distance: 11 mi; Direction: 74°<br>Repacked Channel: 33<br>Repacking Dates: 10/19/2019 to 1/17/2020','WPXL-TV<br>Distance to Tower: 11 miles<br>Direction to Tower: 74 deg',29.9827777777778,-89.9494444444445,'WPXL-TV')"
## [4] "getdetail(7951,37106,'WHNO Facility ID: 37106 <br>WHNO (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=37106 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/37106 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 21<br>RX Strength: 111 dbuV/m<br>Tower Distance: 6 mi; Direction: 103°','WHNO<br>Distance to Tower: 6 miles<br>Direction to Tower: 103 deg',29.9203055555556,-90.0245833333333,'WHNO')"
## [5] "getdetail(7480,4149,'WVUE-DT Facility ID: 4149 <br>WVUE-DT (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=4149 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/4149 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 29<br>RX Strength: 111 dbuV/m<br>Tower Distance: 10 mi; Direction: 84°','WVUE-DT<br>Distance to Tower: 10 miles<br>Direction to Tower: 84 deg',29.9541388888889,-89.9495277777778,'WVUE-DT')"
## [6] "getdetail(7420,72119,'WGNO Facility ID: 72119 <br>WGNO (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=72119 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/72119 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 26<br>RX Strength: 111 dbuV/m<br>Tower Distance: 10 mi; Direction: 86°','WGNO<br>Distance to Tower: 10 miles<br>Direction to Tower: 86 deg',29.95,-89.9577777777778,'WGNO')"
## [7] "getdetail(6887,71357,'WDSU Facility ID: 71357 <br>WDSU (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=71357 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/71357 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 19<br>RX Strength: 111 dbuV/m<br>Tower Distance: 10 mi; Direction: 86°<br>Repacked Channel: 19<br>Repacking Dates: 10/19/2019 to 1/17/2020','WDSU<br>Distance to Tower: 10 miles<br>Direction to Tower: 86 deg',29.95,-89.9577777777778,'WDSU')"
## [8] "getdetail(7421,54280,'WNOL-TV Facility ID: 54280 <br>WNOL-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=54280 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/54280 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 15<br>RX Strength: 110 dbuV/m<br>Tower Distance: 10 mi; Direction: 86°','WNOL-TV<br>Distance to Tower: 10 miles<br>Direction to Tower: 86 deg',29.95,-89.9577777777778,'WNOL-TV')"
## [9] "getdetail(7608,25090,'WYES-TV Facility ID: 25090 <br>WYES-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=25090 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/25090 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 11<br>RX Strength: 102 dbuV/m<br>Tower Distance: 10 mi; Direction: 85°','WYES-TV<br>Distance to Tower: 10 miles<br>Direction to Tower: 85 deg',29.9538888888889,-89.9494444444445,'WYES-TV')"
## [10] "getdetail(8313,24981,'WTNO-LP Facility ID: 24981 <br>WTNO-LP (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=24981 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/24981 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 22<br>RX Strength: 106 dbuV/m<br>Tower Distance: 3 mi; Direction: 330°','WTNO-LP<br>Distance to Tower: 3 miles<br>Direction to Tower: 330 deg',29.9746111111111,-90.1434722222222,'WTNO-LP')"
## [11] "getdetail(6946,18819,'WLAE-TV Facility ID: 18819 <br>WLAE-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=18819 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/18819 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 23<br>RX Strength: 104 dbuV/m<br>Tower Distance: 10 mi; Direction: 74°<br>Repacked Channel: 23<br>Repacking Dates: 10/19/2019 to 1/17/2020','WLAE-TV<br>Distance to Tower: 10 miles<br>Direction to Tower: 74 deg',29.9827777777778,-89.9525,'WLAE-TV')"
## [12] "getdetail(8180,64048,'KNOV-CD Facility ID: 64048 <br>KNOV-CD (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=64048 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/64048 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 31<br>RX Strength: 101 dbuV/m<br>Tower Distance: 3 mi; Direction: 74°<br>Repacked Channel: 31<br>Repacking Dates: 10/19/2019 to 1/17/2020','KNOV-CD<br>Distance to Tower: 3 miles<br>Direction to Tower: 74 deg',29.9521388888889,-90.0702777777778,'KNOV-CD')"
## [13] "getdetail(8155,70419,'WBXN-CD Facility ID: 70419 <br>WBXN-CD (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=70419 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/70419 target=_new>Public File</a>)<br>City of License: NEW ORLEANS, LA<br>RF Channel: 36<br>RX Strength: 98 dbuV/m<br>Tower Distance: 5 mi; Direction: 116°<br>Repacked Channel: 36<br>Repacking Dates: 10/19/2019 to 1/17/2020','WBXN-CD<br>Distance to Tower: 5 miles<br>Direction to Tower: 116 deg',29.9063611111111,-90.0394722222222,'WBXN-CD')"
## [14] "getdetail(6465,83945,'KGLA-DT Facility ID: 83945 <br>KGLA-DT (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=83945 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/83945 target=_new>Public File</a>)<br>City of License: HAMMOND, LA<br>RF Channel: 35<br>RX Strength: 93 dbuV/m<br>Tower Distance: 11 mi; Direction: 76°<br>Repacked Channel: 35<br>Repacking Dates: 3/14/2020 to 5/1/2020','KGLA-DT<br>Distance to Tower: 11 miles<br>Direction to Tower: 76 deg',29.9783333333333,-89.9405555555556,'KGLA-DT')"
## [15] "getdetail(7483,38616,'WBRZ-TV Facility ID: 38616 <br>WBRZ-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=38616 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/38616 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 13<br>RX Strength: 45 dbuV/m<br>Tower Distance: 69 mi; Direction: 291°','WBRZ-TV<br>Distance to Tower: 69 miles<br>Direction to Tower: 291 deg',30.2969444444444,-91.1936111111111,'WBRZ-TV')"
## [16] "getdetail(7918,70021,'WVLA-TV Facility ID: 70021 <br>WVLA-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=70021 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/70021 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 34<br>RX Strength: 46 dbuV/m<br>Tower Distance: 74 mi; Direction: 291°','WVLA-TV<br>Distance to Tower: 74 miles<br>Direction to Tower: 291 deg',30.3262777777778,-91.2766944444444,'WVLA-TV')"
## [17] "getdetail(6750,12520,'WGMB-TV Facility ID: 12520 <br>WGMB-TV (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=12520 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/12520 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 24<br>RX Strength: 43 dbuV/m<br>Tower Distance: 74 mi; Direction: 291°<br>Repacked Channel: 24<br>Repacking Dates: 1/18/2020 to 3/13/2020','WGMB-TV<br>Distance to Tower: 74 miles<br>Direction to Tower: 291 deg',30.3262777777778,-91.2766944444444,'WGMB-TV')"
## [18] "getdetail(7664,589,'WAFB Facility ID: 589 <br>WAFB (<a href=https://enterpriseefiling.fcc.gov/dataentry/public/tv/publicFacilityDetails.html?facilityId=589 target=_new>Licensing</a>) (<a href=https://publicfiles.fcc.gov/tv-profile/589 target=_new>Public File</a>)<br>City of License: BATON ROUGE, LA<br>RF Channel: 9<br>RX Strength: 37 dbuV/m<br>Tower Distance: 72 mi; Direction: 295°','WAFB<br>Distance to Tower: 72 miles<br>Direction to Tower: 295 deg',30.3663888888889,-91.2130555555556,'WAFB')"
Extract signal by string operations
strength <- read_html(html) %>%
html_nodes(".callsign") %>%
html_attr("onclick") %>%
str_extract("(?<=RX Strength: )\\s*\\-*[0-9.]+")
# (?<=…) is a special regex expression for positive lookbehind
signals <- cbind(signals, strength)
signals
## callsign network ch_num band strength
## 1 WWL-TV CBS 4 UHF 115
## 2 WUPL MYNE 54 UHF 114
## 3 WPXL-TV ION 49 UHF 111
## 4 WHNO IND 20 UHF 111
## 5 WVUE-DT FOX 8 UHF 111
## 6 WGNO ABC 26 UHF 111
## 7 WDSU NBC 6 UHF 111
## 8 WNOL-TV CW 38 UHF 110
## 9 WYES-TV PBS 12 Hi-V 102
## 10 WTNO-LP UHF 106
## 11 WLAE-TV PBS 32 UHF 104
## 12 KNOV-CD UHF 101
## 13 WBXN-CD UHF 98
## 14 KGLA-DT IND 42 UHF 93
## 15 WBRZ-TV ABC 2 Hi-V 45
## 16 WVLA-TV NBC 33 UHF 46
## 17 WGMB-TV FOX 44 UHF 43
## 18 WAFB CBS 9 Hi-V 37