Give read_html() its own documentation (#390)

Fixes #388
tidyverse · Feb 2, 2024 · e6e7480 · e6e7480
1 parent a018808
commit e6e7480
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 4 deletions.
diff --git a/R/rvest-package.R b/R/rvest-package.R
@@ -3,8 +3,50 @@
 #' @importFrom lifecycle deprecated
 "_PACKAGE"
 
+
+#' Static web scraping (with xml2)
+#'
+#' @description
+#' [read_html()] works by performing a HTTP request then parsing the HTML
+#' received using the xml2 package. This is "static" scraping because it
+#' operates only on the raw HTML file. While this works for most sites,
+#' in some cases you will need to use [read_html_live()] if the parts of
+#' the page you want to scrape are dynamically generated with javascript.
+#'
+#' Generally, we recommend using `read_html()` if it works, as it will be
+#' faster and more robust, as it has fewer external dependencies (i.e. it
+#' doesn't rely on the Chrome web browser installed on your computer.)
+#'
+#' @inheritParams xml2::read_html
+#' @param x Usually a string representing a URL. See [xml2::read_html()] for
+#'   other options.
+#' @rdname read_html
 #' @importFrom xml2 read_html
 #' @export
+#' @examples
+#' # Start by reading a HTML page with read_html():
+#' starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html")
+#'
+#' # Then find elements that match a css selector or XPath expression
+#' # using html_elements(). In this example, each <section> corresponds
+#' # to a different film
+#' films <- starwars %>% html_elements("section")
+#' films
+#'
+#' # Then use html_element() to extract one element per film. Here
+#' # we the title is given by the text inside <h2>
+#' title <- films %>%
+#'   html_element("h2") %>%
+#'   html_text2()
+#' title
+#'
+#' # Or use html_attr() to get data out of attributes. html_attr() always
+#' # returns a string so we convert it to an integer using a readr function
+#' episode <- films %>%
+#'   html_element("h2") %>%
+#'   html_attr("data-id") %>%
+#'   readr::parse_integer()
+#' episode
 xml2::read_html
 
 #' @importFrom xml2 url_absolute

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -18,7 +18,7 @@ reference:
   - read_html_live
   - LiveHTML
 
-- title: Extract data from HTML
+- title: Extract data
   contents:
   - html_attr
   - html_children
@@ -27,7 +27,7 @@ reference:
   - html_table
   - html_text
 
-- title: Session
+- title: Move from page to page
   contents:
   - session
   - html_form

diff --git a/man/read_html.Rd b/man/read_html.Rd
diff --git a/man/reexports.Rd b/man/reexports.Rd