diff --git a/data-raw/parse_rdfs.R b/data-raw/parse_rdfs.R index bcbd52b..f2a5b69 100644 --- a/data-raw/parse_rdfs.R +++ b/data-raw/parse_rdfs.R @@ -1,10 +1,10 @@ library(fs) library(dplyr) -library(gutenbergr) library(here) library(purrr) library(stringr) library(tibble) +# library(tictoc) library(xml2) source(here::here("data-raw", "parsers.R")) @@ -14,6 +14,7 @@ source(here::here("data-raw", "parsers.R")) # this timestamp yet, other than to inform users. updated <- lubridate::date(lubridate::now(tzone = "UTC")) +# tictoc::tic() cache_dir <- download_raw_data() rdf_paths <- unname(fs::dir_ls(cache_dir, recurse = TRUE, glob = "*.rdf")) @@ -40,18 +41,6 @@ new_gutenberg_subjects <- purrr::map_dfr(all_metadata, ~ .x$subjects) |> dplyr::distinct() |> dplyr::arrange(gutenberg_id) -# waldo::compare(nrow(gutenberg_authors), nrow(new_gutenberg_authors)) -# waldo::compare(nrow(gutenberg_subjects), nrow(new_gutenberg_subjects)) -# waldo::compare(nrow(gutenberg_languages), nrow(new_gutenberg_languages)) -# waldo::compare(nrow(gutenberg_metadata), nrow(new_gutenberg_metadata)) -# dplyr::distinct(new_gutenberg_metadata, gutenberg_id, has_text) |> -# dplyr::left_join( -# dplyr::distinct(gutenberg_metadata, gutenberg_id, has_text), -# by = "gutenberg_id" -# ) |> -# dplyr::filter(has_text.x != has_text.y) |> -# dplyr::filter(!has_text.x) - gutenberg_authors <- new_gutenberg_authors gutenberg_subjects <- new_gutenberg_subjects gutenberg_languages <- new_gutenberg_languages @@ -87,3 +76,4 @@ rm( parse_subject, updated ) +# tictoc::toc() diff --git a/data/gutenberg_authors.rda b/data/gutenberg_authors.rda index 3cd523f..08d709b 100644 Binary files a/data/gutenberg_authors.rda and b/data/gutenberg_authors.rda differ diff --git a/data/gutenberg_languages.rda b/data/gutenberg_languages.rda index e0b8dbb..9a4c8c2 100644 Binary files a/data/gutenberg_languages.rda and b/data/gutenberg_languages.rda differ diff --git a/data/gutenberg_metadata.rda b/data/gutenberg_metadata.rda index 18a2166..b1ae97c 100644 Binary files a/data/gutenberg_metadata.rda and b/data/gutenberg_metadata.rda differ diff --git a/data/gutenberg_subjects.rda b/data/gutenberg_subjects.rda index 4a57c53..988d780 100644 Binary files a/data/gutenberg_subjects.rda and b/data/gutenberg_subjects.rda differ