From 0c86c0a3ea843ace813f6d57a0040550adbf3b1a Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Thu, 8 Aug 2024 21:31:37 +1000 Subject: [PATCH] Fixed excessive writes (#2). --- CHANGELOG.md | 4 ++++ README.md | 2 +- pyproject.toml | 2 +- src/oalc_creator/creator.py | 27 ++++++++++++++++++++------- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d6712a..573ee93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ ## Changelog 🔄 All notable changes to the Open Australian Legal Corpus Creator will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.0.4] - 2024-08-08 +### Fixed +- Fixed the fact that, when the Creator was run, it would unnecessarily rewrite the entire Corpus in order to detect and remove duplicates, outdated documents and otherwise repair it (which caused excessive writes and overwore disks) by instead first reading the Corpus and then only overwriting it if found necessary as, although this can sometimes double read time, reading is much cheaper on SSDs (which most modern drives are) than writing ([#2](https://github.com/umarbutler/open-australian-legal-corpus-creator/issues/2)). + ## [3.0.3] - 2024-08-05 ### Fixed - Fixed a bug preventing the scraping of documents from the NSW Legislation database that are stored as PDFs but are reported by the database's web server as being HTML files. diff --git a/README.md b/README.md index 1d2b999..7a098e6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Open Australian Legal Corpus Creator - + The [Open Australian Legal Corpus](https://huggingface.co/datasets/umarbutler/open-australian-legal-corpus) is the first and only multijurisdictional open corpus of Australian legislative and judicial documents. This repository contains the code used to create and update the Corpus. diff --git a/pyproject.toml b/pyproject.toml index 88f6bf9..9711148 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "oalc-creator" -version = "3.0.3" +version = "3.0.4" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] diff --git a/src/oalc_creator/creator.py b/src/oalc_creator/creator.py index 915534b..1de8ea3 100644 --- a/src/oalc_creator/creator.py +++ b/src/oalc_creator/creator.py @@ -247,28 +247,41 @@ async def create(self) -> None: for entry in entries.entries } - # Deduplicate (and, if necessary, repair) the Corpus and remove any documents that have the same source as the sources being scraped and do not appear in the sources' indices; and also store the version ids of documents not removed from the Corpus in order to later identify missing documents to be added to the Corpus. + # Index the version ids of documents in the Corpus from the desired sources in order to later identify missing documents to be added to the Corpus and also determine whether it is necessary to deduplicate, repair and/or remove outdated documents (in particular, documents that have the same source as the sources being scraped and do not appear in the sources' indices) from the Corpus. corpus_version_ids = set() + corpus_line_numbers_to_remove = set() - with open(self.corpus_path, 'rb') as corpus_file, open(f'{self.corpus_path}.tmp', 'wb') as tmp_file: + with open(self.corpus_path, 'rb') as corpus_file: for i, line in enumerate(corpus_file): try: doc = document_decoder(line) except DecodeError as e: warning(f"Failed to decode document #{i + 1:,} when loading the Corpus. The error encountered was: '{e}'. The document will be treated as corrupted and will be removed from the Corpus.") - + corpus_line_numbers_to_remove.add(i) continue version_id = doc.version_id + source = doc.source + + if version_id in corpus_version_ids or (version_id not in entries and source in self.scrapers): + corpus_line_numbers_to_remove.add(i) - if version_id not in corpus_version_ids and (version_id in entries or doc.source not in self.scrapers): - tmp_file.write(line) + elif source in self.scrapers: corpus_version_ids.add(version_id) - # Overwrite the Corpus with the temporary file. - os.replace(f'{self.corpus_path}.tmp', self.corpus_path) + # Deduplicate, repair and/or remove outdated documents (in particular, documents that have the same source as the sources being scraped and do not appear in the sources' indices) from the Corpus. + if corpus_line_numbers_to_remove: + console.print('\nDeduplicating, repairing and/or removing outdated documents from the Corpus.', style='light_cyan1 bold') + + with open(self.corpus_path, 'rb') as corpus_file, open(f'{self.corpus_path}.tmp', 'wb') as tmp_file: + for i, line in enumerate(corpus_file): + if i not in corpus_line_numbers_to_remove: + tmp_file.write(line) + # Overwrite the Corpus with the temporary file. + os.replace(f'{self.corpus_path}.tmp', self.corpus_path) + # Identify missing documents by filtering out from the document entries any documents that already appear in the Corpus. missing_entries = [scraper_entry for version_id, scraper_entry in entries.items() if version_id not in corpus_version_ids]