Fixed excessive writes (#2).

umarbutler · Aug 8, 2024 · 0c86c0a · 0c86c0a
1 parent 0b925d1
commit 0c86c0a
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 ## Changelog 🔄
 All notable changes to the Open Australian Legal Corpus Creator will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.0.4] - 2024-08-08
+### Fixed
+- Fixed the fact that, when the Creator was run, it would unnecessarily rewrite the entire Corpus in order to detect and remove duplicates, outdated documents and otherwise repair it (which caused excessive writes and overwore disks) by instead first reading the Corpus and then only overwriting it if found necessary as, although this can sometimes double read time, reading is much cheaper on SSDs (which most modern drives are) than writing ([#2](https://github.com/umarbutler/open-australian-legal-corpus-creator/issues/2)).
+
 ## [3.0.3] - 2024-08-05
 ### Fixed
 - Fixed a bug preventing the scraping of documents from the NSW Legislation database that are stored as PDFs but are reported by the database's web server as being HTML files.

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Open Australian Legal Corpus Creator
-<a href="https://github.com/umarbutler/open-australian-legal-corpus-creator" alt="Release"><img src="https://img.shields.io/badge/release-v3.0.3-green"></a>
+<a href="https://github.com/umarbutler/open-australian-legal-corpus-creator" alt="Release"><img src="https://img.shields.io/badge/release-v3.0.4-green"></a>
 
 The [Open Australian Legal Corpus](https://huggingface.co/datasets/umarbutler/open-australian-legal-corpus) is the first and only multijurisdictional open corpus of Australian legislative and judicial documents. This repository contains the code used to create and update the Corpus.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "oalc-creator"
-version = "3.0.3"
+version = "3.0.4"
 authors = [
   {name="Umar Butler", email="[email protected]"},
 ]

diff --git a/src/oalc_creator/creator.py b/src/oalc_creator/creator.py
@@ -247,28 +247,41 @@ async def create(self) -> None:
                 for entry in entries.entries
             }
 
-            # Deduplicate (and, if necessary, repair) the Corpus and remove any documents that have the same source as the sources being scraped and do not appear in the sources' indices; and also store the version ids of documents not removed from the Corpus in order to later identify missing documents to be added to the Corpus.
+            # Index the version ids of documents in the Corpus from the desired sources in order to later identify missing documents to be added to the Corpus and also determine whether it is necessary to deduplicate, repair and/or remove outdated documents (in particular, documents that have the same source as the sources being scraped and do not appear in the sources' indices) from the Corpus.
             corpus_version_ids = set()
+            corpus_line_numbers_to_remove = set()
 
-            with open(self.corpus_path, 'rb') as corpus_file, open(f'{self.corpus_path}.tmp', 'wb') as tmp_file:
+            with open(self.corpus_path, 'rb') as corpus_file:
                 for i, line in enumerate(corpus_file):
                     try:
                         doc = document_decoder(line)
 
                     except DecodeError as e:
                         warning(f"Failed to decode document #{i + 1:,} when loading the Corpus. The error encountered was: '{e}'. The document will be treated as corrupted and will be removed from the Corpus.")
-
+                        corpus_line_numbers_to_remove.add(i)
                         continue
 
                     version_id = doc.version_id
+                    source = doc.source
+
+                    if version_id in corpus_version_ids or (version_id not in entries and source in self.scrapers):
+                        corpus_line_numbers_to_remove.add(i)
 
-                    if version_id not in corpus_version_ids and (version_id in entries or doc.source not in self.scrapers):
-                        tmp_file.write(line)
+                    elif source in self.scrapers:
                         corpus_version_ids.add(version_id)
 
-            # Overwrite the Corpus with the temporary file.
-            os.replace(f'{self.corpus_path}.tmp', self.corpus_path)
+            # Deduplicate, repair and/or remove outdated documents (in particular, documents that have the same source as the sources being scraped and do not appear in the sources' indices) from the Corpus.
+            if corpus_line_numbers_to_remove:
+                console.print('\nDeduplicating, repairing and/or removing outdated documents from the Corpus.', style='light_cyan1 bold')
+
+                with open(self.corpus_path, 'rb') as corpus_file, open(f'{self.corpus_path}.tmp', 'wb') as tmp_file:
+                    for i, line in enumerate(corpus_file):
+                        if i not in corpus_line_numbers_to_remove:
+                            tmp_file.write(line)
 
+                # Overwrite the Corpus with the temporary file.
+                os.replace(f'{self.corpus_path}.tmp', self.corpus_path)
+
             # Identify missing documents by filtering out from the document entries any documents that already appear in the Corpus.
             missing_entries = [scraper_entry for version_id, scraper_entry in entries.items() if version_id not in corpus_version_ids]