Fixed bug with NSW Legislation database.

umarbutler · Aug 5, 2024 · 5c316de · 5c316de
1 parent 0741561
commit 5c316de
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 ## Changelog 🔄
 All notable changes to the Open Australian Legal Corpus Creator will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.0.3] - 2024-08-05
+### Fixed
+- Fixed a bug preventing the scraping of documents from the NSW Legislation database that are stored as PDFs but are reported by the database's web server as being HTML files.
+
 ## [3.0.2] - 2024-08-04
 ### Fixed
 - Fixed a bug that caused only the first volume of multivolume documents on the Federal Register of Legislation available in a HTML format to be scraped instead of all volumes.

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Open Australian Legal Corpus Creator
-<a href="https://github.com/umarbutler/open-australian-legal-corpus-creator" alt="Release"><img src="https://img.shields.io/badge/release-v3.0.2-green"></a>
+<a href="https://github.com/umarbutler/open-australian-legal-corpus-creator" alt="Release"><img src="https://img.shields.io/badge/release-v3.0.3-green"></a>
 
 The [Open Australian Legal Corpus](https://huggingface.co/datasets/umarbutler/open-australian-legal-corpus) is the first and only multijurisdictional open corpus of Australian legislative and judicial documents. This repository contains the code used to create and update the Corpus.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "oalc-creator"
-version = "3.0.2"
+version = "3.0.3"
 authors = [
   {name="Umar Butler", email="[email protected]"},
 ]
@@ -66,6 +66,7 @@ dependencies = [
     "uvloop; platform_system != 'Windows'",
     "tesserocr",
     "pypdfium2",
+    "pillow",
 ]
 
 [project.urls]

diff --git a/src/oalc_creator/scrapers/nsw_legislation.py b/src/oalc_creator/scrapers/nsw_legislation.py
@@ -143,45 +143,51 @@ async def _get_doc(self, entry: Entry) -> Document | None:
 
             return
 
-        match resp.type:
-            case 'text/html':
-                # If the response contains the substring 'No fragments found.', then return `None` as there is a bug in the NSW Legislation database preventing the retrieval of certain documents (see, eg, https://legislation.nsw.gov.au/view/whole/html/inforce/2021-03-25/act-1944-031).
-                if 'No fragments found.' in resp.text:
-                    warning(f"Unable to retrieve document from {entry.request.path}. 'No fragments found.' encountered in the response, indicating that the document is missing from the NSW Legislation database. Returning `None`.")
-                    return
-
-                # Create an etree from the response.
+        resp_type = resp.type
+
+        if resp_type == 'text/html':
+            # If the response contains the substring 'No fragments found.', then return `None` as there is a bug in the NSW Legislation database preventing the retrieval of certain documents (see, eg, https://legislation.nsw.gov.au/view/whole/html/inforce/2021-03-25/act-1944-031).
+            if 'No fragments found.' in resp.text:
+                warning(f"Unable to retrieve document from {entry.request.path}. 'No fragments found.' encountered in the response, indicating that the document is missing from the NSW Legislation database. Returning `None`.")
+                return
+
+            # Create an etree from the response if a UnicodeDecodeError is not encountered otherwise assume that the document is a PDF.
+            try:
                 etree = lxml.html.fromstring(resp.text)
-
-                # Select the element containing the text of the document.
-                text_elm = etree.xpath('//div[@id="frag-col"]')[0]
-
-                # Remove the toolbar.
-                text_elm.xpath('//div[@id="fragToolbar"]')[0].drop_tree()
-
-                # Remove the search results (they are supposed to be hidden by Javascript).
-                text_elm.xpath('//div[@class="nav-result display-none"]')[0].drop_tree()
+
+            except UnicodeDecodeError:
+                resp_type = 'application/pdf'
+
+        if resp_type == 'text/html':
+            # Select the element containing the text of the document.
+            text_elm = etree.xpath('//div[@id="frag-col"]')[0]
+
+            # Remove the toolbar.
+            text_elm.xpath('//div[@id="fragToolbar"]')[0].drop_tree()
+
+            # Remove the search results (they are supposed to be hidden by Javascript).
+            text_elm.xpath('//div[@class="nav-result display-none"]')[0].drop_tree()
 
-                # Remove footnotes (they are supposed to be hidden by Javascript).
-                for elm in text_elm.xpath("//*[contains(concat(' ', normalize-space(@class), ' '), ' view-history-note ')]"): elm.drop_tree()
+            # Remove footnotes (they are supposed to be hidden by Javascript).
+            for elm in text_elm.xpath("//*[contains(concat(' ', normalize-space(@class), ' '), ' view-history-note ')]"): elm.drop_tree()
 
-                # Extract the text of the document.
-                text = CustomInscriptis(text_elm, self._inscriptis_config).get_text()
+            # Extract the text of the document.
+            text = CustomInscriptis(text_elm, self._inscriptis_config).get_text()
 
-            case 'application/pdf':
-                # Extract the text of the document from the PDF with OCR.
-                text = await pdf2txt(resp.stream, self.ocr_batch_size, self.thread_pool_executor, self.ocr_semaphore)
+        elif 'application/pdf':
+            # Extract the text of the document from the PDF with OCR.
+            text = await pdf2txt(resp.stream, self.ocr_batch_size, self.thread_pool_executor, self.ocr_semaphore)
 
-            case _:
-                raise ValueError(f'Unable to retrieve document from {entry.request.path}. Invalid content type: {resp.type}.')
+        else:
+            raise ValueError(f'Unable to retrieve document from {entry.request.path}. Invalid content type: {resp_type}.')
 
         # Return the document.
         return make_doc(
             version_id=entry.version_id,
             type=entry.type,
             jurisdiction=entry.jurisdiction,
             source=entry.source,
-            mime=resp.type,
+            mime=resp_type,
             date=entry.date,
             citation=entry.title,
             url=entry.request.path,