From 5c316deadc4af9b75790ffdcd7a13753d6c628ab Mon Sep 17 00:00:00 2001
From: Umar Butler <umar@umarbutler.com>
Date: Mon, 5 Aug 2024 12:30:19 +1000
Subject: [PATCH] Fixed bug with NSW Legislation database.

---
 CHANGELOG.md                                 |  4 ++
 README.md                                    |  2 +-
 pyproject.toml                               |  3 +-
 src/oalc_creator/scrapers/nsw_legislation.py | 60 +++++++++++---------
 4 files changed, 40 insertions(+), 29 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 02dec9f..002a6fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 ## Changelog 🔄
 All notable changes to the Open Australian Legal Corpus Creator will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.0.3] - 2024-08-05
+### Fixed
+- Fixed a bug preventing the scraping of documents from the NSW Legislation database that are stored as PDFs but are reported by the database's web server as being HTML files.
+
 ## [3.0.2] - 2024-08-04
 ### Fixed
 - Fixed a bug that caused only the first volume of multivolume documents on the Federal Register of Legislation available in a HTML format to be scraped instead of all volumes.
diff --git a/README.md b/README.md
index 4c1ab4e..1d2b999 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Open Australian Legal Corpus Creator
-<a href="https://github.com/umarbutler/open-australian-legal-corpus-creator" alt="Release"><img src="https://img.shields.io/badge/release-v3.0.2-green"></a>
+<a href="https://github.com/umarbutler/open-australian-legal-corpus-creator" alt="Release"><img src="https://img.shields.io/badge/release-v3.0.3-green"></a>
 
 The [Open Australian Legal Corpus](https://huggingface.co/datasets/umarbutler/open-australian-legal-corpus) is the first and only multijurisdictional open corpus of Australian legislative and judicial documents. This repository contains the code used to create and update the Corpus.
 
diff --git a/pyproject.toml b/pyproject.toml
index 09859d6..88f6bf9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "oalc-creator"
-version = "3.0.2"
+version = "3.0.3"
 authors = [
   {name="Umar Butler", email="umar@umar.au"},
 ]
@@ -66,6 +66,7 @@ dependencies = [
     "uvloop; platform_system != 'Windows'",
     "tesserocr",
     "pypdfium2",
+    "pillow",
 ]
 
 [project.urls]
diff --git a/src/oalc_creator/scrapers/nsw_legislation.py b/src/oalc_creator/scrapers/nsw_legislation.py
index fba41e6..d585f12 100644
--- a/src/oalc_creator/scrapers/nsw_legislation.py
+++ b/src/oalc_creator/scrapers/nsw_legislation.py
@@ -143,37 +143,43 @@ async def _get_doc(self, entry: Entry) -> Document | None:
             
             return
         
-        match resp.type:
-            case 'text/html':
-                # If the response contains the substring 'No fragments found.', then return `None` as there is a bug in the NSW Legislation database preventing the retrieval of certain documents (see, eg, https://legislation.nsw.gov.au/view/whole/html/inforce/2021-03-25/act-1944-031).
-                if 'No fragments found.' in resp.text:
-                    warning(f"Unable to retrieve document from {entry.request.path}. 'No fragments found.' encountered in the response, indicating that the document is missing from the NSW Legislation database. Returning `None`.")
-                    return
-                
-                # Create an etree from the response.
+        resp_type = resp.type
+        
+        if resp_type == 'text/html':
+            # If the response contains the substring 'No fragments found.', then return `None` as there is a bug in the NSW Legislation database preventing the retrieval of certain documents (see, eg, https://legislation.nsw.gov.au/view/whole/html/inforce/2021-03-25/act-1944-031).
+            if 'No fragments found.' in resp.text:
+                warning(f"Unable to retrieve document from {entry.request.path}. 'No fragments found.' encountered in the response, indicating that the document is missing from the NSW Legislation database. Returning `None`.")
+                return
+            
+            # Create an etree from the response if a UnicodeDecodeError is not encountered otherwise assume that the document is a PDF.
+            try:
                 etree = lxml.html.fromstring(resp.text)
-                
-                # Select the element containing the text of the document.
-                text_elm = etree.xpath('//div[@id="frag-col"]')[0]
-                
-                # Remove the toolbar.
-                text_elm.xpath('//div[@id="fragToolbar"]')[0].drop_tree()
-                
-                # Remove the search results (they are supposed to be hidden by Javascript).
-                text_elm.xpath('//div[@class="nav-result display-none"]')[0].drop_tree()
+            
+            except UnicodeDecodeError:
+                resp_type = 'application/pdf'
+            
+        if resp_type == 'text/html':
+            # Select the element containing the text of the document.
+            text_elm = etree.xpath('//div[@id="frag-col"]')[0]
+            
+            # Remove the toolbar.
+            text_elm.xpath('//div[@id="fragToolbar"]')[0].drop_tree()
+            
+            # Remove the search results (they are supposed to be hidden by Javascript).
+            text_elm.xpath('//div[@class="nav-result display-none"]')[0].drop_tree()
 
-                # Remove footnotes (they are supposed to be hidden by Javascript).
-                for elm in text_elm.xpath("//*[contains(concat(' ', normalize-space(@class), ' '), ' view-history-note ')]"): elm.drop_tree()
+            # Remove footnotes (they are supposed to be hidden by Javascript).
+            for elm in text_elm.xpath("//*[contains(concat(' ', normalize-space(@class), ' '), ' view-history-note ')]"): elm.drop_tree()
 
-                # Extract the text of the document.
-                text = CustomInscriptis(text_elm, self._inscriptis_config).get_text()
+            # Extract the text of the document.
+            text = CustomInscriptis(text_elm, self._inscriptis_config).get_text()
             
-            case 'application/pdf':
-                # Extract the text of the document from the PDF with OCR.
-                text = await pdf2txt(resp.stream, self.ocr_batch_size, self.thread_pool_executor, self.ocr_semaphore)
+        elif 'application/pdf':
+            # Extract the text of the document from the PDF with OCR.
+            text = await pdf2txt(resp.stream, self.ocr_batch_size, self.thread_pool_executor, self.ocr_semaphore)
             
-            case _:
-                raise ValueError(f'Unable to retrieve document from {entry.request.path}. Invalid content type: {resp.type}.')
+        else:
+            raise ValueError(f'Unable to retrieve document from {entry.request.path}. Invalid content type: {resp_type}.')
         
         # Return the document.
         return make_doc(
@@ -181,7 +187,7 @@ async def _get_doc(self, entry: Entry) -> Document | None:
             type=entry.type,
             jurisdiction=entry.jurisdiction,
             source=entry.source,
-            mime=resp.type,
+            mime=resp_type,
             date=entry.date,
             citation=entry.title,
             url=entry.request.path,