From 5c316deadc4af9b75790ffdcd7a13753d6c628ab Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 5 Aug 2024 12:30:19 +1000 Subject: [PATCH] Fixed bug with NSW Legislation database. --- CHANGELOG.md | 4 ++ README.md | 2 +- pyproject.toml | 3 +- src/oalc_creator/scrapers/nsw_legislation.py | 60 +++++++++++--------- 4 files changed, 40 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02dec9f..002a6fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ ## Changelog 🔄 All notable changes to the Open Australian Legal Corpus Creator will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.0.3] - 2024-08-05 +### Fixed +- Fixed a bug preventing the scraping of documents from the NSW Legislation database that are stored as PDFs but are reported by the database's web server as being HTML files. + ## [3.0.2] - 2024-08-04 ### Fixed - Fixed a bug that caused only the first volume of multivolume documents on the Federal Register of Legislation available in a HTML format to be scraped instead of all volumes. diff --git a/README.md b/README.md index 4c1ab4e..1d2b999 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Open Australian Legal Corpus Creator - + The [Open Australian Legal Corpus](https://huggingface.co/datasets/umarbutler/open-australian-legal-corpus) is the first and only multijurisdictional open corpus of Australian legislative and judicial documents. This repository contains the code used to create and update the Corpus. diff --git a/pyproject.toml b/pyproject.toml index 09859d6..88f6bf9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "oalc-creator" -version = "3.0.2" +version = "3.0.3" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] @@ -66,6 +66,7 @@ dependencies = [ "uvloop; platform_system != 'Windows'", "tesserocr", "pypdfium2", + "pillow", ] [project.urls] diff --git a/src/oalc_creator/scrapers/nsw_legislation.py b/src/oalc_creator/scrapers/nsw_legislation.py index fba41e6..d585f12 100644 --- a/src/oalc_creator/scrapers/nsw_legislation.py +++ b/src/oalc_creator/scrapers/nsw_legislation.py @@ -143,37 +143,43 @@ async def _get_doc(self, entry: Entry) -> Document | None: return - match resp.type: - case 'text/html': - # If the response contains the substring 'No fragments found.', then return `None` as there is a bug in the NSW Legislation database preventing the retrieval of certain documents (see, eg, https://legislation.nsw.gov.au/view/whole/html/inforce/2021-03-25/act-1944-031). - if 'No fragments found.' in resp.text: - warning(f"Unable to retrieve document from {entry.request.path}. 'No fragments found.' encountered in the response, indicating that the document is missing from the NSW Legislation database. Returning `None`.") - return - - # Create an etree from the response. + resp_type = resp.type + + if resp_type == 'text/html': + # If the response contains the substring 'No fragments found.', then return `None` as there is a bug in the NSW Legislation database preventing the retrieval of certain documents (see, eg, https://legislation.nsw.gov.au/view/whole/html/inforce/2021-03-25/act-1944-031). + if 'No fragments found.' in resp.text: + warning(f"Unable to retrieve document from {entry.request.path}. 'No fragments found.' encountered in the response, indicating that the document is missing from the NSW Legislation database. Returning `None`.") + return + + # Create an etree from the response if a UnicodeDecodeError is not encountered otherwise assume that the document is a PDF. + try: etree = lxml.html.fromstring(resp.text) - - # Select the element containing the text of the document. - text_elm = etree.xpath('//div[@id="frag-col"]')[0] - - # Remove the toolbar. - text_elm.xpath('//div[@id="fragToolbar"]')[0].drop_tree() - - # Remove the search results (they are supposed to be hidden by Javascript). - text_elm.xpath('//div[@class="nav-result display-none"]')[0].drop_tree() + + except UnicodeDecodeError: + resp_type = 'application/pdf' + + if resp_type == 'text/html': + # Select the element containing the text of the document. + text_elm = etree.xpath('//div[@id="frag-col"]')[0] + + # Remove the toolbar. + text_elm.xpath('//div[@id="fragToolbar"]')[0].drop_tree() + + # Remove the search results (they are supposed to be hidden by Javascript). + text_elm.xpath('//div[@class="nav-result display-none"]')[0].drop_tree() - # Remove footnotes (they are supposed to be hidden by Javascript). - for elm in text_elm.xpath("//*[contains(concat(' ', normalize-space(@class), ' '), ' view-history-note ')]"): elm.drop_tree() + # Remove footnotes (they are supposed to be hidden by Javascript). + for elm in text_elm.xpath("//*[contains(concat(' ', normalize-space(@class), ' '), ' view-history-note ')]"): elm.drop_tree() - # Extract the text of the document. - text = CustomInscriptis(text_elm, self._inscriptis_config).get_text() + # Extract the text of the document. + text = CustomInscriptis(text_elm, self._inscriptis_config).get_text() - case 'application/pdf': - # Extract the text of the document from the PDF with OCR. - text = await pdf2txt(resp.stream, self.ocr_batch_size, self.thread_pool_executor, self.ocr_semaphore) + elif 'application/pdf': + # Extract the text of the document from the PDF with OCR. + text = await pdf2txt(resp.stream, self.ocr_batch_size, self.thread_pool_executor, self.ocr_semaphore) - case _: - raise ValueError(f'Unable to retrieve document from {entry.request.path}. Invalid content type: {resp.type}.') + else: + raise ValueError(f'Unable to retrieve document from {entry.request.path}. Invalid content type: {resp_type}.') # Return the document. return make_doc( @@ -181,7 +187,7 @@ async def _get_doc(self, entry: Entry) -> Document | None: type=entry.type, jurisdiction=entry.jurisdiction, source=entry.source, - mime=resp.type, + mime=resp_type, date=entry.date, citation=entry.title, url=entry.request.path,