Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ScraperExtractFromText): add fail case testing #1290

Merged
merged 2 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions juriscraper/opinions/united_states/administrative_agency/bia.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datetime import datetime
from typing import Any, Dict

from juriscraper.AbstractSite import logger
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -70,8 +71,14 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
date = re.findall(
r"Decided (by (Acting\s)?Attorney General )?(.*\d{4})",
scraped_text,
)[0][-1]
date_filed = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%m-%d")
)
if not date:
logger.error("bia: unable to extract_from_text a date_filed")
return {}

date_filed = datetime.strptime(date[0][-1], "%B %d, %Y").strftime(
"%Y-%m-%d"
)
metadata = {
"OpinionCluster": {
"date_filed": date_filed,
Expand Down
7 changes: 4 additions & 3 deletions juriscraper/opinions/united_states/federal_bankruptcy/bap1.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,10 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""
months = "|".join(calendar.month_name[1:])
date_pattern = re.compile(rf"({months})\s+\d{{1,2}}\s?,?\s+\d{{4}}")
match = re.search(date_pattern, scraped_text)
date_extracted = match.group(0) if match else ""
date_filed = re.sub(r"\s+", " ", date_extracted).strip()
if match := re.search(date_pattern, scraped_text):
date_filed = re.sub(r"\s+", " ", match.group(0)).strip()
else:
return {}

metadata = {
"OpinionCluster": {
Expand Down
8 changes: 6 additions & 2 deletions juriscraper/opinions/united_states/state/nm.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,14 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
:param scraped_text: Text of scraped content
:return: metadata
"""
docket_number = re.findall(r"N[oO]\.\s(.*)", scraped_text)[0]
docket_number = re.findall(r"N[oO]\.\s(.*)", scraped_text)
if not docket_number:
logger.error("nm: unable to extract_from_text a docket_number")
return {}

metadata = {
"OpinionCluster": {
"docket_number": docket_number,
"docket_number": docket_number[0],
},
}
return metadata
9 changes: 6 additions & 3 deletions juriscraper/opinions/united_states/state/sd.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,15 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""

# The docket number appears to be the first text on the page.
# So I crop the text to avoid any confusion that might occur in the
# So we crop the text to avoid any confusion that might occur in the
# body of an opinion.
docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])[0]
docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])
if not docket:
return {}

metadata = {
"Docket": {
"docket_number": docket,
"docket_number": docket[0],
},
}
return metadata
12 changes: 12 additions & 0 deletions tests/local/test_ScraperExtractFromTextTest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import logging
import unittest

from juriscraper.lib.importer import build_module_list
Expand Down Expand Up @@ -752,16 +753,27 @@ class ScraperExtractFromText(unittest.TestCase):

def test_extract_from_text(self):
"""Test that extract_from_text returns the expected data."""
# prevent logger.error calls to be triggered
logging.disable(logging.CRITICAL)
for module_string, test_cases in self.test_data.items():
package, module = module_string.rsplit(".", 1)
mod = __import__(
f"{package}.{module}", globals(), locals(), [module]
)
site = mod.Site()

# ensure that if no data is parsed, a dict is returned
# also, this ensures that there are no uncontrolled exceptions
self.assertTrue(
isinstance(
site.extract_from_text("Lorem ipsum dolorem..."), dict
)
)
for test_case in test_cases:
self.assertEqual(
site.extract_from_text(test_case[0]), test_case[1]
)
logging.disable(logging.NOTSET)

def test_extract_from_text_properly_implemented(self):
"""Ensure that extract_from_text is properly implemented."""
Expand Down
Loading