-
-
Notifications
You must be signed in to change notification settings - Fork 112
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #848 from grossir/nymisc
feat(nymisc): Add scraper for nyfam, nycity, nycounty, nysupreme, nycciv, nyccrim, nysurrogate, nydistrict, nyjustice, nyctclaims
- Loading branch information
Showing
40 changed files
with
23,466 additions
and
227 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"City? (Ct|Court)" | ||
# Most start with the regex, but there are special cases | ||
# such as 'Utica City Ct' in Dec 2023 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"Civ(il)? C[our]*t|[HC]CIV|Hous Part" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"^Ct Cl|C(our)?t( [Oo]f)? Cl(aims)?$" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"^County|(Co(unty?)? Ct)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"Cri?m(inal)? C[our]*t" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"Dist\.?(ric[tk])? C(our)?t" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"Fam Ct|Family Court|Youth Part" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"(Just|Village|Town) Ct|Just(ice)? Cour+t" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,5 @@ | ||
# Scraper and Back Scraper for New York Commercial Division | ||
# CourtID: nysupct | ||
# Court Short Name: NY | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
from juriscraper.opinions.united_states.state import nyappterm_1st | ||
|
||
|
||
class Site(nyappterm_1st.Site): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.court = "Commercial Division" | ||
self.parameters.update({"court": self.court}) | ||
class Site(nytrial.Site): | ||
court_regex = r"Sup[rt]?\.? ?[Cc]o?u?r?t?|[sS]ur?pu?rem?e? C(our)?t|Sur?pe?r?me?|Suoreme|Sup County|Integrated Domestic Violence|Soho Fashions LTD" |
15 changes: 15 additions & 0 deletions
15
juriscraper/opinions/united_states/state/nysupct_commercial.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
"""Scraper and Back Scraper for New York Commercial Division | ||
CourtID: nysupct_commercial | ||
Court Short Name: NY | ||
History: | ||
- 2024-01-05, grossir: modified to use nytrial template | ||
""" | ||
from datetime import date | ||
|
||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
base_url = "https://nycourts.gov/reporter/slipidx/com_div_idxtable.shtml" | ||
court_regex = r".*" | ||
first_opinion_date = date(2013, 7, 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from juriscraper.opinions.united_states.state import nytrial | ||
|
||
|
||
class Site(nytrial.Site): | ||
court_regex = r"Sur{1,}oa?gate|Sur[.r]* Ct" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
""" | ||
Scraper template for the 'Other Courts' of the NY Reporter | ||
Court Contact: phone: (518) 453-6900 | ||
Author: Gianfranco Rossi | ||
History: | ||
- 2024-01-05, grossir: created | ||
""" | ||
import re | ||
from datetime import date | ||
from itertools import chain | ||
from typing import Any, Dict, List, Optional | ||
|
||
from dateutil.rrule import MONTHLY, rrule | ||
from lxml.html import fromstring | ||
|
||
from juriscraper.AbstractSite import logger | ||
from juriscraper.lib.judge_parsers import normalize_judge_string | ||
from juriscraper.lib.string_utils import harmonize | ||
from juriscraper.OpinionSiteLinear import OpinionSiteLinear | ||
|
||
|
||
class Site(OpinionSiteLinear): | ||
court_regex: str # to be defined on inheriting classes | ||
base_url = "https://nycourts.gov/reporter/slipidx/miscolo.shtml" | ||
first_opinion_date = date(2003, 12, 1) | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
|
||
self.court_id = self.__module__ | ||
self.url = self.build_url() | ||
|
||
date_keys = rrule( | ||
MONTHLY, dtstart=self.first_opinion_date, until=date(2023, 12, 30) | ||
) | ||
self.back_scrape_iterable = [i.date() for i in date_keys] | ||
|
||
def build_url(self, target_date: Optional[date] = None) -> str: | ||
"""URL as is loads most recent month page | ||
There is an URL for each month of each year back to Dec 2003 | ||
:param target_date: used to extract month and year for backscraping | ||
:returns str: formatted url | ||
""" | ||
if not target_date: | ||
return self.base_url | ||
|
||
end = f"_{target_date.year}_{target_date.strftime('%B')}.shtml" | ||
|
||
return self.base_url.replace(".shtml", end) | ||
|
||
def is_court_of_interest(self, court: str) -> bool: | ||
"""'Other Courts' of NY Reporter consists of 10 different | ||
family of sources. Each family has an scraper that inherits | ||
from this class and defines a `court_regex` to capture those | ||
that belong to its family | ||
For example | ||
"Civ Ct City NY, Queens County" and "Civ Ct City NY, NY County" | ||
belong to nycivct family | ||
:param court: court name | ||
:return: true if court name matches | ||
family of courts of calling scraper | ||
""" | ||
return bool(re.search(self.court_regex, court)) | ||
|
||
def _process_html(self) -> None: | ||
"""Parses a page's HTML into opinion dictionaries | ||
:return: None | ||
""" | ||
row_xpath = "//table[caption]//tr[position()>1 and td]" | ||
for row in self.html.xpath(row_xpath): | ||
court = re.sub( | ||
r"\s+", " ", row.xpath("td[2]")[0].text_content() | ||
).strip(", ") | ||
|
||
if not self.is_court_of_interest(court): | ||
logger.debug("Skipping %s", court) | ||
continue | ||
|
||
url = row.xpath("td[1]/a/@href")[0] | ||
name = harmonize(row.xpath("td[1]/a")[0].text_content()) | ||
opinion_date = row.xpath("td[3]")[0].text_content() | ||
slip_cite = row.xpath("td[4]")[0].text_content() | ||
status = "Unpublished" if "(U)" in slip_cite else "Published" | ||
|
||
self.cases.append( | ||
{ | ||
"name": name, | ||
"date": opinion_date, | ||
"status": status, | ||
"url": url, | ||
"citation": slip_cite, | ||
"child_court": court, | ||
} | ||
) | ||
|
||
def _get_docket_numbers(self) -> List[str]: | ||
"""Overriding from OpinionSiteLinear, since docket numbers are | ||
not in the HTML and they are required | ||
We will get them on the extract_from_text stage on courtlistener | ||
:return: list of empty strings values | ||
""" | ||
return ["" for _ in self.cases] | ||
|
||
def _download_backwards(self, target_date: date) -> None: | ||
"""Method used by backscraper to download historical records | ||
:param target_date: an element of self.back_scrape_iterable | ||
:return: None | ||
""" | ||
self.url = self.build_url(target_date) | ||
|
||
def extract_from_text(self, scraped_text: str) -> Dict[str, Any]: | ||
"""Extract values from opinion's text | ||
:param scraped_text: pdf or html string contents | ||
:return: dict where keys match courtlistener model objects | ||
""" | ||
pattern = r"Judge:\s?(.+)|([\w .,]+), [JS]\.\s" | ||
judge = self.match(scraped_text, pattern) | ||
|
||
pattern = r"</table><br><br\s?/?>\s?(.*)\r?\n|Docket Number:\s?(.+)" | ||
docket_number = self.match(scraped_text, pattern) | ||
|
||
pattern = r"\[(?P<volume>\d+) (?P<reporter>Misc 3d) (?P<page>.+)\]" | ||
cite_match = re.search(pattern, scraped_text[:2000]) | ||
|
||
# Only for .htm links | ||
full_case = None | ||
if scraped_text.find("<table") != -1: | ||
# replace <br> with newlines because text_content() replaces <br> | ||
# with whitespace. If not, case names would lack proper separation | ||
scraped_text = scraped_text.replace("<br>", "\n") | ||
full_case = fromstring(scraped_text).xpath("//table") | ||
full_case = full_case[1].text_content() if full_case else "" | ||
|
||
metadata = { | ||
"Docket": {"docket_number": docket_number}, | ||
} | ||
|
||
if judge: | ||
metadata["Opinion"] = { | ||
"author_str": normalize_judge_string(judge)[0] | ||
} | ||
if cite_match: | ||
metadata["Citation"] = cite_match.groupdict("") | ||
if full_case: | ||
full_case = harmonize(full_case) | ||
metadata["Docket"]["case_name_full"] = full_case | ||
metadata["OpinionCluster"] = {"case_name_full": full_case} | ||
|
||
return metadata | ||
|
||
@staticmethod | ||
def match(scraped_text: str, pattern: str) -> str: | ||
"""Returns first match | ||
:param scraped_text: HTML or PDF string content | ||
:param pattern: regex string | ||
:returns: first match | ||
""" | ||
m = re.findall(pattern, scraped_text) | ||
r = list(filter(None, chain.from_iterable(m))) | ||
return r[0].strip() if r else "" |
50 changes: 50 additions & 0 deletions
50
tests/examples/opinions/united_states/nycityct_example.compare.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
[ | ||
{ | ||
"case_dates": "2023-12-11", | ||
"case_names": "Morel v. Aviles", | ||
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_51355.htm", | ||
"precedential_statuses": "Unpublished", | ||
"blocked_statuses": false, | ||
"date_filed_is_approximate": false, | ||
"docket_numbers": "", | ||
"citations": "2023 NY Slip Op 51355(U)", | ||
"case_name_shorts": "Morel", | ||
"child_courts": "City Ct Middletown, Orange County" | ||
}, | ||
{ | ||
"case_dates": "2023-12-08", | ||
"case_names": "Cooley v. Vanslyke", | ||
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23382.htm", | ||
"precedential_statuses": "Published", | ||
"blocked_statuses": false, | ||
"date_filed_is_approximate": false, | ||
"docket_numbers": "", | ||
"citations": "2023 NY Slip Op 23382", | ||
"case_name_shorts": "Cooley", | ||
"child_courts": "City Ct Little Falls, Herkimer County" | ||
}, | ||
{ | ||
"case_dates": "2023-12-07", | ||
"case_names": "Hughes v. Qingling Zhao", | ||
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23383.htm", | ||
"precedential_statuses": "Published", | ||
"blocked_statuses": false, | ||
"date_filed_is_approximate": false, | ||
"docket_numbers": "", | ||
"citations": "2023 NY Slip Op 23383", | ||
"case_name_shorts": "Hughes", | ||
"child_courts": "City Ct Long Beach, Nassau County" | ||
}, | ||
{ | ||
"case_dates": "2023-12-01", | ||
"case_names": "Potentia Mgt. Group, LLC v. D.W.", | ||
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23374.htm", | ||
"precedential_statuses": "Published", | ||
"blocked_statuses": false, | ||
"date_filed_is_approximate": false, | ||
"docket_numbers": "", | ||
"citations": "2023 NY Slip Op 23374", | ||
"case_name_shorts": "D.W.", | ||
"child_courts": "Utica City Ct" | ||
} | ||
] |
Oops, something went wrong.