Skip to content

Commit

Permalink
Merge pull request #848 from grossir/nymisc
Browse files Browse the repository at this point in the history
feat(nymisc): Add scraper for nyfam, nycity, nycounty, nysupreme, nycciv, nyccrim, nysurrogate, nydistrict, nyjustice, nyctclaims
  • Loading branch information
flooie authored Jan 9, 2024
2 parents c23f27a + 87d9fcd commit c15d259
Show file tree
Hide file tree
Showing 40 changed files with 23,466 additions and 227 deletions.
4 changes: 4 additions & 0 deletions juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(self, *args, **kwargs):
"parallel_citations",
"summaries",
"case_name_shorts",
"child_courts",
]
self._req_attrs = [
"case_dates",
Expand Down Expand Up @@ -108,6 +109,9 @@ def _get_precedential_statuses(self):
def _get_summaries(self):
return None

def _get_child_courts(self):
return None

def extract_from_text(self, scraped_text):
"""Pass scraped text into function and return data as a dictionary
Expand Down
3 changes: 3 additions & 0 deletions juriscraper/OpinionSiteLinear.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,6 @@ def _get_summaries(self):

def _get_lower_courts(self):
return self._get_optional_field_by_id("lower_court")

def _get_child_courts(self):
return self._get_optional_field_by_id("child_court")
10 changes: 10 additions & 0 deletions juriscraper/opinions/united_states/state/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,17 @@
"nyappdiv_4th",
"nyappterm_1st",
"nyappterm_2nd",
"nysupct_commercial",
"nysupct",
"nyfamct",
"nycityct",
"nycountyct",
"nycivct",
"nycrimct",
"nysurct",
"nydistct",
"nyjustct",
"nyclaimsct",
"ohio",
"ohioctapp_1",
"ohioctapp_2",
Expand Down
7 changes: 7 additions & 0 deletions juriscraper/opinions/united_states/state/nycityct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"City? (Ct|Court)"
# Most start with the regex, but there are special cases
# such as 'Utica City Ct' in Dec 2023
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nycivct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"Civ(il)? C[our]*t|[HC]CIV|Hous Part"
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nyclaimsct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"^Ct Cl|C(our)?t( [Oo]f)? Cl(aims)?$"
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nycountyct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"^County|(Co(unty?)? Ct)"
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nycrimct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"Cri?m(inal)? C[our]*t"
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nydistct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"Dist\.?(ric[tk])? C(our)?t"
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nyfamct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"Fam Ct|Family Court|Youth Part"
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nyjustct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"(Just|Village|Town) Ct|Just(ice)? Cour+t"
13 changes: 3 additions & 10 deletions juriscraper/opinions/united_states/state/nysupct.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
# Scraper and Back Scraper for New York Commercial Division
# CourtID: nysupct
# Court Short Name: NY
from juriscraper.opinions.united_states.state import nytrial

from juriscraper.opinions.united_states.state import nyappterm_1st


class Site(nyappterm_1st.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court = "Commercial Division"
self.parameters.update({"court": self.court})
class Site(nytrial.Site):
court_regex = r"Sup[rt]?\.? ?[Cc]o?u?r?t?|[sS]ur?pu?rem?e? C(our)?t|Sur?pe?r?me?|Suoreme|Sup County|Integrated Domestic Violence|Soho Fashions LTD"
15 changes: 15 additions & 0 deletions juriscraper/opinions/united_states/state/nysupct_commercial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Scraper and Back Scraper for New York Commercial Division
CourtID: nysupct_commercial
Court Short Name: NY
History:
- 2024-01-05, grossir: modified to use nytrial template
"""
from datetime import date

from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
base_url = "https://nycourts.gov/reporter/slipidx/com_div_idxtable.shtml"
court_regex = r".*"
first_opinion_date = date(2013, 7, 1)
5 changes: 5 additions & 0 deletions juriscraper/opinions/united_states/state/nysurct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from juriscraper.opinions.united_states.state import nytrial


class Site(nytrial.Site):
court_regex = r"Sur{1,}oa?gate|Sur[.r]* Ct"
170 changes: 170 additions & 0 deletions juriscraper/opinions/united_states/state/nytrial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""
Scraper template for the 'Other Courts' of the NY Reporter
Court Contact: phone: (518) 453-6900
Author: Gianfranco Rossi
History:
- 2024-01-05, grossir: created
"""
import re
from datetime import date
from itertools import chain
from typing import Any, Dict, List, Optional

from dateutil.rrule import MONTHLY, rrule
from lxml.html import fromstring

from juriscraper.AbstractSite import logger
from juriscraper.lib.judge_parsers import normalize_judge_string
from juriscraper.lib.string_utils import harmonize
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
court_regex: str # to be defined on inheriting classes
base_url = "https://nycourts.gov/reporter/slipidx/miscolo.shtml"
first_opinion_date = date(2003, 12, 1)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.court_id = self.__module__
self.url = self.build_url()

date_keys = rrule(
MONTHLY, dtstart=self.first_opinion_date, until=date(2023, 12, 30)
)
self.back_scrape_iterable = [i.date() for i in date_keys]

def build_url(self, target_date: Optional[date] = None) -> str:
"""URL as is loads most recent month page
There is an URL for each month of each year back to Dec 2003
:param target_date: used to extract month and year for backscraping
:returns str: formatted url
"""
if not target_date:
return self.base_url

end = f"_{target_date.year}_{target_date.strftime('%B')}.shtml"

return self.base_url.replace(".shtml", end)

def is_court_of_interest(self, court: str) -> bool:
"""'Other Courts' of NY Reporter consists of 10 different
family of sources. Each family has an scraper that inherits
from this class and defines a `court_regex` to capture those
that belong to its family
For example
"Civ Ct City NY, Queens County" and "Civ Ct City NY, NY County"
belong to nycivct family
:param court: court name
:return: true if court name matches
family of courts of calling scraper
"""
return bool(re.search(self.court_regex, court))

def _process_html(self) -> None:
"""Parses a page's HTML into opinion dictionaries
:return: None
"""
row_xpath = "//table[caption]//tr[position()>1 and td]"
for row in self.html.xpath(row_xpath):
court = re.sub(
r"\s+", " ", row.xpath("td[2]")[0].text_content()
).strip(", ")

if not self.is_court_of_interest(court):
logger.debug("Skipping %s", court)
continue

url = row.xpath("td[1]/a/@href")[0]
name = harmonize(row.xpath("td[1]/a")[0].text_content())
opinion_date = row.xpath("td[3]")[0].text_content()
slip_cite = row.xpath("td[4]")[0].text_content()
status = "Unpublished" if "(U)" in slip_cite else "Published"

self.cases.append(
{
"name": name,
"date": opinion_date,
"status": status,
"url": url,
"citation": slip_cite,
"child_court": court,
}
)

def _get_docket_numbers(self) -> List[str]:
"""Overriding from OpinionSiteLinear, since docket numbers are
not in the HTML and they are required
We will get them on the extract_from_text stage on courtlistener
:return: list of empty strings values
"""
return ["" for _ in self.cases]

def _download_backwards(self, target_date: date) -> None:
"""Method used by backscraper to download historical records
:param target_date: an element of self.back_scrape_iterable
:return: None
"""
self.url = self.build_url(target_date)

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Extract values from opinion's text
:param scraped_text: pdf or html string contents
:return: dict where keys match courtlistener model objects
"""
pattern = r"Judge:\s?(.+)|([\w .,]+), [JS]\.\s"
judge = self.match(scraped_text, pattern)

pattern = r"</table><br><br\s?/?>\s?(.*)\r?\n|Docket Number:\s?(.+)"
docket_number = self.match(scraped_text, pattern)

pattern = r"\[(?P<volume>\d+) (?P<reporter>Misc 3d) (?P<page>.+)\]"
cite_match = re.search(pattern, scraped_text[:2000])

# Only for .htm links
full_case = None
if scraped_text.find("<table") != -1:
# replace <br> with newlines because text_content() replaces <br>
# with whitespace. If not, case names would lack proper separation
scraped_text = scraped_text.replace("<br>", "\n")
full_case = fromstring(scraped_text).xpath("//table")
full_case = full_case[1].text_content() if full_case else ""

metadata = {
"Docket": {"docket_number": docket_number},
}

if judge:
metadata["Opinion"] = {
"author_str": normalize_judge_string(judge)[0]
}
if cite_match:
metadata["Citation"] = cite_match.groupdict("")
if full_case:
full_case = harmonize(full_case)
metadata["Docket"]["case_name_full"] = full_case
metadata["OpinionCluster"] = {"case_name_full": full_case}

return metadata

@staticmethod
def match(scraped_text: str, pattern: str) -> str:
"""Returns first match
:param scraped_text: HTML or PDF string content
:param pattern: regex string
:returns: first match
"""
m = re.findall(pattern, scraped_text)
r = list(filter(None, chain.from_iterable(m)))
return r[0].strip() if r else ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
[
{
"case_dates": "2023-12-11",
"case_names": "Morel v. Aviles",
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_51355.htm",
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "",
"citations": "2023 NY Slip Op 51355(U)",
"case_name_shorts": "Morel",
"child_courts": "City Ct Middletown, Orange County"
},
{
"case_dates": "2023-12-08",
"case_names": "Cooley v. Vanslyke",
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23382.htm",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "",
"citations": "2023 NY Slip Op 23382",
"case_name_shorts": "Cooley",
"child_courts": "City Ct Little Falls, Herkimer County"
},
{
"case_dates": "2023-12-07",
"case_names": "Hughes v. Qingling Zhao",
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23383.htm",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "",
"citations": "2023 NY Slip Op 23383",
"case_name_shorts": "Hughes",
"child_courts": "City Ct Long Beach, Nassau County"
},
{
"case_dates": "2023-12-01",
"case_names": "Potentia Mgt. Group, LLC v. D.W.",
"download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23374.htm",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "",
"citations": "2023 NY Slip Op 23374",
"case_name_shorts": "D.W.",
"child_courts": "Utica City Ct"
}
]
Loading

0 comments on commit c15d259

Please sign in to comment.