Merge pull request #848 from grossir/nymisc

feat(nymisc): Add scraper for nyfam, nycity, nycounty, nysupreme, nycciv, nyccrim, nysurrogate, nydistrict, nyjustice, nyctclaims
freelawproject · Jan 9, 2024 · c15d259 · c15d259
2 parents c23f27a + 87d9fcd
commit c15d259
Show file tree

Hide file tree

Showing 40 changed files with 23,466 additions and 227 deletions.
diff --git a/juriscraper/OpinionSite.py b/juriscraper/OpinionSite.py
@@ -30,6 +30,7 @@ def __init__(self, *args, **kwargs):
             "parallel_citations",
             "summaries",
             "case_name_shorts",
+            "child_courts",
         ]
         self._req_attrs = [
             "case_dates",
@@ -108,6 +109,9 @@ def _get_precedential_statuses(self):
     def _get_summaries(self):
         return None
 
+    def _get_child_courts(self):
+        return None
+
     def extract_from_text(self, scraped_text):
         """Pass scraped text into function and return data as a dictionary
 

diff --git a/juriscraper/OpinionSiteLinear.py b/juriscraper/OpinionSiteLinear.py
@@ -73,3 +73,6 @@ def _get_summaries(self):
 
     def _get_lower_courts(self):
         return self._get_optional_field_by_id("lower_court")
+
+    def _get_child_courts(self):
+        return self._get_optional_field_by_id("child_court")
diff --git a/juriscraper/opinions/united_states/state/__init__.py b/juriscraper/opinions/united_states/state/__init__.py
@@ -113,7 +113,17 @@
     "nyappdiv_4th",
     "nyappterm_1st",
     "nyappterm_2nd",
+    "nysupct_commercial",
     "nysupct",
+    "nyfamct",
+    "nycityct",
+    "nycountyct",
+    "nycivct",
+    "nycrimct",
+    "nysurct",
+    "nydistct",
+    "nyjustct",
+    "nyclaimsct",
     "ohio",
     "ohioctapp_1",
     "ohioctapp_2",

diff --git a/juriscraper/opinions/united_states/state/nycityct.py b/juriscraper/opinions/united_states/state/nycityct.py
@@ -0,0 +1,7 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"City? (Ct|Court)"
+    # Most start with the regex, but there are special cases
+    # such as 'Utica City Ct' in Dec 2023
diff --git a/juriscraper/opinions/united_states/state/nycivct.py b/juriscraper/opinions/united_states/state/nycivct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"Civ(il)? C[our]*t|[HC]CIV|Hous Part"
diff --git a/juriscraper/opinions/united_states/state/nyclaimsct.py b/juriscraper/opinions/united_states/state/nyclaimsct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"^Ct Cl|C(our)?t( [Oo]f)? Cl(aims)?$"
diff --git a/juriscraper/opinions/united_states/state/nycountyct.py b/juriscraper/opinions/united_states/state/nycountyct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"^County|(Co(unty?)? Ct)"
diff --git a/juriscraper/opinions/united_states/state/nycrimct.py b/juriscraper/opinions/united_states/state/nycrimct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"Cri?m(inal)? C[our]*t"
diff --git a/juriscraper/opinions/united_states/state/nydistct.py b/juriscraper/opinions/united_states/state/nydistct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"Dist\.?(ric[tk])? C(our)?t"
diff --git a/juriscraper/opinions/united_states/state/nyfamct.py b/juriscraper/opinions/united_states/state/nyfamct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"Fam Ct|Family Court|Youth Part"
diff --git a/juriscraper/opinions/united_states/state/nyjustct.py b/juriscraper/opinions/united_states/state/nyjustct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"(Just|Village|Town) Ct|Just(ice)? Cour+t"
diff --git a/juriscraper/opinions/united_states/state/nysupct.py b/juriscraper/opinions/united_states/state/nysupct.py
@@ -1,12 +1,5 @@
-# Scraper and Back Scraper for New York Commercial Division
-# CourtID: nysupct
-# Court Short Name: NY
+from juriscraper.opinions.united_states.state import nytrial
 
-from juriscraper.opinions.united_states.state import nyappterm_1st
 
-
-class Site(nyappterm_1st.Site):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.court = "Commercial Division"
-        self.parameters.update({"court": self.court})
+class Site(nytrial.Site):
+    court_regex = r"Sup[rt]?\.? ?[Cc]o?u?r?t?|[sS]ur?pu?rem?e? C(our)?t|Sur?pe?r?me?|Suoreme|Sup County|Integrated Domestic Violence|Soho Fashions LTD"
diff --git a/juriscraper/opinions/united_states/state/nysupct_commercial.py b/juriscraper/opinions/united_states/state/nysupct_commercial.py
@@ -0,0 +1,15 @@
+"""Scraper and Back Scraper for New York Commercial Division
+CourtID: nysupct_commercial
+Court Short Name: NY
+History:
+ - 2024-01-05, grossir: modified to use nytrial template
+"""
+from datetime import date
+
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    base_url = "https://nycourts.gov/reporter/slipidx/com_div_idxtable.shtml"
+    court_regex = r".*"
+    first_opinion_date = date(2013, 7, 1)
diff --git a/juriscraper/opinions/united_states/state/nysurct.py b/juriscraper/opinions/united_states/state/nysurct.py
@@ -0,0 +1,5 @@
+from juriscraper.opinions.united_states.state import nytrial
+
+
+class Site(nytrial.Site):
+    court_regex = r"Sur{1,}oa?gate|Sur[.r]* Ct"
diff --git a/juriscraper/opinions/united_states/state/nytrial.py b/juriscraper/opinions/united_states/state/nytrial.py
@@ -0,0 +1,170 @@
+"""
+Scraper template for the 'Other Courts' of the NY Reporter
+Court Contact: phone: (518) 453-6900
+Author: Gianfranco Rossi
+History:
+ - 2024-01-05, grossir: created
+"""
+import re
+from datetime import date
+from itertools import chain
+from typing import Any, Dict, List, Optional
+
+from dateutil.rrule import MONTHLY, rrule
+from lxml.html import fromstring
+
+from juriscraper.AbstractSite import logger
+from juriscraper.lib.judge_parsers import normalize_judge_string
+from juriscraper.lib.string_utils import harmonize
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
+
+
+class Site(OpinionSiteLinear):
+    court_regex: str  # to be defined on inheriting classes
+    base_url = "https://nycourts.gov/reporter/slipidx/miscolo.shtml"
+    first_opinion_date = date(2003, 12, 1)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.court_id = self.__module__
+        self.url = self.build_url()
+
+        date_keys = rrule(
+            MONTHLY, dtstart=self.first_opinion_date, until=date(2023, 12, 30)
+        )
+        self.back_scrape_iterable = [i.date() for i in date_keys]
+
+    def build_url(self, target_date: Optional[date] = None) -> str:
+        """URL as is loads most recent month page
+        There is an URL for each month of each year back to Dec 2003
+
+        :param target_date: used to extract month and year for backscraping
+        :returns str: formatted url
+        """
+        if not target_date:
+            return self.base_url
+
+        end = f"_{target_date.year}_{target_date.strftime('%B')}.shtml"
+
+        return self.base_url.replace(".shtml", end)
+
+    def is_court_of_interest(self, court: str) -> bool:
+        """'Other Courts' of NY Reporter consists of 10 different
+        family of sources. Each family has an scraper that inherits
+        from this class and defines a `court_regex` to capture those
+        that belong to its family
+
+        For example
+        "Civ Ct City NY, Queens County" and "Civ Ct City NY, NY County"
+        belong to nycivct family
+
+        :param court: court name
+        :return: true if court name matches
+                family of courts of calling scraper
+        """
+        return bool(re.search(self.court_regex, court))
+
+    def _process_html(self) -> None:
+        """Parses a page's HTML into opinion dictionaries
+
+        :return: None
+        """
+        row_xpath = "//table[caption]//tr[position()>1 and td]"
+        for row in self.html.xpath(row_xpath):
+            court = re.sub(
+                r"\s+", " ", row.xpath("td[2]")[0].text_content()
+            ).strip(", ")
+
+            if not self.is_court_of_interest(court):
+                logger.debug("Skipping %s", court)
+                continue
+
+            url = row.xpath("td[1]/a/@href")[0]
+            name = harmonize(row.xpath("td[1]/a")[0].text_content())
+            opinion_date = row.xpath("td[3]")[0].text_content()
+            slip_cite = row.xpath("td[4]")[0].text_content()
+            status = "Unpublished" if "(U)" in slip_cite else "Published"
+
+            self.cases.append(
+                {
+                    "name": name,
+                    "date": opinion_date,
+                    "status": status,
+                    "url": url,
+                    "citation": slip_cite,
+                    "child_court": court,
+                }
+            )
+
+    def _get_docket_numbers(self) -> List[str]:
+        """Overriding from OpinionSiteLinear, since docket numbers are
+        not in the HTML and they are required
+
+        We will get them on the extract_from_text stage on courtlistener
+
+        :return: list of empty strings values
+        """
+        return ["" for _ in self.cases]
+
+    def _download_backwards(self, target_date: date) -> None:
+        """Method used by backscraper to download historical records
+
+        :param target_date: an element of self.back_scrape_iterable
+        :return: None
+        """
+        self.url = self.build_url(target_date)
+
+    def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
+        """Extract values from opinion's text
+
+        :param scraped_text: pdf or html string contents
+        :return: dict where keys match courtlistener model objects
+        """
+        pattern = r"Judge:\s?(.+)|([\w .,]+), [JS]\.\s"
+        judge = self.match(scraped_text, pattern)
+
+        pattern = r"</table><br><br\s?/?>\s?(.*)\r?\n|Docket Number:\s?(.+)"
+        docket_number = self.match(scraped_text, pattern)
+
+        pattern = r"\[(?P<volume>\d+) (?P<reporter>Misc 3d) (?P<page>.+)\]"
+        cite_match = re.search(pattern, scraped_text[:2000])
+
+        # Only for .htm links
+        full_case = None
+        if scraped_text.find("<table") != -1:
+            # replace <br> with newlines because text_content() replaces <br>
+            # with whitespace. If not, case names would lack proper separation
+            scraped_text = scraped_text.replace("<br>", "\n")
+            full_case = fromstring(scraped_text).xpath("//table")
+            full_case = full_case[1].text_content() if full_case else ""
+
+        metadata = {
+            "Docket": {"docket_number": docket_number},
+        }
+
+        if judge:
+            metadata["Opinion"] = {
+                "author_str": normalize_judge_string(judge)[0]
+            }
+        if cite_match:
+            metadata["Citation"] = cite_match.groupdict("")
+        if full_case:
+            full_case = harmonize(full_case)
+            metadata["Docket"]["case_name_full"] = full_case
+            metadata["OpinionCluster"] = {"case_name_full": full_case}
+
+        return metadata
+
+    @staticmethod
+    def match(scraped_text: str, pattern: str) -> str:
+        """Returns first match
+
+        :param scraped_text: HTML or PDF string content
+        :param pattern: regex string
+
+        :returns: first match
+        """
+        m = re.findall(pattern, scraped_text)
+        r = list(filter(None, chain.from_iterable(m)))
+        return r[0].strip() if r else ""
diff --git a/tests/examples/opinions/united_states/nycityct_example.compare.json b/tests/examples/opinions/united_states/nycityct_example.compare.json
@@ -0,0 +1,50 @@
+[
+  {
+    "case_dates": "2023-12-11",
+    "case_names": "Morel v. Aviles",
+    "download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_51355.htm",
+    "precedential_statuses": "Unpublished",
+    "blocked_statuses": false,
+    "date_filed_is_approximate": false,
+    "docket_numbers": "",
+    "citations": "2023 NY Slip Op 51355(U)",
+    "case_name_shorts": "Morel",
+    "child_courts": "City Ct Middletown, Orange County"
+  },
+  {
+    "case_dates": "2023-12-08",
+    "case_names": "Cooley v. Vanslyke",
+    "download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23382.htm",
+    "precedential_statuses": "Published",
+    "blocked_statuses": false,
+    "date_filed_is_approximate": false,
+    "docket_numbers": "",
+    "citations": "2023 NY Slip Op 23382",
+    "case_name_shorts": "Cooley",
+    "child_courts": "City Ct Little Falls, Herkimer County"
+  },
+  {
+    "case_dates": "2023-12-07",
+    "case_names": "Hughes v. Qingling Zhao",
+    "download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23383.htm",
+    "precedential_statuses": "Published",
+    "blocked_statuses": false,
+    "date_filed_is_approximate": false,
+    "docket_numbers": "",
+    "citations": "2023 NY Slip Op 23383",
+    "case_name_shorts": "Hughes",
+    "child_courts": "City Ct Long Beach, Nassau County"
+  },
+  {
+    "case_dates": "2023-12-01",
+    "case_names": "Potentia Mgt. Group, LLC v. D.W.",
+    "download_urls": "https://nycourts.gov/reporter/3dseries/2023/2023_23374.htm",
+    "precedential_statuses": "Published",
+    "blocked_statuses": false,
+    "date_filed_is_approximate": false,
+    "docket_numbers": "",
+    "citations": "2023 NY Slip Op 23374",
+    "case_name_shorts": "D.W.",
+    "child_courts": "Utica City Ct"
+  }
+]