diff --git a/clean/ca/ventura_county_sheriff.py b/clean/ca/ventura_county_sheriff.py new file mode 100644 index 0000000..27622e1 --- /dev/null +++ b/clean/ca/ventura_county_sheriff.py @@ -0,0 +1,219 @@ +import logging +import time +import urllib.parse +from pathlib import Path +from typing import List + +from bs4 import BeautifulSoup + +from .. import utils +from ..cache import Cache +from ..utils import MetadataDict + +logger = logging.getLogger(__name__) + + +class Site: + """Scrape file metadata and download files for the Ventura County Sheriff for SB1421/AB748 data. + + Attributes: + name (str): The official name of the agency + """ + + name = "Ventura County Sheriff" + + agency_slug = "ca_ventura_county_sheriff" + + def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR): + # Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748 + # videos and files along with additional index pages + self.base_url = "https://www.venturasheriff.org" + self.index_urls = { + f"{self.base_url}/sb1421/officer-involved-shooting-ois/": "ois.html", + f"{self.base_url}/sb1421/use-of-force-great-bodily-injury-cases-gbi/": "gbi.html", + f"{self.base_url}/ab748/": "ab748.html", + } + + self.cache = Cache(cache_dir) # ~/.clean-scraper/cache/ + self.data_dir = data_dir + self.cache_dir = cache_dir + + # Use module path to construct agency slug, which we'll use downstream + # to create a subdir inside the main cache directory to stash files for this agency + mod = Path(__file__) + state_postal = mod.parent.stem + self.cache_suffix = f"{state_postal}_{mod.stem}" # ca_ventura_county_sheriff + self.cache_root = cache_dir / (self.cache_suffix) + self.subpages_dir = self.cache_root / "subpages" + + def scrape_meta(self, throttle: int = 0) -> Path: + metadata: List[MetadataDict] = [] + page_urls = [] + # Scrape index pages for both assets and links to case directories/subpages + for index_url in self.index_urls: + detail_page_links, local_metadata = self._process_index_page(index_url) + page_urls.extend(detail_page_links) + metadata.extend(local_metadata) + time.sleep(throttle) + + # Now, process the links of case directories/subpages + for page_url in page_urls: + local_metadata = self._process_detail_page(page_url) + metadata.extend(local_metadata) + time.sleep(throttle) + + outfile = self.data_dir.joinpath(f"{self.cache_suffix}.json") + logger.debug(f"Attempting to save metadata to {outfile}") + full_filename = self.cache.write_json(outfile, metadata) + return full_filename + + # Helper/Private Methods + def _process_detail_page(self, target_url) -> List[MetadataDict]: + """Extract links to files such as videos from a detail page and write to JSON file.""" + local_metadata: List[MetadataDict] = [] + + # Build a complete URL and determine the subdirectory name + if target_url.endswith("/"): + target_url = target_url[:-1] + if "http" not in target_url: + target_url = urllib.parse.urljoin(self.base_url, target_url) + + full_filename = self.subpages_dir / (target_url.split("/")[-1] + ".html") + relative_filename = str(full_filename.relative_to(self.cache_dir)).replace( + "\\", "/" + ) + + # Download the index page, which saves to local cache + self.cache.download( + full_filename, + target_url, + force=False, # Do NOT automatically rescrape subpages + ) + + html = self.cache.read(full_filename) + soup = BeautifulSoup(html, "html.parser") + # Find the title of the page + title = soup.find("h1") + if title: + title = title.get_text().strip() # type: ignore + + h2split = "", "

").replace("", "

") + + soup = BeautifulSoup(html, "html.parser") + h2split = "