Skip to content

Commit

Permalink
Add mroe information to nypd scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
RyEggGit committed Jan 11, 2024
1 parent bc9d8db commit d81bdb3
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 63 deletions.
5 changes: 3 additions & 2 deletions backend/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,16 @@ def scrape_cpdp():

@app.cli.command("scrape-v2")
@dev_only
def scrape_v2():
@click.argument("debug", default=False)
def scrape_v2(debug: bool = False):
"""Scrape from public data into the database.
This is a handy way to populate the database to start with publicly
available data.
"""
from backend.scraper.run_scrape import scrape

scrape(True)
scrape(debug)


def register_routes(app: Flask):
Expand Down
48 changes: 24 additions & 24 deletions backend/database/models/incident.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,30 @@ class VictimStatus(enum.Enum):
DECEASED = 5


class SourceDetails(db.Model, CrudMixin):
id = db.Column(db.Integer, primary_key=True) # source details id
incident_id = db.Column(
db.Integer, db.ForeignKey("incident.id"), nullable=False
)
record_type = db.Column(db.Enum(RecordType))
# For Journalistic Publications
publication_name = db.Column(db.Text)
publication_date = db.Column(db.Date)
publication_url = db.Column(db.Text)
author = db.Column(db.Text)
author_url = db.Column(db.Text)
author_email = db.Column(db.Text)
# For Government Records
reporting_organization = db.Column(db.Text)
reporting_organization_url = db.Column(db.Text)
reporting_organization_email = db.Column(db.Text)
# For Legal Records
court = db.Column(db.Text)
judge = db.Column(db.Text)
docket_number = db.Column(db.Text)
date_of_action = db.Column(db.Date)


class Incident(db.Model, CrudMixin):
"""The incident table is the fact table."""

Expand Down Expand Up @@ -137,27 +161,3 @@ def create(self, refresh: bool = True):
# # Does an existing warrant count here?
# criminal_case_brought = db.Column(db.Boolean)
# case_id = db.Column(db.Integer) # TODO: foreign key of some sort?


class SourceDetails(db.Model, CrudMixin):
id = db.Column(db.Integer, primary_key=True) # source details id
incident_id = db.Column(
db.Integer, db.ForeignKey("incident.id"), nullable=False
)
record_type = db.Column(db.Enum(RecordType))
# For Journalistic Publications
publication_name = db.Column(db.Text)
publication_date = db.Column(db.Date)
publication_url = db.Column(db.Text)
author = db.Column(db.Text)
author_url = db.Column(db.Text)
author_email = db.Column(db.Text)
# For Government Records
reporting_organization = db.Column(db.Text)
reporting_organization_url = db.Column(db.Text)
reporting_organization_email = db.Column(db.Text)
# For Legal Records
court = db.Column(db.Text)
judge = db.Column(db.Text)
docket_number = db.Column(db.Text)
date_of_action = db.Column(db.Date)
4 changes: 2 additions & 2 deletions backend/database/queries/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def officer_exists(db: Session, stateID: StateID) -> bool:
)


def incident_exists(db: Session, case_id: str) -> bool:
def incident_exists(db: Session, incident: Incident) -> bool:
return (
db.query(Incident).filter(Incident.case_id == case_id).first()
db.query(Incident).filter(Incident.case_id == incident.case_id).first()
is not None
)
6 changes: 3 additions & 3 deletions backend/scraper/run_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def add_to_database(
if table == "officer":
model_exists = officer_exists(
db.session, # type: ignore
model.stateId.value, # type: ignore
model.stateId, # type: ignore
)
elif table == "incident":
model_exists = incident_exists(
db.session, # type: ignore
model.case_id, # type: ignore
model, # type: ignore
)
else:
raise ValueError(f"Invalid table {table}")
Expand Down Expand Up @@ -83,7 +83,7 @@ def scrape(debug: bool = False):
nypd_officer, nypd_incidents = nypd.extract_data(debug=debug)

logger.info(
f"Found {len(nypd_officer)} officers and {len(nypd_incidents)}incidents from NYPD" # noqa: E501
f"Found {len(nypd_officer)} officers and {len(nypd_incidents)} incidents from NYPD" # noqa: E501
)

officers = fiftya_officer + nypd_officer
Expand Down
1 change: 1 addition & 0 deletions backend/scraper/websites/FiftyA/FiftyA.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def _find_incidents(self, complaints: list[str]) -> list[Incident]:
self.logger.info(f"Scrapped {index} complaints")
response = self.fetch(f"{self.SEED}{complaint}")
if not response:
self.logger.error(f"Could not fetch {complaint}")
continue

incident = incident_parser.parse_complaint(response, complaint)
Expand Down
48 changes: 23 additions & 25 deletions backend/scraper/websites/FiftyA/FiftyAIncidentParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
UseOfForce,
SourceDetails,
RecordType,
Perpetrator,
)


Expand Down Expand Up @@ -104,9 +105,26 @@ def _get_force(self, soup: BeautifulSoup) -> Optional[list[UseOfForce]]:
)
]

def _get_officer_badges(self, soup: BeautifulSoup):
officer_involved = soup.find_all("a", class_="name")
return list({officer.get("title", "") for officer in officer_involved})
def _get_officers(self, soup: BeautifulSoup) -> list[Perpetrator]:
officer_involved = set(soup.find_all("a", class_="name"))
badge_number_pattern = re.compile(r"#(\w+)$")
perps: list[Perpetrator] = []
for officer in officer_involved:
description = officer.get_text(strip=True)
badge = re.search(badge_number_pattern, description)
if badge:
badge = badge.group(1)
perps.append(
Perpetrator(
**{
"badge": badge,
"first_name": description.split(" ")[0],
"last_name": description.split(" ")[-1],
"rank": officer.get("title", "").split(" ")[0],
}
)
)
return perps

def _get_witnesses(self, details: list[str]) -> Optional[list[str]]:
witnesses: list[str] = []
Expand Down Expand Up @@ -188,26 +206,6 @@ def parse_complaint(
incident.source_details = source # type: ignore
incident.victims = victim # type: ignore
incident.use_of_force = force # type: ignore

# table = soup.find('tbody')
# perps = soup.find_all('a', class_="name")
# perpetrators = list(set([perp.text for perp in perps]))
# data = {}
# data["victim"] = victim
# data["perpetrators"] = list(set(officer_involved_badges))
# data["tags"] = None
# data["agencies_present"] = [
# f"NYPD {precinct_number} Precinct {precinct_name}"
# if precinct_number and precinct_name
# else None
# ]
# data["participants"] = witnesses
# data["attachments"] = None
# data["investigations"] = None
# data["results_of_stop"] = None
# data["actions"] = None
# data["use_of_force"] = force
# data["legal_case"] = None
# data["incident"] = incident

incident.case_id = int(self.complaint_number(complaint_link))
incident.perpetrators = self._get_officers(soup) # type: ignore
return incident
5 changes: 4 additions & 1 deletion backend/scraper/websites/NYPD/NYPDParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def parse_incidents(self, incidents: list[str]) -> list[Incident]:
incident.was_victim_arrested = False
incident.arrest_id = None
incident.criminal_case_brought = None
incident.case_id = incident_csv[12]
case_id = int(incident_csv[12])
if not case_id:
continue
incident.case_id = case_id
source = SourceDetails(
**{
"record_type": RecordType.GOVERNMENT_RECORD,
Expand Down
6 changes: 3 additions & 3 deletions backend/scraper/websites/NYPD/Nypd.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _find_incidents(self) -> list[str]:
return []
return resp.split("\n")[1:]

def extract_data(self, debug: int = 0):
def extract_data(self, debug: bool = False):
"""
Extract the officer profiles from NYPD
:param debug: The number of officers to extract (0 for all)
Expand All @@ -46,12 +46,12 @@ def extract_data(self, debug: int = 0):

officers = self._find_officers()
if debug:
officers = random.sample(officers, debug)
officers = random.sample(officers, min(5, len(officers)))
self.logger.info(f"Found {len(officers)} officers")

incidents = self._find_incidents()
if debug:
incidents = random.sample(incidents, debug)
incidents = random.sample(incidents, min(5, len(incidents)))
self.logger.info(f"Found {len(incidents)} incidents")

parser = NYPDParser()
Expand Down
2 changes: 1 addition & 1 deletion backend/tests/scraper/test_nypd.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,4 @@ def test_extract_data(
assert incidents[0].was_victim_arrested is False
assert incidents[0].arrest_id is None
assert incidents[0].criminal_case_brought is None
assert incidents[0].case_id == "202201611"
assert incidents[0].case_id == 202201611
4 changes: 2 additions & 2 deletions backend/tests/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_incident_exists(db_session: Any):
)

# Test that the incident exists
assert incident_exists(db_session, case_id_value)
assert incident_exists(db_session, incident)

# Test that a non-existing incident returns False
assert not incident_exists(db_session, "654321")
assert not incident_exists(db_session, Incident(**{"case_id": "654321"}))

0 comments on commit d81bdb3

Please sign in to comment.