Skip to content

Commit

Permalink
Update to scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
RyEggGit committed Jan 10, 2024
1 parent ce625a2 commit 1ab5182
Show file tree
Hide file tree
Showing 28 changed files with 1,337 additions and 51 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,5 @@ dmypy.json
# Data scraper
/backend/scraper/data_scrapers/notebooks/chicago-police-data
/backend/scraper/data_scrapers/counted/scraper_data
/backend/scraper/data_scrapers/fatal_force/temp.csv
/backend/scraper/data_scrapers/fatal_force/temp.csv
/scraper-cache
12 changes: 9 additions & 3 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
FROM python:3.9.5-slim-buster AS base

RUN apt-get update && apt-get install curl -y && apt-get install g++ libpq-dev gcc -y
RUN apt-get update && \
apt-get install -y curl g++ libpq-dev gcc cron && \
rm -rf /var/lib/apt/lists/*

ADD https://github.com/ufoscout/docker-compose-wait/releases/download/2.7.3/wait /wait
RUN chmod +x /wait


FROM base
WORKDIR /app/

Expand All @@ -20,5 +21,10 @@ COPY . .

ENV PORT=$PDT_API_PORT

CMD /wait && ./run_dev.sh
# Add a cron job and start the cron service
COPY backend/scraper/crontab /etc/cron.d/scrape-cron
RUN chmod 0644 /etc/cron.d/scrape-cron
RUN crontab /etc/cron.d/scrape-cron
RUN touch /var/log/cron.log

CMD /wait && cron && tail -f /var/log/cron.log && ./run_dev.sh
13 changes: 12 additions & 1 deletion backend/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def create_app(config: Optional[str] = None):
# @app.before_first_request
# def _():
# db.create_all()

return app


Expand Down Expand Up @@ -124,6 +123,18 @@ def scrape_cpdp():
]
)

@app.cli.command("scrape-v2")
@dev_only
def scrape_v2():
"""Scrape from public data into the database.
This is a handy way to populate the database to start with publicly
available data.
"""
from backend.scraper.run_scrape import scrape

scrape(True)


def register_routes(app: Flask):
app.register_blueprint(partners_bp)
Expand Down
1 change: 1 addition & 0 deletions backend/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@
from .models.user import *
from .models.victim import *
from .models.partner import *
from .models.action import *
46 changes: 33 additions & 13 deletions backend/database/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""
import os
from typing import Any, Optional

from datetime import datetime
import click
import pandas as pd
import psycopg2.errors
Expand Down Expand Up @@ -47,10 +47,36 @@ def get(cls, id: Any, abort_if_null: bool = True):
abort(404)
return obj

"""https://stackoverflow.com/questions/18147435/how-to-exclude-specific-fields-on-serialization-with-jsonpickle"""

QUERIES_DIR = os.path.abspath(
os.path.join(os.path.dirname(__file__), "queries")
)
def __getstate__(self, stringify_dates: list[str] = []):
"""
Get the state of the object for pickling.
Args:
stringify_dates (bool): Whether to convert datetime objects to strings.
Returns:
dict: The state of the object.
"""
state = self.__dict__.copy()
keys_to_remove = ["_sa_instance_state", "id"]
keys_to_remove += [key for key, value in state.items() if value is None]
for key in keys_to_remove:
del state[key]
return state

def __setstate__(self, state: dict[str, Any]):
"""
Set the state of the object using the provided dictionary.
Args:
state (dict[str, Any]): The dictionary containing the state of the object.
"""
self.__dict__.update(state)


QUERIES_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "queries"))


def execute_query(filename: str) -> Optional[pd.DataFrame]:
Expand Down Expand Up @@ -102,17 +128,13 @@ def db_cli(ctx: click.Context):
@pass_psql_admin_connection
@click.pass_context
@dev_only
def create_database(
ctx: click.Context, conn: connection, overwrite: bool = False
):
def create_database(ctx: click.Context, conn: connection, overwrite: bool = False):
"""Create the database from nothing."""
database = current_app.config["POSTGRES_DB"]
cursor = conn.cursor()

if overwrite:
cursor.execute(
f"SELECT bool_or(datname = '{database}') FROM pg_database;"
)
cursor.execute(f"SELECT bool_or(datname = '{database}') FROM pg_database;")
exists = cursor.fetchall()[0][0]
if exists:
ctx.invoke(delete_database)
Expand Down Expand Up @@ -173,9 +195,7 @@ def delete_database(conn: connection, test_db: bool):
)
confirmation = click.prompt("Database name")
if database != confirmation:
click.echo(
"The input does not match. " "The database will not be deleted."
)
click.echo("The input does not match. " "The database will not be deleted.")
return None

try:
Expand Down
52 changes: 25 additions & 27 deletions backend/database/models/incident.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,7 @@ class Incident(db.Model, CrudMixin):
"""The incident table is the fact table."""

id = db.Column(db.Integer, primary_key=True, autoincrement=True)
source_id = db.Column(
db.Integer, db.ForeignKey("partner.id"))
source_details = db.relationship(
"SourceDetails", backref="incident", uselist=False)
source_id = db.Column(db.Integer, db.ForeignKey("partner.id"))
date_record_created = db.Column(db.DateTime)
time_of_incident = db.Column(db.DateTime)
time_confidence = db.Column(db.Integer)
Expand All @@ -84,12 +81,15 @@ class Incident(db.Model, CrudMixin):
# Does an existing warrant count here?
criminal_case_brought = db.Column(db.Boolean)
case_id = db.Column(db.Integer) # TODO: foreign key of some sort?

source_details = db.relationship("SourceDetails", backref="incident", uselist=False)
victims = db.relationship("Victim", backref="incident")
perpetrators = db.relationship("Perpetrator", backref="incident")
# descriptions = db.relationship("Description", backref="incident")
tags = db.relationship("Tag", secondary=incident_tag, backref="incidents")
agencies_present = db.relationship(
"Agency", secondary=incident_agency, backref="recorded_incidents")
"Agency", secondary=incident_agency, backref="recorded_incidents"
)
participants = db.relationship("Participant", backref="incident")
attachments = db.relationship("Attachment", backref="incident")
investigations = db.relationship("Investigation", backref="incident")
Expand Down Expand Up @@ -119,29 +119,27 @@ def create(self, refresh: bool = True):
# )
# text = db.Column(db.Text)
# type = db.Column(db.Text) # TODO: enum
# TODO: are there rules for this column other than text?
# organization_id = db.Column(db.Text)
# location = db.Column(db.Text) # TODO: location object
# # TODO: neighborhood seems like a weird identifier that may not always
# # apply in consistent ways across municipalities.
# neighborhood = db.Column(db.Text)
# stop_type = db.Column(db.Text) # TODO: enum
# call_type = db.Column(db.Text) # TODO: enum
# has_multimedia = db.Column(db.Boolean)
# from_report = db.Column(db.Boolean)
# # These may require an additional table. Also can dox a victim
# was_victim_arrested = db.Column(db.Boolean)
# arrest_id = db.Column(db.Integer) # TODO: foreign key of some sort?
# # Does an existing warrant count here?
# criminal_case_brought = db.Column(db.Boolean)
# case_id = db.Column(db.Integer) # TODO: foreign key of some sort?


class SourceDetails(db.Model):
# TODO: are there rules for this column other than text?
# organization_id = db.Column(db.Text)
# location = db.Column(db.Text) # TODO: location object
# # TODO: neighborhood seems like a weird identifier that may not always
# # apply in consistent ways across municipalities.
# neighborhood = db.Column(db.Text)
# stop_type = db.Column(db.Text) # TODO: enum
# call_type = db.Column(db.Text) # TODO: enum
# has_multimedia = db.Column(db.Boolean)
# from_report = db.Column(db.Boolean)
# # These may require an additional table. Also can dox a victim
# was_victim_arrested = db.Column(db.Boolean)
# arrest_id = db.Column(db.Integer) # TODO: foreign key of some sort?
# # Does an existing warrant count here?
# criminal_case_brought = db.Column(db.Boolean)
# case_id = db.Column(db.Integer) # TODO: foreign key of some sort?


class SourceDetails(db.Model, CrudMixin):
id = db.Column(db.Integer, primary_key=True) # source details id
incident_id = db.Column(
db.Integer, db.ForeignKey("incident.id"), nullable=False
)
incident_id = db.Column(db.Integer, db.ForeignKey("incident.id"), nullable=False)
record_type = db.Column(db.Enum(RecordType))
# For Journalistic Publications
publication_name = db.Column(db.Text)
Expand Down
12 changes: 6 additions & 6 deletions backend/database/models/officer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import enum

from .. import db
from ..core import CrudMixin, db


class Rank(str, enum.Enum):
Expand Down Expand Up @@ -69,15 +68,15 @@ class State(str, enum.Enum):
WY = "WY"


class StateID(db.Model):
class StateID(db.Model, CrudMixin):
"""
Represents a Statewide ID that follows an offcier even as they move between
law enforcement agencies. for an officer. For example, in New York, this
would be the Tax ID Number.
"""

id = db.Column(db.Integer, primary_key=True)
officer_id = db.Column(
db.Integer, db.ForeignKey("officer.id"))
officer_id = db.Column(db.Integer, db.ForeignKey("officer.id"))
id_name = db.Column(db.Text) # e.g. "Tax ID Number"
state = db.Column(db.Enum(State)) # e.g. "NY"
value = db.Column(db.Text) # e.g. "958938"
Expand All @@ -86,14 +85,15 @@ def __repr__(self):
return f"<StateID {self.id}>"


class Officer(db.Model):
class Officer(db.Model, CrudMixin):
id = db.Column(db.Integer, primary_key=True) # officer id
first_name = db.Column(db.Text)
last_name = db.Column(db.Text)
race = db.Column(db.Text)
ethnicity = db.Column(db.Text)
gender = db.Column(db.Text)
date_of_birth = db.Column(db.Date)
stateId = db.relationship("StateID", backref="officer", uselist=False)

def __repr__(self):
return f"<Officer {self.id}>"
1 change: 1 addition & 0 deletions backend/database/models/participant.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ class Participant(db.Model):
gender = db.Column(db.Enum(Gender))
race = db.Column(db.Enum(Race))
age = db.Column(db.Integer)
name = db.Column(db.Text)
Empty file removed backend/scraper/__init__.py
Empty file.
1 change: 1 addition & 0 deletions backend/scraper/crontab
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* * * * * flask scrape-v2 > /backend/scraper/scrape.log 2>&1
53 changes: 53 additions & 0 deletions backend/scraper/mixins/Parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from bs4 import BeautifulSoup, Tag
from typing import Union
import logging


class ParserMixin:
"""
A mixin class for parsing HTML using BeautifulSoup.
Args:
logger (Union[logging.Logger, None], optional): The logger instance to use for logging. Defaults to None.
Attributes:
logger (logging.Logger): The logger instance used for logging.
Methods:
_find_and_extract: Finds and extracts text from an HTML element.
"""

def __init__(self, logger: Union[logging.Logger, None] = None):
self.logger = logger or logging.getLogger(__name__)

def _find_and_extract(
self,
soup: Union[BeautifulSoup, Tag],
tag: str,
class_: str,
error_message: str,
replace_text: Union[str, None] = None,
) -> Union[str, None]:
"""
Finds and extracts text from an HTML element.
Args:
soup (Union[BeautifulSoup, Tag]): The BeautifulSoup object or Tag to search within.
tag (str): The HTML tag to search for.
class_ (str): The CSS class of the HTML element to search for.
error_message (str): The error message to log if the element is not found.
replace_text (Union[str, None], optional): The text to replace in the extracted text. Defaults to None.
Returns:
Union[str, None]: The extracted text, or None if the element is not found.
"""
element = soup.find(tag, class_=class_)
if not element:
self.logger.warning(error_message)
return None
text = element.text
if replace_text:
text = text.replace(replace_text, "")
return text
Loading

0 comments on commit 1ab5182

Please sign in to comment.