Skip to content

Commit

Permalink
Merge branch 'develop' into 96-ui-spell-check-crossfeeds-pages
Browse files Browse the repository at this point in the history
  • Loading branch information
ameliav committed Mar 19, 2024
2 parents 819ba86 + 2d1781a commit ae63e6d
Show file tree
Hide file tree
Showing 23 changed files with 820 additions and 420 deletions.
4 changes: 1 addition & 3 deletions .bandit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
# If `tests` is empty, all tests are considered included.

tests:
# - B101
# - B102

skips:
# - B101 # skip "assert used" check since assertions are required in pytests
- B101 # skip "assert used" check since assertions are required in pytests
2 changes: 2 additions & 0 deletions .github/workflows/backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ jobs:
uses: actions/[email protected]
with:
python-version: '3.10'
- name: Copy .env file
run: cp ../dev.env.example .env
- uses: actions/cache@v3
with:
path: ~/.cache/pip
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# Files already tracked by Git are not affected.
# See: https://git-scm.com/docs/gitignore

# python
__pycache__
.mypy_cache
.python-version

# terraform
.terraform
Expand Down Expand Up @@ -50,4 +54,3 @@ minio-data
infrastructure/lambdas/security_headers.zip
*.hcl
.iac-data

2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ repos:
rev: v1.5.1
hooks:
- id: mypy
additional_dependencies:
- types-requests
- repo: https://github.com/asottile/pyupgrade
rev: v3.10.1
hooks:
Expand Down
48 changes: 40 additions & 8 deletions backend/scripts/populateCountiesCities/cities.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,44 @@
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
"""
This module contains the script for populating cities data.
It includes functions for parsing titles, pulling cities data from Wikipedia,
and writing the data to a CSV file.
"""

# Standard Python Libraries
import json
import re
import time
from urllib.parse import unquote

# Third-Party Libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests


def title_parse(title):
"""
Parse the title by unquoting it.
Args:
title (str): The title to be parsed.
Returns:
str: The parsed title.
"""
title = unquote(title)
return title


def pull_cities():
"""
Process and pull cities data from Wikipedia.
This function reads the Wikipedia US cities data from a JSON file, processes each entry,
fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information,
and writes the data to a CSV file.
"""
print("Processing Cities...")
with open("wikipedia_US_cities.json") as f:
wikipedia_us_city_data = json.load(f)
Expand All @@ -23,7 +49,10 @@ def pull_cities():
print(entry["name"])
# get the response in the form of html
wikiurl = "https://en.wikipedia.org/wiki/" + entry["url"]
response = requests.get(wikiurl)
try:
response = requests.get(wikiurl, timeout=5)
except requests.exceptions.Timeout:
print("The request timed out")

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, "html.parser")
Expand Down Expand Up @@ -52,7 +81,9 @@ def pull_cities():
if "," in link.get("title"):
county_pieces = link.get("title").split(",")
# OPEN WIKIPEDIA PAGE UP
x = requests.get("https://en.wikipedia.org/" + link.get("href"))
x = requests.get(
"https://en.wikipedia.org/" + link.get("href"), timeout=5
)

# PULL COUNTY OR PARISH FROM WIKIPEDIA PAGE
county_parish_matches = re.findall(
Expand Down Expand Up @@ -85,7 +116,8 @@ def pull_cities():
}
)
time.sleep(1)
except:
except Exception as e:
print(f"Error: {e}")
pass

df = pd.DataFrame(holding_pen, columns=["State", "County", "City", "URL"])
Expand Down
32 changes: 26 additions & 6 deletions backend/scripts/populateCountiesCities/counties.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,35 @@
"""
This module contains the script for populating counties data.
It includes functions for pulling counties data from Wikipedia,
and writing the data to a CSV file.
"""

# Standard Python Libraries
import re
import time

# Third-Party Libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re


def pull_counties():
"""
Process and pull counties data from Wikipedia.
This function fetches the Wikipedia page for the list of United States counties,
parses the page to extract county, state, and URL information,
and writes the data to a CSV file.
"""
print("Processing Counties...")
# get the response in the form of html
wikiurl = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents"
table_class = "wikitable sortable jquery-tablesorter"
response = requests.get(wikiurl)
try:
response = requests.get(wikiurl, timeout=5)
except requests.exceptions.Timeout:
print("The request timed out")

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, "html.parser")
Expand All @@ -24,7 +43,7 @@ def pull_counties():
try:
county_pieces = link.get("title").split(", ")
# OPEN WIKIPEDIA PAGE UP
x = requests.get("https://en.wikipedia.org/" + link.get("href"))
x = requests.get("https://en.wikipedia.org/" + link.get("href"), timeout=5)

# PULL WEBSITE FROM WIKIPEDIA PAGE
w = re.findall(
Expand All @@ -43,6 +62,7 @@ def pull_counties():
}
)
except Exception as e:
print(f"Error: {e}")
pass

time.sleep(1)
Expand Down
33 changes: 32 additions & 1 deletion backend/scripts/populateCountiesCities/main.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,53 @@
import typer
"""
This module contains the main script for populating counties and cities data.
It includes commands for processing cities and counties data separately or both at once.
"""

# Third-Party Libraries
import cities
import counties
import typer

app = typer.Typer()


@app.command()
def process_cities():
"""
Process and pull cities data from Wikipedia.
This function calls the pull_cities function from the cities module,
which reads the Wikipedia US cities data from a JSON file, processes each entry,
fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information,
and writes the data to a CSV file.
"""
cities.pull_cities()


@app.command()
def process_counties():
"""
Process and pull counties data from Wikipedia.
This function calls the pull_counties function from the counties module,
which fetches the Wikipedia page for the list of United States counties,
parses the page to extract county, state, and URL information,
and writes the data to a CSV file.
"""
counties.pull_counties()


@app.command()
def process_both():
"""
Process and pull both cities and counties data from Wikipedia.
This function calls both the pull_cities function from the cities module and the pull_counties function from the counties module,
which fetches the Wikipedia pages for the list of United States cities and counties,
parses the pages to extract city, county, state, and URL information,
and writes the data to CSV files.
"""
counties.pull_counties()
cities.pull_cities()

Expand Down
2 changes: 1 addition & 1 deletion backend/scripts/populateCountiesCities/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
beautifulsoup4==4.11.2
pandas==1.5.1
requests==2.28.2
beautifulsoup4==4.11.2
typer==0.7.0
5 changes: 5 additions & 0 deletions backend/worker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
This package contains the worker tasks for the backend.
It includes modules for processing data, interacting with databases, and other backend tasks.
"""
Loading

0 comments on commit ae63e6d

Please sign in to comment.