Skip to content

Commit

Permalink
working w/ docker
Browse files Browse the repository at this point in the history
  • Loading branch information
Sawyer committed Oct 12, 2024
1 parent 166e02c commit 7d7aaa4
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 466 deletions.
28 changes: 28 additions & 0 deletions gef-portal-scraper/Dockerfile.get_project_ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM python:3.11-slim as builder

RUN pip install poetry==1.8.3

ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache

WORKDIR /app

COPY pyproject.toml poetry.lock ./
RUN touch README.md

RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR

FROM python:3.11-slim as runtime

WORKDIR /app

ENV VIRTUAL_ENV=/app/.venv \
PATH="/app/.venv/bin:$PATH"

COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}

COPY gef_portal_scraper/get_project_ids.py ./src/

CMD ["python", "./src/get_project_ids.py"]
43 changes: 0 additions & 43 deletions gef-portal-scraper/Dockerfile.selenium

This file was deleted.

194 changes: 83 additions & 111 deletions gef-portal-scraper/gef_portal_scraper/get_project_ids.py
Original file line number Diff line number Diff line change
@@ -1,169 +1,141 @@
import asyncio
import logging
import os
from datetime import datetime

import aiohttp
import backoff
import ujson as json

# import ujson as json
import json
from aiolimiter import AsyncLimiter
from bs4 import BeautifulSoup
from pydantic_settings import BaseSettings
from tqdm.asyncio import tqdm

# Set up logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Switches
USE_RATE_LIMITER = True
USE_TQDM = True
USE_FILE_TIMESTAMPS = True
class Settings(BaseSettings):
BASE_URL: str = "https://www.thegef.org/projects-operations/projects/{project_id}"
DATABASE_URL: str = "https://www.thegef.org/projects-operations/database"
OUTPUT_DIR: str = "/app/data/"
JSON_FILENAME: str = "project_ids.json"
ID_START: int = 1
RATE_LIMIT: int = 125
BATCH_SIZE: int = 999999
RESPONSE_CODES_LOGGED: list[int] = [429]
USE_RATE_LIMITER: bool = True
USE_TQDM: bool = True
USE_FILE_TIMESTAMPS: bool = True

class Config:
env_file = ".env"
env_file_encoding = "utf-8"

# Constants
BASE_URL = "https://www.thegef.org/projects-operations/projects/{project_id}"
OUTPUT_DIR = "../data/"
JSON_FILENAME = "project_ids.json"
ID_START = 1
RATE_LIMIT = 125
BATCH_SIZE = 99999
RESPONSE_CODES_LOGGED = [429]

# Set up limiter
rate_limiter = AsyncLimiter(RATE_LIMIT, 1) # 25 requests per second
config = Settings()

# Set up logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

async def get_highest_project_id():
"""
Gets the highest project ID from the GEF website.
# Set up rate limiter
rate_limiter = AsyncLimiter(config.RATE_LIMIT, 1)

Returns:
int: the highest project ID, or None if the table or data could not be found.
"""
url = "https://www.thegef.org/projects-operations/database"
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="table-hover")
if table:
rows = table.find("tbody").find_all("tr") # type: ignore
if rows:
return int(rows[0].find_all("td")[1].text.strip())

async def get_highest_project_id(session) -> int | None:
"""Gets the highest project ID from the GEF website."""
async with session.get(config.DATABASE_URL) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="table-hover")
if table and (rows := table.find("tbody").find_all("tr")): # type: ignore
return int(rows[0].find_all("td")[1].text.strip())
logging.error("Could not find the project ID table or data.")
return None


@backoff.on_predicate(backoff.expo, lambda value: value == -429, max_time=60)
async def check_project_id(session, project_id):
"""
Check if a project ID exists in the GEF database.
Args:
session (aiohttp.ClientSession): The aiohttp client session to use for the request.
project_id (int): The project ID to check.
Returns:
int: The project ID if it exists, otherwise None.
"""

url = BASE_URL.format(project_id=project_id)
@backoff.on_predicate(backoff.expo, lambda x: x == -429, max_time=60)
async def check_project_id(session, project_id) -> int | None:
"""Check if a project ID exists in the GEF database."""
url = config.BASE_URL.format(project_id=project_id)

async def make_request():
async with session.head(url, allow_redirects=False) as response:
global total_retry_after
if response.status in RESPONSE_CODES_LOGGED:
if response.status in config.RESPONSE_CODES_LOGGED:
logging.error(
f"Failed to access website: {url}. Status code: {response.status}"
)

return int(response.status) * -1 # trigger backoff w/ status

return -response.status
return project_id if response.status == 200 else None

try:
if USE_RATE_LIMITER:
if config.USE_RATE_LIMITER:
async with rate_limiter:
return await make_request()
else:
return await make_request()

except aiohttp.ClientError as e:
logging.error(f"Failed to access website: {url}")
raise e # Re-raise the exception to trigger backoff
raise e
except Exception as e:
# Log status code and exception type
logging.error(f"Failed to access website: {url}. Exception type: {type(e)}")
raise e
return None


async def check_project_ids(start_id, end_id):
"""
Check all project IDs in a given range.
Args:
start_id (int): The lowest project ID to check.
end_id (int): The highest project ID to check (inclusive).

Returns:
list[int]: A list of valid project IDs in the given range.
"""

async def check_project_ids(session, start_id: int, end_id: int) -> list[int]:
"""Check all project IDs in a given range."""
logging.info(f"Checking project IDs from {start_id} to {end_id}")
async with aiohttp.ClientSession() as session:
tasks = [check_project_id(session, id) for id in range(start_id, end_id + 1)]

if USE_TQDM:
results = await tqdm.gather(
*tasks,
desc=f"Checking Project IDs: {start_id}-{end_id}",
total=len(tasks),
)
else:
results = await asyncio.gather(*tasks)
tasks = [check_project_id(session, id) for id in range(start_id, end_id + 1)]

valid_ids = [id for id in results if id is not None]
logging.info(f"Found {len(valid_ids)} valid IDs in range {start_id}-{end_id}")
return valid_ids
if config.USE_TQDM:
results = await tqdm.gather(
*tasks, desc=f"Checking Project IDs: {start_id}-{end_id}", total=len(tasks)
)
else:
results = await asyncio.gather(*tasks)

valid_ids = [id for id in results if id is not None]
logging.info(f"Found {len(valid_ids)} valid IDs in range {start_id}-{end_id}")
return valid_ids

def save_to_json(data, filename=JSON_FILENAME):
if USE_FILE_TIMESTAMPS:

def save_to_json(data: list[int], filename: str = config.JSON_FILENAME):
"""Save the valid project IDs to a JSON file."""
if config.USE_FILE_TIMESTAMPS:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = filename.replace(".json", f"_{timestamp}.json")

with open(OUTPUT_DIR + filename, "w") as f:
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
with open(os.path.join(config.OUTPUT_DIR, filename), "w") as f:
json.dump({"valid_project_ids": data}, f, indent=2)

logging.info(f"Saved {len(data)} project IDs to JSON file: {filename}")


async def main():
highest_id = await get_highest_project_id()

if highest_id is None:
logging.error("Could not get the highest project ID. Exiting.")
return

logging.info(f"Highest project ID: {highest_id}")

all_valid_ids: list[int] = []

# Batched check
batches = [
(start_id, min(start_id + BATCH_SIZE - 1, highest_id))
for start_id in range(ID_START, highest_id + 1, BATCH_SIZE)
]

for start_id, end_id in batches:
valid_ids = await check_project_ids(start_id, end_id)
all_valid_ids.extend(valid_ids)

logging.info(
f"Found {len(all_valid_ids)} valid IDs in range {ID_START}-{highest_id}"
)

save_to_json(all_valid_ids)
async with aiohttp.ClientSession() as session:
highest_id = await get_highest_project_id(session)
if highest_id is None:
logging.error("Could not get the highest project ID. Exiting.")
return

logging.info(f"Highest project ID: {highest_id}")
all_valid_ids = []

batches = [
(start_id, min(start_id + config.BATCH_SIZE - 1, highest_id))
for start_id in range(config.ID_START, highest_id + 1, config.BATCH_SIZE)
]

for start_id, end_id in batches:
valid_ids = await check_project_ids(session, start_id, end_id)
all_valid_ids.extend(valid_ids)

logging.info(
f"Found {len(all_valid_ids)} valid IDs in range {config.ID_START}-{highest_id}"
)
save_to_json(all_valid_ids)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 7d7aaa4

Please sign in to comment.