working w/ docker

wmgeolab · Oct 12, 2024 · 7d7aaa4 · 7d7aaa4
1 parent 166e02c
commit 7d7aaa4
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 466 deletions.
diff --git a/gef-portal-scraper/Dockerfile.get_project_ids b/gef-portal-scraper/Dockerfile.get_project_ids
@@ -0,0 +1,28 @@
+FROM python:3.11-slim as builder
+
+RUN pip install poetry==1.8.3
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+
+RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
+
+FROM python:3.11-slim as runtime
+
+WORKDIR /app
+
+ENV VIRTUAL_ENV=/app/.venv \
+    PATH="/app/.venv/bin:$PATH"
+
+COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+
+COPY gef_portal_scraper/get_project_ids.py ./src/
+
+CMD ["python", "./src/get_project_ids.py"]
diff --git a/gef-portal-scraper/Dockerfile.selenium b/gef-portal-scraper/Dockerfile.selenium
diff --git a/gef-portal-scraper/gef_portal_scraper/get_project_ids.py b/gef-portal-scraper/gef_portal_scraper/get_project_ids.py
@@ -1,169 +1,141 @@
 import asyncio
 import logging
+import os
 from datetime import datetime
 
 import aiohttp
 import backoff
-import ujson as json
+
+# import ujson as json
+import json
 from aiolimiter import AsyncLimiter
 from bs4 import BeautifulSoup
+from pydantic_settings import BaseSettings
 from tqdm.asyncio import tqdm
 
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 
-# Switches
-USE_RATE_LIMITER = True
-USE_TQDM = True
-USE_FILE_TIMESTAMPS = True
+class Settings(BaseSettings):
+    BASE_URL: str = "https://www.thegef.org/projects-operations/projects/{project_id}"
+    DATABASE_URL: str = "https://www.thegef.org/projects-operations/database"
+    OUTPUT_DIR: str = "/app/data/"
+    JSON_FILENAME: str = "project_ids.json"
+    ID_START: int = 1
+    RATE_LIMIT: int = 125
+    BATCH_SIZE: int = 999999
+    RESPONSE_CODES_LOGGED: list[int] = [429]
+    USE_RATE_LIMITER: bool = True
+    USE_TQDM: bool = True
+    USE_FILE_TIMESTAMPS: bool = True
 
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
 
-# Constants
-BASE_URL = "https://www.thegef.org/projects-operations/projects/{project_id}"
-OUTPUT_DIR = "../data/"
-JSON_FILENAME = "project_ids.json"
-ID_START = 1
-RATE_LIMIT = 125
-BATCH_SIZE = 99999
-RESPONSE_CODES_LOGGED = [429]
 
-# Set up limiter
-rate_limiter = AsyncLimiter(RATE_LIMIT, 1)  # 25 requests per second
+config = Settings()
 
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 
-async def get_highest_project_id():
-    """
-    Gets the highest project ID from the GEF website.
+# Set up rate limiter
+rate_limiter = AsyncLimiter(config.RATE_LIMIT, 1)
 
-    Returns:
-        int: the highest project ID, or None if the table or data could not be found.
-    """
-    url = "https://www.thegef.org/projects-operations/database"
-    async with aiohttp.ClientSession() as session:
-        async with session.get(url) as response:
-            html = await response.text()
-            soup = BeautifulSoup(html, "html.parser")
-            table = soup.find("table", class_="table-hover")
-            if table:
-                rows = table.find("tbody").find_all("tr")  # type: ignore
-                if rows:
-                    return int(rows[0].find_all("td")[1].text.strip())
 
+async def get_highest_project_id(session) -> int | None:
+    """Gets the highest project ID from the GEF website."""
+    async with session.get(config.DATABASE_URL) as response:
+        html = await response.text()
+        soup = BeautifulSoup(html, "html.parser")
+        table = soup.find("table", class_="table-hover")
+        if table and (rows := table.find("tbody").find_all("tr")):  # type: ignore
+            return int(rows[0].find_all("td")[1].text.strip())
     logging.error("Could not find the project ID table or data.")
     return None
 
 
-@backoff.on_predicate(backoff.expo, lambda value: value == -429, max_time=60)
-async def check_project_id(session, project_id):
-    """
-    Check if a project ID exists in the GEF database.
-    Args:
-    session (aiohttp.ClientSession): The aiohttp client session to use for the request.
-    project_id (int): The project ID to check.
-    Returns:
-    int: The project ID if it exists, otherwise None.
-    """
-
-    url = BASE_URL.format(project_id=project_id)
+@backoff.on_predicate(backoff.expo, lambda x: x == -429, max_time=60)
+async def check_project_id(session, project_id) -> int | None:
+    """Check if a project ID exists in the GEF database."""
+    url = config.BASE_URL.format(project_id=project_id)
 
     async def make_request():
         async with session.head(url, allow_redirects=False) as response:
-            global total_retry_after
-            if response.status in RESPONSE_CODES_LOGGED:
+            if response.status in config.RESPONSE_CODES_LOGGED:
                 logging.error(
                     f"Failed to access website: {url}. Status code: {response.status}"
                 )
-
-                return int(response.status) * -1  # trigger backoff w/ status
-
+                return -response.status
             return project_id if response.status == 200 else None
 
     try:
-        if USE_RATE_LIMITER:
+        if config.USE_RATE_LIMITER:
             async with rate_limiter:
                 return await make_request()
         else:
             return await make_request()
-
     except aiohttp.ClientError as e:
         logging.error(f"Failed to access website: {url}")
-        raise e  # Re-raise the exception to trigger backoff
+        raise e
     except Exception as e:
-        # Log status code and exception type
         logging.error(f"Failed to access website: {url}. Exception type: {type(e)}")
         raise e
-    return None
-
-
-async def check_project_ids(start_id, end_id):
-    """
-    Check all project IDs in a given range.
-
-    Args:
-        start_id (int): The lowest project ID to check.
-        end_id (int): The highest project ID to check (inclusive).
 
-    Returns:
-        list[int]: A list of valid project IDs in the given range.
-    """
 
+async def check_project_ids(session, start_id: int, end_id: int) -> list[int]:
+    """Check all project IDs in a given range."""
     logging.info(f"Checking project IDs from {start_id} to {end_id}")
-    async with aiohttp.ClientSession() as session:
-        tasks = [check_project_id(session, id) for id in range(start_id, end_id + 1)]
-
-        if USE_TQDM:
-            results = await tqdm.gather(
-                *tasks,
-                desc=f"Checking Project IDs: {start_id}-{end_id}",
-                total=len(tasks),
-            )
-        else:
-            results = await asyncio.gather(*tasks)
+    tasks = [check_project_id(session, id) for id in range(start_id, end_id + 1)]
 
-        valid_ids = [id for id in results if id is not None]
-        logging.info(f"Found {len(valid_ids)} valid IDs in range {start_id}-{end_id}")
-        return valid_ids
+    if config.USE_TQDM:
+        results = await tqdm.gather(
+            *tasks, desc=f"Checking Project IDs: {start_id}-{end_id}", total=len(tasks)
+        )
+    else:
+        results = await asyncio.gather(*tasks)
 
+    valid_ids = [id for id in results if id is not None]
+    logging.info(f"Found {len(valid_ids)} valid IDs in range {start_id}-{end_id}")
+    return valid_ids
 
-def save_to_json(data, filename=JSON_FILENAME):
-    if USE_FILE_TIMESTAMPS:
+
+def save_to_json(data: list[int], filename: str = config.JSON_FILENAME):
+    """Save the valid project IDs to a JSON file."""
+    if config.USE_FILE_TIMESTAMPS:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         filename = filename.replace(".json", f"_{timestamp}.json")
 
-    with open(OUTPUT_DIR + filename, "w") as f:
+    os.makedirs(config.OUTPUT_DIR, exist_ok=True)
+    with open(os.path.join(config.OUTPUT_DIR, filename), "w") as f:
         json.dump({"valid_project_ids": data}, f, indent=2)
 
     logging.info(f"Saved {len(data)} project IDs to JSON file: {filename}")
 
 
 async def main():
-    highest_id = await get_highest_project_id()
-
-    if highest_id is None:
-        logging.error("Could not get the highest project ID. Exiting.")
-        return
-
-    logging.info(f"Highest project ID: {highest_id}")
-
-    all_valid_ids: list[int] = []
-
-    # Batched check
-    batches = [
-        (start_id, min(start_id + BATCH_SIZE - 1, highest_id))
-        for start_id in range(ID_START, highest_id + 1, BATCH_SIZE)
-    ]
-
-    for start_id, end_id in batches:
-        valid_ids = await check_project_ids(start_id, end_id)
-        all_valid_ids.extend(valid_ids)
-
-    logging.info(
-        f"Found {len(all_valid_ids)} valid IDs in range {ID_START}-{highest_id}"
-    )
-
-    save_to_json(all_valid_ids)
+    async with aiohttp.ClientSession() as session:
+        highest_id = await get_highest_project_id(session)
+        if highest_id is None:
+            logging.error("Could not get the highest project ID. Exiting.")
+            return
+
+        logging.info(f"Highest project ID: {highest_id}")
+        all_valid_ids = []
+
+        batches = [
+            (start_id, min(start_id + config.BATCH_SIZE - 1, highest_id))
+            for start_id in range(config.ID_START, highest_id + 1, config.BATCH_SIZE)
+        ]
+
+        for start_id, end_id in batches:
+            valid_ids = await check_project_ids(session, start_id, end_id)
+            all_valid_ids.extend(valid_ids)
+
+        logging.info(
+            f"Found {len(all_valid_ids)} valid IDs in range {config.ID_START}-{highest_id}"
+        )
+        save_to_json(all_valid_ids)
 
 
 if __name__ == "__main__":