Configure browsertrix proxies (#1847)

Resolves #1354 Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+) Config: - proxies defined in btrix-proxies subchart - can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart - proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed - support for ssh and socks5 proxies - proxy keys added to secrets in subchart - support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available - prevent starting manual crawl if previously configured proxy is no longer available, return error - force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh Operator: - support crawling through proxies, pass proxyId in CrawlJob - support running profile browsers which designated proxy, pass proxyId to ProfileJob - prevent starting scheduled crawl if previously configured proxy is no longer available API / Access: - /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only) - /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org - /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only) - superadmin can configure which orgs can use which proxies, stored on the org - superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org. UI: - Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies. - User can select a proxy in Crawl Workflow browser settings - Users can choose to launch a browser profile with a particular proxy - Display which proxy is used to create profile in profile selector - Users can choose with default proxy to use for new workflows in Crawling Defaults --------- Co-authored-by: Ilya Kreymer <[email protected]> Co-authored-by: Tessa Walsh <[email protected]>
webrecorder · Oct 3, 2024 · bb6e703 · bb6e703
1 parent 08aa2f8
commit bb6e703
Show file tree

Hide file tree

Showing 60 changed files with 1,164 additions and 60 deletions.
diff --git a/.github/workflows/publish-helm-chart.yaml b/.github/workflows/publish-helm-chart.yaml
@@ -23,6 +23,7 @@ jobs:
         run: |
           mkdir .chart-out
           helm package chart/ --destination .chart-out
+          helm package chart/proxies/ --destination .chart-out
 
       - name: Get Version
         run: |
@@ -49,7 +50,9 @@ jobs:
 
             See [the development guide](https://docs.browsertrix.com/deploy/) for more info how to deploy Browsertrix.
 
-          files: .chart-out/browsertrix-v${{ env.version }}.tgz
+          files: |
+            .chart-out/browsertrix-v${{ env.version }}.tgz
+            .chart-out/btrix-proxies-0.1.0.tgz
           tag_name: v${{ env.version }}
           draft: true
           fail_on_unmatched_files: true
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -10,6 +10,7 @@
 import json
 import re
 import os
+import traceback
 from datetime import datetime
 from uuid import UUID, uuid4
 import urllib.parse
@@ -39,6 +40,8 @@
     CrawlConfigSearchValues,
     CrawlConfigUpdateResponse,
     CrawlConfigDeletedResponse,
+    CrawlerProxy,
+    CrawlerProxies,
 )
 from .utils import dt_now, slug_from_name
 
@@ -63,6 +66,8 @@
     "name",
 )
 
+DEFAULT_PROXY_ID: str | None = os.environ.get("DEFAULT_PROXY_ID")
+
 
 # ============================================================================
 class CrawlConfigOps:
@@ -125,6 +130,14 @@ def __init__(
         if "default" not in self.crawler_images_map:
             raise TypeError("The channel list must include a 'default' channel")
 
+        self._crawler_proxies_last_updated = None
+        self._crawler_proxies_map = None
+
+        if DEFAULT_PROXY_ID and DEFAULT_PROXY_ID not in self.get_crawler_proxies_map():
+            raise ValueError(
+                f"Configured proxies must include DEFAULT_PROXY_ID: {DEFAULT_PROXY_ID}"
+            )
+
     def set_crawl_ops(self, ops):
         """set crawl ops reference"""
         self.crawl_ops = ops
@@ -168,7 +181,9 @@ async def get_profile_filename(
         if not profileid:
             return ""
 
-        profile_filename = await self.profiles.get_profile_storage_path(profileid, org)
+        profile_filename, _ = await self.profiles.get_profile_storage_path_and_proxy(
+            profileid, org
+        )
         if not profile_filename:
             raise HTTPException(status_code=400, detail="invalid_profile_id")
 
@@ -195,6 +210,11 @@ async def add_crawl_config(
         if profileid:
             await self.profiles.get_profile(profileid, org)
 
+        # ensure proxyId is valid and available for org
+        if config_in.proxyId:
+            if not self.can_org_use_proxy(org, config_in.proxyId):
+                raise HTTPException(status_code=404, detail="proxy_not_found")
+
         now = dt_now()
         crawlconfig = CrawlConfig(
             id=uuid4(),
@@ -218,6 +238,7 @@ async def add_crawl_config(
             profileid=profileid,
             crawlerChannel=config_in.crawlerChannel,
             crawlFilenameTemplate=config_in.crawlFilenameTemplate,
+            proxyId=config_in.proxyId,
         )
 
         if config_in.runNow:
@@ -331,6 +352,8 @@ async def update_crawl_config(
             and ((not update.profileid) != (not orig_crawl_config.profileid))
         )
 
+        changed = changed or (orig_crawl_config.proxyId != update.proxyId)
+
         metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
         metadata_changed = metadata_changed or self.check_attr_changed(
             orig_crawl_config, update, "description"
@@ -829,6 +852,9 @@ async def run_now_internal(
         if await self.get_running_crawl(crawlconfig.id):
             raise HTTPException(status_code=400, detail="crawl_already_running")
 
+        if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
+            raise HTTPException(status_code=404, detail="proxy_not_found")
+
         profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
         storage_filename = (
             crawlconfig.crawlFilenameTemplate or self.default_filename_template
@@ -848,6 +874,7 @@ async def run_now_internal(
 
         except Exception as exc:
             # pylint: disable=raise-missing-from
+            print(traceback.format_exc())
             raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
 
     async def set_config_current_crawl_info(
@@ -897,6 +924,68 @@ def get_channel_crawler_image(
         """Get crawler image name by id"""
         return self.crawler_images_map.get(crawler_channel or "")
 
+    def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
+        """Load CrawlerProxy mapping from config"""
+        proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
+
+        if not os.path.isfile(proxies_last_update_path):
+            return {}
+
+        # return cached data, when last_update timestamp hasn't changed
+        if self._crawler_proxies_last_updated and self._crawler_proxies_map:
+            with open(proxies_last_update_path, encoding="utf-8") as fh:
+                proxies_last_update = int(fh.read().strip())
+                if proxies_last_update == self._crawler_proxies_last_updated:
+                    return self._crawler_proxies_map
+                self._crawler_proxies_last_updated = proxies_last_update
+
+        crawler_proxies_map: dict[str, CrawlerProxy] = {}
+        with open(os.environ["CRAWLER_PROXIES_JSON"], encoding="utf-8") as fh:
+            proxy_list = json.loads(fh.read())
+            for proxy_data in proxy_list:
+                proxy = CrawlerProxy(
+                    id=proxy_data["id"],
+                    label=proxy_data["label"],
+                    description=proxy_data.get("description", ""),
+                    country_code=proxy_data.get("country_code", ""),
+                    url=proxy_data["url"],
+                    has_host_public_key=bool(proxy_data.get("ssh_host_public_key")),
+                    has_private_key=bool(proxy_data.get("ssh_private_key")),
+                    shared=proxy_data.get("shared", False)
+                    or proxy_data["id"] == DEFAULT_PROXY_ID,
+                )
+
+                crawler_proxies_map[proxy.id] = proxy
+
+        self._crawler_proxies_map = crawler_proxies_map
+        return self._crawler_proxies_map
+
+    def get_crawler_proxies(self):
+        """Get CrawlerProxy configuration"""
+        return CrawlerProxies(
+            default_proxy_id=DEFAULT_PROXY_ID,
+            servers=list(self.get_crawler_proxies_map().values()),
+        )
+
+    def get_crawler_proxy(self, proxy_id: str) -> Optional[CrawlerProxy]:
+        """Get crawlerProxy by id"""
+        return self.get_crawler_proxies_map().get(proxy_id)
+
+    def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> bool:
+        """Checks if org is able to use proxy"""
+
+        if isinstance(proxy, str):
+            _proxy = self.get_crawler_proxy(proxy)
+        else:
+            _proxy = proxy
+
+        if _proxy is None:
+            return False
+
+        return (
+            _proxy.shared and org.allowSharedProxies
+        ) or _proxy.id in org.allowedProxies
+
     def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
         """Generate WARC prefix slug from org slug, name or url
         if no name is provided, hostname is used from url, otherwise
@@ -983,6 +1072,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
 # ============================================================================
 # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
 def init_crawl_config_api(
+    app,
     dbclient,
     mdb,
     user_dep,
@@ -1060,6 +1150,28 @@ async def get_crawler_channels(
     ):
         return ops.crawler_channels
 
+    @router.get("/crawler-proxies", response_model=CrawlerProxies)
+    async def get_crawler_proxies(
+        org: Organization = Depends(org_crawl_dep),
+    ):
+        return CrawlerProxies(
+            default_proxy_id=DEFAULT_PROXY_ID,
+            servers=[
+                proxy
+                for proxy in ops.get_crawler_proxies_map().values()
+                if ops.can_org_use_proxy(org, proxy)
+            ],
+        )
+
+    @app.get("/orgs/all/crawlconfigs/crawler-proxies", response_model=CrawlerProxies)
+    async def get_all_crawler_proxies(
+        user: User = Depends(user_dep),
+    ):
+        if not user.is_superuser:
+            raise HTTPException(status_code=403, detail="Not Allowed")
+
+        return ops.get_crawler_proxies()
+
     @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
     async def get_crawl_config_seeds(
         cid: UUID,

diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py
@@ -1,7 +1,6 @@
 """ shared crawl manager implementation """
 
 import os
-import asyncio
 import secrets
 
 from typing import Optional, Dict
@@ -16,13 +15,12 @@
 
 
 # ============================================================================
-class CrawlManager(K8sAPI):
-    """abstract crawl manager"""
+DEFAULT_PROXY_ID: str = os.environ.get("DEFAULT_PROXY_ID", "")
 
-    def __init__(self):
-        super().__init__()
 
-        self.loop = asyncio.get_running_loop()
+# ============================================================================
+class CrawlManager(K8sAPI):
+    """abstract crawl manager"""
 
     # pylint: disable=too-many-arguments
     async def run_profile_browser(
@@ -34,6 +32,7 @@ async def run_profile_browser(
         crawler_image: str,
         baseprofile: str = "",
         profile_filename: str = "",
+        proxy_id: str = "",
     ) -> str:
         """run browser for profile creation"""
 
@@ -55,6 +54,7 @@ async def run_profile_browser(
             "vnc_password": secrets.token_hex(16),
             "expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
             "crawler_image": crawler_image,
+            "proxy_id": proxy_id or DEFAULT_PROXY_ID,
         }
 
         data = self.templates.env.get_template("profile_job.yaml").render(params)
@@ -138,6 +138,7 @@ async def create_crawl_job(
             warc_prefix=warc_prefix,
             storage_filename=storage_filename,
             profile_filename=profile_filename,
+            proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
         )
 
     async def create_qa_crawl_job(

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
@@ -379,6 +379,7 @@ async def add_new_crawl(
             tags=crawlconfig.tags,
             name=crawlconfig.name,
             crawlerChannel=crawlconfig.crawlerChannel,
+            proxyId=crawlconfig.proxyId,
             image=image,
         )
 

diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py
@@ -2,8 +2,8 @@
 
 import os
 import traceback
-
 from typing import Optional
+
 import yaml
 
 from kubernetes_asyncio import client, config
@@ -93,6 +93,7 @@ def new_crawl_job_yaml(
         storage_filename: str = "",
         profile_filename: str = "",
         qa_source: str = "",
+        proxy_id: str = "",
     ):
         """load job template from yaml"""
         if not crawl_id:
@@ -115,6 +116,7 @@ def new_crawl_job_yaml(
             "storage_filename": storage_filename,
             "profile_filename": profile_filename,
             "qa_source": qa_source,
+            "proxy_id": proxy_id,
         }
 
         data = self.templates.env.get_template("crawl_job.yaml").render(params)
@@ -136,6 +138,7 @@ async def new_crawl_job(
         storage_filename: str = "",
         profile_filename: str = "",
         qa_source: str = "",
+        proxy_id: str = "",
     ) -> str:
         """load and init crawl job via k8s api"""
         crawl_id, data = self.new_crawl_job_yaml(
@@ -153,6 +156,7 @@ async def new_crawl_job(
             storage_filename=storage_filename,
             profile_filename=profile_filename,
             qa_source=qa_source,
+            proxy_id=proxy_id,
         )
 
         # create job directly

diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
@@ -205,6 +205,7 @@ def main() -> None:
     )
 
     crawl_config_ops = init_crawl_config_api(
+        app,
         dbclient,
         mdb,
         current_active_user,