Skip to content

Commit

Permalink
Configure browsertrix proxies (#1847)
Browse files Browse the repository at this point in the history
Resolves #1354

Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+)

Config:
- proxies defined in btrix-proxies subchart
- can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart
- proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed
- support for ssh and socks5 proxies
- proxy keys added to secrets in subchart
- support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available
- prevent starting manual crawl if previously configured proxy is no longer available, return error
- force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh

Operator:
- support crawling through proxies, pass proxyId in CrawlJob
- support running profile browsers which designated proxy, pass proxyId to ProfileJob
- prevent starting scheduled crawl if previously configured proxy is no longer available

API / Access:
- /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only)
- /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org
- /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only)
- superadmin can configure which orgs can use which proxies, stored on the org
- superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org.

UI:
- Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies.
- User can select a proxy in Crawl Workflow browser settings
- Users can choose to launch a browser profile with a particular proxy
- Display which proxy is used to create profile in profile selector
- Users can choose with default proxy to use for new workflows in Crawling Defaults

---------
Co-authored-by: Ilya Kreymer <[email protected]>
Co-authored-by: Tessa Walsh <[email protected]>
  • Loading branch information
3 people authored Oct 3, 2024
1 parent 08aa2f8 commit bb6e703
Show file tree
Hide file tree
Showing 60 changed files with 1,164 additions and 60 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/publish-helm-chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jobs:
run: |
mkdir .chart-out
helm package chart/ --destination .chart-out
helm package chart/proxies/ --destination .chart-out
- name: Get Version
run: |
Expand All @@ -49,7 +50,9 @@ jobs:
See [the development guide](https://docs.browsertrix.com/deploy/) for more info how to deploy Browsertrix.
files: .chart-out/browsertrix-v${{ env.version }}.tgz
files: |
.chart-out/browsertrix-v${{ env.version }}.tgz
.chart-out/btrix-proxies-0.1.0.tgz
tag_name: v${{ env.version }}
draft: true
fail_on_unmatched_files: true
114 changes: 113 additions & 1 deletion backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import json
import re
import os
import traceback
from datetime import datetime
from uuid import UUID, uuid4
import urllib.parse
Expand Down Expand Up @@ -39,6 +40,8 @@
CrawlConfigSearchValues,
CrawlConfigUpdateResponse,
CrawlConfigDeletedResponse,
CrawlerProxy,
CrawlerProxies,
)
from .utils import dt_now, slug_from_name

Expand All @@ -63,6 +66,8 @@
"name",
)

DEFAULT_PROXY_ID: str | None = os.environ.get("DEFAULT_PROXY_ID")


# ============================================================================
class CrawlConfigOps:
Expand Down Expand Up @@ -125,6 +130,14 @@ def __init__(
if "default" not in self.crawler_images_map:
raise TypeError("The channel list must include a 'default' channel")

self._crawler_proxies_last_updated = None
self._crawler_proxies_map = None

if DEFAULT_PROXY_ID and DEFAULT_PROXY_ID not in self.get_crawler_proxies_map():
raise ValueError(
f"Configured proxies must include DEFAULT_PROXY_ID: {DEFAULT_PROXY_ID}"
)

def set_crawl_ops(self, ops):
"""set crawl ops reference"""
self.crawl_ops = ops
Expand Down Expand Up @@ -168,7 +181,9 @@ async def get_profile_filename(
if not profileid:
return ""

profile_filename = await self.profiles.get_profile_storage_path(profileid, org)
profile_filename, _ = await self.profiles.get_profile_storage_path_and_proxy(
profileid, org
)
if not profile_filename:
raise HTTPException(status_code=400, detail="invalid_profile_id")

Expand All @@ -195,6 +210,11 @@ async def add_crawl_config(
if profileid:
await self.profiles.get_profile(profileid, org)

# ensure proxyId is valid and available for org
if config_in.proxyId:
if not self.can_org_use_proxy(org, config_in.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
Expand All @@ -218,6 +238,7 @@ async def add_crawl_config(
profileid=profileid,
crawlerChannel=config_in.crawlerChannel,
crawlFilenameTemplate=config_in.crawlFilenameTemplate,
proxyId=config_in.proxyId,
)

if config_in.runNow:
Expand Down Expand Up @@ -331,6 +352,8 @@ async def update_crawl_config(
and ((not update.profileid) != (not orig_crawl_config.profileid))
)

changed = changed or (orig_crawl_config.proxyId != update.proxyId)

metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
metadata_changed = metadata_changed or self.check_attr_changed(
orig_crawl_config, update, "description"
Expand Down Expand Up @@ -829,6 +852,9 @@ async def run_now_internal(
if await self.get_running_crawl(crawlconfig.id):
raise HTTPException(status_code=400, detail="crawl_already_running")

if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
storage_filename = (
crawlconfig.crawlFilenameTemplate or self.default_filename_template
Expand All @@ -848,6 +874,7 @@ async def run_now_internal(

except Exception as exc:
# pylint: disable=raise-missing-from
print(traceback.format_exc())
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")

async def set_config_current_crawl_info(
Expand Down Expand Up @@ -897,6 +924,68 @@ def get_channel_crawler_image(
"""Get crawler image name by id"""
return self.crawler_images_map.get(crawler_channel or "")

def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
"""Load CrawlerProxy mapping from config"""
proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]

if not os.path.isfile(proxies_last_update_path):
return {}

# return cached data, when last_update timestamp hasn't changed
if self._crawler_proxies_last_updated and self._crawler_proxies_map:
with open(proxies_last_update_path, encoding="utf-8") as fh:
proxies_last_update = int(fh.read().strip())
if proxies_last_update == self._crawler_proxies_last_updated:
return self._crawler_proxies_map
self._crawler_proxies_last_updated = proxies_last_update

crawler_proxies_map: dict[str, CrawlerProxy] = {}
with open(os.environ["CRAWLER_PROXIES_JSON"], encoding="utf-8") as fh:
proxy_list = json.loads(fh.read())
for proxy_data in proxy_list:
proxy = CrawlerProxy(
id=proxy_data["id"],
label=proxy_data["label"],
description=proxy_data.get("description", ""),
country_code=proxy_data.get("country_code", ""),
url=proxy_data["url"],
has_host_public_key=bool(proxy_data.get("ssh_host_public_key")),
has_private_key=bool(proxy_data.get("ssh_private_key")),
shared=proxy_data.get("shared", False)
or proxy_data["id"] == DEFAULT_PROXY_ID,
)

crawler_proxies_map[proxy.id] = proxy

self._crawler_proxies_map = crawler_proxies_map
return self._crawler_proxies_map

def get_crawler_proxies(self):
"""Get CrawlerProxy configuration"""
return CrawlerProxies(
default_proxy_id=DEFAULT_PROXY_ID,
servers=list(self.get_crawler_proxies_map().values()),
)

def get_crawler_proxy(self, proxy_id: str) -> Optional[CrawlerProxy]:
"""Get crawlerProxy by id"""
return self.get_crawler_proxies_map().get(proxy_id)

def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> bool:
"""Checks if org is able to use proxy"""

if isinstance(proxy, str):
_proxy = self.get_crawler_proxy(proxy)
else:
_proxy = proxy

if _proxy is None:
return False

return (
_proxy.shared and org.allowSharedProxies
) or _proxy.id in org.allowedProxies

def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
"""Generate WARC prefix slug from org slug, name or url
if no name is provided, hostname is used from url, otherwise
Expand Down Expand Up @@ -983,6 +1072,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
def init_crawl_config_api(
app,
dbclient,
mdb,
user_dep,
Expand Down Expand Up @@ -1060,6 +1150,28 @@ async def get_crawler_channels(
):
return ops.crawler_channels

@router.get("/crawler-proxies", response_model=CrawlerProxies)
async def get_crawler_proxies(
org: Organization = Depends(org_crawl_dep),
):
return CrawlerProxies(
default_proxy_id=DEFAULT_PROXY_ID,
servers=[
proxy
for proxy in ops.get_crawler_proxies_map().values()
if ops.can_org_use_proxy(org, proxy)
],
)

@app.get("/orgs/all/crawlconfigs/crawler-proxies", response_model=CrawlerProxies)
async def get_all_crawler_proxies(
user: User = Depends(user_dep),
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")

return ops.get_crawler_proxies()

@router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
async def get_crawl_config_seeds(
cid: UUID,
Expand Down
13 changes: 7 additions & 6 deletions backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
""" shared crawl manager implementation """

import os
import asyncio
import secrets

from typing import Optional, Dict
Expand All @@ -16,13 +15,12 @@


# ============================================================================
class CrawlManager(K8sAPI):
"""abstract crawl manager"""
DEFAULT_PROXY_ID: str = os.environ.get("DEFAULT_PROXY_ID", "")

def __init__(self):
super().__init__()

self.loop = asyncio.get_running_loop()
# ============================================================================
class CrawlManager(K8sAPI):
"""abstract crawl manager"""

# pylint: disable=too-many-arguments
async def run_profile_browser(
Expand All @@ -34,6 +32,7 @@ async def run_profile_browser(
crawler_image: str,
baseprofile: str = "",
profile_filename: str = "",
proxy_id: str = "",
) -> str:
"""run browser for profile creation"""

Expand All @@ -55,6 +54,7 @@ async def run_profile_browser(
"vnc_password": secrets.token_hex(16),
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
"crawler_image": crawler_image,
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
}

data = self.templates.env.get_template("profile_job.yaml").render(params)
Expand Down Expand Up @@ -138,6 +138,7 @@ async def create_crawl_job(
warc_prefix=warc_prefix,
storage_filename=storage_filename,
profile_filename=profile_filename,
proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
)

async def create_qa_crawl_job(
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ async def add_new_crawl(
tags=crawlconfig.tags,
name=crawlconfig.name,
crawlerChannel=crawlconfig.crawlerChannel,
proxyId=crawlconfig.proxyId,
image=image,
)

Expand Down
6 changes: 5 additions & 1 deletion backend/btrixcloud/k8sapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import os
import traceback

from typing import Optional

import yaml

from kubernetes_asyncio import client, config
Expand Down Expand Up @@ -93,6 +93,7 @@ def new_crawl_job_yaml(
storage_filename: str = "",
profile_filename: str = "",
qa_source: str = "",
proxy_id: str = "",
):
"""load job template from yaml"""
if not crawl_id:
Expand All @@ -115,6 +116,7 @@ def new_crawl_job_yaml(
"storage_filename": storage_filename,
"profile_filename": profile_filename,
"qa_source": qa_source,
"proxy_id": proxy_id,
}

data = self.templates.env.get_template("crawl_job.yaml").render(params)
Expand All @@ -136,6 +138,7 @@ async def new_crawl_job(
storage_filename: str = "",
profile_filename: str = "",
qa_source: str = "",
proxy_id: str = "",
) -> str:
"""load and init crawl job via k8s api"""
crawl_id, data = self.new_crawl_job_yaml(
Expand All @@ -153,6 +156,7 @@ async def new_crawl_job(
storage_filename=storage_filename,
profile_filename=profile_filename,
qa_source=qa_source,
proxy_id=proxy_id,
)

# create job directly
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def main() -> None:
)

crawl_config_ops = init_crawl_config_api(
app,
dbclient,
mdb,
current_active_user,
Expand Down
Loading

0 comments on commit bb6e703

Please sign in to comment.