diff --git a/cmoncrawl/aggregator/athena_query.py b/cmoncrawl/aggregator/athena_query.py index 8d984dba..d7fadc99 100644 --- a/cmoncrawl/aggregator/athena_query.py +++ b/cmoncrawl/aggregator/athena_query.py @@ -14,6 +14,7 @@ crawl_url_to_name, prepare_athena_sql_query, ) +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes from cmoncrawl.common.loggers import all_purpose_logger @@ -86,7 +87,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]): Args: domains (List[str]): A list of domains to search for. - cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json". match_type (MatchType, optional): Match type for cdx-api. Defaults to MatchType.EXACT. cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to []. since (datetime, optional): The start date for the search. Defaults to datetime.min. @@ -112,7 +112,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]): def __init__( self, domains: List[str], - cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json", match_type: MatchType = MatchType.EXACT, cc_servers: List[str] = [], since: datetime = datetime.min, @@ -129,7 +128,6 @@ def __init__( table_name: str = "ccindex", ) -> None: self.domains = domains - self.cc_indexes_server = cc_indexes_server self.match_type = match_type self.cc_servers = cc_servers self.since = since @@ -174,9 +172,7 @@ async def aopen(self) -> AthenaAggregator: ) async with ClientSession() as client: if len(self.cc_servers) == 0: - self.cc_servers = await get_all_CC_indexes( - client, self.cc_indexes_server - ) + self.cc_servers = await get_all_CC_indexes(client, CC_INDEXES_SERVER) # create bucket if not exists async with self.aws_client.client("s3") as s3: # Check if bucket exists diff --git a/cmoncrawl/aggregator/index_query.py b/cmoncrawl/aggregator/index_query.py index fbca1fd9..75b87126 100644 --- a/cmoncrawl/aggregator/index_query.py +++ b/cmoncrawl/aggregator/index_query.py @@ -2,6 +2,7 @@ from collections import deque from datetime import datetime import re +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes, retrieve from types import TracebackType @@ -40,7 +41,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]): Args: domains (List[str]): A list of domains to search for. - cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json". match_type (MatchType, optional): Match type for cdx-api. Defaults to None. cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to []. since (datetime, optional): The start date for the search. Defaults to datetime.min. @@ -60,7 +60,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]): def __init__( self, domains: List[str], - cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json", match_type: MatchType | None = None, cc_servers: List[str] = [], since: datetime = datetime.min, @@ -71,7 +70,6 @@ def __init__( sleep_step: int = 20, ) -> None: self.domains = domains - self.cc_indexes_server = cc_indexes_server self.cc_servers = cc_servers self.since = since self.to = to @@ -87,9 +85,7 @@ async def aopen(self) -> IndexAggregator: await self.client.__aenter__() if len(self.cc_servers) == 0: - self.cc_servers = await get_all_CC_indexes( - self.client, self.cc_indexes_server - ) + self.cc_servers = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER) return self async def __aenter__(self) -> IndexAggregator: diff --git a/cmoncrawl/aggregator/utils/constants.py b/cmoncrawl/aggregator/utils/constants.py new file mode 100644 index 00000000..cb92395e --- /dev/null +++ b/cmoncrawl/aggregator/utils/constants.py @@ -0,0 +1 @@ +CC_INDEXES_SERVER = "https://index.commoncrawl.org/collinfo.json" diff --git a/tests/aggregator_tests.py b/tests/aggregator_tests.py index 72075f49..ecf0fdd7 100644 --- a/tests/aggregator_tests.py +++ b/tests/aggregator_tests.py @@ -3,6 +3,7 @@ from pathlib import Path import boto3 +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from tests.utils import MySQLRecordsDB import aioboto3 @@ -72,7 +73,7 @@ async def test_indexer_num_pages(self): self.assertEqual(size, 5) async def test_indexer_all_CC(self): - indexes = await get_all_CC_indexes(self.client, self.di.cc_indexes_server) + indexes = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER) indexes = sorted(indexes) indexes = indexes[ : indexes.index("https://index.commoncrawl.org/CC-MAIN-2022-27-index") + 1