🔥 removal of cc indexes

hynky1999 · Oct 24, 2023 · 826f886 · 826f886
1 parent 2e9bd48
commit 826f886
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 13 deletions.
diff --git a/cmoncrawl/aggregator/athena_query.py b/cmoncrawl/aggregator/athena_query.py
@@ -14,6 +14,7 @@
     crawl_url_to_name,
     prepare_athena_sql_query,
 )
+from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
 from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes
 from cmoncrawl.common.loggers import all_purpose_logger
 
@@ -86,7 +87,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]):
 
     Args:
         domains (List[str]): A list of domains to search for.
-        cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json".
         match_type (MatchType, optional): Match type for cdx-api. Defaults to MatchType.EXACT.
         cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to [].
         since (datetime, optional): The start date for the search. Defaults to datetime.min.
@@ -112,7 +112,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]):
     def __init__(
         self,
         domains: List[str],
-        cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json",
         match_type: MatchType = MatchType.EXACT,
         cc_servers: List[str] = [],
         since: datetime = datetime.min,
@@ -129,7 +128,6 @@ def __init__(
         table_name: str = "ccindex",
     ) -> None:
         self.domains = domains
-        self.cc_indexes_server = cc_indexes_server
         self.match_type = match_type
         self.cc_servers = cc_servers
         self.since = since
@@ -174,9 +172,7 @@ async def aopen(self) -> AthenaAggregator:
         )
         async with ClientSession() as client:
             if len(self.cc_servers) == 0:
-                self.cc_servers = await get_all_CC_indexes(
-                    client, self.cc_indexes_server
-                )
+                self.cc_servers = await get_all_CC_indexes(client, CC_INDEXES_SERVER)
         # create bucket if not exists
         async with self.aws_client.client("s3") as s3:
             # Check if bucket exists

diff --git a/cmoncrawl/aggregator/index_query.py b/cmoncrawl/aggregator/index_query.py
@@ -2,6 +2,7 @@
 from collections import deque
 from datetime import datetime
 import re
+from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
 from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes, retrieve
 
 from types import TracebackType
@@ -40,7 +41,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]):
 
     Args:
         domains (List[str]): A list of domains to search for.
-        cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json".
         match_type (MatchType, optional): Match type for cdx-api. Defaults to None.
         cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to [].
         since (datetime, optional): The start date for the search. Defaults to datetime.min.
@@ -60,7 +60,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]):
     def __init__(
         self,
         domains: List[str],
-        cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json",
         match_type: MatchType | None = None,
         cc_servers: List[str] = [],
         since: datetime = datetime.min,
@@ -71,7 +70,6 @@ def __init__(
         sleep_step: int = 20,
     ) -> None:
         self.domains = domains
-        self.cc_indexes_server = cc_indexes_server
         self.cc_servers = cc_servers
         self.since = since
         self.to = to
@@ -87,9 +85,7 @@ async def aopen(self) -> IndexAggregator:
         await self.client.__aenter__()
 
         if len(self.cc_servers) == 0:
-            self.cc_servers = await get_all_CC_indexes(
-                self.client, self.cc_indexes_server
-            )
+            self.cc_servers = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER)
         return self
 
     async def __aenter__(self) -> IndexAggregator:

diff --git a/cmoncrawl/aggregator/utils/constants.py b/cmoncrawl/aggregator/utils/constants.py
@@ -0,0 +1 @@
+CC_INDEXES_SERVER = "https://index.commoncrawl.org/collinfo.json"
diff --git a/tests/aggregator_tests.py b/tests/aggregator_tests.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 import boto3
+from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
 from tests.utils import MySQLRecordsDB
 import aioboto3
 
@@ -72,7 +73,7 @@ async def test_indexer_num_pages(self):
         self.assertEqual(size, 5)
 
     async def test_indexer_all_CC(self):
-        indexes = await get_all_CC_indexes(self.client, self.di.cc_indexes_server)
+        indexes = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER)
         indexes = sorted(indexes)
         indexes = indexes[
             : indexes.index("https://index.commoncrawl.org/CC-MAIN-2022-27-index") + 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		CC_INDEXES_SERVER = "https://index.commoncrawl.org/collinfo.json"