Skip to content

Commit

Permalink
🔥 removal of cc indexes
Browse files Browse the repository at this point in the history
  • Loading branch information
hynky1999 committed Oct 24, 2023
1 parent 2e9bd48 commit 826f886
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 13 deletions.
8 changes: 2 additions & 6 deletions cmoncrawl/aggregator/athena_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
crawl_url_to_name,
prepare_athena_sql_query,
)
from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes
from cmoncrawl.common.loggers import all_purpose_logger

Expand Down Expand Up @@ -86,7 +87,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]):
Args:
domains (List[str]): A list of domains to search for.
cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json".
match_type (MatchType, optional): Match type for cdx-api. Defaults to MatchType.EXACT.
cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to [].
since (datetime, optional): The start date for the search. Defaults to datetime.min.
Expand All @@ -112,7 +112,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]):
def __init__(
self,
domains: List[str],
cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json",
match_type: MatchType = MatchType.EXACT,
cc_servers: List[str] = [],
since: datetime = datetime.min,
Expand All @@ -129,7 +128,6 @@ def __init__(
table_name: str = "ccindex",
) -> None:
self.domains = domains
self.cc_indexes_server = cc_indexes_server
self.match_type = match_type
self.cc_servers = cc_servers
self.since = since
Expand Down Expand Up @@ -174,9 +172,7 @@ async def aopen(self) -> AthenaAggregator:
)
async with ClientSession() as client:
if len(self.cc_servers) == 0:
self.cc_servers = await get_all_CC_indexes(
client, self.cc_indexes_server
)
self.cc_servers = await get_all_CC_indexes(client, CC_INDEXES_SERVER)
# create bucket if not exists
async with self.aws_client.client("s3") as s3:
# Check if bucket exists
Expand Down
8 changes: 2 additions & 6 deletions cmoncrawl/aggregator/index_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import deque
from datetime import datetime
import re
from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes, retrieve

from types import TracebackType
Expand Down Expand Up @@ -40,7 +41,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]):
Args:
domains (List[str]): A list of domains to search for.
cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json".
match_type (MatchType, optional): Match type for cdx-api. Defaults to None.
cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to [].
since (datetime, optional): The start date for the search. Defaults to datetime.min.
Expand All @@ -60,7 +60,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]):
def __init__(
self,
domains: List[str],
cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json",
match_type: MatchType | None = None,
cc_servers: List[str] = [],
since: datetime = datetime.min,
Expand All @@ -71,7 +70,6 @@ def __init__(
sleep_step: int = 20,
) -> None:
self.domains = domains
self.cc_indexes_server = cc_indexes_server
self.cc_servers = cc_servers
self.since = since
self.to = to
Expand All @@ -87,9 +85,7 @@ async def aopen(self) -> IndexAggregator:
await self.client.__aenter__()

if len(self.cc_servers) == 0:
self.cc_servers = await get_all_CC_indexes(
self.client, self.cc_indexes_server
)
self.cc_servers = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER)
return self

async def __aenter__(self) -> IndexAggregator:
Expand Down
1 change: 1 addition & 0 deletions cmoncrawl/aggregator/utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CC_INDEXES_SERVER = "https://index.commoncrawl.org/collinfo.json"
3 changes: 2 additions & 1 deletion tests/aggregator_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path

import boto3
from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
from tests.utils import MySQLRecordsDB
import aioboto3

Expand Down Expand Up @@ -72,7 +73,7 @@ async def test_indexer_num_pages(self):
self.assertEqual(size, 5)

async def test_indexer_all_CC(self):
indexes = await get_all_CC_indexes(self.client, self.di.cc_indexes_server)
indexes = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER)
indexes = sorted(indexes)
indexes = indexes[
: indexes.index("https://index.commoncrawl.org/CC-MAIN-2022-27-index") + 1
Expand Down

0 comments on commit 826f886

Please sign in to comment.