Skip to content

Commit

Permalink
feat: --crawl for common crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Lindahl committed Sep 8, 2024
1 parent dbdb739 commit b6b1bde
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 0 deletions.
4 changes: 4 additions & 0 deletions cdx_toolkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ def get(self, url, **kwargs):
validate_timestamps(params)
params['url'] = url
params['output'] = 'json'
if 'crawl' not in params:
params['crawl'] = self.crawl
if 'filter' in params:
if isinstance(params['filter'], str):
params['filter'] = (params['filter'],)
Expand Down Expand Up @@ -275,6 +277,8 @@ def iter(self, url, **kwargs):
validate_timestamps(params)
params['url'] = url
params['output'] = 'json'
if 'crawl' not in params:
params['crawl'] = self.crawl
if 'filter' in params:
if isinstance(params['filter'], str):
params['filter'] = (params['filter'],)
Expand Down
2 changes: 2 additions & 0 deletions cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ def match_cc_crawls(crawls, raw_index_list):
# match crawls requested on the command line to actual crawls
# note that from/to are not considered here
# crawls should be normalized so it's supposed to be a list of str
if crawls is None:
return raw_index_list
if len(crawls) == 1 and crawls[0].isdigit():
num = int(crawls[0])
raw_index_list = raw_index_list[-num:]
Expand Down
12 changes: 12 additions & 0 deletions tests/test_cc_crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
def test_cc_crawl():
# 'crawl' can be specified for a CDXFetcher, and later change in
# an iter() or get() call.

# init no crawl, iter/get no crawl
# init crawl, iter/get no crawl
# init no crawl, iter/get crawl
# init crawl, iter/get crawl

# this is a lot of work to test 4 lines of code :/
pass

0 comments on commit b6b1bde

Please sign in to comment.