From b6b1bdeb4d2c20bb43fba2a8b9ea7935a648a4d4 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 20:21:06 +0000 Subject: [PATCH] feat: --crawl for common crawl --- cdx_toolkit/__init__.py | 4 ++++ cdx_toolkit/commoncrawl.py | 2 ++ tests/test_cc_crawl.py | 12 ++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 tests/test_cc_crawl.py diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py index e413a71..c27845a 100644 --- a/cdx_toolkit/__init__.py +++ b/cdx_toolkit/__init__.py @@ -246,6 +246,8 @@ def get(self, url, **kwargs): validate_timestamps(params) params['url'] = url params['output'] = 'json' + if 'crawl' not in params: + params['crawl'] = self.crawl if 'filter' in params: if isinstance(params['filter'], str): params['filter'] = (params['filter'],) @@ -275,6 +277,8 @@ def iter(self, url, **kwargs): validate_timestamps(params) params['url'] = url params['output'] = 'json' + if 'crawl' not in params: + params['crawl'] = self.crawl if 'filter' in params: if isinstance(params['filter'], str): params['filter'] = (params['filter'],) diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index 8ef7b23..6834217 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -133,6 +133,8 @@ def match_cc_crawls(crawls, raw_index_list): # match crawls requested on the command line to actual crawls # note that from/to are not considered here # crawls should be normalized so it's supposed to be a list of str + if crawls is None: + return raw_index_list if len(crawls) == 1 and crawls[0].isdigit(): num = int(crawls[0]) raw_index_list = raw_index_list[-num:] diff --git a/tests/test_cc_crawl.py b/tests/test_cc_crawl.py new file mode 100644 index 0000000..3ec9c60 --- /dev/null +++ b/tests/test_cc_crawl.py @@ -0,0 +1,12 @@ +def test_cc_crawl(): + # 'crawl' can be specified for a CDXFetcher, and later change in + # an iter() or get() call. + + # init no crawl, iter/get no crawl + # init crawl, iter/get no crawl + # init no crawl, iter/get crawl + # init crawl, iter/get crawl + + # this is a lot of work to test 4 lines of code :/ + pass +