From e5d122a98b00885c65e737cd540389a6f6d957ef Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 9 Sep 2024 02:49:00 +0000 Subject: [PATCH] feat: add --crawl (#39) --- .github/workflows/ci.yaml | 2 +- CHANGELOG.md | 5 +- Makefile | 7 +- README.md | 97 ++++++++++++++++++++------ cdx_toolkit/__init__.py | 19 ++++-- cdx_toolkit/cli.py | 8 ++- cdx_toolkit/commoncrawl.py | 136 +++++++++++++++++++++++++++---------- tests/test_cc_crawl.py | 12 ++++ tests/test_cli.py | 19 +++++- tests/unit/test_cc.py | 49 +++++++++++-- 10 files changed, 276 insertions(+), 78 deletions(-) create mode 100644 tests/test_cc_crawl.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4875825..c43c4ca 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: fail-fast: false - #max-parallel: 1 + max-parallel: 1 # avoids ever triggering a rate limit matrix: python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest] diff --git a/CHANGELOG.md b/CHANGELOG.md index d7ba2a3..ff27f91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ +- 0.9.37 + + --crawl for CCF + - 0.9.36 + ratelimit code; both IA and CCF are rate limiting their cdx endpoints + cache collinfo.json in ~/.cache/cdx_toolkit/ - + py3.11 and py3.12 pass testing + + py3.11 and py3.12 pass testing; windows and macos pass testing - 0.9.35 + exponential backoff retries now that IA is sending 429 diff --git a/Makefile b/Makefile index 99cfc19..c6e3452 100644 --- a/Makefile +++ b/Makefile @@ -33,14 +33,13 @@ distcheck: distclean twine check dist/* dist: distclean - echo " Finishe CHANGELOG and commit it. + echo " Finishe CHANGELOG.md and commit it." echo " git tag --list" - echo " git tag v0.x.x" + echo " git tag 0.x.x # no v" echo " git push --tags" python ./setup.py sdist twine check dist/* twine upload dist/* -r pypi install: - python ./setup.py install - + pip install . diff --git a/README.md b/README.md index 3ce1d26..f22aec7 100644 --- a/README.md +++ b/README.md @@ -3,37 +3,36 @@ [![build](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml) [![coverage](https://codecov.io/gh/cocrawler/cdx_toolkit/graph/badge.svg?token=M1YJB998LE)](https://codecov.io/gh/cocrawler/cdx_toolkit) [![Apache License 2.0](https://img.shields.io/github/license/cocrawler/cdx_toolkit.svg)](LICENSE) cdx_toolkit is a set of tools for working with CDX indices of web -crawls and archives, including those at CommonCrawl and the Internet -Archive's Wayback Machine. +crawls and archives, including those at the Common Crawl Foundation +(CCF) and those at the Internet Archive's Wayback Machine. -CommonCrawl uses Ilya Kreymer's pywb to serve the CDX API, which is -somewhat different from the Internet Archive's CDX API server. cdx_toolkit -hides these differences as best it can. cdx_toolkit also knits -together the monthly Common Crawl CDX indices into a single, virtual -index. +Common Crawl uses Ilya Kreymer's pywb to serve the CDX API, which is +somewhat different from the Internet Archive's CDX API server. +cdx_toolkit hides these differences as best it can. cdx_toolkit also +knits together the monthly Common Crawl CDX indices into a single, +virtual index. Finally, cdx_toolkit allows extracting archived pages from CC and IA -into WARC files. If you're looking to create subsets of CC or IA data -and then process them into WET or WAT files, this is a feature you'll -find useful. +into WARC files. If you're looking to create subsets of CC or IA data +and then further process them, this is a feature you'll find useful. ## Installing -cdx toolkit requires Python 3. - ``` $ pip install cdx_toolkit ``` -or clone this repo and use `python ./setup.py install`. +or clone this repo and use `pip install .` ## Command-line tools ``` $ cdxt --cc size 'commoncrawl.org/*' -$ cdxt --cc --limit 10 iter 'commoncrawl.org/*' +$ cdxt --cc --limit 10 iter 'commoncrawl.org/*' # returns the most recent year +$ cdxt --crawl 3 --limit 10 iter 'commoncrawl.org/*' # returns the most recent 3 crawls $ cdxt --cc --limit 10 --filter '=status:200' iter 'commoncrawl.org/*' -$ cdxt --ia --limit 10 iter 'commoncrawl.org/*' + +$ cdxt --ia --limit 10 iter 'commoncrawl.org/*' # will show the beginning of IA's crawl $ cdxt --ia --limit 10 warc 'commoncrawl.org/*' ``` @@ -41,15 +40,70 @@ cdxt takes a large number of command line switches, controlling the time period and all other CDX query options. cdxt can generate WARC, jsonl, and csv outputs. -** Note that by default, cdxt --cc will iterate over the previous -year of captures. ** +If you don't specify much about the crawls or dates or number of +records you're interested in, some default limits will kick in to +prevent overly-large queries. These default limits include a maximum +of 1000 records (`--limit 1000`) and a limit of 1 year of CC indexes. +To exceed these limits, use `--limit` and `--crawl` or `--from` and +`--to`. + +If it seems like nothing is happening, add `-v` or `-vv` at the start: + +``` +$ cdxt -vv --cc size 'commoncrawl.org/*' +``` + +## Selecting particular CCF crawls + +Common Crawl's data is divided into "crawls", which were yearly at the +start, and are currently done monthly. There are over 100 of them. +[You can find details about these crawls here.](https://data.commoncrawl.org/crawl-data/index.html) + +Unlike some web archives, CCF doesn't have a single CDX index that +covers all of these crawls -- we have 1 index per crawl. The way +you ask for a particular crawl is: + +``` +$ cdxt --crawl CC-MAIN-2024-33 iter 'commoncrawl.org/*' +``` + +- `--crawl CC-MAIN-2024-33` is a single crawl. +- `--crawl 3` is the latest 3 crawls. +- `--crawl CC-MAIN-2018` will match all of the crawls from 2018. +- `--crawl CC-MAIN-2018,CC-MAIN-2019` will match all of the crawls from 2018 and 2019. + +CCF also has a hive-sharded parquet index (called the columnar index) +that covers all of our crawls. Querying broad time ranges is much +faster with the columnar index. You can find more information about +this index at [the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format). + +The Internet Archive cdx index is organized as a single crawl that goes +from the very beginning until now. That's why there is no `--crawl` for +`--ia`. Note that cdx queries to `--ia` will default to one year year +and limit 1000 entries if you do not specify `--from`, `--to`, and `--limit`. + +## Selecting by time + +In most cases you'll probably use --crawl to select the time range for +Common Crawl queries, but for the Internet Archive you'll need to specify +a time range like this: + +``` +$ cdxt --ia --limit 1 --from 2008 --to 200906302359 iter 'commoncrawl.org/*' +``` + +In this example the time range starts at the beginning of 2008 and +ends on June 30, 2009 at 23:59. All times are in UTC. If you do not +specify a time range (and also don't use `--crawl`), you'll get the +most recent year. -See +## The full syntax for command-line tools ``` $ cdxt --help $ cdxt iter --help $ cdxt warc --help +$ cdxt size --help ``` for full details. Note that argument order really matters; each switch @@ -57,7 +111,10 @@ is valid only either before or after the {iter,warc,size} command. Add -v (or -vv) to see what's going on under the hood. -## Programming example +## Python programming example + +Everything that you can do on the command line, and much more, can +be done by writing a Python program. ``` import cdx_toolkit @@ -231,5 +288,3 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - - diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py index e1ba73c..c27845a 100644 --- a/cdx_toolkit/__init__.py +++ b/cdx_toolkit/__init__.py @@ -197,12 +197,14 @@ def __next__(self): LOGGER.debug('getting more in __next__') self.get_more() if len(self.captures) <= 0: + # XXX print out a warning if this hits the default limit of 1000 raise StopIteration class CDXFetcher: - def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None): + def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None): self.source = source + self.crawl = crawl self.cc_sort = cc_sort self.source = source if wb is not None and warc_download_prefix is not None: @@ -211,12 +213,11 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No self.warc_download_prefix = warc_download_prefix if source == 'cc': - self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/' - self.raw_index_list = get_cc_endpoints(self.cc_mirror) if wb is not None: raise ValueError('cannot specify wb= for source=cc') + self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/' + self.raw_index_list = get_cc_endpoints(self.cc_mirror) self.warc_download_prefix = warc_download_prefix or 'https://data.commoncrawl.org' - #https://commoncrawl.s3.amazonaws.com elif source == 'ia': self.index_list = ('https://web.archive.org/cdx/search/cdx',) if self.warc_download_prefix is None and self.wb is None: @@ -230,8 +231,10 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No LOGGER.setLevel(level=loglevel) def customize_index_list(self, params): - if self.source == 'cc' and ('from' in params or 'from_ts' in params or 'to' in params or 'closest' in params): + if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params): LOGGER.info('making a custom cc index list') + if self.crawl and 'crawl' not in params: + params['crawl'] = self.crawl return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params) else: return self.index_list @@ -243,6 +246,8 @@ def get(self, url, **kwargs): validate_timestamps(params) params['url'] = url params['output'] = 'json' + if 'crawl' not in params: + params['crawl'] = self.crawl if 'filter' in params: if isinstance(params['filter'], str): params['filter'] = (params['filter'],) @@ -272,13 +277,15 @@ def iter(self, url, **kwargs): validate_timestamps(params) params['url'] = url params['output'] = 'json' + if 'crawl' not in params: + params['crawl'] = self.crawl if 'filter' in params: if isinstance(params['filter'], str): params['filter'] = (params['filter'],) params['filter'] = munge_filter(params['filter'], self.source) if self.source == 'cc': - apply_cc_defaults(params) + apply_cc_defaults(params, crawl_present=bool(self.crawl)) index_list = self.customize_index_list(params) return CDXFetcherIter(self, params=params, index_list=index_list) diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py index 22a2e1e..6ffa393 100644 --- a/cdx_toolkit/cli.py +++ b/cdx_toolkit/cli.py @@ -6,6 +6,7 @@ import os import cdx_toolkit +from cdx_toolkit.commoncrawl import normalize_crawl LOGGER = logging.getLogger(__name__) @@ -17,13 +18,14 @@ def main(args=None): parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)') parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs') + parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc') parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback') parser.add_argument('--source', action='store', help='direct the query to this CDX server') parser.add_argument('--wb', action='store', help='direct replays for content to this wayback') parser.add_argument('--limit', type=int, action='store') parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror') parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending') - parser.add_argument('--from', action='store') # XXX default for cc + parser.add_argument('--from', action='store') parser.add_argument('--to', action='store') parser.add_argument('--filter', action='append', help='see CDX API documentation for usage') parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000') @@ -93,13 +95,15 @@ def get_version(): def setup(cmd): kwargs = {} - kwargs['source'] = cmd.cc or cmd.ia or cmd.source or None + kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None if kwargs['source'] is None: raise ValueError('must specify --cc, --ia, or a --source') if cmd.wb: kwargs['wb'] = cmd.wb if cmd.cc_mirror: kwargs['cc_mirror'] = cmd.cc_mirror + if cmd.crawl: + kwargs['crawl'] = normalize_crawl([cmd.crawl]) # currently a string, not a list if getattr(cmd, 'warc_download_prefix', None) is not None: kwargs['warc_download_prefix'] = cmd.warc_download_prefix diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index 2d6b9e5..6834217 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -10,11 +10,23 @@ import logging from .myrequests import myrequests_get -from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special +from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special LOGGER = logging.getLogger(__name__) +def normalize_crawl(crawl): + crawls = [] + for c in crawl: + if ',' in c: + crawls.extend(c.split(',')) + else: + crawls.append(c) + if len(crawls) > 1 and (any(x.isdigit() for x in crawls)): + raise ValueError('If you specify an integer, only one crawl is allowed') + return crawls + + def get_cache_names(cc_mirror): cache = os.path.expanduser('~/.cache/cdx_toolkit/') filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', '')) @@ -67,41 +79,81 @@ def get_cc_endpoints(cc_mirror): raise ValueError('Surprisingly few endpoints for common crawl index') # pragma: no cover LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints)) - # endpoints arrive sorted oldest to newest, but let's force that anyawy + # endpoints arrive descending, make them ascending endpoints = sorted(endpoints) return endpoints -def apply_cc_defaults(params, now=None): - three_months = 3 * 30 * 86400 - year = 365*86400 - if params.get('from_ts') is None: - if params.get('closest') is not None: - closest_t = timestamp_to_time(params['closest']) +def apply_cc_defaults(params, crawl_present=False, now=None): + # closest has needs + # if crawl, do nothing (expect the user to have picked the correct crawls) + # XXX ? check sort order, which happens later? + # if no from or to, set them -/+ 3 months from the closest timestamp + # crawl? nothing + # no crawl? 1 year if not specified + + if params.get('closest') is not None: + closest_t = timestamp_to_time(params['closest']) + three_months = 3 * 30 * 86400 + if params.get('from_ts') is None: params['from_ts'] = time_to_timestamp(closest_t - three_months) LOGGER.info('no from but closest, setting from=%s', params['from_ts']) + if params.get('to') is None: + params['to'] = time_to_timestamp(closest_t + three_months) + LOGGER.info('no to but closest, setting to=%s', params['to']) + # XXX set sort order to funky? which does not exist yet + elif not crawl_present: + # can't check params for 'crawl' because crawl is not ever set in params + year = 365*86400 + if params.get('from_ts') is not None: if params.get('to') is None: - params['to'] = time_to_timestamp(closest_t + three_months) - LOGGER.info('no to but closest, setting to=%s', params['to']) + #from_ts = pad_timestamp(params['from_ts']) + #params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year) + #LOGGER.info('no to, setting to=%s', params['to']) + LOGGER.info('from but no to, not doing anything') elif params.get('to') is not None: - to = pad_timestamp_up(params['to']) - params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year) - LOGGER.info('no from but to, setting from=%s', params['from_ts']) + if params.get('from_ts') is None: + to = pad_timestamp_up(params['to']) + params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year) + LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts']) else: if not now: + # now is passed in by tests. if not set, use actual now. + # XXX could be changed to mock now = time.time() params['from_ts'] = time_to_timestamp(now - year) - LOGGER.info('no from, setting from=%s', params['from_ts']) - if params.get('to') is None: - if params.get('closest') is not None: - closest_t = timestamp_to_time(params['closest']) - # 3 months later - params['to'] = time_to_timestamp(closest_t + three_months) - LOGGER.info('no to but closest, setting from=%s', params['to']) - else: - # no to or closest; from was set above, we will not set to - pass + LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts']) + else: + # crawl -- assume the user picked the right things + pass + + +def match_cc_crawls(crawls, raw_index_list): + # match crawls requested on the command line to actual crawls + # note that from/to are not considered here + # crawls should be normalized so it's supposed to be a list of str + if crawls is None: + return raw_index_list + if len(crawls) == 1 and crawls[0].isdigit(): + num = int(crawls[0]) + raw_index_list = raw_index_list[-num:] + else: + selected = set() + used = set() + for asked in crawls: + for available in raw_index_list: + if asked in available: + used.add(asked) + selected.add(available) + if not used: + raise ValueError('No matches for crawls '+','.join(crawls)) + missed = set(crawls).difference(used) + if missed: + LOGGER.warning('No matches for these crawl args: '+','.join(missed)) + raw_index_list = sorted(selected) + LOGGER.info('matched crawls are: '+','.join(raw_index_list)) + return raw_index_list def make_cc_maps(raw_index_list): @@ -131,6 +183,8 @@ def make_cc_maps(raw_index_list): def check_cc_from_to(params): # given caller's time specification, select from and to times; enforce limit on combinations + # closest: both from and to must be present + # otherwise: expect from to exist (due to the cc default 1 year) if 'closest' in params: if 'from_ts' not in params or params['from_ts'] is None: raise ValueError('Cannot happen') @@ -170,22 +224,33 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t): def filter_cc_endpoints(raw_index_list, cc_sort, params={}): - cc_map, cc_times = make_cc_maps(raw_index_list) + crawl_present = False + if 'crawl' in params: + crawl_present = True + crawls = params['crawl'] + del params['crawl'] + index_list = match_cc_crawls(crawls, raw_index_list) - from_ts_t, to_t = check_cc_from_to(params) - - index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t) + else: + # date-based selection. if --crawl was specified, raw_index_list has already been narrowed + # YYY this does not yet use collinfo.json from, to + # YYY shouldn't this be skipped if crawl_present? + cc_map, cc_times = make_cc_maps(raw_index_list) + from_ts_t, to_t = check_cc_from_to(params) + index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t) - # write the fully-adjusted from and to into params XXX necessasry? - # XXX wut? should we only do this when we've changed or added these ?! - params['from_ts'] = time_to_timestamp(from_ts_t) - if to_t is not None: - params['to'] = time_to_timestamp(to_t) + # write the fully-adjusted from and to into params XXX necessasry? + # XXX wut? should we only do this when we've changed or added these ?! + # to_t might have been padded. does from_ts ever get padded? + params['from_ts'] = time_to_timestamp(from_ts_t) + if to_t is not None: + params['to'] = time_to_timestamp(to_t) # adjust index_list order based on cc_sort order if 'closest' in params: # XXX funky ordering not implemented, inform the caller - # cli already prints a warning for iter + closer, telling user to use get instead + # cli already prints a warning for iter + closest, telling user to use get instead + # no need to warn if it's a single crawl # this routine is called for both get and iter pass if cc_sort == 'ascending': @@ -196,7 +261,10 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}): raise ValueError('unknown cc_sort arg of '+cc_sort) if index_list: - LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1]) + if crawl_present: + LOGGER.info('using cc crawls '+','.join(index_list)) + else: + LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1]) else: LOGGER.warning('empty cc index range found') diff --git a/tests/test_cc_crawl.py b/tests/test_cc_crawl.py new file mode 100644 index 0000000..3ec9c60 --- /dev/null +++ b/tests/test_cc_crawl.py @@ -0,0 +1,12 @@ +def test_cc_crawl(): + # 'crawl' can be specified for a CDXFetcher, and later change in + # an iter() or get() call. + + # init no crawl, iter/get no crawl + # init crawl, iter/get no crawl + # init no crawl, iter/get crawl + # init crawl, iter/get crawl + + # this is a lot of work to test 4 lines of code :/ + pass + diff --git a/tests/test_cli.py b/tests/test_cli.py index 2284e56..8a3be51 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -41,6 +41,16 @@ def test_basics(capsys): # this might be commoncrawl.org./ or commoncrawl.org/ assert 'commoncrawl.org' in line + args = '--crawl 2 --limit 10 iter commoncrawl.org/*'.split() + main(args=args) + out, err = capsys.readouterr() + + split = out.splitlines() + assert len(split) == 10 + for line in out.splitlines(): + # this might be commoncrawl.org./ or commoncrawl.org/ + assert 'commoncrawl.org' in line + def multi_helper(t, capsys, caplog): inputs = t[0] @@ -83,8 +93,8 @@ def test_multi_cc1(capsys, caplog): {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '--cc', 'mods': '--limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'count': 11, 'linefgrep': 'commoncrawl.org'}], -# [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'}, -# {'count': 0}], # should limit to 1 index because it runs slowly! + [{'service': '--crawl 1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'}, + {'count': 0}], # runs slowly if we don't limit crawl to 1 [{'service': '--cc', 'mods': '--cc-mirror https://index.commoncrawl.org/ --limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'count': 11, 'linefgrep': 'commoncrawl.org'}], [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/* --all-fields'}, @@ -156,6 +166,11 @@ def test_multi_misc_not_ia(capsys, caplog): [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'exception': ValueError}], + [{'service': '--crawl 1,1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'exception': ValueError}], + [{'service': '--crawl 1,CC-MAIN-2024', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'exception': ValueError}], + [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'}, {'count': 1, 'is_int': True}], [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'}, diff --git a/tests/unit/test_cc.py b/tests/unit/test_cc.py index e23f70f..65bd29f 100644 --- a/tests/unit/test_cc.py +++ b/tests/unit/test_cc.py @@ -9,6 +9,22 @@ logging.basicConfig(level='INFO') +def test_normalize_crawl(): + tests = [ + [['1'], ['1']], + [['a'], ['a']], + [['a', 'b'], ['a', 'b']], + [['a,b', 'c'], ['a', 'b', 'c']], + [['a,b,c,d'], ['a', 'b', 'c', 'd']], + [['a', 'b,c'], ['a', 'b', 'c']], + ] + + for t in tests: + assert cdx_toolkit.commoncrawl.normalize_crawl(t[0]) == t[1] + with pytest.raises(ValueError): + cdx_toolkit.commoncrawl.normalize_crawl(['1', '2']) + + def test_apply_cc_defaults(): # no from # closest -- sets from, to @@ -24,33 +40,52 @@ def test_apply_cc_defaults(): now = 1524962339.157388 # 20180429003859 tests = [ + [{'crawl': 'foo'}, {}], [{'closest': '20180101'}, {'from_ts': '20171003000000', 'to': '20180401000000'}], [{'closest': '20180101', 'to': '20181201'}, {'from_ts': '20171003000000'}], [{'to': '20180101'}, {'from_ts': '20170131235959'}], - [{}, {'from_ts': '20170429003859'}], # hits both elses, uses now + [{}, {'from_ts': '20170429003859'}], # uses now [{'from_ts': '20100101', 'closest': '20150301'}, {'to': '20150530000000'}], - [{'from_ts': '20100101'}, {}], # hits the second else only + [{'from_ts': '20100101'}, {}], ] for test_in, test_out in tests: + crawl_present = bool(test_in.pop('crawl', None)) test_out.update(test_in) - cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, now=now) + cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, crawl_present=crawl_present, now=now) assert test_in == test_out my_cc_endpoints = [ + # expected to be ascending + 'https://index.commoncrawl.org/CC-MAIN-2008-2009-index', + 'https://index.commoncrawl.org/CC-MAIN-2009-2010-index', + 'https://index.commoncrawl.org/CC-MAIN-2012-index', 'https://index.commoncrawl.org/CC-MAIN-2013-20-index', 'https://index.commoncrawl.org/CC-MAIN-2017-51-index', 'https://index.commoncrawl.org/CC-MAIN-2018-05-index', 'https://index.commoncrawl.org/CC-MAIN-2018-09-index', 'https://index.commoncrawl.org/CC-MAIN-2018-13-index', - # and the specials - 'https://index.commoncrawl.org/CC-MAIN-2012-index', - 'https://index.commoncrawl.org/CC-MAIN-2009-2010-index', - 'https://index.commoncrawl.org/CC-MAIN-2008-2009-index', ] +def test_match_cc_crawls(): + tests = [ + [['CC-MAIN-2013-20'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']], + [['CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2017-51-index']], + [['CC-MAIN-2018'], ['https://index.commoncrawl.org/CC-MAIN-2018-05-index', + 'https://index.commoncrawl.org/CC-MAIN-2018-09-index', + 'https://index.commoncrawl.org/CC-MAIN-2018-13-index']], + [['CC-MAIN-2013', 'CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index', + 'https://index.commoncrawl.org/CC-MAIN-2017-51-index']], + [['CC-MAIN-2013-20', 'no match'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']], # .warning + ] + for t in tests: + assert cdx_toolkit.commoncrawl.match_cc_crawls(t[0], my_cc_endpoints) == t[1] + with pytest.raises(ValueError): + cdx_toolkit.commoncrawl.match_cc_crawls(['no match'], my_cc_endpoints) + + def test_make_cc_maps(): cc_map, cc_times = cdx_toolkit.commoncrawl.make_cc_maps(my_cc_endpoints) t = cc_times[0]