From f026065ddafb1241b9b71eba3733aaae6cac8f55 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Wed, 4 Sep 2024 23:07:55 +0000 Subject: [PATCH 1/8] work in progress --- cdx_toolkit/cli.py | 6 +++++- cdx_toolkit/commoncrawl.py | 25 ++++++++++++++++++++++++- tests/unit/test_cc.py | 16 ++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py index 22a2e1e..a1aa528 100644 --- a/cdx_toolkit/cli.py +++ b/cdx_toolkit/cli.py @@ -6,6 +6,7 @@ import os import cdx_toolkit +from cdx_toolkit.commoncrawl import normalize_crawl LOGGER = logging.getLogger(__name__) @@ -23,6 +24,7 @@ def main(args=None): parser.add_argument('--limit', type=int, action='store') parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror') parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending') + parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc') parser.add_argument('--from', action='store') # XXX default for cc parser.add_argument('--to', action='store') parser.add_argument('--filter', action='append', help='see CDX API documentation for usage') @@ -93,13 +95,15 @@ def get_version(): def setup(cmd): kwargs = {} - kwargs['source'] = cmd.cc or cmd.ia or cmd.source or None + kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None if kwargs['source'] is None: raise ValueError('must specify --cc, --ia, or a --source') if cmd.wb: kwargs['wb'] = cmd.wb if cmd.cc_mirror: kwargs['cc_mirror'] = cmd.cc_mirror + if cmd.crawl: + kwargs['crawl'] = normalize_crawl(cmd.crawl) if getattr(cmd, 'warc_download_prefix', None) is not None: kwargs['warc_download_prefix'] = cmd.warc_download_prefix diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index a8608c7..545f97f 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -15,6 +15,18 @@ LOGGER = logging.getLogger(__name__) +def normalize_crawl(crawl): + crawls = [] + for c in crawl: + if ',' in c: + crawls.extend(c.split(',')) + else: + crawls.append(c) + if len(crawls) > 1 and any(x.isdigit() for x in crawls): + raise ValueError('If you specify an integer, only one crawl is allowed') + return crawls + + def get_cache_names(cc_mirror): cache = os.path.expanduser('~/.cache/cdx_toolkit/') filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', '')) @@ -75,6 +87,9 @@ def get_cc_endpoints(cc_mirror): def apply_cc_defaults(params, now=None): + if 'crawl' in params: + return + three_months = 3 * 30 * 86400 year = 365*86400 if params.get('from_ts') is None: @@ -171,6 +186,13 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t): def filter_cc_endpoints(raw_index_list, cc_sort, params={}): + # YYY with --crawl, just check that the list is crawls that exist + # YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here + # YYY we do need to reorder according to cc_sort + # what is the type of raw_index_list -- it is from collinfo.json cdx-api + # "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index" + + # if no --crawl cc_map, cc_times = make_cc_maps(raw_index_list) from_ts_t, to_t = check_cc_from_to(params) @@ -186,7 +208,8 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}): # adjust index_list order based on cc_sort order if 'closest' in params: # XXX funky ordering not implemented, inform the caller - # cli already prints a warning for iter + closer, telling user to use get instead + # cli already prints a warning for iter + closest, telling user to use get instead + # no need to warn if it's a single crawl # this routine is called for both get and iter pass if cc_sort == 'ascending': diff --git a/tests/unit/test_cc.py b/tests/unit/test_cc.py index e23f70f..17aa2e6 100644 --- a/tests/unit/test_cc.py +++ b/tests/unit/test_cc.py @@ -9,6 +9,22 @@ logging.basicConfig(level='INFO') +def test_normalize_crawl(): + tests = [ + [['1'], ['1']], + [['a'], ['a']], + [['a', 'b'], ['a', 'b']], + [['a,b', 'c'], ['a', 'b', 'c']], + [['a,b,c,d'], ['a', 'b', 'c', 'd']], + [['a', 'b,c'], ['a', 'b', 'c']], + ] + + for t in tests: + assert cdx_toolkit.commoncrawl.normalize_crawl(t[0]) == t[1] + with pytest.raises(ValueError): + cdx_toolkit.commoncrawl.normalize_crawl(['1', '2']) + + def test_apply_cc_defaults(): # no from # closest -- sets from, to From a7b7b402b45fe51f3f1d2ea55c380846365acdb2 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 07:58:26 +0000 Subject: [PATCH 2/8] feat: cdxt --crawl, plus partial docs --- README.md | 67 +++++++++++++------ cdx_toolkit/__init__.py | 15 +++-- cdx_toolkit/cli.py | 6 +- cdx_toolkit/commoncrawl.py | 131 ++++++++++++++++++++++++------------- tests/test_cli.py | 19 +++++- tests/unit/test_cc.py | 33 ++++++++-- 6 files changed, 190 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 3ce1d26..9032d0f 100644 --- a/README.md +++ b/README.md @@ -3,37 +3,36 @@ [![build](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml) [![coverage](https://codecov.io/gh/cocrawler/cdx_toolkit/graph/badge.svg?token=M1YJB998LE)](https://codecov.io/gh/cocrawler/cdx_toolkit) [![Apache License 2.0](https://img.shields.io/github/license/cocrawler/cdx_toolkit.svg)](LICENSE) cdx_toolkit is a set of tools for working with CDX indices of web -crawls and archives, including those at CommonCrawl and the Internet -Archive's Wayback Machine. +crawls and archives, including those at the Common Crawl Foundation +(CCF) and those at the Internet Archive's Wayback Machine. -CommonCrawl uses Ilya Kreymer's pywb to serve the CDX API, which is -somewhat different from the Internet Archive's CDX API server. cdx_toolkit -hides these differences as best it can. cdx_toolkit also knits -together the monthly Common Crawl CDX indices into a single, virtual -index. +Common Crawl uses Ilya Kreymer's pywb to serve the CDX API, which is +somewhat different from the Internet Archive's CDX API server. +cdx_toolkit hides these differences as best it can. cdx_toolkit also +knits together the monthly Common Crawl CDX indices into a single, +virtual index. Finally, cdx_toolkit allows extracting archived pages from CC and IA -into WARC files. If you're looking to create subsets of CC or IA data -and then process them into WET or WAT files, this is a feature you'll -find useful. +into WARC files. If you're looking to create subsets of CC or IA data +and then further process them, this is a feature you'll find useful. ## Installing -cdx toolkit requires Python 3. - ``` $ pip install cdx_toolkit ``` -or clone this repo and use `python ./setup.py install`. +or clone this repo and use `pip install .` ## Command-line tools ``` $ cdxt --cc size 'commoncrawl.org/*' -$ cdxt --cc --limit 10 iter 'commoncrawl.org/*' +$ cdxt --cc --limit 10 iter 'commoncrawl.org/*' # returns the most recent year +$ cdxt --crawl 3 --limit 10 iter 'commoncrawl.org/*' # returns the most recent 3 crawls $ cdxt --cc --limit 10 --filter '=status:200' iter 'commoncrawl.org/*' -$ cdxt --ia --limit 10 iter 'commoncrawl.org/*' + +$ cdxt --ia --limit 10 iter 'commoncrawl.org/*' # will show the beginning of IA's crawl $ cdxt --ia --limit 10 warc 'commoncrawl.org/*' ``` @@ -41,15 +40,42 @@ cdxt takes a large number of command line switches, controlling the time period and all other CDX query options. cdxt can generate WARC, jsonl, and csv outputs. -** Note that by default, cdxt --cc will iterate over the previous -year of captures. ** +If you don't specify much about the crawls or dates or number of +records you're interested in, some default limits will kick in to +prevent overly-large queries. These default limits include a maximum +of 1000 records (`--limit 1000`) and a limit of 1 year of CC indexes. +To exceed these limits, use `--limit` and `--crawl` or `--from` and +`--to`. + +## Selecting particular CCF crawls + +Common Crawl's data is divided into "crawls", which were yearly at the +start, and are currently done monthly. There are over 100 of them. + +XXX -See +Unlike some web archives, CCF doesn't have a single CDX index that +covers all of these crawls. CCF does have a hive-sharded Parquet index +(called the columnar index) that covers all of our indexes. You +can find more information about this index at +[the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format). + +The Internet Archive cdx index is organized as a single crawl that goes +from the very beginning until now. That's why there is no `--crawl` for +`--ia`. Note that cdx queries to `--ia` will default to one year year +and limit 1000 entries if you do not specify `--from`, `--to`, and `--limit`. + +## Selecting by time + +XXX + +## The full syntax for command-line tools ``` $ cdxt --help $ cdxt iter --help $ cdxt warc --help +$ cdxt size --help ``` for full details. Note that argument order really matters; each switch @@ -57,7 +83,10 @@ is valid only either before or after the {iter,warc,size} command. Add -v (or -vv) to see what's going on under the hood. -## Programming example +## Python programming example + +Everything that you can do on the command line, and much more, can +be done by writing a Python program. ``` import cdx_toolkit diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py index e1ba73c..e413a71 100644 --- a/cdx_toolkit/__init__.py +++ b/cdx_toolkit/__init__.py @@ -197,12 +197,14 @@ def __next__(self): LOGGER.debug('getting more in __next__') self.get_more() if len(self.captures) <= 0: + # XXX print out a warning if this hits the default limit of 1000 raise StopIteration class CDXFetcher: - def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None): + def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None): self.source = source + self.crawl = crawl self.cc_sort = cc_sort self.source = source if wb is not None and warc_download_prefix is not None: @@ -211,12 +213,11 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No self.warc_download_prefix = warc_download_prefix if source == 'cc': - self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/' - self.raw_index_list = get_cc_endpoints(self.cc_mirror) if wb is not None: raise ValueError('cannot specify wb= for source=cc') + self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/' + self.raw_index_list = get_cc_endpoints(self.cc_mirror) self.warc_download_prefix = warc_download_prefix or 'https://data.commoncrawl.org' - #https://commoncrawl.s3.amazonaws.com elif source == 'ia': self.index_list = ('https://web.archive.org/cdx/search/cdx',) if self.warc_download_prefix is None and self.wb is None: @@ -230,8 +231,10 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No LOGGER.setLevel(level=loglevel) def customize_index_list(self, params): - if self.source == 'cc' and ('from' in params or 'from_ts' in params or 'to' in params or 'closest' in params): + if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params): LOGGER.info('making a custom cc index list') + if self.crawl and 'crawl' not in params: + params['crawl'] = self.crawl return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params) else: return self.index_list @@ -278,7 +281,7 @@ def iter(self, url, **kwargs): params['filter'] = munge_filter(params['filter'], self.source) if self.source == 'cc': - apply_cc_defaults(params) + apply_cc_defaults(params, crawl_present=bool(self.crawl)) index_list = self.customize_index_list(params) return CDXFetcherIter(self, params=params, index_list=index_list) diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py index a1aa528..6ffa393 100644 --- a/cdx_toolkit/cli.py +++ b/cdx_toolkit/cli.py @@ -18,14 +18,14 @@ def main(args=None): parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)') parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs') + parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc') parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback') parser.add_argument('--source', action='store', help='direct the query to this CDX server') parser.add_argument('--wb', action='store', help='direct replays for content to this wayback') parser.add_argument('--limit', type=int, action='store') parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror') parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending') - parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc') - parser.add_argument('--from', action='store') # XXX default for cc + parser.add_argument('--from', action='store') parser.add_argument('--to', action='store') parser.add_argument('--filter', action='append', help='see CDX API documentation for usage') parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000') @@ -103,7 +103,7 @@ def setup(cmd): if cmd.cc_mirror: kwargs['cc_mirror'] = cmd.cc_mirror if cmd.crawl: - kwargs['crawl'] = normalize_crawl(cmd.crawl) + kwargs['crawl'] = normalize_crawl([cmd.crawl]) # currently a string, not a list if getattr(cmd, 'warc_download_prefix', None) is not None: kwargs['warc_download_prefix'] = cmd.warc_download_prefix diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index d3f26c3..8ef7b23 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -10,7 +10,7 @@ import logging from .myrequests import myrequests_get -from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special +from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special LOGGER = logging.getLogger(__name__) @@ -22,7 +22,7 @@ def normalize_crawl(crawl): crawls.extend(c.split(',')) else: crawls.append(c) - if len(crawls) > 1 and any(x.isdigit() for x in crawls): + if len(crawls) > 1 and (any(x.isdigit() for x in crawls)): raise ValueError('If you specify an integer, only one crawl is allowed') return crawls @@ -79,44 +79,79 @@ def get_cc_endpoints(cc_mirror): raise ValueError('Surprisingly few endpoints for common crawl index') # pragma: no cover LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints)) - # endpoints arrive sorted oldest to newest, but let's force that anyawy + # endpoints arrive descending, make them ascending endpoints = sorted(endpoints) return endpoints -def apply_cc_defaults(params, now=None): - if 'crawl' in params: - return +def apply_cc_defaults(params, crawl_present=False, now=None): + # closest has needs + # if crawl, do nothing (expect the user to have picked the correct crawls) + # XXX ? check sort order, which happens later? + # if no from or to, set them -/+ 3 months from the closest timestamp + # crawl? nothing + # no crawl? 1 year if not specified - three_months = 3 * 30 * 86400 - year = 365*86400 - if params.get('from_ts') is None: - if params.get('closest') is not None: - closest_t = timestamp_to_time(params['closest']) + if params.get('closest') is not None: + closest_t = timestamp_to_time(params['closest']) + three_months = 3 * 30 * 86400 + if params.get('from_ts') is None: params['from_ts'] = time_to_timestamp(closest_t - three_months) LOGGER.info('no from but closest, setting from=%s', params['from_ts']) + if params.get('to') is None: + params['to'] = time_to_timestamp(closest_t + three_months) + LOGGER.info('no to but closest, setting to=%s', params['to']) + # XXX set sort order to funky? which does not exist yet + elif not crawl_present: + # can't check params for 'crawl' because crawl is not ever set in params + year = 365*86400 + if params.get('from_ts') is not None: if params.get('to') is None: - params['to'] = time_to_timestamp(closest_t + three_months) - LOGGER.info('no to but closest, setting to=%s', params['to']) + #from_ts = pad_timestamp(params['from_ts']) + #params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year) + #LOGGER.info('no to, setting to=%s', params['to']) + LOGGER.info('from but no to, not doing anything') elif params.get('to') is not None: - to = pad_timestamp_up(params['to']) - params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year) - LOGGER.info('no from but to, setting from=%s', params['from_ts']) + if params.get('from_ts') is None: + to = pad_timestamp_up(params['to']) + params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year) + LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts']) else: if not now: + # now is passed in by tests. if not set, use actual now. + # XXX could be changed to mock now = time.time() params['from_ts'] = time_to_timestamp(now - year) - LOGGER.info('no from, setting from=%s', params['from_ts']) - if params.get('to') is None: - if params.get('closest') is not None: - closest_t = timestamp_to_time(params['closest']) - # 3 months later - params['to'] = time_to_timestamp(closest_t + three_months) - LOGGER.info('no to but closest, setting from=%s', params['to']) - else: - # no to or closest; from was set above, we will not set to - pass + LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts']) + else: + # crawl -- assume the user picked the right things + pass + + +def match_cc_crawls(crawls, raw_index_list): + # match crawls requested on the command line to actual crawls + # note that from/to are not considered here + # crawls should be normalized so it's supposed to be a list of str + if len(crawls) == 1 and crawls[0].isdigit(): + num = int(crawls[0]) + raw_index_list = raw_index_list[-num:] + else: + selected = set() + used = set() + for asked in crawls: + for available in raw_index_list: + if asked in available: + used.add(asked) + selected.add(available) + if not used: + raise ValueError('No matches for crawls '+','.join(crawls)) + missed = set(crawls).difference(used) + if missed: + LOGGER.warning('No matches for these crawl args: '+','.join(missed)) + raw_index_list = sorted(selected) + LOGGER.info('matched crawls are: '+','.join(raw_index_list)) + return raw_index_list def make_cc_maps(raw_index_list): @@ -146,6 +181,8 @@ def make_cc_maps(raw_index_list): def check_cc_from_to(params): # given caller's time specification, select from and to times; enforce limit on combinations + # closest: both from and to must be present + # otherwise: expect from to exist (due to the cc default 1 year) if 'closest' in params: if 'from_ts' not in params or params['from_ts'] is None: raise ValueError('Cannot happen') @@ -185,24 +222,27 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t): def filter_cc_endpoints(raw_index_list, cc_sort, params={}): - # YYY with --crawl, just check that the list is crawls that exist - # YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here - # YYY we do need to reorder according to cc_sort - # what is the type of raw_index_list -- it is from collinfo.json cdx-api - # "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index" - - # if no --crawl - cc_map, cc_times = make_cc_maps(raw_index_list) - - from_ts_t, to_t = check_cc_from_to(params) - - index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t) + crawl_present = False + if 'crawl' in params: + crawl_present = True + crawls = params['crawl'] + del params['crawl'] + index_list = match_cc_crawls(crawls, raw_index_list) - # write the fully-adjusted from and to into params XXX necessasry? - # XXX wut? should we only do this when we've changed or added these ?! - params['from_ts'] = time_to_timestamp(from_ts_t) - if to_t is not None: - params['to'] = time_to_timestamp(to_t) + else: + # date-based selection. if --crawl was specified, raw_index_list has already been narrowed + # YYY this does not yet use collinfo.json from, to + # YYY shouldn't this be skipped if crawl_present? + cc_map, cc_times = make_cc_maps(raw_index_list) + from_ts_t, to_t = check_cc_from_to(params) + index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t) + + # write the fully-adjusted from and to into params XXX necessasry? + # XXX wut? should we only do this when we've changed or added these ?! + # to_t might have been padded. does from_ts ever get padded? + params['from_ts'] = time_to_timestamp(from_ts_t) + if to_t is not None: + params['to'] = time_to_timestamp(to_t) # adjust index_list order based on cc_sort order if 'closest' in params: @@ -219,7 +259,10 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}): raise ValueError('unknown cc_sort arg of '+cc_sort) if index_list: - LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1]) + if crawl_present: + LOGGER.info('using cc crawls '+','.join(index_list)) + else: + LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1]) else: LOGGER.warning('empty cc index range found') diff --git a/tests/test_cli.py b/tests/test_cli.py index 2284e56..8a3be51 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -41,6 +41,16 @@ def test_basics(capsys): # this might be commoncrawl.org./ or commoncrawl.org/ assert 'commoncrawl.org' in line + args = '--crawl 2 --limit 10 iter commoncrawl.org/*'.split() + main(args=args) + out, err = capsys.readouterr() + + split = out.splitlines() + assert len(split) == 10 + for line in out.splitlines(): + # this might be commoncrawl.org./ or commoncrawl.org/ + assert 'commoncrawl.org' in line + def multi_helper(t, capsys, caplog): inputs = t[0] @@ -83,8 +93,8 @@ def test_multi_cc1(capsys, caplog): {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '--cc', 'mods': '--limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'count': 11, 'linefgrep': 'commoncrawl.org'}], -# [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'}, -# {'count': 0}], # should limit to 1 index because it runs slowly! + [{'service': '--crawl 1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'}, + {'count': 0}], # runs slowly if we don't limit crawl to 1 [{'service': '--cc', 'mods': '--cc-mirror https://index.commoncrawl.org/ --limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'count': 11, 'linefgrep': 'commoncrawl.org'}], [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/* --all-fields'}, @@ -156,6 +166,11 @@ def test_multi_misc_not_ia(capsys, caplog): [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'exception': ValueError}], + [{'service': '--crawl 1,1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'exception': ValueError}], + [{'service': '--crawl 1,CC-MAIN-2024', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'exception': ValueError}], + [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'}, {'count': 1, 'is_int': True}], [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'}, diff --git a/tests/unit/test_cc.py b/tests/unit/test_cc.py index 17aa2e6..65bd29f 100644 --- a/tests/unit/test_cc.py +++ b/tests/unit/test_cc.py @@ -40,33 +40,52 @@ def test_apply_cc_defaults(): now = 1524962339.157388 # 20180429003859 tests = [ + [{'crawl': 'foo'}, {}], [{'closest': '20180101'}, {'from_ts': '20171003000000', 'to': '20180401000000'}], [{'closest': '20180101', 'to': '20181201'}, {'from_ts': '20171003000000'}], [{'to': '20180101'}, {'from_ts': '20170131235959'}], - [{}, {'from_ts': '20170429003859'}], # hits both elses, uses now + [{}, {'from_ts': '20170429003859'}], # uses now [{'from_ts': '20100101', 'closest': '20150301'}, {'to': '20150530000000'}], - [{'from_ts': '20100101'}, {}], # hits the second else only + [{'from_ts': '20100101'}, {}], ] for test_in, test_out in tests: + crawl_present = bool(test_in.pop('crawl', None)) test_out.update(test_in) - cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, now=now) + cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, crawl_present=crawl_present, now=now) assert test_in == test_out my_cc_endpoints = [ + # expected to be ascending + 'https://index.commoncrawl.org/CC-MAIN-2008-2009-index', + 'https://index.commoncrawl.org/CC-MAIN-2009-2010-index', + 'https://index.commoncrawl.org/CC-MAIN-2012-index', 'https://index.commoncrawl.org/CC-MAIN-2013-20-index', 'https://index.commoncrawl.org/CC-MAIN-2017-51-index', 'https://index.commoncrawl.org/CC-MAIN-2018-05-index', 'https://index.commoncrawl.org/CC-MAIN-2018-09-index', 'https://index.commoncrawl.org/CC-MAIN-2018-13-index', - # and the specials - 'https://index.commoncrawl.org/CC-MAIN-2012-index', - 'https://index.commoncrawl.org/CC-MAIN-2009-2010-index', - 'https://index.commoncrawl.org/CC-MAIN-2008-2009-index', ] +def test_match_cc_crawls(): + tests = [ + [['CC-MAIN-2013-20'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']], + [['CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2017-51-index']], + [['CC-MAIN-2018'], ['https://index.commoncrawl.org/CC-MAIN-2018-05-index', + 'https://index.commoncrawl.org/CC-MAIN-2018-09-index', + 'https://index.commoncrawl.org/CC-MAIN-2018-13-index']], + [['CC-MAIN-2013', 'CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index', + 'https://index.commoncrawl.org/CC-MAIN-2017-51-index']], + [['CC-MAIN-2013-20', 'no match'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']], # .warning + ] + for t in tests: + assert cdx_toolkit.commoncrawl.match_cc_crawls(t[0], my_cc_endpoints) == t[1] + with pytest.raises(ValueError): + cdx_toolkit.commoncrawl.match_cc_crawls(['no match'], my_cc_endpoints) + + def test_make_cc_maps(): cc_map, cc_times = cdx_toolkit.commoncrawl.make_cc_maps(my_cc_endpoints) t = cc_times[0] From 56edccb334ad994bd04b24fe9405b6770c9d2052 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 18:41:39 +0000 Subject: [PATCH 3/8] doc: update README [skip ci] --- README.md | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9032d0f..277e1e1 100644 --- a/README.md +++ b/README.md @@ -47,15 +47,30 @@ of 1000 records (`--limit 1000`) and a limit of 1 year of CC indexes. To exceed these limits, use `--limit` and `--crawl` or `--from` and `--to`. +If it seems like nothing is happening, add `-v` or `-vv` at the start: + +``` +$ cdxt -vv --cc size 'commoncrawl.org/*' +``` + ## Selecting particular CCF crawls Common Crawl's data is divided into "crawls", which were yearly at the start, and are currently done monthly. There are over 100 of them. - -XXX +[You can find details about these crawls here.](https://data.commoncrawl.org/crawl-data/index.html) Unlike some web archives, CCF doesn't have a single CDX index that -covers all of these crawls. CCF does have a hive-sharded Parquet index +covers all of these crawls -- we have 1 index per crawl. The way +you ask for a particular crawl is: + +``` +$ cdxt --crawl CC-MAIN-2024-33 iter 'commoncrawl.org/*' +``` + +`--crawl 3` is the latest 3 crawls. `--crawl CC-MAIN-2018` will match all +of the crawls from 2018. + +CCF also has a hive-sharded parquet index (called the columnar index) that covers all of our indexes. You can find more information about this index at [the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format). @@ -67,7 +82,18 @@ and limit 1000 entries if you do not specify `--from`, `--to`, and `--limit`. ## Selecting by time -XXX +In most cases you'll probably use --crawl to select the time range for +Common Crawl queries, but for the Internet Archive you'll need to specify +a time range like this: + +``` +cdxt --ia --from 2008 --to 200906302359 size 'commoncrawl.org/*' +``` + +In this example the time range starts at the beginning of 2008 and +ends on June 30, 2009 at 23:59. All times are in UTC. If you do not +specify a time range (and also don't use `--crawl`), you'll get the +most recent year. ## The full syntax for command-line tools From dbdb73989b1a88f1b7d5a4bb5c0b3c0b4e83d6fe Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 19:06:52 +0000 Subject: [PATCH 4/8] doc: update README [skip ci] --- README.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 277e1e1..d87a4d5 100644 --- a/README.md +++ b/README.md @@ -67,13 +67,15 @@ you ask for a particular crawl is: $ cdxt --crawl CC-MAIN-2024-33 iter 'commoncrawl.org/*' ``` -`--crawl 3` is the latest 3 crawls. `--crawl CC-MAIN-2018` will match all -of the crawls from 2018. +- `--crawl CC-MAIN-2024-33` is a single crawl. +- `--crawl 3` is the latest 3 crawls. +- `--crawl CC-MAIN-2018` will match all of the crawls from 2018. +- `--crawl CC-MAIN-2018,CC-MAIN-2019` will match all of the crawls from 2018 and 2019. -CCF also has a hive-sharded parquet index -(called the columnar index) that covers all of our indexes. You -can find more information about this index at -[the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format). +CCF also has a hive-sharded parquet index (called the columnar index) +that covers all of our crawls. Querying broad time ranges is much +faster with the columnar index. You can find more information about +this index at [the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format). The Internet Archive cdx index is organized as a single crawl that goes from the very beginning until now. That's why there is no `--crawl` for @@ -87,7 +89,7 @@ Common Crawl queries, but for the Internet Archive you'll need to specify a time range like this: ``` -cdxt --ia --from 2008 --to 200906302359 size 'commoncrawl.org/*' +$ cdxt --ia --limit 1 --from 2008 --to 200906302359 iter 'commoncrawl.org/*' ``` In this example the time range starts at the beginning of 2008 and From b6b1bdeb4d2c20bb43fba2a8b9ea7935a648a4d4 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 20:21:06 +0000 Subject: [PATCH 5/8] feat: --crawl for common crawl --- cdx_toolkit/__init__.py | 4 ++++ cdx_toolkit/commoncrawl.py | 2 ++ tests/test_cc_crawl.py | 12 ++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 tests/test_cc_crawl.py diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py index e413a71..c27845a 100644 --- a/cdx_toolkit/__init__.py +++ b/cdx_toolkit/__init__.py @@ -246,6 +246,8 @@ def get(self, url, **kwargs): validate_timestamps(params) params['url'] = url params['output'] = 'json' + if 'crawl' not in params: + params['crawl'] = self.crawl if 'filter' in params: if isinstance(params['filter'], str): params['filter'] = (params['filter'],) @@ -275,6 +277,8 @@ def iter(self, url, **kwargs): validate_timestamps(params) params['url'] = url params['output'] = 'json' + if 'crawl' not in params: + params['crawl'] = self.crawl if 'filter' in params: if isinstance(params['filter'], str): params['filter'] = (params['filter'],) diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index 8ef7b23..6834217 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -133,6 +133,8 @@ def match_cc_crawls(crawls, raw_index_list): # match crawls requested on the command line to actual crawls # note that from/to are not considered here # crawls should be normalized so it's supposed to be a list of str + if crawls is None: + return raw_index_list if len(crawls) == 1 and crawls[0].isdigit(): num = int(crawls[0]) raw_index_list = raw_index_list[-num:] diff --git a/tests/test_cc_crawl.py b/tests/test_cc_crawl.py new file mode 100644 index 0000000..3ec9c60 --- /dev/null +++ b/tests/test_cc_crawl.py @@ -0,0 +1,12 @@ +def test_cc_crawl(): + # 'crawl' can be specified for a CDXFetcher, and later change in + # an iter() or get() call. + + # init no crawl, iter/get no crawl + # init crawl, iter/get no crawl + # init no crawl, iter/get crawl + # init crawl, iter/get crawl + + # this is a lot of work to test 4 lines of code :/ + pass + From 1749d6d1ec3f7c6a89bec58f985cf0811eab7bba Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 20:35:00 +0000 Subject: [PATCH 6/8] feat: --crawl for common crawl [skip ci] --- CHANGELOG.md | 5 ++++- Makefile | 7 +++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7ba2a3..ff27f91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ +- 0.9.37 + + --crawl for CCF + - 0.9.36 + ratelimit code; both IA and CCF are rate limiting their cdx endpoints + cache collinfo.json in ~/.cache/cdx_toolkit/ - + py3.11 and py3.12 pass testing + + py3.11 and py3.12 pass testing; windows and macos pass testing - 0.9.35 + exponential backoff retries now that IA is sending 429 diff --git a/Makefile b/Makefile index 99cfc19..c6e3452 100644 --- a/Makefile +++ b/Makefile @@ -33,14 +33,13 @@ distcheck: distclean twine check dist/* dist: distclean - echo " Finishe CHANGELOG and commit it. + echo " Finishe CHANGELOG.md and commit it." echo " git tag --list" - echo " git tag v0.x.x" + echo " git tag 0.x.x # no v" echo " git push --tags" python ./setup.py sdist twine check dist/* twine upload dist/* -r pypi install: - python ./setup.py install - + pip install . From ab1b0de887779c86598cd2cfab55003098168d75 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 22:27:49 +0000 Subject: [PATCH 7/8] feat: --crawl for common crawl [skip ci] --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4875825..c43c4ca 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: fail-fast: false - #max-parallel: 1 + max-parallel: 1 # avoids ever triggering a rate limit matrix: python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest] From dd0b1d33b8838ee20350ddd0e0b35e0cf3336424 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 22:29:28 +0000 Subject: [PATCH 8/8] intentionally trigger ci --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index d87a4d5..f22aec7 100644 --- a/README.md +++ b/README.md @@ -288,5 +288,3 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - -