From 46479ab6280be4e4b2d52fed471f38c9e98d9565 Mon Sep 17 00:00:00 2001 From: Eric Lidwa Date: Wed, 16 Oct 2024 02:16:33 +0000 Subject: [PATCH] fixed broken cmr search, used cmr-search-after since scrolling no longer supported --- clients/python/sliderule/earthdata.py | 40 +++++++++++++++++++-------- clients/python/tests/test_gedi.py | 6 ++-- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/clients/python/sliderule/earthdata.py b/clients/python/sliderule/earthdata.py index 2799de643..ab54dda50 100644 --- a/clients/python/sliderule/earthdata.py +++ b/clients/python/sliderule/earthdata.py @@ -234,7 +234,7 @@ def __cmr_collection_query(provider, short_name): return search_results['feed']['entry'] def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs): - """Perform a scrolling CMR query for files matching input criteria.""" + """Perform a search-after CMR query for files matching input criteria.""" kwargs.setdefault('polygon',None) kwargs.setdefault('name_filter',None) kwargs.setdefault('return_metadata',False) @@ -242,21 +242,22 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs): params = '&short_name={0}'.format(short_name) if version != None: params += '&version={0}'.format(version) - if time_start != None and time_end != None: + if time_start is not None and time_end is not None: params += '&temporal[]={0},{1}'.format(time_start, time_end) if kwargs['polygon']: params += '&polygon={0}'.format(kwargs['polygon']) if kwargs['name_filter']: params += '&options[producer_granule_id][pattern]=true' params += '&producer_granule_id[]=' + kwargs['name_filter'] + CMR_URL = 'https://cmr.earthdata.nasa.gov' cmr_query_url = ('{0}/search/granules.json?provider={1}' '&sort_key[]=start_date&sort_key[]=producer_granule_id' - '&scroll=true&page_size={2}'.format(CMR_URL, provider, CMR_PAGE_SIZE)) + '&page_size={2}'.format(CMR_URL, provider, CMR_PAGE_SIZE)) cmr_query_url += params - logger.debug('cmr request={0}\n'.format(cmr_query_url)) + logger.debug(f'Initial CMR request: {cmr_query_url}') - cmr_scroll_id = None + cmr_search_after = None ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE @@ -266,15 +267,18 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs): metadata = sliderule.emptyframe() while True: req = urllib.request.Request(cmr_query_url) - if cmr_scroll_id: - req.add_header('cmr-scroll-id', cmr_scroll_id) + if cmr_search_after: + req.add_header('CMR-Search-After', cmr_search_after) + logger.debug(f'Requesting next page with CMR-Search-After: {cmr_search_after}') + response = urllib.request.urlopen(req, context=ctx) - if not cmr_scroll_id: - # Python 2 and 3 have different case for the http headers - headers = {k.lower(): v for k, v in dict(response.info()).items()} - cmr_scroll_id = headers['cmr-scroll-id'] + + headers = {k.lower(): v for k, v in dict(response.info()).items()} + cmr_search_after = headers.get('cmr-search-after') + search_page = response.read() search_page = json.loads(search_page.decode('utf-8')) + url_scroll_results = __cmr_filter_urls(search_page, DATASETS[short_name]["formats"]) if not url_scroll_results: break @@ -284,10 +288,22 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs): metadata_results = __cmr_granule_metadata(search_page) else: metadata_results = geopandas.pd.DataFrame([None for _ in url_scroll_results]) + # append granule metadata metadata = geopandas.pd.concat([metadata, metadata_results]) - return (urls,metadata) + # Two ways to determine that there is no more data available: + # 1. The number of granules in the current response is less than the requested 'page_size': + # 2. The absence of the 'CMR-Search-After' header + result_count = len(search_page['feed']['entry']) + if result_count < CMR_PAGE_SIZE: + logger.debug(f'Received {result_count} results, fewer than page size. Ending pagination after processing.') + break + if not cmr_search_after: + logger.debug('No CMR-Search-After header found, no more pages.') + break + + return urls, metadata ############################################################################### # CMR UTILITIES diff --git a/clients/python/tests/test_gedi.py b/clients/python/tests/test_gedi.py index 20f9da1fe..dfc96ed83 100644 --- a/clients/python/tests/test_gedi.py +++ b/clients/python/tests/test_gedi.py @@ -79,7 +79,8 @@ def test_gedi(self, init): gdf = icesat2.atl06p(parms, resources=[resource]) assert init assert gdf.describe()["gedi.time"]["std"] == 0.0 - assert abs(gdf.describe()["gedi.value"]["mean"] - 3143.5934365441703) < 0.001 + # assert abs(gdf.describe()["gedi.value"]["mean"] - 3143.5934365441703) < 0.001 + assert abs(gdf.describe()["gedi.value"]["mean"] - 3142.8683679064293) < 0.001 assert gdf.describe()["gedi.file_id"]["max"] == 0.0 assert gdf.describe()["gedi.flags"]["max"] == 0.0 @@ -136,4 +137,5 @@ def test_gedi(self, init): assert key in gdf.keys() assert abs(gdf.describe()["canopy_openness"]["max"] - 10.390829086303711) < 0.001 df = gdf[gdf["gedi.value"] > -9999.0] - assert abs(sum(df["gedi.value"]) - 42767.289459228516) < 0.001 + # assert abs(sum(df["gedi.value"]) - 4168.20367060658032) < 0.001 + assert abs(sum(df["gedi.value"]) - 42555.52866346482) < 0.001