Skip to content

Commit

Permalink
fixed broken cmr search, used cmr-search-after since scrolling no lon…
Browse files Browse the repository at this point in the history
…ger supported
  • Loading branch information
elidwa committed Oct 16, 2024
1 parent d723f98 commit 46479ab
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 14 deletions.
40 changes: 28 additions & 12 deletions clients/python/sliderule/earthdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,29 +234,30 @@ def __cmr_collection_query(provider, short_name):
return search_results['feed']['entry']

def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs):
"""Perform a scrolling CMR query for files matching input criteria."""
"""Perform a search-after CMR query for files matching input criteria."""
kwargs.setdefault('polygon',None)
kwargs.setdefault('name_filter',None)
kwargs.setdefault('return_metadata',False)
# build params
params = '&short_name={0}'.format(short_name)
if version != None:
params += '&version={0}'.format(version)
if time_start != None and time_end != None:
if time_start is not None and time_end is not None:
params += '&temporal[]={0},{1}'.format(time_start, time_end)
if kwargs['polygon']:
params += '&polygon={0}'.format(kwargs['polygon'])
if kwargs['name_filter']:
params += '&options[producer_granule_id][pattern]=true'
params += '&producer_granule_id[]=' + kwargs['name_filter']

CMR_URL = 'https://cmr.earthdata.nasa.gov'
cmr_query_url = ('{0}/search/granules.json?provider={1}'
'&sort_key[]=start_date&sort_key[]=producer_granule_id'
'&scroll=true&page_size={2}'.format(CMR_URL, provider, CMR_PAGE_SIZE))
'&page_size={2}'.format(CMR_URL, provider, CMR_PAGE_SIZE))
cmr_query_url += params
logger.debug('cmr request={0}\n'.format(cmr_query_url))
logger.debug(f'Initial CMR request: {cmr_query_url}')

cmr_scroll_id = None
cmr_search_after = None
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
Expand All @@ -266,15 +267,18 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs):
metadata = sliderule.emptyframe()
while True:
req = urllib.request.Request(cmr_query_url)
if cmr_scroll_id:
req.add_header('cmr-scroll-id', cmr_scroll_id)
if cmr_search_after:
req.add_header('CMR-Search-After', cmr_search_after)
logger.debug(f'Requesting next page with CMR-Search-After: {cmr_search_after}')

response = urllib.request.urlopen(req, context=ctx)
if not cmr_scroll_id:
# Python 2 and 3 have different case for the http headers
headers = {k.lower(): v for k, v in dict(response.info()).items()}
cmr_scroll_id = headers['cmr-scroll-id']

headers = {k.lower(): v for k, v in dict(response.info()).items()}
cmr_search_after = headers.get('cmr-search-after')

search_page = response.read()
search_page = json.loads(search_page.decode('utf-8'))

url_scroll_results = __cmr_filter_urls(search_page, DATASETS[short_name]["formats"])
if not url_scroll_results:
break
Expand All @@ -284,10 +288,22 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs):
metadata_results = __cmr_granule_metadata(search_page)
else:
metadata_results = geopandas.pd.DataFrame([None for _ in url_scroll_results])

# append granule metadata
metadata = geopandas.pd.concat([metadata, metadata_results])

return (urls,metadata)
# Two ways to determine that there is no more data available:
# 1. The number of granules in the current response is less than the requested 'page_size':
# 2. The absence of the 'CMR-Search-After' header
result_count = len(search_page['feed']['entry'])
if result_count < CMR_PAGE_SIZE:
logger.debug(f'Received {result_count} results, fewer than page size. Ending pagination after processing.')
break
if not cmr_search_after:
logger.debug('No CMR-Search-After header found, no more pages.')
break

return urls, metadata

###############################################################################
# CMR UTILITIES
Expand Down
6 changes: 4 additions & 2 deletions clients/python/tests/test_gedi.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def test_gedi(self, init):
gdf = icesat2.atl06p(parms, resources=[resource])
assert init
assert gdf.describe()["gedi.time"]["std"] == 0.0
assert abs(gdf.describe()["gedi.value"]["mean"] - 3143.5934365441703) < 0.001
# assert abs(gdf.describe()["gedi.value"]["mean"] - 3143.5934365441703) < 0.001
assert abs(gdf.describe()["gedi.value"]["mean"] - 3142.8683679064293) < 0.001
assert gdf.describe()["gedi.file_id"]["max"] == 0.0
assert gdf.describe()["gedi.flags"]["max"] == 0.0

Expand Down Expand Up @@ -136,4 +137,5 @@ def test_gedi(self, init):
assert key in gdf.keys()
assert abs(gdf.describe()["canopy_openness"]["max"] - 10.390829086303711) < 0.001
df = gdf[gdf["gedi.value"] > -9999.0]
assert abs(sum(df["gedi.value"]) - 42767.289459228516) < 0.001
# assert abs(sum(df["gedi.value"]) - 4168.20367060658032) < 0.001
assert abs(sum(df["gedi.value"]) - 42555.52866346482) < 0.001

0 comments on commit 46479ab

Please sign in to comment.