From 9911087c866747ff16c0cb5dc66cb92c615402f6 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Thu, 31 Oct 2024 14:38:23 +0100 Subject: [PATCH] fixes for tdx --- pixi.lock | 2 +- src/coincident/datasets/csda.py | 17 ++++++++++++++++ src/coincident/search/main.py | 36 +++++++++++++++++++++++++++------ src/coincident/search/stac.py | 29 +++++++++++++++++++++++++- tests/test_search.py | 11 ++++++++++ 5 files changed, 87 insertions(+), 8 deletions(-) diff --git a/pixi.lock b/pixi.lock index 5061201..f584b7e 100644 --- a/pixi.lock +++ b/pixi.lock @@ -3117,7 +3117,7 @@ packages: requires_python: '>=3.8' - kind: pypi name: coincident - version: 0.1.dev43+g16468bc.d20241031 + version: 0.1.dev30+g3960508 path: . sha256: ff33a9cdefb73e562804affbed3bda6ba7e9ee34533e3112f68164a7babf7107 requires_dist: diff --git a/src/coincident/datasets/csda.py b/src/coincident/datasets/csda.py index 04cd0ab..ac25ab4 100644 --- a/src/coincident/datasets/csda.py +++ b/src/coincident/datasets/csda.py @@ -14,6 +14,7 @@ from __future__ import annotations from dataclasses import dataclass, field +from typing import Any from coincident.datasets.general import Dataset @@ -53,3 +54,19 @@ class TDX(Dataset): end: str | None = None type: str = "sar" provider: str = "csda" + stac_kwargs: dict[str, Any] = field( + default_factory=lambda: { + "limit": 1000, + "filter": { + "op": "and", + "args": [ + # exclude PAZ, only SSC products + { + "op": "in", + "args": [{"property": "platform"}, ["TDX-1", "TSX-1"]], + }, + {"op": "=", "args": [{"property": "sar:product_type"}, "SSC"]}, + ], + }, + } + ) diff --git a/src/coincident/search/main.py b/src/coincident/search/main.py index 6bf3096..8d15e39 100644 --- a/src/coincident/search/main.py +++ b/src/coincident/search/main.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from typing import Any import geopandas as gpd @@ -51,10 +52,10 @@ def search( try: dataset = _alias_to_Dataset[dataset] except KeyError as e: - message = ( + msg_unsupported = ( f"{dataset} is not a supported dataset: {_alias_to_Dataset.keys()}" ) - raise ValueError(message) from e + raise ValueError(msg_unsupported) from e # Validate Datetimes _validate_temporal_bounds(dataset, datetime) @@ -81,6 +82,11 @@ def search( shapely_geometry = shapely_geometry.reverse() aoi = _pystac_client._format_intersects(shapely_geometry) # to JSON geometry else: + if "bbox" not in kwargs: + msg_unconstrained = ( + "Neither `bbox` nor `intersects` provided... search will be global" + ) + warnings.warn(msg_unconstrained, stacklevel=2) aoi = None # STAC API Searches @@ -94,8 +100,8 @@ def search( if dataset.provider == "maxar": # NOTE: not sure how to avoid incompatible type "str | None"; expected "str" for Dataset.attrs client = stac.configure_maxar_client(dataset.area_based_calc) # type: ignore[attr-defined] - results = stac.search(client, **stac_api_kwargs) - gf = stac.to_geopandas(results) + item_collection = stac.search(client, **stac_api_kwargs) + gf = stac.to_geopandas(item_collection) # Client-side reduce to only acquisitions having stereo pairs gf = gf.loc[gf.stereo_pair_identifiers.str[0].dropna().index] @@ -110,8 +116,26 @@ def search( # Generic STAC endpoint w/o additional config else: client = stac.configure_stac_client(dataset.search) # type: ignore[arg-type] - results = stac.search(client, **stac_api_kwargs) - gf = stac.to_geopandas(results) + item_collection = stac.search(client, **stac_api_kwargs) + + # Per-dataset munging + # https://github.com/uw-cryo/coincident/issues/8#issuecomment-2449810481 + if dataset.alias == "tdx": + # Drop columns with messy schema + dropcols = [ + "sceneInfo", + "missionInfo", + "previewInfo", + "imageDataInfo", + "generationInfo", + "acquisitionInfo", + "productVariantInfo", + ] + for item in item_collection: + for col in dropcols: + item.properties.pop(col) + + gf = stac.to_geopandas(item_collection) # Non-STAC Searches elif dataset.alias == "3dep": diff --git a/src/coincident/search/stac.py b/src/coincident/search/stac.py index a5c5a9a..4c38215 100644 --- a/src/coincident/search/stac.py +++ b/src/coincident/search/stac.py @@ -30,7 +30,32 @@ def to_geopandas( collection: pystac.item_collection.ItemCollection, ) -> gpd.GeoDataFrame: - """Convert returned from STAC API to geodataframe via arrow""" + """ + Convert a STAC ItemCollection to a GeoDataFrame. + This function converts a given STAC ItemCollection to a GeoDataFrame using the + `stac_geoparquet.arrow.parse_stac_items_to_arrow` method. It also adds an additional + column 'dayofyear' for convenience. + + Parameters + ---------- + collection : pystac.item_collection.ItemCollection + The STAC ItemCollection to be converted. + + Returns + ------- + gpd.GeoDataFrame + A GeoDataFrame containing the data from the STAC ItemCollection. + + Raises + ------ + ValueError + If the provided ItemCollection is empty. + """ + # Catch if no items are passed + if len(collection) == 0: + message = "ItemCollection is empty, cannot convert to GeoDataFrame" + raise ValueError(message) + record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(collection) gf = gpd.GeoDataFrame.from_arrow(record_batch_reader) # doesn't keep arrow dtypes @@ -65,6 +90,8 @@ def search( client: pystac_client.client.Client, **kwargs: dict[str, Any] | None ) -> pystac_client.item_search.ItemSearch: """Search any STAC API (e.g. https://github.com/nasa/cmr-stac)""" + # NOTE: add logging for kwargs? + # print(kwargs) results = client.search( **kwargs, ) diff --git a/tests/test_search.py b/tests/test_search.py index a42d928..9c0cc70 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -42,6 +42,16 @@ def test_polygon_invalid_type(): m.search.search(dataset="3dep", intersects="-120, 40, -121, 41") +def test_to_geopandas_empty_search_result(): + with pytest.raises(ValueError, match="ItemCollection is empty"): + m.search.stac.to_geopandas([]) + + +def test_unconstrained_search_warns(): + with pytest.warns(match="Neither `bbox` nor `intersects` provided"): + m.search.search(dataset="tdx") + + # TODO: add more assertions / tests for this section @network @pytest.mark.filterwarnings("ignore:Server does not conform") @@ -98,6 +108,7 @@ def test_gedi_search(aoi): def test_tdx_search(aoi): gf = m.search.search(dataset="tdx", intersects=aoi, datetime=["2009", "2020"]) assert len(gf) == 48 + assert gf["sar:product_type"].unique() == "SSC" # MS PLANETARY COMPUTER