diff --git a/docs/examples/opentopo.ipynb b/docs/examples/opentopo.ipynb index 41d05ab..b4d57b7 100644 --- a/docs/examples/opentopo.ipynb +++ b/docs/examples/opentopo.ipynb @@ -225,11 +225,7 @@ "metadata": {}, "outputs": [], "source": [ - "gf_noaa = coincident.search.search(\n", - " dataset=\"noaa\",\n", - " intersects=aoi,\n", - " datetime=[\"2020\"]\n", - ")" + "gf_noaa = coincident.search.search(dataset=\"noaa\", intersects=aoi, datetime=[\"2020\"])" ] }, { @@ -595,11 +591,7 @@ "outputs": [], "source": [ "# now, let's see if there were any NCALM missions from the same year\n", - "gf_ncalm = coincident.search.search(\n", - " dataset=\"ncalm\",\n", - " intersects=aoi,\n", - " datetime=[\"2020\"]\n", - ")" + "gf_ncalm = coincident.search.search(dataset=\"ncalm\", intersects=aoi, datetime=[\"2020\"])" ] }, { @@ -954,8 +946,8 @@ } ], "source": [ - "m = gf_noaa.explore(color='black')\n", - "gf_ncalm.explore(m=m, color='deeppink')" + "m = gf_noaa.explore(color=\"black\")\n", + "gf_ncalm.explore(m=m, color=\"deeppink\")" ] } ], diff --git a/src/coincident/datasets/__init__.py b/src/coincident/datasets/__init__.py index 25a7a96..1cbd508 100644 --- a/src/coincident/datasets/__init__.py +++ b/src/coincident/datasets/__init__.py @@ -12,7 +12,15 @@ from __future__ import annotations -from coincident.datasets import csda, maxar, nasa, opentopo, planetary_computer, usgs +from coincident.datasets import ( + csda, + maxar, + nasa, + neon, + opentopo, + planetary_computer, + usgs, +) from coincident.datasets.general import Dataset # Convenience mapping of string aliases to supported dataset classes @@ -26,6 +34,7 @@ csda.TDX(), opentopo.NOAA(), opentopo.NCALM(), + neon.NEON(), ] aliases = [x.alias for x in _datasets] diff --git a/src/coincident/datasets/neon.py b/src/coincident/datasets/neon.py new file mode 100644 index 0000000..e34b8c0 --- /dev/null +++ b/src/coincident/datasets/neon.py @@ -0,0 +1,24 @@ +""" +NEON Discrete LiDAR Datasets + +https://data.neonscience.org/data-products/DP3.30024.001 +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from coincident.datasets.general import Dataset + + +@dataclass +class NEON(Dataset): + """Essential metadata for NEON LiDAR""" + + has_stac_api: bool = False + search: str = "http://data.neonscience.org/api/v0/sites" + start: str = "2005-06-01" + end: str | None = None + type: str = "lidar" + alias: str = "neon" + provider: str = "neon" diff --git a/src/coincident/search/main.py b/src/coincident/search/main.py index 1975edd..b222d2b 100644 --- a/src/coincident/search/main.py +++ b/src/coincident/search/main.py @@ -11,7 +11,7 @@ from coincident.datasets import _alias_to_Dataset from coincident.datasets.general import Dataset from coincident.overlaps import subset_by_minimum_area -from coincident.search import opentopo_api, stac, wesm +from coincident.search import neon_api, opentopo_api, stac, wesm _pystac_client = _ItemSearch("no_url") @@ -147,6 +147,13 @@ def search( **kwargs, ) + elif dataset.alias == "neon": + gf = neon_api.search_bboxes( + intersects=intersects, + search_start=search_start, + search_end=search_end, + ) + elif dataset.provider == "opentopography": gf = opentopo_api.search_opentopo( intersects=intersects, diff --git a/src/coincident/search/neon_api.py b/src/coincident/search/neon_api.py new file mode 100644 index 0000000..3314430 --- /dev/null +++ b/src/coincident/search/neon_api.py @@ -0,0 +1,234 @@ +""" +Searching for NEON data with spatial and temporal filters is actually quite difficult. +I couldn't find any STAC catalogs or code to make this easier + +https://data.neonscience.org/data-products/DP3.30024.001 +https://www.neonscience.org/resources/learning-hub/tutorials/neon-api-intro-requests-py + +The process goes as follows: +- Use requests to get some basic metadata for all NEON sites that contain discrete return LiDAR point cloud data + - Here, we grab the site code, site name, available months, and product URLs + - Note that grabbing the specific collection time for each flight requires + a lot more key-value pair digging and i do not believe is worth the effort + - We grab the 'centroid' (top left of the center-most tile of footprint) here as well + this is because there is no easy way to search on an input geometry and we can + use the 'centroid' to make that process easier +- Subset the resulting gdf on the user-input start and end times +- Subset that gdf by only including sites that have 'centroids' within 1 degree (~111km) + we do this because grabbing the actual footprint geometry is time-consuming. + the only way to do this dynamically, at least barring the use of + more unnecessary and unique llibraries/packages from what i found, + is by making a GET request for each site and then reading in the footprint + hosted on storage.googleapis.com +- Grab the footprint geometry for each site + These are hosted on storage.googleapis.com in the form of a shapefile + that has all of the complex tile footprints as their own row geometries +""" + +from __future__ import annotations + +import warnings + +import geopandas as gpd +import pandas as pd +import pyogrio +import requests +from pandas import Timestamp +from shapely.geometry import Point + +warnings.filterwarnings( + "ignore", message=".*Geometry is in a geographic CRS.*", category=UserWarning +) + + +def build_neon_point_gf(sites_url: str) -> gpd.GeoDataFrame: + """ + Build a GeoDataFrame containing NEON site information, including product URLs and available months. + + Parameters + ---------- + sites_url : str + The URL to the NEON API for retrieving site data. + + Returns + ------- + gpd.GeoDataFrame + A GeoDataFrame with site information and 'centroid' geometries. Note that these centroids + are the top left of the center-most tile for each NEON flight. Also note that there is no + easy way to pull the flight footprint geometry from NEON. + """ + sites_request = requests.get(sites_url) + sites_json = sites_request.json() + + df_neon = pd.concat( + [ + pd.DataFrame( + { + "id": site["siteCode"], + "title": site["siteName"], + "start_datetime": dict_lidar["availableMonths"], + "end_datetime": dict_lidar["availableMonths"], + "product_url": dict_lidar["availableDataUrls"], + "latitude": site["siteLatitude"], + "longitude": site["siteLongitude"], + } + ) + for site in sites_json["data"] + for dict_lidar in [ + next( + ( + item + for item in site["dataProducts"] + if item["dataProductCode"] == "DP3.30024.001" + ), + None, + ) + ] + if dict_lidar is not None + ], + ignore_index=True, + ) + + df_neon["geometry"] = df_neon.apply( + lambda row: Point(row["longitude"], row["latitude"]), axis=1 + ) + gf_neon = gpd.GeoDataFrame(df_neon, geometry="geometry", crs="EPSG:4326") + + return gf_neon.drop(columns=["latitude", "longitude"]) + + +def temporal_filter_neon( + gf_neon: gpd.GeoDataFrame, search_start: Timestamp, search_end: Timestamp +) -> gpd.GeoDataFrame: + """ + Filter NEON GeoDataFrame by temporal range. + + Parameters + ---------- + gf_neon : gpd.GeoDataFrame + The GeoDataFrame containing NEON site data from build_neon_point_gf(). + search_start : pd.Timestamp + The start of the time range to filter by. + search_end : pd.Timestamp + The end of the time range to filter by. + + Returns + ------- + gpd.GeoDataFrame + A filtered GeoDataFrame based on the temporal range. + """ + return gf_neon[ + ( + gf_neon["start_datetime"].apply(lambda x: pd.to_datetime(x + "-01")) + >= search_start + ) + & ( + gf_neon["start_datetime"].apply(lambda x: pd.to_datetime(x + "-01")) + <= search_end + ) + ].reset_index(drop=True) + + +def get_neon_bboxes(url: str | None, fallback_geometry: gpd.GeoSeries) -> gpd.GeoSeries: + """ + Fetch and return bounding boxes for NEON data products. + + Parameters + ---------- + url : str + The URL to the NEON product data. + fallback_geometry : gpd.GeoSeries + A fallback geometry to return if footprint cannot be retrieved. + + Returns + ------- + gpd.GeoSeries + The bounding box geometry for the NEON data product. + """ + response = requests.get(url) + if response.status_code != 200: + return fallback_geometry + data = response.json().get("data", {}) + files = data.get("files", []) + # Find the file that ends with 'merged_tiles.shp' dynamically + # I really can't think of a better, more-efficient way to do this after dissecting the NEON API + # The merged tiles shapefiles have similar filepaths, but we're missing some extra metadata + # that contains another ID needed to hardcode those paths effectively (see 'D02' in commented link below) + shp_file_url = next( + (f["url"] for f in files if f["name"].endswith("merged_tiles.shp")), None + ) + + try: + df_tiles = pyogrio.read_dataframe(shp_file_url).to_crs(4326) + except: # noqa: E722 + # TODO: fix provisional footprint grab + # the provisional (usually synonymous with new) NEON flights don't typically have merged_tiles.shp + # i tried to iterate over all the different tiles and grab bboxes but too inefficient + # we'll just return the point geometry for now + return fallback_geometry + + try: + return df_tiles.union_all().envelope + except: # noqa: E722 + # see below link for example where make_valid() is necessary + # https://storage.googleapis.com/neon-aop-products/2019/FullSite/D02/2019_SCBI_3/Metadata/DiscreteLidar/TileBoundary/shps/2019_SCBI_3_merged_tiles.shp + return df_tiles.geometry.make_valid().union_all().envelope + + +def search_bboxes( + intersects: gpd.GeoDataFrame | gpd.GeoSeries, + search_start: Timestamp, + search_end: Timestamp, +) -> gpd.GeoDataFrame: + """ + Perform a search for NEON metadata and respective bbox footprints. Note that this search + will take a while if you denote a large aoi or a large time range. + + Parameters + ---------- + intersects : gpd.GeoDataFrame | gpd.GeoSeries + The geometry to restrict the search. + search_start : pd.Timestamp + The start of the time range to filter by. + search_end : pd.Timestamp + The end of the time range to filter by. + + Returns + ------- + gpd.GeoDataFrame + A GeoDataFrame with NEON metadata and respective bbox footprints + """ + if search_start is None and search_end is None: + search_start = pd.Timestamp( + "2005-06-01" + ) # Starting from June, 2005 (first NEON dataset) + search_end = pd.Timestamp.today() # Default to today's date + # note that the above will result in the search taking a very long time + # if you're searching a larger area + if intersects is None: + # Define global bounding box + intersects = gpd.GeoSeries([Point(-180, -90).buffer(360)]) + + sites_url = "http://data.neonscience.org/api/v0/sites" + # Build NEON point GeoDataFrame + gf_neon = build_neon_point_gf(sites_url) + + # Apply temporal filter + gf_neon = temporal_filter_neon(gf_neon, search_start, search_end) + + # Find nearest footprint 'centroids' within the intersection geometry + gf_neon = ( + gpd.sjoin_nearest(gf_neon, intersects, max_distance=1) + .drop(columns=["index_right"]) + .reset_index(drop=True) + ) + + # Apply bounding box function + gf_neon["geometry"] = gf_neon.apply( + lambda row: get_neon_bboxes(row["product_url"], row["geometry"]), axis=1 + ) + + # sjoin to make sure geometries actually overlap with input aoi + return ( + gf_neon.sjoin(intersects).drop(columns=["index_right"]).reset_index(drop=True) + ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 2d72f12..cf7e568 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -25,3 +25,8 @@ def test_opentopo_defaults(): assert ds_noaa.alias == "noaa" ds_ncalm = coincident.datasets.opentopo.NCALM() assert ds_ncalm.alias == "ncalm" + + +def test_neon_defaults(): + ds = coincident.datasets.neon.NEON() + assert ds.alias == "neon" diff --git a/tests/test_search.py b/tests/test_search.py index 7507382..c552c3c 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -203,3 +203,17 @@ def test_ncalm_search(large_aoi): dataset="ncalm", intersects=large_aoi, datetime=["2019-01-01", "2023-12-31"] ) assert len(gf) == 6 + + +# NEON +# TODO: use a smaller aoi for search +# i don't want to keep adding fixtures to conftest.py but i think the test takes way too long with CO +# and no sites overlap with grandmesa +# ======= +@network +@pytest.mark.filterwarnings("ignore:Geometry is in a geographic CRS:UserWarning") +def test_neon_search(large_aoi): + gf = coincident.search.search( + dataset="neon", intersects=large_aoi, datetime=["2022-01-01", "2022-12-31"] + ) + assert len(gf) == 3