Added messy NEON search

uw-cryo · Dec 17, 2024 · 03e39e6 · 03e39e6 · Jack-Hayes · Dec 18, 2024
1 parent f732cbf
commit 03e39e6
Show file tree

Hide file tree

Showing 7 changed files with 299 additions and 14 deletions.
diff --git a/docs/examples/opentopo.ipynb b/docs/examples/opentopo.ipynb
@@ -225,11 +225,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "gf_noaa = coincident.search.search(\n",
-    "    dataset=\"noaa\",\n",
-    "    intersects=aoi,\n",
-    "    datetime=[\"2020\"]\n",
-    ")"
+    "gf_noaa = coincident.search.search(dataset=\"noaa\", intersects=aoi, datetime=[\"2020\"])"
    ]
   },
   {
@@ -595,11 +591,7 @@
    "outputs": [],
    "source": [
     "# now, let's see if there were any NCALM missions from the same year\n",
-    "gf_ncalm = coincident.search.search(\n",
-    "    dataset=\"ncalm\",\n",
-    "    intersects=aoi,\n",
-    "    datetime=[\"2020\"]\n",
-    ")"
+    "gf_ncalm = coincident.search.search(dataset=\"ncalm\", intersects=aoi, datetime=[\"2020\"])"
    ]
   },
   {
@@ -954,8 +946,8 @@
     }
    ],
    "source": [
-    "m = gf_noaa.explore(color='black')\n",
-    "gf_ncalm.explore(m=m, color='deeppink')"
+    "m = gf_noaa.explore(color=\"black\")\n",
+    "gf_ncalm.explore(m=m, color=\"deeppink\")"
    ]
   }
  ],

diff --git a/src/coincident/datasets/__init__.py b/src/coincident/datasets/__init__.py
@@ -12,7 +12,15 @@
 
 from __future__ import annotations
 
-from coincident.datasets import csda, maxar, nasa, opentopo, planetary_computer, usgs
+from coincident.datasets import (
+    csda,
+    maxar,
+    nasa,
+    neon,
+    opentopo,
+    planetary_computer,
+    usgs,
+)
 from coincident.datasets.general import Dataset
 
 # Convenience mapping of string aliases to supported dataset classes
@@ -26,6 +34,7 @@
     csda.TDX(),
     opentopo.NOAA(),
     opentopo.NCALM(),
+    neon.NEON(),
 ]
 
 aliases = [x.alias for x in _datasets]

diff --git a/src/coincident/datasets/neon.py b/src/coincident/datasets/neon.py
@@ -0,0 +1,24 @@
+"""
+NEON Discrete LiDAR Datasets
+
+https://data.neonscience.org/data-products/DP3.30024.001
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from coincident.datasets.general import Dataset
+
+
+@dataclass
+class NEON(Dataset):
+    """Essential metadata for NEON LiDAR"""
+
+    has_stac_api: bool = False
+    search: str = "http://data.neonscience.org/api/v0/sites"
+    start: str = "2005-06-01"
+    end: str | None = None
+    type: str = "lidar"
+    alias: str = "neon"
+    provider: str = "neon"
diff --git a/src/coincident/search/main.py b/src/coincident/search/main.py
@@ -11,7 +11,7 @@
 from coincident.datasets import _alias_to_Dataset
 from coincident.datasets.general import Dataset
 from coincident.overlaps import subset_by_minimum_area
-from coincident.search import opentopo_api, stac, wesm
+from coincident.search import neon_api, opentopo_api, stac, wesm
 
 _pystac_client = _ItemSearch("no_url")
 
@@ -147,6 +147,13 @@ def search(
             **kwargs,
         )
 
+    elif dataset.alias == "neon":
+        gf = neon_api.search_bboxes(
+            intersects=intersects,
+            search_start=search_start,
+            search_end=search_end,
+        )
+
     elif dataset.provider == "opentopography":
         gf = opentopo_api.search_opentopo(
             intersects=intersects,

diff --git a/src/coincident/search/neon_api.py b/src/coincident/search/neon_api.py
@@ -0,0 +1,234 @@
+"""
+Searching for NEON data with spatial and temporal filters is actually quite difficult.
+I couldn't find any STAC catalogs or code to make this easier
+
+https://data.neonscience.org/data-products/DP3.30024.001
+https://www.neonscience.org/resources/learning-hub/tutorials/neon-api-intro-requests-py
+
+The process goes as follows:
+- Use requests to get some basic metadata for all NEON sites that contain discrete return LiDAR point cloud data
+    - Here, we grab the site code, site name, available months, and product URLs
+    - Note that grabbing the specific collection time for each flight requires
+        a lot more key-value pair digging and i do not believe is worth the effort
+    - We grab the 'centroid' (top left of the center-most tile of footprint) here as well
+        this is because there is no easy way to search on an input geometry and we can
+        use the 'centroid' to make that process easier
+- Subset the resulting gdf on the user-input start and end times
+- Subset that gdf by only including sites that have 'centroids' within 1 degree (~111km)
+    we do this because grabbing the actual footprint geometry is time-consuming.
+    the only way to do this dynamically, at least barring the use of
+    more unnecessary and unique llibraries/packages from what i found,
+    is by making a GET request for each site and then reading in the footprint
+    hosted on storage.googleapis.com
+- Grab the footprint geometry for each site
+    These are hosted on storage.googleapis.com in the form of a shapefile
+    that has all of the complex tile footprints as their own row geometries
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import geopandas as gpd
+import pandas as pd
+import pyogrio
+import requests
+from pandas import Timestamp
+from shapely.geometry import Point
+
+warnings.filterwarnings(
+    "ignore", message=".*Geometry is in a geographic CRS.*", category=UserWarning
+)
+
+
+def build_neon_point_gf(sites_url: str) -> gpd.GeoDataFrame:
+    """
+    Build a GeoDataFrame containing NEON site information, including product URLs and available months.
+
+    Parameters
+    ----------
+    sites_url : str
+        The URL to the NEON API for retrieving site data.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        A GeoDataFrame with site information and 'centroid' geometries. Note that these centroids
+        are the top left of the center-most tile for each NEON flight. Also note that there is no
+        easy way to pull the flight footprint geometry from NEON.
+    """
+    sites_request = requests.get(sites_url)
+    sites_json = sites_request.json()
+
+    df_neon = pd.concat(
+        [
+            pd.DataFrame(
+                {
+                    "id": site["siteCode"],
+                    "title": site["siteName"],
+                    "start_datetime": dict_lidar["availableMonths"],
+                    "end_datetime": dict_lidar["availableMonths"],
+                    "product_url": dict_lidar["availableDataUrls"],
+                    "latitude": site["siteLatitude"],
+                    "longitude": site["siteLongitude"],
+                }
+            )
+            for site in sites_json["data"]
+            for dict_lidar in [
+                next(
+                    (
+                        item
+                        for item in site["dataProducts"]
+                        if item["dataProductCode"] == "DP3.30024.001"
+                    ),
+                    None,
+                )
+            ]
+            if dict_lidar is not None
+        ],
+        ignore_index=True,
+    )
+
+    df_neon["geometry"] = df_neon.apply(
+        lambda row: Point(row["longitude"], row["latitude"]), axis=1
+    )
+    gf_neon = gpd.GeoDataFrame(df_neon, geometry="geometry", crs="EPSG:4326")
+
+    return gf_neon.drop(columns=["latitude", "longitude"])
+
+
+def temporal_filter_neon(
+    gf_neon: gpd.GeoDataFrame, search_start: Timestamp, search_end: Timestamp
+) -> gpd.GeoDataFrame:
+    """
+    Filter NEON GeoDataFrame by temporal range.
+
+    Parameters
+    ----------
+    gf_neon : gpd.GeoDataFrame
+        The GeoDataFrame containing NEON site data from build_neon_point_gf().
+    search_start : pd.Timestamp
+        The start of the time range to filter by.
+    search_end : pd.Timestamp
+        The end of the time range to filter by.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        A filtered GeoDataFrame based on the temporal range.
+    """
+    return gf_neon[
+        (
+            gf_neon["start_datetime"].apply(lambda x: pd.to_datetime(x + "-01"))
+            >= search_start
+        )
+        & (
+            gf_neon["start_datetime"].apply(lambda x: pd.to_datetime(x + "-01"))
+            <= search_end
+        )
+    ].reset_index(drop=True)
+
+
+def get_neon_bboxes(url: str | None, fallback_geometry: gpd.GeoSeries) -> gpd.GeoSeries:
+    """
+    Fetch and return bounding boxes for NEON data products.
+
+    Parameters
+    ----------
+    url : str
+        The URL to the NEON product data.
+    fallback_geometry : gpd.GeoSeries
+        A fallback geometry to return if footprint cannot be retrieved.
+
+    Returns
+    -------
+    gpd.GeoSeries
+        The bounding box geometry for the NEON data product.
+    """
+    response = requests.get(url)
+    if response.status_code != 200:
+        return fallback_geometry
+    data = response.json().get("data", {})
+    files = data.get("files", [])
+    # Find the file that ends with 'merged_tiles.shp' dynamically
+    # I really can't think of a better, more-efficient way to do this after dissecting the NEON API
+    # The merged tiles shapefiles have similar filepaths, but we're missing some extra metadata
+    # that contains another ID needed to hardcode those paths effectively (see 'D02' in commented link below)
+    shp_file_url = next(
+        (f["url"] for f in files if f["name"].endswith("merged_tiles.shp")), None
+    )
+
+    try:
+        df_tiles = pyogrio.read_dataframe(shp_file_url).to_crs(4326)
+    except:  # noqa: E722
+        # TODO: fix provisional footprint grab
+        # the provisional (usually synonymous with new) NEON flights don't typically have merged_tiles.shp
+        # i tried to iterate over all the different tiles and grab bboxes but too inefficient
+        # we'll just return the point geometry for now
+        return fallback_geometry
+
+    try:
+        return df_tiles.union_all().envelope
+    except:  # noqa: E722
+        # see below link for example where make_valid() is necessary
+        # https://storage.googleapis.com/neon-aop-products/2019/FullSite/D02/2019_SCBI_3/Metadata/DiscreteLidar/TileBoundary/shps/2019_SCBI_3_merged_tiles.shp
+        return df_tiles.geometry.make_valid().union_all().envelope
+
+
+def search_bboxes(
+    intersects: gpd.GeoDataFrame | gpd.GeoSeries,
+    search_start: Timestamp,
+    search_end: Timestamp,
+) -> gpd.GeoDataFrame:
+    """
+    Perform a search for NEON metadata and respective bbox footprints. Note that this search
+    will take a while if you denote a large aoi or a large time range.
+
+    Parameters
+    ----------
+    intersects : gpd.GeoDataFrame | gpd.GeoSeries
+        The geometry to restrict the search.
+    search_start : pd.Timestamp
+        The start of the time range to filter by.
+    search_end : pd.Timestamp
+        The end of the time range to filter by.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        A GeoDataFrame with NEON metadata and respective bbox footprints
+    """
+    if search_start is None and search_end is None:
+        search_start = pd.Timestamp(
+            "2005-06-01"
+        )  # Starting from June, 2005 (first NEON dataset)
+        search_end = pd.Timestamp.today()  # Default to today's date
+    # note that the above will result in the search taking a very long time
+    # if you're searching a larger area
+    if intersects is None:
+        # Define global bounding box
+        intersects = gpd.GeoSeries([Point(-180, -90).buffer(360)])
+
+    sites_url = "http://data.neonscience.org/api/v0/sites"
+    # Build NEON point GeoDataFrame
+    gf_neon = build_neon_point_gf(sites_url)
+
+    # Apply temporal filter
+    gf_neon = temporal_filter_neon(gf_neon, search_start, search_end)
+
+    # Find nearest footprint 'centroids' within the intersection geometry
+    gf_neon = (
+        gpd.sjoin_nearest(gf_neon, intersects, max_distance=1)
+        .drop(columns=["index_right"])
+        .reset_index(drop=True)
+    )
+
+    # Apply bounding box function
+    gf_neon["geometry"] = gf_neon.apply(
+        lambda row: get_neon_bboxes(row["product_url"], row["geometry"]), axis=1
+    )
+
+    # sjoin to make sure geometries actually overlap with input aoi
+    return (
+        gf_neon.sjoin(intersects).drop(columns=["index_right"]).reset_index(drop=True)
+    )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -25,3 +25,8 @@ def test_opentopo_defaults():
     assert ds_noaa.alias == "noaa"
     ds_ncalm = coincident.datasets.opentopo.NCALM()
     assert ds_ncalm.alias == "ncalm"
+
+
+def test_neon_defaults():
+    ds = coincident.datasets.neon.NEON()
+    assert ds.alias == "neon"
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -203,3 +203,17 @@ def test_ncalm_search(large_aoi):
         dataset="ncalm", intersects=large_aoi, datetime=["2019-01-01", "2023-12-31"]
     )
     assert len(gf) == 6
+
+
+# NEON
+# TODO: use a smaller aoi for search
+# i don't want to keep adding fixtures to conftest.py but i think the test takes way too long with CO
+# and no sites overlap with grandmesa
+# =======
+@network
+@pytest.mark.filterwarnings("ignore:Geometry is in a geographic CRS:UserWarning")
+def test_neon_search(large_aoi):
+    gf = coincident.search.search(
+        dataset="neon", intersects=large_aoi, datetime=["2022-01-01", "2022-12-31"]
+    )
+    assert len(gf) == 3