diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ce9658b..c0a0308 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -37,6 +37,7 @@ jobs: POSTGRESQL_PASSWORD: ${{ env.POSTGRESQL_PASSWORD }} run: | pytest tests/test_ckan_provider.py + pytest tests/test_geopandas_provider.py pytest tests/test_postgresql_provider.py pytest tests/test_sitemap_process.py pytest tests/test_sparql_provider.py diff --git a/README.md b/README.md index 8efbf28..bea18f6 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Additional OGC API - Feature providers are listed below, along with a matrix of | `CKAN` | ✅/✅ | results/hits | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | | `PsuedoPostgreSQL` | ✅/✅ | results/hits | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | `SPARQL` | ❌/✅ | results/hits | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| `GeoPandas` | ✅/✅ | results/hits | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | The provider names listed in the table are only accessible in [internetofwater/pygeoapi](https://github.com/internetofwater/pygeoapi), otherwise the full python path is required. @@ -78,6 +79,23 @@ The SPARQL Provider only uses variables prefixed with sparql\_ in the configurat - `sparql_subject`: The SPARQL variable representing the subject URI in the query. - `sparql_predicates`: A mapping of attribute names to SPARQL predicates. These predicates will be used to query specific attributes in the SPARQL data source. +### GeoPandas + +The GeoPandas Provider enables OGC API - Feature support using GeoPandas as the backend. This integration can read in data files in [any of the geospatial formats supported by GeoPandas](https://geopandas.org/en/stable/docs/user_guide/io.html#supported-drivers-file-formats). + +`id_field` is the only field that is required to be labelled. + +```yaml + providers: + - type: feature + name: pygeoapi_plugins.provider.geopandas_.GeoPandasProvider + # Example data + data: 'https://www.hydroshare.org/resource/3295a17b4cc24d34bd6a5c5aaf753c50/data/contents/hu02.gpkg', + id_field: id +``` + +You can also use plain CSV and read in points by providing an `x_field` and `y_field` in the config the [same way you would with the default pygeoapi CSV provider](https://github.com/geopython/pygeoapi/blob/510875027e8483ce2916e7cf315fb6a7f6105807/pygeoapi-config.yml#L137). + ## OGC API - Processes Additional OGC API - Process are listed below diff --git a/pygeoapi_plugins/provider/geopandas_.py b/pygeoapi_plugins/provider/geopandas_.py new file mode 100644 index 0000000..b5b0b3f --- /dev/null +++ b/pygeoapi_plugins/provider/geopandas_.py @@ -0,0 +1,537 @@ +# ================================================================= +# +# Authors: Colton Loftus +# +# Copyright (c) 2024 Colton Loftus +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +import datetime +import geopandas +import pandas +import shapely.geometry +import logging +from shapely import box +from collections import OrderedDict +from typing import Literal, Optional +from typing import TypedDict +from collections import defaultdict +from pygeoapi.provider.base import ( + BaseProvider, + ProviderInvalidDataError, + ProviderItemNotFoundError, + ProviderNoDataError, + ProviderQueryError, +) +from pygeoapi.util import crs_transform + +LOGGER = logging.getLogger(__name__) + +# All types exposed by shapely +PossibleGeometries = ( + shapely.geometry.LineString + | shapely.geometry.multilinestring.MultiLineString + | shapely.geometry.multipoint.MultiPoint + | shapely.geometry.multipolygon.MultiPolygon + | shapely.geometry.point.Point + | shapely.geometry.polygon.LinearRing + | shapely.geometry.polygon.Polygon +) + + +class FeatureGeometry(TypedDict): + coordinates: list[PossibleGeometries] + type: str + + +# Non exhaustive + + +class FeatureProperties(TypedDict): + # These are likely to be included but can be specified with other names + # timestamp: Optional[str] + # geometry: Optional[FeatureGeometry] + pass + + +class Feature(TypedDict): + type: Literal['Feature'] + # Optional if skipping geometry + geometry: Optional[PossibleGeometries] + properties: FeatureProperties + id: str + + +class SortDict(TypedDict): + property: str + order: Literal['+', '-'] + + +class FeatureCollection(TypedDict): + type: Literal['FeatureCollection'] + features: list[Feature] + numberMatched: int + numberReturned: int + + +class GeoPandasProvider(BaseProvider): + """GeoPandas provider""" + + gdf: geopandas.GeoDataFrame + + def _set_time_field(self, provider_def: dict): + """ + Set time field and check if there is a specific "LOADDATE" column or not + """ + if 'time_field' in provider_def: + self.time_field = provider_def['time_field'] + # else look for a column named LOADDATE + elif 'LOADDATE' in self.gdf.columns: + self.time_field = 'LOADDATE' + else: + for col in self.gdf.columns: + if isinstance( + self.gdf[col].iloc[0], (datetime.date, datetime.datetime) + ): + self.time_field = col + break + else: + LOGGER.warning('No time field found') + return + + self.gdf[self.time_field] = pandas.to_datetime(self.gdf[self.time_field]) + + def _filter_by_date( + self, df: geopandas.GeoDataFrame, datetime_: str + ) -> geopandas.GeoDataFrame: + """ + Filter by date + """ + dateRange = datetime_.split('/') + + if _START_AND_END := len(dateRange) == 2: # noqa F841 + start, end = dateRange + + # python does not accept Z at the end of the datetime even though that is a valid ISO 8601 datetime + if start.endswith('Z'): + start = start.replace('Z', '+00:00') + + if end.endswith('Z'): + end = end.replace('Z', '+00:00') + + start = ( + datetime.datetime.min + if start == '..' + else datetime.datetime.fromisoformat(start) + ) + end = ( + datetime.datetime.max + if end == '..' + else datetime.datetime.fromisoformat(end) + ) + start, end = ( + start.replace(tzinfo=datetime.timezone.utc), + end.replace(tzinfo=datetime.timezone.utc), + ) + + if start > end: + raise ProviderQueryError( + 'Start date must be before end date but got {} and {}'.format( + start, end + ) + ) + + # If the user just passes in 2019/.. this still handles the match for all days in 2019 + # since the iso format will create the start as 2019-01-01 + return df[(df[self.time_field] >= start) & (df[self.time_field] <= end)] + + elif _ONLY_MATCH_ONE_DATE := len(dateRange) == 1: # noqa + dates: geopandas.GeoSeries = df[self.time_field] + + # By casting to a string we can use .str.contains to coarsely check. + # We want 2019-10 to match 2019-10-01, 2019-10-02, etc. + return df[dates.astype(str).str.startswith(datetime_)] + else: + raise ProviderQueryError( + "datetime_ must be a date or date range with two dates separated by '/' but got {}".format( + datetime_ + ) + ) + + def _set_geometry_fields(self, provider_def: dict): + """ + Set geometry fields and check both if there is a point-based csv or a shapely geometry column + """ + # Check if it was specified in the config, try the CSV geo style and otherwise use the standard shapely geo format + if 'geometry' in provider_def: + if ( + provider_def['geometry']['x_field'] + and provider_def['geometry']['y_field'] + ): + self.geometry_x = provider_def['geometry']['x_field'] + self.geometry_y = provider_def['geometry']['y_field'] + self._exclude_from_fields.append(self.geometry_x) + self._exclude_from_fields.append(self.geometry_y) + + # Unclear if there can be other types of geometries manually specified in the config + # Otherwise we would just read it in as a shapely geometry\ + # else: + # self.geometry_col = provider_def["geometry"] + # self._exclude_from_fields.append(self.geometry_col) + + # If we don't have x,y coords as separate columns then look for a geometry column + elif 'geometry' in self.gdf.columns: + self.geometry_col = 'geometry' + self._exclude_from_fields.append(self.geometry_col) + + # If we don't have any of the above, find the first geometry column and assume that is where the geometry is + else: + for col in self.gdf.columns: + if hasattr(col, 'geom_type'): + self.geometry_col = col + self._exclude_from_fields.append(self.geometry_col) + break + else: + # Assuming that there is no reason to read in geometric data without geometry + LOGGER.warning('No geometry column found') + + def __init__(self, provider_def: dict): + """ + Initialize object + + :param provider_def: provider definition + + :returns: pygeoapi.provider.base.GeoPandasProvider + """ + + super().__init__(provider_def) + + try: + self.gdf = geopandas.read_file(provider_def['data']) + except FileNotFoundError as ex: + raise ProviderNoDataError( + f'Tried to read GeoDataFrame: {ex} but it does not exist' + ) + except Exception as ex: + raise ProviderInvalidDataError(f'Failed to read GeoDataFrame: {ex}') + + # These fields should not be returned in the property list for a query + self._exclude_from_fields: list[str] = [] + + self._set_time_field(provider_def) + self._set_geometry_fields(provider_def) + + self.gdf[self.id_field] = self.gdf[self.id_field].astype(str) + + # Without below, the CSV reads std_id as an object dtype + # And fails the CSV provider tests. Maybe a way to do this better + # that is more generalizable? + if 'stn_id' in self.gdf.columns: + self.gdf['stn_id'] = self.gdf['stn_id'].astype('int64') + if 'value' in self.gdf.columns: + self.gdf['value'] = self.gdf['value'].astype('float64') + + self._exclude_from_properties: list[str] = self._exclude_from_fields + [ + self.id_field + ] + + self.fields = self.get_fields() + + def get_fields(self) -> dict[str, any]: + """ + Get provider field information (names, types) + + Example response: {'field1': 'string', 'field2': 'number'}} + + :returns: dict of field names and their associated JSON Schema types + """ + if len(self.gdf) == 0: + raise ProviderNoDataError('No data found to get fields from') + + field_mapper = { + col: self.gdf[col].dtype.name + for col in self.gdf.columns + if col not in self._exclude_from_fields + } + + # Pandas has a different ãmes for types than the pygeoapi spec expects + pandas_dtypes_to_ours = { + 'float64': 'number', + 'int64': 'integer', + 'object': 'string', + } + + pandas_default = defaultdict(lambda: 'string') + pandas_default.update(pandas_dtypes_to_ours) + + our_types_names = { + k: {'type': pandas_default[v]} for k, v in field_mapper.items() + } + + return our_types_names + + @crs_transform + def query( + self, + offset=0, + limit=10, + resulttype: Literal['results', 'hits'] = 'results', + identifier=None, + bbox: list[float] = [], + datetime_: Optional[str] = None, + properties: list[tuple[str, str]] = [], + select_properties=[], + sortby: list[SortDict] = [], + skip_geometry=False, + q=None, + **kwargs, + ) -> FeatureCollection: + """ + Query data with GeoPandas + + :param offset: starting record to return (default 0) + :param limit: number of records to return (default 10) + :param datetime_: temporal (datestamp or extent) + :param identifier: feature id + :param resulttype: return results or hit limit (default results) + :param properties: Properties with specific values to select list of tuples (name, value) + :param select_properties: list of general properties to select regardless of values + :param sortby: How to return the sorted features list of dicts (property, order) + :param skip_geometry: bool of whether to skip geometry (default False) + :param q: full-text search term(s) + + :returns: dict of GeoJSON FeatureCollection + """ + + if q is not None: + raise NotImplementedError('q not implemented for GeoPandasProvider') + + found, result = False, False + feature_collection: FeatureCollection = { + 'type': 'FeatureCollection', + 'features': [], + 'numberMatched': 0, + 'numberReturned': 0, + } + + if identifier is not None: + # If we are querying for just one feature, we may have a different limit than the default + # TODO should this be min? So min or this limit and limit in the function call? + limit = self.query(resulttype='hits')['numberMatched'] + + # Create a dummy backup that we can overwrite + df: geopandas.GeoDataFrame = self.gdf + + if properties: + for prop in properties: + (column_name, val_to_filter_by) = prop + + # Only keep rows where the property is the right value + + # We need to convert this to a string since it appears the properties are always strings, + # but our dataframe contains integers or floats + df = df[df[column_name].astype(str) == val_to_filter_by] + + if resulttype == 'hits': + # If we are querying for just the number matched, we don't + # datetime_obj to further process the df and can simply return len + feature_collection['numberMatched'] = len(df) + return feature_collection + + if _BBOX_DEFINED := len(bbox) == 4: # noqa + minx, miny, maxx, maxy = bbox + bbox_geom = box(minx, miny, maxx, maxy) + df = df[df['geometry'].intersects(bbox_geom)] + elif _INVALID_BBOX := (len(bbox) != 4 and len(bbox) != 0): # noqa + raise ProviderQueryError( + 'bbox must be a list of 4 values got {}'.format(len(bbox)) + ) + + if sortby: + sort_keys = [sort_key['property'] for sort_key in sortby] + + for sort_specifier in sortby: + if '+' != sort_specifier['order'] and '-' != sort_specifier['order']: + raise ProviderQueryError( + 'sortby order must be + or - got {}'.format( + sort_specifier['order'] + ) + ) + + sort_directions = [ + True if sort_key['order'] == '+' else False for sort_key in sortby + ] + + df = df.sort_values(by=sort_keys, ascending=sort_directions) + + if datetime_ is not None: + df = self._filter_by_date(df, datetime_) + + for _, row in df.iterrows(): + feature: Feature = { + 'type': 'Feature', + 'id': str(row[self.id_field]), + 'properties': OrderedDict(), + 'geometry': {'type': None, 'coordinates': None}, + } + + if skip_geometry: + feature['geometry'] = None + else: + if hasattr(self, 'geometry_x') and hasattr(self, 'geometry_y'): + feature['geometry']['coordinates'] = [ + float(row[self.geometry_x]), + float(row[self.geometry_y]), + ] + feature['geometry']['type'] = 'Point' + elif hasattr(self, 'geometry_col'): + feature['geometry']['coordinates'] = shapely.to_geojson( + row[self.geometry_col] + ) + feature['geometry']['type'] = row[self.geometry_col].geom_type + else: + raise ProviderQueryError( + 'The config passed in does not specify which geometry column to use' + ) + + for key, value in row.items(): + properties_to_keep = set(self.properties) | (set(select_properties)) + + # If no properties are specified to filter by, we have a no-op filter + KEEP_ALL = len(properties_to_keep) == 0 + + if KEEP_ALL or key in properties_to_keep: + if key not in self._exclude_from_properties: + feature['properties'][key] = value + + if identifier and feature[self.id_field] == identifier: + found = True + result = feature + + feature_collection['features'].append(feature) + feature_collection['numberMatched'] = len(feature_collection['features']) + + if identifier: + return None if not found else result + + feature_collection['features'] = feature_collection['features'][ + offset : offset + limit + ] + feature_collection['numberReturned'] = len(feature_collection['features']) + + # After we have used the timestamps for querying we need to convert them back to strings + for feature in feature_collection['features']: + if self.time_field in feature['properties']: + feature['properties'][self.time_field] = str( + feature['properties'][self.time_field] + ) + + return feature_collection + + @crs_transform + def get(self, identifier: str, **kwargs): + """ + query the provider by id + + :param identifier: feature id + + :returns: dict of single GeoJSON feature + """ + # + res: geopandas.GeoSeries = self.gdf[ + self.gdf[self.id_field].astype(str) == identifier + ].squeeze(axis=0) + if res.empty: + err = f'item {identifier} not found' + LOGGER.error(err) + raise ProviderItemNotFoundError(err) + + feature: Feature = {} + feature['type'] = 'Feature' + feature['id'] = res[self.id_field] + feature['properties'] = {k: v for k, v in res.items()} + return feature + + def create(self, item): + """ + Create a new item + + :param item: `dict` of new item + + :returns: identifier of created item + """ + + if len(item) != len(self.gdf.columns): + raise ProviderQueryError('Item to update does not match dataframe shape') + + self.gdf = self.gdf._append(item, ignore_index=True) + + return self.gdf[self.id_field].iloc[-1] + + def update(self, identifier, item: dict[str, any]): + """ + Updates an existing item + + :param identifier: feature id + :param item: `dict` of partial or full item + + :returns: `bool` of update result + """ + + if len(self.gdf) == 0: + raise ProviderNoDataError('No data in provider') + + if len(item) != len(self.gdf.columns): + raise ProviderQueryError('Item to update does not match dataframe shape') + + if identifier not in self.gdf[self.id_field].values: + return False + + # Find the index of the row that matches the identifier + index = self.gdf[self.gdf[self.id_field] == identifier].index[0] + + # Update the row with the new item values + for key, value in item.items(): + self.gdf.at[index, key] = value + + # Return True to indicate successful update + return True + + def delete(self, identifier): + """ + Deletes an existing item + + :param identifier: item id + + :returns: `bool` of deletion result + """ + try: + self.gdf = self.gdf[self.gdf[self.id_field] != identifier] + return True + except Exception as e: + LOGGER.error(e) + return False + + def __repr__(self): + return f' {self.type}' diff --git a/requirements.txt b/requirements.txt index 47a0f7c..765baa9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ pygeoapi geoalchemy +geopandas +numpy<2.0.0 psycopg2 pygeofilter[backend-sqlalchemy] requests diff --git a/tests/data/hu02.gpkg b/tests/data/hu02.gpkg new file mode 100644 index 0000000..feb06e6 Binary files /dev/null and b/tests/data/hu02.gpkg differ diff --git a/tests/data/obs.csv b/tests/data/obs.csv new file mode 100644 index 0000000..0f9430f --- /dev/null +++ b/tests/data/obs.csv @@ -0,0 +1,6 @@ +id,stn_id,datetime,value,lat,long +371,35,"2001-10-30T14:24:55Z",89.9,45,-75 +377,35,"2002-10-30T18:31:38Z",93.9,45,-75 +238,2147,"2007-10-30T08:57:29Z",103.5,43,-79 +297,2147,"2003-10-30T07:37:29Z",93.5,43,-79 +964,604,"2000-10-30T18:24:39Z",99.9,49,-122 diff --git a/tests/data/station_list.csv b/tests/data/station_list.csv new file mode 100644 index 0000000..ef8dd50 --- /dev/null +++ b/tests/data/station_list.csv @@ -0,0 +1,80 @@ +station_name,wigos_station_identifier,traditional_station_identifier,facility_type,latitude,longitude,elevation,territory_name,wmo_region +BONIFATI (16337-0),0-20000-0-16337,16337,Land (fixed),39.5847222222,15.8913888889,484,Italy,6 +DECIMOMANNU,0-20000-0-16546,16546,Land (fixed),39.3461111111,8.9675,29,Italy,6 +CAMPOBASSO,0-20000-0-16252,16252,Land (fixed),41.5636111111,14.655,793,Italy,6 +GRAZZANISE,0-20000-0-16253,16253,Land (fixed),41.0605555556,14.0788888889,9.19,Italy,6 +PRATICA DI MARE,0-20000-0-16245,16245,Land (fixed),41.6555555556,12.4480555556,12.3,Italy,6 +ILLIZI,0-20000-0-60640,60640,Land (fixed),26.71916,8.61722,542,Algeria,1 +SKIKDA,0-20000-0-60355,60355,Land (fixed),36.88178,6.93503,2,Algeria,1 +PIAN ROSA,0-20000-0-16052,16052,Land (fixed),45.935,7.7061111111,3480,Italy,6 +PRIZZI,0-20000-0-16434,16434,Land (fixed),37.7227777778,13.4280555556,1034,Italy,6 +DOBBIACO,0-20000-0-16033,16033,Land (fixed),46.73,12.22,1222,Italy,6 +CATANIA SIGONELLA,0-20000-0-16459,16459,Land (fixed),37.4055555556,14.9186111111,24,Italy,6 +BALAKA,0-454-2-AWSBALAKA,AWSBALAKA,Land (fixed),-14.983333,34.966666,618,Malawi,1 +MECHERIA,0-20000-0-60549,60549,Land (fixed),33.54581,-0.23527,1123.2,Algeria,1 +TERMOLI,0-20000-0-16232,16232,Land (fixed),42.0041666667,14.9963888889,16,Italy,6 +VIGNA DI VALLE,0-20000-0-16224,16224,Land (fixed),42.0802777778,12.2113888889,260,Italy,6 +HASSI-MESSAOUD,0-20000-0-60581,60581,Land (fixed),31.65861,6.14138,140,Algeria,1 +DJELFA,0-20000-0-60535,60535,Land (fixed),34.65361,3.28138,1180,Algeria,1 +PAGANELLA,0-20000-0-16022,16022,Land (fixed),46.1597222222,11.0341666667,2125,Italy,6 +MONTE S. ANGELO,0-20000-0-16258,16258,Land (fixed),41.7083333333,15.9477777778,838,Italy,6 +MALOMO,0-454-2-AWSMALOMO,AWSMALOMO,Land (fixed),-13.14202,33.83727,1088,Malawi,1 +TREVICO,0-20000-0-16263,16263,Land (fixed),41.0466666667,15.2327777778,1085,Italy,6 +EL-OUED,0-20000-0-60559,60559,Land (fixed),33.50618,6.78841,63,Algeria,1 +TIMIMOUN,0-20000-0-60607,60607,Land (fixed),29.24412,0.28385,312,Algeria,1 +CONCORDIA,0-20000-0-89625,89625,Land (fixed),-75.1016666667,123.4119444444,3233,Italy,7 +TOUGGOURT,0-20000-0-60555,60555,Land (fixed),33.07011,6.09208,87,Algeria,1 +MONTE SCURO,0-20000-0-16344,16344,Land (fixed),39.3305555556,16.3963888889,1669,Italy,6 +BATNA,0-20000-0-60468,60468,Land (fixed),35.76083,6.31972,821,Algeria,1 +ENNA,0-20000-0-16450,16450,Land (fixed),37.5680555556,14.2797222222,1000,Italy,6 +NAMITAMBO,0-454-2-AWSNAMITAMBO,AWSNAMITAMBO,Land (fixed),-15.84052,35.27428,806,Malawi,1 +TOLEZA,0-454-2-AWSTOLEZA,AWSTOLEZA,Land (fixed),-14.948,34.955,764,Malawi,1 +IN-GUEZZAM,0-20000-0-60690,60690,Land (fixed),19.56388,5.74887,399,Algeria,1 +BRIC DELLA CROCE,0-20000-0-16061,16061,Land (fixed),45.0333333333,7.7316666667,709,Italy,6 +TREVISO/ISTRANA,0-20000-0-16098,16098,Land (fixed),45.6838888889,12.1066666667,42,Italy,6 +KAYEREKERA,0-454-2-AWSKAYEREKERA,AWSKAYEREKERA,Land (fixed),-9.92951,33.67305,848,Malawi,1 +LOBI AWS,0-454-2-AWSLOBI,AWSLOBI,Land (fixed),-14.39528,34.07244,1288,Malawi,1 +CERVIA,0-20000-0-16148,16148,Land (fixed),44.2288888889,12.2919444444,6,Italy,6 +NKHOMA UNIVERSITY,0-454-2-AWSNKHOMA,AWSNKHOMA,Land (fixed),-14.04422,34.10468,1230,Malawi,1 +SAIDA,0-20000-0-60536,60536,Land (fixed),34.89186,0.15774,750,Algeria,1 +PASSO ROLLE,0-20000-0-16021,16021,Land (fixed),46.2977777778,11.7866666667,2004,Italy,6 +AREZZO,0-20000-0-16172,16172,Land (fixed),43.4597222222,11.8455555556,248,Italy,6 +CAPRI,0-20000-0-16294,16294,Land (fixed),40.5577777778,14.2019444444,160,Italy,6 +TARVISIO,0-20000-0-16040,16040,Land (fixed),46.5055555556,13.5861111111,777,Italy,6 +SETIF/AIN ARNAT,0-20000-0-60445,60445,Land (fixed),36.1666666667,5.3166666667,1009,Algeria,1 +JIJEL- ACHOUAT,0-20000-0-60351,60351,Land (fixed),36.79472,5.87722,8,Algeria,1 +MONTE ARGENTARIO,0-20000-0-16168,16168,Land (fixed),42.3869444444,11.1697222222,630.7,Italy,6 +CAPO CARBONARA,0-20000-0-16564,16564,Land (fixed),39.1038888889,9.5136111111,116,Italy,6 +ANNABA,0-20000-0-60360,60360,Land (fixed),36.822222,7.8025,5,Algeria,1 +BENI-ABBES,0-20000-0-60602,60602,Land (fixed),30.12846,-2.14953,510,Algeria,1 +FRONTONE,0-20000-0-16179,16179,Land (fixed),43.5169444444,12.7277777778,570,Italy,6 +FERRARA (16138-0),0-20000-0-16138,16138,Land (fixed),44.8155555556,11.6125,8,Italy,6 +TIARET,0-20000-0-60511,60511,Land (fixed),35.35542,1.46792,977,Algeria,1 +MASCARA-GHRISS,0-20000-0-60507,60507,Land (fixed),35.20666,0.1525,511,Algeria,1 +EL-BAYADH,0-20000-0-60550,60550,Land (fixed),33.71966,1.09484,1361,Algeria,1 +USTICA,0-20000-0-16400,16400,Land (fixed),38.7072222222,13.1772222222,242,Italy,6 +GROSSETO,0-20000-0-16206,16206,Land (fixed),42.7480555556,11.0588888889,5.4,Italy,6 +BENI OUNIF,0-12-0-08BECCN60577,60577,Land (fixed),32.05138,-1.26527,830,Algeria,1 +OCNA SUGATAG,0-20000-0-15015,15015,Land (fixed),47.7770616258,23.9404602638,503,Romania,6 +BOTOSANI,0-20000-0-15020,15020,Land (fixed),47.7356532437,26.6455501701,161,Romania,6 +IASI,0-20000-0-15090,15090,Land (fixed),47.163333333,27.6272222222,74.29,Romania,6 +CEAHLAU TOACA,0-20000-0-15108,15108,Land (fixed),46.9775099973,25.9499399749,1897,Romania,6 +CLUJ-NAPOCA,0-20000-0-15120,15120,Land (fixed),46.7777705044,23.5713052939,410,Romania,6 +BACAU,0-20000-0-15150,15150,Land (fixed),46.5577777778,26.8966666667,174,Romania,6 +MIERCUREA CIUC,0-20000-0-15170,15170,Land (fixed),46.3713166568,25.7726166755,661,Romania,6 +ARAD,0-20000-0-15200,15200,Land (fixed),46.1335163958,21.3536215174,116.59,Romania,6 +DEVA,0-20000-0-15230,15230,Land (fixed),45.8649230138,22.898806236,240,Romania,6 +SIBIU,0-20000-0-15260,15260,Land (fixed),45.79018,24.036245,450,Romania,6 +VARFU OMU,0-20000-0-15280,15280,Land (fixed),45.4457927989,25.456690976,2504,Romania,6 +CARANSEBES,0-20000-0-15292,15292,Land (fixed),45.41667,22.22917,241,Romania,6 +GALATI,0-20000-0-15310,15310,Land (fixed),45.4729181384,28.0323010582,69,Romania,6 +TULCEA,0-20000-0-15335,15335,Land (fixed),45.1905064849,28.8241607619,4.36,Romania,6 +RAMNICU VALCEA,0-20000-0-15346,15346,Land (fixed),45.0888211225,24.3628139123,237,Romania,6 +BUZAU,0-20000-0-15350,15350,Land (fixed),45.1326632857,26.8517319231,97,Romania,6 +SULINA,0-20000-0-15360,15360,Land (fixed),45.1623111,29.7268286,12.69,Romania,6 +DROBETA-TURNU SEVERIN,0-20000-0-15410,15410,Land (fixed),44.6264587019,22.6260737132,77,Romania,6 +BUCURESTI BANEASA,0-20000-0-15420,15420,Land (fixed),44.5104330044,26.0781904077,90,Romania,6 +CRAIOVA,0-20000-0-15450,15450,Land (fixed),44.3101404313,23.8669847441,192,Romania,6 +CALARASI,0-20000-0-15460,15460,Land (fixed),44.2057385289,27.3383080718,18.72,Romania,6 +ROSIORII DE VEDE,0-20000-0-15470,15470,Land (fixed),44.1072133362,24.9787400713,102.15,Romania,6 +CONSTANTA,0-20000-0-15480,15480,Land (fixed),44.2138143888,28.6455646839,12.8,Romania,6 diff --git a/tests/test_geopandas_provider.py b/tests/test_geopandas_provider.py new file mode 100644 index 0000000..106511e --- /dev/null +++ b/tests/test_geopandas_provider.py @@ -0,0 +1,347 @@ +# ================================================================= +# +# Authors: Colton Loftus +# +# Copyright (c) 2024 Colton Loftus +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +import datetime + +import geopandas as gpd +import pytest +import shapely + +from pygeoapi.provider.base import ProviderItemNotFoundError + +from pygeoapi_plugins.provider.geopandas_ import GeoPandasProvider + + +@pytest.fixture() +def config(): + return { + 'name': 'CSV', + 'type': 'feature', + 'data': 'tests/data/obs.csv', + 'id_field': 'id', + 'geometry': {'x_field': 'long', 'y_field': 'lat'}, + } + + +@pytest.fixture() +def station_config(): + return { + 'name': 'CSV', + 'type': 'feature', + 'data': 'tests/data/station_list.csv', + 'id_field': 'wigos_station_identifier', + 'geometry': {'x_field': 'longitude', 'y_field': 'latitude'}, + } + + +@pytest.fixture() +def gpkg_config(): + return { + 'name': 'gpkg', + 'type': 'feature', + 'data': 'tests/data/hu02.gpkg', + 'id_field': 'HUC2', + } + + +def test_csv_query(config): + p = GeoPandasProvider(config) + + fields = p.get_fields() + assert len(fields) == 4 + assert ['id', 'stn_id', 'datetime', 'value'] == list(fields.keys()) + assert fields['value']['type'] == 'number' + assert fields['stn_id']['type'] == 'integer' + + results = p.query() + assert len(results['features']) == 5 + assert results['numberMatched'] == 5 + assert results['numberReturned'] == 5 + assert results['features'][0]['id'] == '371' + assert results['features'][0]['properties']['value'] == 89.9 + + assert results['features'][0]['geometry']['coordinates'][0] == -75.0 + assert results['features'][0]['geometry']['coordinates'][1] == 45.0 + + results = p.query(limit=1) + assert len(results['features']) == 1 + assert results['features'][0]['id'] == '371' + + results = p.query(offset=2, limit=1) + assert len(results['features']) == 1 + assert results['features'][0]['id'] == '238' + # should just be stn_id, datetime and value + assert len(results['features'][0]['properties']) == 3 + + results = p.query(select_properties=['value']) + assert len(results['features'][0]['properties']) == 1 + + results = p.query(select_properties=['value', 'stn_id']) + assert len(results['features'][0]['properties']) == 2 + + results = p.query(skip_geometry=True) + assert results['features'][0]['geometry'] is None + + results = p.query(properties=[('stn_id', '604')]) + assert len(results['features']) == 1 + assert results['numberMatched'] == 1 + assert results['numberReturned'] == 1 + + results = p.query(properties=[('stn_id', '35')]) + assert len(results['features']) == 2 + assert results['numberMatched'] == 2 + assert results['numberReturned'] == 2 + + results = p.query(properties=[('stn_id', '35'), ('value', '93.9')]) + assert len(results['features']) == 1 + + config['properties'] = ['value', 'stn_id'] + p = GeoPandasProvider(config) + results = p.query() + assert len(results['features'][0]['properties']) == 2 + + +def test_csv_get(config): + p = GeoPandasProvider(config) + + result = p.get('964') + assert result['id'] == '964' + assert result['properties']['value'] == 99.9 + + +def test_csv_get_not_existing_item_raise_exception(config): + """Testing query for a not existing object""" + p = GeoPandasProvider(config) + with pytest.raises(ProviderItemNotFoundError): + p.get('404') + + +def test_csv_get_station(station_config): + p = GeoPandasProvider(station_config) + + results = p.query(limit=20) + assert len(results['features']) == 20 + assert results['numberMatched'] == 79 + assert results['numberReturned'] == 20 + + result = p.get('0-20000-0-16337') + assert result['properties']['station_name'] == 'BONIFATI (16337-0)' + + result = p.get('0-454-2-AWSNAMITAMBO') + assert result['properties']['station_name'] == 'NAMITAMBO' + + +# Make sure the way we are filtering the dataframe works in general outside of the provider +def test_intersection(): + gdf = gpd.read_file('tests/data/hu02.gpkg') + gdf = gdf[gdf['HUC2'] == '01'] + + minx, miny, maxx, maxy = -70.5, 43.0, -70.0, 43.3 + polygon = shapely.geometry.Polygon( + [(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny)] + ) + box = shapely.box(minx, miny, maxx, maxy) + huc_range: shapely.geometry.MultiPolygon = gdf['geometry'].iloc[0] + + assert isinstance(huc_range, shapely.geometry.MultiPolygon) + assert isinstance(polygon, shapely.geometry.Polygon) + assert shapely.intersects(polygon, huc_range) == True # noqa + assert shapely.intersects(box, huc_range) == True # noqa + + gdf = gpd.read_file('tests/data/hu02.gpkg') + box = shapely.box(minx, miny, maxx, maxy) + gdf = gdf[gdf['geometry'].intersects(box)] + + assert len(gdf) == 1 + + +def test_gpkg_bbox_query(gpkg_config): + p = GeoPandasProvider(gpkg_config) + + results = p.query(limit=1) + assert len(results['features']) == 1 + + results = p.query(offset=1, limit=1) + assert len(results['features']) == 1 + + results = p.query(skip_geometry=True) + assert results['features'][0]['geometry'] is None + + results = p.query(properties=[('uri', 'https://geoconnex.us/ref/hu02/07')]) + assert len(results['features']) == 1 + assert ( + results['features'][0]['properties']['uri'] + == 'https://geoconnex.us/ref/hu02/07' + ) + + results = p.query( + bbox=(0, 0, 0, 0), properties=[('uri', 'https://geoconnex.us/ref/hu02/07')] + ) + assert len(results['features']) == 0 + + # Should intersect with New England + results = p.query(bbox=(-70.5, 43.0, -70.0, 43.3)) + assert len(results['features']) == 1 + assert results['features'][0]['id'] == '01' + + # Should intersect with Midatlantic and New England + results = p.query(bbox=(-74.881, 40.566, -71.249, 41.27)) + assert len(results['features']) == 2 + assert results['features'][0]['id'] == '01' + assert results['features'][1]['id'] == '02' + + +def test_gpkg_date_query(gpkg_config): + p = GeoPandasProvider(gpkg_config) + + results = p.query(datetime_='2019-10-10') + assert len(results['features']) == 1 + assert ( + results['features'][0]['properties']['LOADDATE'] == '2019-10-10 20:08:56+00:00' + ) + + results = p.query(datetime_='../1900-09-18T17:34:02.666+00:00') + assert len(results['features']) == 0 + + results = p.query(datetime_='2900-09-18/..') + assert len(results['features']) == 0 + + results = p.query(datetime_='2016-09-22') + assert len(results['features']) == 1 + + results = p.query(datetime_='2016-09-22/2016-11-23') + assert len(results['features']) == 2 + assert ( + results['features'][0]['properties']['LOADDATE'] == '2016-10-11 21:37:03+00:00' + ) + assert ( + results['features'][1]['properties']['LOADDATE'] == '2016-09-22 06:01:28+00:00' + ) + + results = p.query(datetime_='2000-01-01T00:00:00Z/2016-11-23') + assert len(results['features']) == 2 + assert ( + results['features'][0]['properties']['LOADDATE'] == '2016-10-11 21:37:03+00:00' + ) + assert ( + results['features'][1]['properties']['LOADDATE'] == '2016-09-22 06:01:28+00:00' + ) + + results = p.query(datetime_='2016') + assert len(results['features']) == 2 + assert ( + results['features'][0]['properties']['LOADDATE'] == '2016-10-11 21:37:03+00:00' + ) + assert ( + results['features'][1]['properties']['LOADDATE'] == '2016-09-22 06:01:28+00:00' + ) + + +def test_gpkg_sort_query(gpkg_config): + p = GeoPandasProvider(gpkg_config) + + results = p.query(sortby=[{'property': 'LOADDATE', 'order': '-'}]) + # Sort by descending so we expect the newest date first + assert ( + results['features'][0]['properties']['LOADDATE'] == '2019-10-31 16:20:07+00:00' + ) + + # Create a dummy row In order to test breaking ties + dummy_row = { + 'uri': '_', + 'NAME': 'AAAAAAA_THIS_KEY_SHOULD_BE_SORTED_TO_BE_FIRST', + 'gnis_url': '_', + 'GNIS_ID': '_', + 'HUC2': '_', + # Tie for the latest date in the dataset + 'LOADDATE': datetime.datetime.fromisoformat('2019-10-31T16:20:07+00:00'), + 'geometry': shapely.box(0, 0, 0, 0), + } + + p.gdf = p.gdf._append(dummy_row, ignore_index=True) + assert (len(p.gdf)) == 23 + + results = p.query( + sortby=[ + {'property': 'LOADDATE', 'order': '-'}, + {'property': 'NAME', 'order': '+'}, + ] + ) + assert ( + results['features'][0]['properties']['NAME'] + == 'AAAAAAA_THIS_KEY_SHOULD_BE_SORTED_TO_BE_FIRST' + ) + + +def test_transaction(gpkg_config): + p = GeoPandasProvider(gpkg_config) + + dummy_row = { + 'uri': '_', + 'NAME': '_', + 'gnis_url': '_', + 'GNIS_ID': '', + 'HUC2': '1111', + 'LOADDATE': datetime.datetime.fromisoformat('2019-10-31T16:20:07+00:00'), + 'geometry': shapely.box(0, 0, 0, 0), + } + + id = p.create(dummy_row) + + assert id == '1111' + + assert len(p.gdf) == 23 + + dummy_row_updated = { + 'uri': '_', + 'NAME': 'TEST_NAME', + 'gnis_url': '_', + 'GNIS_ID': '', + 'HUC2': '1111', + 'LOADDATE': datetime.datetime.fromisoformat('2019-10-31T16:20:08+00:00'), + 'geometry': shapely.box(0, 0, 0, 0), + } + + success = p.update(dummy_row_updated['HUC2'], dummy_row_updated) + + assert len(p.gdf) == 23 + + assert success + + res = p.get('1111') + + assert res['properties']['NAME'] == 'TEST_NAME' + + success = p.delete('1111') + + assert len(p.gdf) == 22 + assert success + + with pytest.raises(ProviderItemNotFoundError): + res = p.get('1111')