-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Validate geojson #9
Comments
import geopandas as gpd
import pandas as pd
from shapely.geometry import box
import pyproj
from shapely import geometry
from typing import Union, Dict, List, Optional
import math
def lint_geojson(
geojson_path: str,
reference_extent: Optional[Union[gpd.GeoDataFrame, Dict]] = None,
tolerance: float = 0.01,
max_buffer: float = 1000
) -> Dict[str, List[str]]:
"""
Lint a GeoJSON file for common issues, focusing on Dutch geographic data.
This function performs comprehensive validation of GeoJSON files, including checks
for valid geometries, coordinate reference systems (CRS), extent constraints,
and data quality. It specifically focuses on Dutch geographic standards and
coordinate systems.
Parameters
----------
geojson_path : str
Path to the GeoJSON file to be validated.
reference_extent : {GeoDataFrame, dict, None}, optional
Reference extent to check against. If dict, must contain 'minx', 'miny',
'maxx', 'maxy' keys. If GeoDataFrame, total bounds will be used.
The default is None.
tolerance : float, optional
Tolerance for coordinate precision checks in meters.
The default is 0.01.
max_buffer : float, optional
Maximum allowed buffer distance in meters between data extent and
reference extent. The default is 1000.
Returns
-------
dict
Dictionary containing validation results with the following keys:
- 'critical' : list
List of critical errors that need immediate attention
- 'warnings' : list
List of potential issues that should be reviewed
- 'info' : list
General information about the dataset
Notes
-----
The function performs the following checks:
1. CRS validation
- Verifies presence of CRS information
- Checks for Dutch projection systems (EPSG:28992 or EPSG:7415)
2. Geometry validation
- Checks for invalid geometries
- Identifies null or empty geometries
3. Extent validation
- Ensures all geometries fall within reference extent
- Verifies buffer distances don't exceed max_buffer
4. Coordinate precision
- Checks for excessive coordinate precision
The reference extent constraint ensures that the extent is not too loose,
with a maximum allowed buffer of max_buffer meters in any direction.
Examples
--------
>>> # Using dictionary-based reference extent
>>> ref_extent = {
... 'minx': 120000,
... 'miny': 480000,
... 'maxx': 122000,
... 'maxy': 482000
... }
>>> results = lint_geojson("path/to/file.geojson",
... reference_extent=ref_extent,
... max_buffer=1000)
>>> for level, messages in results.items():
... print(f"{level}:", messages)
>>> # Using GeoDataFrame as reference extent
>>> import geopandas as gpd
>>> ref_gdf = gpd.read_file("reference.geojson")
>>> results = lint_geojson("path/to/file.geojson",
... reference_extent=ref_gdf,
... max_buffer=500)
See Also
--------
geopandas.read_file : Function used to read the GeoJSON file
shapely.geometry.box : Function used to create bounding boxes
"""
errors = {
"critical": [],
"warnings": [],
"info": []
}
try:
# Read the GeoJSON
gdf = gpd.read_file(geojson_path)
except Exception as e:
errors["critical"].append(f"Failed to read GeoJSON: {str(e)}")
return errors
# Check 1: Valid CRS
if gdf.crs is None:
errors["critical"].append("Missing CRS information")
else:
# Check if using Dutch projection systems
dutch_crs = ["EPSG:28992", "EPSG:7415"] # Amersfoort / RD New and ETRS89 / RD New
if str(gdf.crs) not in dutch_crs:
errors["warnings"].append(
f"Non-Dutch CRS detected: {gdf.crs}. "
f"Consider using Amersfoort/RD New (EPSG:28992)"
)
# Check 2: Valid geometries
invalid_geoms = gdf[~gdf.geometry.is_valid]
if not invalid_geoms.empty:
errors["critical"].append(
f"Found {len(invalid_geoms)} invalid geometries at indices: "
f"{invalid_geoms.index.tolist()}"
)
# Check 3: Enhanced extent validation
if reference_extent is not None:
# Get the actual data extent
data_bounds = gdf.total_bounds
data_extent = box(*data_bounds)
# Convert reference extent to a box
if isinstance(reference_extent, dict):
ref_box = box(
reference_extent['minx'],
reference_extent['miny'],
reference_extent['maxx'],
reference_extent['maxy']
)
ref_gdf = gpd.GeoDataFrame(geometry=[ref_box], crs=gdf.crs)
else:
ref_gdf = reference_extent
ref_box = box(*ref_gdf.total_bounds)
# Ensure same CRS for comparison
if ref_gdf.crs != gdf.crs:
ref_gdf = ref_gdf.to_crs(gdf.crs)
ref_box = box(*ref_gdf.total_bounds)
# Check if any geometries fall outside reference extent
outside_extent = gdf[~gdf.geometry.within(ref_box)]
if not outside_extent.empty:
errors["critical"].append(
f"Found {len(outside_extent)} geometries outside reference extent "
f"at indices: {outside_extent.index.tolist()}"
)
# Check if reference extent is too large compared to data extent
ref_bounds = ref_gdf.total_bounds
# Calculate buffer distances on each side
buffer_distances = {
'west': abs(ref_bounds[0] - data_bounds[0]),
'south': abs(ref_bounds[1] - data_bounds[1]),
'east': abs(ref_bounds[2] - data_bounds[2]),
'north': abs(ref_bounds[3] - data_bounds[3])
}
# Check each buffer distance
excessive_buffers = {
direction: distance
for direction, distance in buffer_distances.items()
if distance > max_buffer
}
if excessive_buffers:
errors["critical"].append(
"Reference extent is too loose. Maximum allowed buffer is "
f"{max_buffer}m, but found excessive buffers:\n" +
"\n".join([
f" - {direction}: {distance:.1f}m"
for direction, distance in excessive_buffers.items()
])
)
# Add extent information
errors["info"].append(
"Extent information:\n"
f" Data extent: {tuple(data_bounds)}\n"
f" Reference extent: {tuple(ref_bounds)}\n"
"Buffer distances:\n" +
"\n".join([
f" - {direction}: {distance:.1f}m"
for direction, distance in buffer_distances.items()
])
)
# Check 4: Coordinate precision
def check_coord_precision(geom):
"""
Check if geometry coordinates exceed specified precision.
Parameters
----------
geom : shapely.geometry
Geometry to check
Returns
-------
bool
True if precision is within tolerance, False otherwise
"""
if isinstance(geom, (geometry.Point, geometry.LineString, geometry.LinearRing)):
coords = geom.coords
elif isinstance(geom, geometry.Polygon):
coords = geom.exterior.coords
else:
return True
for x, y in coords:
if abs(round(x, int(-math.log10(tolerance))) - x) > tolerance or \
abs(round(y, int(-math.log10(tolerance))) - y) > tolerance:
return False
return True
imprecise_geoms = gdf[~gdf.geometry.apply(check_coord_precision)]
if not imprecise_geoms.empty:
errors["warnings"].append(
f"Found {len(imprecise_geoms)} geometries with excessive "
f"coordinate precision at indices: {imprecise_geoms.index.tolist()}"
)
# Check 5: Null geometries
null_geoms = gdf[gdf.geometry.isna()]
if not null_geoms.empty:
errors["critical"].append(
f"Found {len(null_geoms)} null geometries at indices: "
f"{null_geoms.index.tolist()}"
)
# Check 6: Empty geometries
empty_geoms = gdf[gdf.geometry.is_empty]
if not empty_geoms.empty:
errors["warnings"].append(
f"Found {len(empty_geoms)} empty geometries at indices: "
f"{empty_geoms.index.tolist()}"
)
# Check 7: Basic attribute checks
if gdf.empty:
errors["critical"].append("GeoJSON contains no features")
else:
errors["info"].append(f"Total features: {len(gdf)}")
errors["info"].append(f"Column names: {list(gdf.columns)}")
return errors |
Double post |
import pytest
from pathlib import Path
import geopandas as gpd
from typing import Dict, List, Tuple
from geojson_linter import lint_geojson
# Collection configuration
COLLECTIONS = [
{
'folder': 'path/to/a',
'extent': {
'minx': 120000,
'miny': 480000,
'maxx': 122000,
'maxy': 482000
}
},
{
'folder': 'path/to/b',
'extent': {
'minx': 85000,
'miny': 435000,
'maxx': 87000,
'maxy': 437000
}
}
]
def get_collection_files() -> List[Tuple[Path, Dict]]:
"""Get all GeoJSON files and their corresponding extents from collections."""
test_cases = []
for collection in COLLECTIONS:
folder = Path(collection['folder'])
geojson_files = list(folder.glob('*.geojson')) + list(folder.glob('*.json'))
for file_path in geojson_files:
test_cases.append((file_path, collection['extent']))
return test_cases
@pytest.mark.parametrize('geojson_file,extent', get_collection_files())
def test_geojson_file(geojson_file: Path, extent: Dict):
"""Test individual GeoJSON file against its collection's extent."""
results = lint_geojson(
str(geojson_file),
reference_extent=extent,
max_buffer=1000
)
if results['warnings']:
pytest.warn(UserWarning(
f"Warnings in {geojson_file.name}:\n" +
"\n".join(results['warnings'])
))
assert not results['critical'], \
f"Critical errors in {geojson_file.name}:\n" + "\n".join(results['critical']) Only test most recent versions of the mockup data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use geopandas to validate geojsons:
Provide a hatch command to format the geojsons
The text was updated successfully, but these errors were encountered: