Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate geojson #9

Open
bdestombe opened this issue Nov 20, 2024 · 3 comments
Open

Validate geojson #9

bdestombe opened this issue Nov 20, 2024 · 3 comments

Comments

@bdestombe
Copy link
Member

Use geopandas to validate geojsons:

  • geometries
  • RDS coordinates
  • extent in repository.yml is sufficiently accurate.

Provide a hatch command to format the geojsons

@bdestombe
Copy link
Member Author

import geopandas as gpd
import pandas as pd
from shapely.geometry import box
import pyproj
from shapely import geometry
from typing import Union, Dict, List, Optional
import math

def lint_geojson(
    geojson_path: str,
    reference_extent: Optional[Union[gpd.GeoDataFrame, Dict]] = None,
    tolerance: float = 0.01,
    max_buffer: float = 1000
) -> Dict[str, List[str]]:
    """
    Lint a GeoJSON file for common issues, focusing on Dutch geographic data.

    This function performs comprehensive validation of GeoJSON files, including checks
    for valid geometries, coordinate reference systems (CRS), extent constraints,
    and data quality. It specifically focuses on Dutch geographic standards and
    coordinate systems.

    Parameters
    ----------
    geojson_path : str
        Path to the GeoJSON file to be validated.
    reference_extent : {GeoDataFrame, dict, None}, optional
        Reference extent to check against. If dict, must contain 'minx', 'miny',
        'maxx', 'maxy' keys. If GeoDataFrame, total bounds will be used.
        The default is None.
    tolerance : float, optional
        Tolerance for coordinate precision checks in meters.
        The default is 0.01.
    max_buffer : float, optional
        Maximum allowed buffer distance in meters between data extent and
        reference extent. The default is 1000.

    Returns
    -------
    dict
        Dictionary containing validation results with the following keys:
        - 'critical' : list
            List of critical errors that need immediate attention
        - 'warnings' : list
            List of potential issues that should be reviewed
        - 'info' : list
            General information about the dataset

    Notes
    -----
    The function performs the following checks:
    1. CRS validation
        - Verifies presence of CRS information
        - Checks for Dutch projection systems (EPSG:28992 or EPSG:7415)
    2. Geometry validation
        - Checks for invalid geometries
        - Identifies null or empty geometries
    3. Extent validation
        - Ensures all geometries fall within reference extent
        - Verifies buffer distances don't exceed max_buffer
    4. Coordinate precision
        - Checks for excessive coordinate precision

    The reference extent constraint ensures that the extent is not too loose,
    with a maximum allowed buffer of max_buffer meters in any direction.

    Examples
    --------
    >>> # Using dictionary-based reference extent
    >>> ref_extent = {
    ...     'minx': 120000,
    ...     'miny': 480000,
    ...     'maxx': 122000,
    ...     'maxy': 482000
    ... }
    >>> results = lint_geojson("path/to/file.geojson",
    ...                       reference_extent=ref_extent,
    ...                       max_buffer=1000)
    >>> for level, messages in results.items():
    ...     print(f"{level}:", messages)

    >>> # Using GeoDataFrame as reference extent
    >>> import geopandas as gpd
    >>> ref_gdf = gpd.read_file("reference.geojson")
    >>> results = lint_geojson("path/to/file.geojson",
    ...                       reference_extent=ref_gdf,
    ...                       max_buffer=500)

    See Also
    --------
    geopandas.read_file : Function used to read the GeoJSON file
    shapely.geometry.box : Function used to create bounding boxes
    """
    errors = {
        "critical": [],
        "warnings": [],
        "info": []
    }
    
    try:
        # Read the GeoJSON
        gdf = gpd.read_file(geojson_path)
    except Exception as e:
        errors["critical"].append(f"Failed to read GeoJSON: {str(e)}")
        return errors

    # Check 1: Valid CRS
    if gdf.crs is None:
        errors["critical"].append("Missing CRS information")
    else:
        # Check if using Dutch projection systems
        dutch_crs = ["EPSG:28992", "EPSG:7415"]  # Amersfoort / RD New and ETRS89 / RD New
        if str(gdf.crs) not in dutch_crs:
            errors["warnings"].append(
                f"Non-Dutch CRS detected: {gdf.crs}. "
                f"Consider using Amersfoort/RD New (EPSG:28992)"
            )

    # Check 2: Valid geometries
    invalid_geoms = gdf[~gdf.geometry.is_valid]
    if not invalid_geoms.empty:
        errors["critical"].append(
            f"Found {len(invalid_geoms)} invalid geometries at indices: "
            f"{invalid_geoms.index.tolist()}"
        )

    # Check 3: Enhanced extent validation
    if reference_extent is not None:
        # Get the actual data extent
        data_bounds = gdf.total_bounds
        data_extent = box(*data_bounds)

        # Convert reference extent to a box
        if isinstance(reference_extent, dict):
            ref_box = box(
                reference_extent['minx'], 
                reference_extent['miny'],
                reference_extent['maxx'], 
                reference_extent['maxy']
            )
            ref_gdf = gpd.GeoDataFrame(geometry=[ref_box], crs=gdf.crs)
        else:
            ref_gdf = reference_extent
            ref_box = box(*ref_gdf.total_bounds)

        # Ensure same CRS for comparison
        if ref_gdf.crs != gdf.crs:
            ref_gdf = ref_gdf.to_crs(gdf.crs)
            ref_box = box(*ref_gdf.total_bounds)

        # Check if any geometries fall outside reference extent
        outside_extent = gdf[~gdf.geometry.within(ref_box)]
        if not outside_extent.empty:
            errors["critical"].append(
                f"Found {len(outside_extent)} geometries outside reference extent "
                f"at indices: {outside_extent.index.tolist()}"
            )

        # Check if reference extent is too large compared to data extent
        ref_bounds = ref_gdf.total_bounds
        
        # Calculate buffer distances on each side
        buffer_distances = {
            'west': abs(ref_bounds[0] - data_bounds[0]),
            'south': abs(ref_bounds[1] - data_bounds[1]),
            'east': abs(ref_bounds[2] - data_bounds[2]),
            'north': abs(ref_bounds[3] - data_bounds[3])
        }

        # Check each buffer distance
        excessive_buffers = {
            direction: distance 
            for direction, distance in buffer_distances.items() 
            if distance > max_buffer
        }

        if excessive_buffers:
            errors["critical"].append(
                "Reference extent is too loose. Maximum allowed buffer is "
                f"{max_buffer}m, but found excessive buffers:\n" + 
                "\n".join([
                    f"  - {direction}: {distance:.1f}m"
                    for direction, distance in excessive_buffers.items()
                ])
            )

        # Add extent information
        errors["info"].append(
            "Extent information:\n"
            f"  Data extent: {tuple(data_bounds)}\n"
            f"  Reference extent: {tuple(ref_bounds)}\n"
            "Buffer distances:\n" + 
            "\n".join([
                f"  - {direction}: {distance:.1f}m"
                for direction, distance in buffer_distances.items()
            ])
        )

    # Check 4: Coordinate precision
    def check_coord_precision(geom):
        """
        Check if geometry coordinates exceed specified precision.
        
        Parameters
        ----------
        geom : shapely.geometry
            Geometry to check
            
        Returns
        -------
        bool
            True if precision is within tolerance, False otherwise
        """
        if isinstance(geom, (geometry.Point, geometry.LineString, geometry.LinearRing)):
            coords = geom.coords
        elif isinstance(geom, geometry.Polygon):
            coords = geom.exterior.coords
        else:
            return True
        
        for x, y in coords:
            if abs(round(x, int(-math.log10(tolerance))) - x) > tolerance or \
               abs(round(y, int(-math.log10(tolerance))) - y) > tolerance:
                return False
        return True

    imprecise_geoms = gdf[~gdf.geometry.apply(check_coord_precision)]
    if not imprecise_geoms.empty:
        errors["warnings"].append(
            f"Found {len(imprecise_geoms)} geometries with excessive "
            f"coordinate precision at indices: {imprecise_geoms.index.tolist()}"
        )

    # Check 5: Null geometries
    null_geoms = gdf[gdf.geometry.isna()]
    if not null_geoms.empty:
        errors["critical"].append(
            f"Found {len(null_geoms)} null geometries at indices: "
            f"{null_geoms.index.tolist()}"
        )

    # Check 6: Empty geometries
    empty_geoms = gdf[gdf.geometry.is_empty]
    if not empty_geoms.empty:
        errors["warnings"].append(
            f"Found {len(empty_geoms)} empty geometries at indices: "
            f"{empty_geoms.index.tolist()}"
        )

    # Check 7: Basic attribute checks
    if gdf.empty:
        errors["critical"].append("GeoJSON contains no features")
    else:
        errors["info"].append(f"Total features: {len(gdf)}")
        errors["info"].append(f"Column names: {list(gdf.columns)}")

    return errors

@bdestombe
Copy link
Member Author

bdestombe commented Nov 20, 2024

Double post

@bdestombe
Copy link
Member Author

bdestombe commented Nov 20, 2024

import pytest
from pathlib import Path
import geopandas as gpd
from typing import Dict, List, Tuple

from geojson_linter import lint_geojson

# Collection configuration
COLLECTIONS = [
    {
        'folder': 'path/to/a',
        'extent': {
            'minx': 120000,
            'miny': 480000,
            'maxx': 122000,
            'maxy': 482000
        }
    },
    {
        'folder': 'path/to/b',
        'extent': {
            'minx': 85000,
            'miny': 435000,
            'maxx': 87000,
            'maxy': 437000
        }
    }
]

def get_collection_files() -> List[Tuple[Path, Dict]]:
    """Get all GeoJSON files and their corresponding extents from collections."""
    test_cases = []
    
    for collection in COLLECTIONS:
        folder = Path(collection['folder'])
        geojson_files = list(folder.glob('*.geojson')) + list(folder.glob('*.json'))
        
        for file_path in geojson_files:
            test_cases.append((file_path, collection['extent']))
            
    return test_cases

@pytest.mark.parametrize('geojson_file,extent', get_collection_files())
def test_geojson_file(geojson_file: Path, extent: Dict):
    """Test individual GeoJSON file against its collection's extent."""
    results = lint_geojson(
        str(geojson_file),
        reference_extent=extent,
        max_buffer=1000
    )

    if results['warnings']:
        pytest.warn(UserWarning(
            f"Warnings in {geojson_file.name}:\n" + 
            "\n".join(results['warnings'])
        ))

    assert not results['critical'], \
        f"Critical errors in {geojson_file.name}:\n" + "\n".join(results['critical'])

Only test most recent versions of the mockup data

@bdestombe bdestombe added this to NHFLO Nov 21, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
Status: No status
Development

No branches or pull requests

1 participant