diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index d3be77f9..96742de2 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -4,7 +4,7 @@ Thank you for considering contributing to `SWMManywhere`. ## Bugs -Please create a new issues (insert link when published) +Please [create a new issue](https://github.com/ImperialCollegeLondon/SWMManywhere/issues/new) if you may have found a bug. Please describe the bug and instructions on recreating it (including OS and Python version). Label the issue with `bug`. @@ -14,12 +14,12 @@ Python version). Label the issue with `bug`. Our intention with `SWMManywhere` is that a high level of customisation to suit your needs may be achieved by adding new `graphfcns` or new `metrics`, see below. Other new behaviour may be tagged with `enhancement`, though please -check existing issues (insert link when published) +check [existing issues](https://github.com/ImperialCollegeLondon/SWMManywhere/issues) to see if something similar already exists. ### Take a graph give a graph: `graphfcns` -All transformations that take place do so on graph functions, you can change +All transformations that take place do so on graph functions; you can change the order in which these are executed and add new ones. If you want a `graphfcn` that does a new thing, please create an issue to discuss with the label `graphfcn`. If a single new `graphfcn` is not sufficient to capture the @@ -38,7 +38,7 @@ create a new `metric`, please create an issue to discuss with the label ## Installation for development To install `SWMManywhere` in development mode, first you will need a virtual -environment. Here we use a `conda` environment which let us use the version of +environment. Here we use a `conda` environment which lets us use the version of python we want to use, but you can use any other tool you are familiar with. Just make sure you use a version of Python compatible with SWMManywhere. @@ -59,19 +59,18 @@ cd swmmanywhere We use [`pip-tools`](https://pip-tools.readthedocs.io/en/latest/) to ensure consistency in the development process, ensuring all people contributing to -`SWMManywhere` uses the same versions for all the dependencies, which minimiese +`SWMManywhere` use the same versions for all the dependencies, which minimises the conflicts. To install the development dependencies and then `SWMManywhere` -in development mode run: +in development mode, run: ```bash -pip install -r dev-requirements.txt -pip install -e . +pip install -e .[dev,doc] ``` ## Quality assurance and linting `SWMManywhere` uses a collection of tools that ensure that a specific code -style and formatting is follow throughout the software. The tools we used for +style and formatting is followed throughout the software. The tools we use for that are [`ruff`](https://docs.astral.sh/ruff/), [`markdownlint`](https://github.com/igorshubovych/markdownlint-cli), [`mypy`](https://github.com/pre-commit/mirrors-mypy), @@ -123,7 +122,7 @@ coverage run -m pytest coverage report ``` -And generate a new coverage html for the documentation with +And generate a new coverage html site for the documentation with ```bash coverage html @@ -131,15 +130,8 @@ coverage html ## Changing dependencies -As the development process moves forward you find you need to add a new -dependency, just add it to the relevant section of the `pyproject.toml` file -and recompile requirements: - -```bash -pip-compile -pip-compile --extra dev -o dev-requirements.txt pyproject.toml -pip-compile --extra doc -o doc-requirements.txt pyproject.toml -``` +As the development process moves forward, you may find you need to add a new +dependency. Just add it to the relevant section of the `pyproject.toml` file. Read the [`pip-tools` documentation](https://pip-tools.readthedocs.io/en/latest/) for diff --git a/src/swmmanywhere/prepare_data.py b/src/swmmanywhere/prepare_data.py index 31d33887..e98359a0 100644 --- a/src/swmmanywhere/prepare_data.py +++ b/src/swmmanywhere/prepare_data.py @@ -19,6 +19,11 @@ import rioxarray.merge as rxr_merge import xarray as xr from geopy.geocoders import Nominatim +from pyarrow import RecordBatchReader +from pyarrow.compute import field +from pyarrow.dataset import dataset +from pyarrow.fs import S3FileSystem +from pyarrow.parquet import ParquetWriter from swmmanywhere.logging import logger from swmmanywhere.utilities import yaml_load @@ -59,6 +64,50 @@ def get_country(x: float, y: float) -> dict[int, str]: return {2: iso_country_code, 3: data.get(iso_country_code, "")} +def _record_batch_reader(bbox: tuple[float, float, float, float]) -> RecordBatchReader: + """Get a pyarrow batch reader this for bounding box and s3 path.""" + s3_region = "us-west-2" + version = "2024-07-22.0" + path = f"overturemaps-{s3_region}/release/{version}/theme=buildings/type=building/" + xmin, ymin, xmax, ymax = bbox + ds_filter = ( + (field("bbox", "xmin") < xmax) + & (field("bbox", "xmax") > xmin) + & (field("bbox", "ymin") < ymax) + & (field("bbox", "ymax") > ymin) + ) + + ds = dataset(path, filesystem=S3FileSystem(anonymous=True, region=s3_region)) + batches = ds.to_batches(filter=ds_filter) + non_empty_batches = (b for b in batches if b.num_rows > 0) + + geoarrow_schema = ds.schema.set( + ds.schema.get_field_index("geometry"), + ds.schema.field("geometry").with_metadata( + {b"ARROW:extension:name": b"geoarrow.wkb"} + ), + ) + return RecordBatchReader.from_batches(geoarrow_schema, non_empty_batches) + + +def download_buildings_bbox( + file_address: Path, bbox: tuple[float, float, float, float] +) -> None: + """Retrieve building data in bbox from Overture Maps to file. + + This function is based on + `overturemaps-py `__. + + Args: + bbox (tuple): Bounding box coordinates (xmin, ymin, xmax, ymax) + file_address (Path): File address to save the downloaded data. + """ + reader = _record_batch_reader(bbox) + with ParquetWriter(file_address, reader.schema) as writer: + for batch in reader: + writer.write_batch(batch) + + def download_buildings(file_address: Path, x: float, y: float) -> int: """Download buildings data based on coordinates and save to a file. diff --git a/src/swmmanywhere/preprocessing.py b/src/swmmanywhere/preprocessing.py index 97af0a64..72766569 100644 --- a/src/swmmanywhere/preprocessing.py +++ b/src/swmmanywhere/preprocessing.py @@ -78,23 +78,14 @@ def prepare_elevation( def prepare_building( bbox: tuple[float, float, float, float], addresses: FilePaths, target_crs: str ): - """Download, trim and reproject building data.""" + """Download and reproject building data.""" if addresses.bbox_paths.building.exists(): return - if not addresses.project_paths.national_building.exists(): - logger.info( - f"""downloading buildings to - {addresses.project_paths.national_building}""" - ) - prepare_data.download_buildings( - addresses.project_paths.national_building, bbox[0], bbox[1] - ) - - logger.info(f"trimming buildings to {addresses.bbox_paths.building}") - national_buildings = gpd.read_parquet(addresses.project_paths.national_building) - buildings = national_buildings.cx[bbox[0] : bbox[2], bbox[1] : bbox[3]] # type: ignore + logger.info(f"downloading buildings to {addresses.bbox_paths.building}") + prepare_data.download_buildings_bbox(addresses.bbox_paths.building, bbox) + buildings = gpd.read_parquet(addresses.bbox_paths.building) buildings = buildings.to_crs(target_crs) write_df(buildings, addresses.bbox_paths.building) diff --git a/tests/test_prepare_data.py b/tests/test_prepare_data.py index d51ab15b..08429e56 100644 --- a/tests/test_prepare_data.py +++ b/tests/test_prepare_data.py @@ -73,6 +73,26 @@ def test_building_downloader_download(): assert gdf.shape[0] > 0 +@pytest.mark.downloads +def test_building_bbox_downloader_download(): + """Check buildings are downloaded.""" + # Coordinates for small country (VAT) + bbox = (-0.17929, 51.49638, -0.17383, 51.49846) + with tempfile.TemporaryDirectory() as temp_dir: + temp_fid = Path(temp_dir) / "temp.parquet" + # Download + downloaders.download_buildings_bbox(temp_fid, bbox) + + # Check file exists + assert temp_fid.exists(), "Buildings data file not found after download." + + # Load data + gdf = gpd.read_parquet(temp_fid) + + # Make sure has some rows + assert gdf.shape[0] > 0 + + @pytest.mark.downloads def test_street_downloader_download(): """Check streets are downloaded and a specific point in the graph."""