Merge branch 'main' into joss-paper

ImperialCollegeLondon · Oct 24, 2024 · 6a86fb1 · 6a86fb1
2 parents 0e5d443 + b5779c1
commit 6a86fb1
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 32 deletions.
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -4,7 +4,7 @@ Thank you for considering contributing to `SWMManywhere`.
 
 ## Bugs
 
-Please create a new issues (insert link when published)
+Please [create a new issue](https://github.com/ImperialCollegeLondon/SWMManywhere/issues/new)
 if you may have found a bug.
 Please describe the bug and instructions on recreating it (including OS and
 Python version). Label the issue with `bug`.
@@ -14,12 +14,12 @@ Python version). Label the issue with `bug`.
 Our intention with `SWMManywhere` is that a high level of customisation to suit
 your needs may be achieved by adding new `graphfcns` or new `metrics`, see
 below. Other new behaviour may be tagged with `enhancement`, though please
-check existing issues (insert link when published)
+check [existing issues](https://github.com/ImperialCollegeLondon/SWMManywhere/issues)
 to see if something similar already exists.
 
 ### Take a graph give a graph: `graphfcns`
 
-All transformations that take place do so on graph functions, you can change
+All transformations that take place do so on graph functions; you can change
 the order in which these are executed and add new ones. If you want a
 `graphfcn` that does a new thing, please create an issue to discuss with the
 label `graphfcn`. If a single new `graphfcn` is not sufficient to capture the
@@ -38,7 +38,7 @@ create a new `metric`, please create an issue to discuss with the label
 ## Installation for development
 
 To install `SWMManywhere` in development mode, first you will need a virtual
-environment. Here we use a `conda` environment which let us use the version of
+environment. Here we use a `conda` environment which lets us use the version of
 python we want to use, but you can use any other tool you are familiar with.
 Just make sure you use a version of Python compatible with SWMManywhere.
 
@@ -59,19 +59,18 @@ cd swmmanywhere
 
 We use [`pip-tools`](https://pip-tools.readthedocs.io/en/latest/) to ensure
 consistency in the development process, ensuring all people contributing to
-`SWMManywhere` uses the same versions for all the dependencies, which minimiese
+`SWMManywhere` use the same versions for all the dependencies, which minimises
 the conflicts. To install the development dependencies and then `SWMManywhere`
-in development mode run:
+in development mode, run:
 
 ```bash
-pip install -r dev-requirements.txt
-pip install -e .
+pip install -e .[dev,doc]
 ```
 
 ## Quality assurance and linting
 
 `SWMManywhere` uses a collection of tools that ensure that a specific code
-style and formatting is follow throughout the software. The tools we used for
+style and formatting is followed throughout the software. The tools we use for
 that are [`ruff`](https://docs.astral.sh/ruff/),
 [`markdownlint`](https://github.com/igorshubovych/markdownlint-cli),
 [`mypy`](https://github.com/pre-commit/mirrors-mypy),
@@ -123,23 +122,16 @@ coverage run -m pytest
 coverage report
 ```
 
-And generate a new coverage html for the documentation with
+And generate a new coverage html site for the documentation with
 
 ```bash
 coverage html
 ```
 
 ## Changing dependencies
 
-As the development process moves forward you find you need to add a new
-dependency, just add it to the relevant section of the `pyproject.toml` file
-and recompile requirements:
-
-```bash
-pip-compile
-pip-compile --extra dev -o dev-requirements.txt pyproject.toml
-pip-compile --extra doc -o doc-requirements.txt pyproject.toml
-```
+As the development process moves forward, you may find you need to add a new
+dependency. Just add it to the relevant section of the `pyproject.toml` file.
 
 Read the
 [`pip-tools` documentation](https://pip-tools.readthedocs.io/en/latest/) for

diff --git a/src/swmmanywhere/prepare_data.py b/src/swmmanywhere/prepare_data.py
@@ -19,6 +19,11 @@
 import rioxarray.merge as rxr_merge
 import xarray as xr
 from geopy.geocoders import Nominatim
+from pyarrow import RecordBatchReader
+from pyarrow.compute import field
+from pyarrow.dataset import dataset
+from pyarrow.fs import S3FileSystem
+from pyarrow.parquet import ParquetWriter
 
 from swmmanywhere.logging import logger
 from swmmanywhere.utilities import yaml_load
@@ -59,6 +64,50 @@ def get_country(x: float, y: float) -> dict[int, str]:
     return {2: iso_country_code, 3: data.get(iso_country_code, "")}
 
 
+def _record_batch_reader(bbox: tuple[float, float, float, float]) -> RecordBatchReader:
+    """Get a pyarrow batch reader this for bounding box and s3 path."""
+    s3_region = "us-west-2"
+    version = "2024-07-22.0"
+    path = f"overturemaps-{s3_region}/release/{version}/theme=buildings/type=building/"
+    xmin, ymin, xmax, ymax = bbox
+    ds_filter = (
+        (field("bbox", "xmin") < xmax)
+        & (field("bbox", "xmax") > xmin)
+        & (field("bbox", "ymin") < ymax)
+        & (field("bbox", "ymax") > ymin)
+    )
+
+    ds = dataset(path, filesystem=S3FileSystem(anonymous=True, region=s3_region))
+    batches = ds.to_batches(filter=ds_filter)
+    non_empty_batches = (b for b in batches if b.num_rows > 0)
+
+    geoarrow_schema = ds.schema.set(
+        ds.schema.get_field_index("geometry"),
+        ds.schema.field("geometry").with_metadata(
+            {b"ARROW:extension:name": b"geoarrow.wkb"}
+        ),
+    )
+    return RecordBatchReader.from_batches(geoarrow_schema, non_empty_batches)
+
+
+def download_buildings_bbox(
+    file_address: Path, bbox: tuple[float, float, float, float]
+) -> None:
+    """Retrieve building data in bbox from Overture Maps to file.
+
+    This function is based on
+    `overturemaps-py <https://github.com/OvertureMaps/overturemaps-py>`__.
+
+    Args:
+        bbox (tuple): Bounding box coordinates (xmin, ymin, xmax, ymax)
+        file_address (Path): File address to save the downloaded data.
+    """
+    reader = _record_batch_reader(bbox)
+    with ParquetWriter(file_address, reader.schema) as writer:
+        for batch in reader:
+            writer.write_batch(batch)
+
+
 def download_buildings(file_address: Path, x: float, y: float) -> int:
     """Download buildings data based on coordinates and save to a file.
 

diff --git a/src/swmmanywhere/preprocessing.py b/src/swmmanywhere/preprocessing.py
@@ -78,23 +78,14 @@ def prepare_elevation(
 def prepare_building(
     bbox: tuple[float, float, float, float], addresses: FilePaths, target_crs: str
 ):
-    """Download, trim and reproject building data."""
+    """Download and reproject building data."""
     if addresses.bbox_paths.building.exists():
         return
 
-    if not addresses.project_paths.national_building.exists():
-        logger.info(
-            f"""downloading buildings to 
-                    {addresses.project_paths.national_building}"""
-        )
-        prepare_data.download_buildings(
-            addresses.project_paths.national_building, bbox[0], bbox[1]
-        )
-
-    logger.info(f"trimming buildings to {addresses.bbox_paths.building}")
-    national_buildings = gpd.read_parquet(addresses.project_paths.national_building)
-    buildings = national_buildings.cx[bbox[0] : bbox[2], bbox[1] : bbox[3]]  # type: ignore
+    logger.info(f"downloading buildings to {addresses.bbox_paths.building}")
+    prepare_data.download_buildings_bbox(addresses.bbox_paths.building, bbox)
 
+    buildings = gpd.read_parquet(addresses.bbox_paths.building)
     buildings = buildings.to_crs(target_crs)
     write_df(buildings, addresses.bbox_paths.building)
 

diff --git a/tests/test_prepare_data.py b/tests/test_prepare_data.py
@@ -73,6 +73,26 @@ def test_building_downloader_download():
         assert gdf.shape[0] > 0
 
 
+@pytest.mark.downloads
+def test_building_bbox_downloader_download():
+    """Check buildings are downloaded."""
+    # Coordinates for small country (VAT)
+    bbox = (-0.17929, 51.49638, -0.17383, 51.49846)
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_fid = Path(temp_dir) / "temp.parquet"
+        # Download
+        downloaders.download_buildings_bbox(temp_fid, bbox)
+
+        # Check file exists
+        assert temp_fid.exists(), "Buildings data file not found after download."
+
+        # Load data
+        gdf = gpd.read_parquet(temp_fid)
+
+        # Make sure has some rows
+        assert gdf.shape[0] > 0
+
+
 @pytest.mark.downloads
 def test_street_downloader_download():
     """Check streets are downloaded and a specific point in the graph."""