feat(models): Add GFS Model repository

openclimatefix · Nov 22, 2024 · dbbcf57 · dbbcf57
1 parent f16637f
commit dbbcf57
Show file tree

Hide file tree

Showing 24 changed files with 1,025 additions and 456 deletions.
diff --git a/.github/workflows/branch_ci.yml b/.github/workflows/branch_ci.yml
@@ -45,7 +45,7 @@ jobs:
           python-version-file: "pyproject.toml"
 
       - name: Install editable package and required dependencies
-        run: uv sync --extra=dev
+        run: uv sync
 
       - name: Lint package
         run: uv run ruff check --output-format=github .
@@ -76,7 +76,7 @@ jobs:
           python-version-file: "pyproject.toml"
 
       - name: Install editable package and required dependencies
-        run: uv sync --extra=dev
+        run: uv sync
 
       # Run unittests
       # * Produce JUnit XML report
@@ -113,7 +113,7 @@ jobs:
           python-version-file: "pyproject.toml"
 
       - name: Install editable package and required dependencies
-        run: uv sync --extra=dev
+        run: uv sync
 
       - name: Build documentation
         run: uv run pydoctor
@@ -174,3 +174,4 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           platforms: linux/amd64,linux/arm64
           cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache
+
diff --git a/.github/workflows/tagged_ci.yml b/.github/workflows/tagged_ci.yml
@@ -99,7 +99,7 @@ jobs:
           python-version-file: "pyproject.toml"
 
       - name: Install editable package and required dependencies
-        run: uv sync
+        run: uv sync --no-dev
 
       # Building the wheel dynamically assigns the version according to git
       # * The setuptools_git_versioning package reads the git tags and assigns the version
@@ -118,4 +118,4 @@ jobs:
         uses: pypa/[email protected]
         with:
           user: __token__
-          password: ${{ secrets.PYPI_API_TOKEN }}
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/Containerfile b/Containerfile
@@ -1,20 +1,85 @@
-# Build a virtualenv using miniconda
-# * Conda creates a completely isolated environment,
-#   including all required shared libraries, enabling
-#   just putting the virtualenv into a distroless image
-#   without having to faff around with linking all
-#   the filelist (including for each dependency) of
-#   https://packages.debian.org/trixie/libpython3.12-dev, e.g.
-#
-#       echo "Copying symlinked python binary into venv" && \
-#       cp --remove-destination /usr/local/bin/python3.12 /venv/bin/python && \
-#       echo "Copying libpython package into venv" && \
-#       cp -r /usr/local/lib/* /venv/lib/ && \
-#       cp -r /usr/local/include/python3.12/* /venv/include/ && \
-#       mkdir -p /venv/lib/aarch64-linux-gnu/ && \
-#       cp -r /usr/lib/aarch64-linux-gnu/* /venv/lib/aarch64-linux-gnu/ && \
-#       mkdir -p /venv/include/aarch64-linux-gnu/ && \
-#       cp -r /usr/include/aarch64-linux-gnu/* /venv/include/aarch64-linux-gnu/ && \
+# POTENTIAL FOR SMALLER CONTAINERFILE IF THIS CAN BE GOT WORKING
+
+
+# # --- Base Python image -----------------------------------------------------------------
+# FROM python:3.12-bookworm AS python-base
+# 
+# # --- Distroless Container creation -----------------------------------------------------
+# FROM gcr.io/distroless/cc-debian12 AS python-distroless
+# 
+# ARG CHIPSET_ARCH=aarch64-linux-gnu
+# 
+# # Copy the python installation from the base image
+# COPY --from=python-base /usr/local/lib/ /usr/local/lib/
+# COPY --from=python-base /usr/local/bin/python /usr/local/bin/python
+# COPY --from=python-base /etc/ld.so.cache /etc/ld.so.cache
+# 
+# # Add common compiled libraries
+# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libz.so.1 /lib/${CHIPSET_ARCH}/
+# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libffi.so.8 /lib/${CHIPSET_ARCH}/
+# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libbz2.so.1.0 /lib/${CHIPSET_ARCH}/
+# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libm.so.6 /lib/${CHIPSET_ARCH}/
+# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libc.so.6 /lib/${CHIPSET_ARCH}/
+# 
+# # Don't generate .pyc, enable tracebacks
+# ENV LANG=C.UTF-8 \
+#     LC_ALL=C.UTF-8 \
+#     PYTHONDONTWRITEBYTECODE=1 \
+#     PYTHONFAULTHANDLER=1
+# 
+# # Check python installation works
+# COPY --from=python-base /bin/rm /bin/rm
+# COPY --from=python-base /bin/sh /bin/sh
+# RUN python --version
+# RUN rm /bin/sh /bin/rm
+# 
+# # --- Virtualenv builder image ----------------------------------------------------------
+# FROM python-base AS build-venv
+# COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+# 
+# ENV UV_LINK_MODE=copy \
+#     UV_COMPILE_BYTECODE=1 \
+#     UV_PYTHON_DOWNLOADS=never \
+#     UV_PYTHON=python3.12 \
+#     UV_NO_CACHE=1 \
+#     CFLAGS="-g0 -Wl,--strip-all" \
+#     VENV=/.venv
+# 
+# COPY pyproject.toml ./
+# 
+# # Synchronize DEPENDENCIES without the application itself.
+# # This layer is cached until uv.lock or pyproject.toml change.
+# # Delete any unwanted parts of the installed packages to reduce size
+# RUN uv venv ${VENV} && \
+#     echo "Installing dependencies into ${VENV}" && \
+#     mkdir src && \
+#     du -h ${VENV}/lib/python3.12/site-packages && \
+#     uv sync --no-dev --no-install-project && \
+#     echo "Copying libpython package into ${VENV}" && \
+#     cp --remove-destination /usr/local/bin/python3.12 ${VENV}/bin/python && \
+#     cp /usr/local/lib/libpython3.12.so.1.0 ${VENV}/lib/ && \
+#     echo "Optimizing site-packages" && \
+#     rm -r ${VENV}/lib/python3.12/site-packages/**/tests && \
+#     du -h ${VENV}/lib/python3.12/site-packages | sort -h | tail -n 4
+# 
+# COPY . /src
+# RUN uv pip install --no-deps /src && ls /.venv/bin
+# 
+# # --- Distroless App image --------------------------------------------------------------
+# FROM python-distroless
+# 
+# COPY --from=build-venv /.venv /venv
+# 
+# ENV RAWDIR=/work/raw \
+#     ZARRDIR=/work/data \
+#     ECCODES_DEFINITION_PATH=.venv/share/eccodes/definitions
+# 
+# ENTRYPOINT ["/venv/bin/nwp-consumer-cli"]
+# VOLUME /work
+# STOPSIGNAL SIGINT
+
+
+# WORKING CONTAINERFILE
 
 FROM quay.io/condaforge/miniforge3:latest AS build-venv
 
@@ -30,25 +95,22 @@ COPY pyproject.toml /_lock/
 # Synchronize DEPENDENCIES without the application itself.
 # This layer is cached until uv.lock or pyproject.toml change.
 # Delete any unwanted parts of the installed packages to reduce size
-RUN --mount=type=cache,target=/root/.cache \
-    apt-get update && apt-get install build-essential -y && \
+RUN apt-get -qq update && apt-get -qq -y install gcc && \
     echo "Creating virtualenv at /venv" && \
-    conda create -qy -p /venv python=3.12 numcodecs
-RUN which gcc
+    conda create --quiet --yes -p /venv python=3.12 numcodecs eccodes
 RUN echo "Installing dependencies into /venv" && \
     cd /_lock && \
     mkdir src && \
     uv sync --no-dev --no-install-project && \
     echo "Optimizing /venv site-packages" && \
     rm -r /venv/lib/python3.12/site-packages/**/tests && \
-    rm -r /venv/lib/python3.12/site-packages/**/_*cache*
-
+    rm -r /venv/lib/python3.12/site-packages/**/_*cache* && \
+    rm -r /venv/share/eccodes/definitions/bufr
 
 # Then install the application itself
 # * Delete the test and cache folders from installed packages to reduce size
 COPY . /src
-RUN --mount=type=cache,target=/root/.cache \
-    uv pip install --no-deps --python=$UV_PROJECT_ENVIRONMENT /src
+RUN uv pip install --no-deps --python=$UV_PROJECT_ENVIRONMENT /src
 
 # Copy the virtualenv into a distroless image
 # * These are small images that only contain the runtime dependencies

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,44 +18,39 @@ authors = [
 classifiers = ["Programming Language :: Python :: 3"]
 dependencies = [
     "dask == 2024.8.1",
-    "eccodes == 2.38.1",
+    "eccodes == 2.38.3",
     "ecmwf-api-client == 1.6.3",
-    "cfgrib == 0.9.14.0",
+    "cfgrib == 0.9.14.1",
     "dagster-pipes == 1.8.5",
     "joblib == 1.4.2",
     "numpy == 2.1.0",
     "ocf-blosc2 == 0.0.11",
     "psutil == 6.0.0",
-    "requests == 2.32.3",
     "returns == 0.23.0",
     "s3fs == 2024.9.0",
     "xarray == 2024.9.0",
-    "zarr == 2.18.2"
+    "zarr == 2.18.3"
 ]
 
-[project.optional-dependencies]
-test = [
+[dependency-groups]
+dev = [
+    # Testing
+    "botocore == 1.33.7",  # Required for moto, prevents installing the whole of boto3
     "flask == 3.0.0",
     "flask-cors == 4.0.0",
     "moto[s3,server] == 4.2.11",
     "unittest-xml-reporting == 3.2.0",
     "hypothesis == 6.115.3",
-]
-lint = [
+    # Linting
     "returns[compatible-mypy]",
     "ruff == 0.6.9",
     "pandas-stubs",
     "types-psutil",
     "types-pytz",
     "types-pyyaml",
-]
-docs = [
+    # Docs
     "pydoctor >= 24.3.0",
-]
-dev = [
-    "nwp-consumer[test,lint,docs]",
-]
-lsp = [
+    # IDE support
     "python-lsp-server",
     "pylsp-mypy",
     "python-lsp-ruff",
@@ -102,6 +97,8 @@ plugins = [
 # If they are ever made, remove from here!
 module = [
     "cfgrib",
+    "botocore.session",
+    "botocore.client",
     "joblib",
     "ocf_blosc2",
     "s3fs",

diff --git a/src/nwp_consumer/__init__.py b/src/nwp_consumer/__init__.py
@@ -23,6 +23,8 @@
 +-------------------------------+-------------------------------------+---------------------------------------------+
 | MODEL_REPOSITORY              | The model repository to use.        | ceda-metoffice-global                       |
 +-------------------------------+-------------------------------------+---------------------------------------------+
+| CONCURRENCY                   | Whether to use concurrency.         | True                                        |
++-------------------------------+-------------------------------------+---------------------------------------------+
 
 
 Development Documentation
@@ -149,6 +151,7 @@
     "gribapi",
     "aiobotocore",
     "s3fs",
+    "fsspec",
     "asyncio",
     "botocore",
     "cfgrib",

diff --git a/src/nwp_consumer/cmd/main.py b/src/nwp_consumer/cmd/main.py
@@ -18,12 +18,11 @@ def parse_env() -> Adaptors:
     """Parse from the environment."""
     model_repository_adaptor: type[ports.ModelRepository]
     match os.getenv("MODEL_REPOSITORY"):
-        case None:
-            log.error("MODEL_REPOSITORY is not set in environment.")
-            sys.exit(1)
+        case None | "gfs":
+            model_repository_adaptor = repositories.NOAAS3ModelRepository
         case "ceda":
-            model_repository_adaptor = repositories.CedaMetOfficeGlobalModelRepository
-        case "ecmwf-realtime-s3":
+            model_repository_adaptor = repositories.CEDAFTPModelRepository
+        case "ecmwf-realtime":
             model_repository_adaptor = repositories.ECMWFRealTimeS3ModelRepository
         case _ as model:
             log.error(f"Unknown model: {model}")

diff --git a/src/nwp_consumer/internal/entities/coordinates.py b/src/nwp_consumer/internal/entities/coordinates.py
@@ -37,7 +37,10 @@
 
 import dataclasses
 import datetime as dt
+import json
+from importlib.metadata import PackageNotFoundError, version
 
+import dask.array
 import numpy as np
 import pandas as pd
 import pytz
@@ -46,6 +49,11 @@
 
 from .parameters import Parameter
 
+try:
+    __version__ = version("nwp-consumer")
+except PackageNotFoundError:
+    __version__ = "v?"
+
 
 @dataclasses.dataclass(slots=True)
 class NWPDimensionCoordinateMap:
@@ -69,7 +77,7 @@ class NWPDimensionCoordinateMap:
     """The forecast step times.
 
     This corresponds to the horizon of the values, which is the time
-    difference between the forecast initialisation time and the target
+    difference between the forecast initialization time and the target
     time at which the forecast data is valid.
     """
     variable: list[Parameter]
@@ -207,7 +215,7 @@ def to_pandas(self) -> dict[str, pd.Index]:  # type: ignore
         This is useful for interoperability with xarray, which prefers to define
         DataArray coordinates using a dict pandas Index objects.
 
-        For the most part, the conversion consists of a straighforward cast
+        For the most part, the conversion consists of a straightforward cast
         to a pandas Index object. However, there are some caveats involving
         the time-centric dimensions:
 
@@ -367,11 +375,57 @@ def default_chunking(self) -> dict[str, int]:
         that wants to cover the entire dimension should have a size equal to the
         dimension length.
 
-        It defaults to a single chunk per init time and step, and a single chunk
-        for each entire other dimension.
+        It defaults to a single chunk per init time and step, and 8 chunks
+        for each entire other dimension. These are purposefully small, to ensure
+        that when perfomring parallel writes, chunk boundaries are not crossed.
         """
         out_dict: dict[str, int] = {
             "init_time": 1,
             "step": 1,
-        } | {dim: len(getattr(self, dim)) for dim in self.dims if dim not in ["init_time", "step"]}
+        } | {
+            dim: len(getattr(self, dim)) // 8 if len(getattr(self, dim)) > 8 else 1
+            for dim in self.dims
+            if dim not in ["init_time", "step"]
+        }
+
         return out_dict
+
+
+    def as_zeroed_dataarray(self, name: str) -> xr.DataArray:
+        """Express the coordinates as an xarray DataArray.
+
+        Data is populated with zeros and a default chunking scheme is applied.
+
+        Args:
+            name: The name of the DataArray.
+
+        See Also:
+        - https://docs.xarray.dev/en/stable/user-guide/io.html#distributed-writes
+        """
+        # Create a dask array of zeros with the shape of the dataset
+        # * The values of this are ignored, only the shape and chunks are used
+        dummy_values = dask.array.zeros(  # type: ignore
+            shape=list(self.shapemap.values()),
+            chunks=tuple([self.default_chunking()[k] for k in self.shapemap]),
+        )
+        attrs: dict[str, str] = {
+            "produced_by": "".join((
+                f"nwp-consumer {__version__} at ",
+                f"{dt.datetime.now(tz=dt.UTC).strftime('%Y-%m-%d %H:%M')}",
+            )),
+            "variables": json.dumps({
+                p.value: {
+                    "description": p.metadata().description,
+                    "units": p.metadata().units,
+                } for p in self.variable
+            }),
+        }
+        # Create a DataArray object with the given coordinates and dummy values
+        da: xr.DataArray = xr.DataArray(
+            name=name,
+            data=dummy_values,
+            coords=self.to_pandas(),
+            attrs=attrs,
+        )
+        return da
+