Skip to content

Commit

Permalink
feat(models): Add GFS Model repository
Browse files Browse the repository at this point in the history
  • Loading branch information
devsjc committed Nov 22, 2024
1 parent f16637f commit dbbcf57
Show file tree
Hide file tree
Showing 24 changed files with 1,025 additions and 456 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/branch_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
python-version-file: "pyproject.toml"

- name: Install editable package and required dependencies
run: uv sync --extra=dev
run: uv sync

- name: Lint package
run: uv run ruff check --output-format=github .
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
python-version-file: "pyproject.toml"

- name: Install editable package and required dependencies
run: uv sync --extra=dev
run: uv sync

# Run unittests
# * Produce JUnit XML report
Expand Down Expand Up @@ -113,7 +113,7 @@ jobs:
python-version-file: "pyproject.toml"

- name: Install editable package and required dependencies
run: uv sync --extra=dev
run: uv sync

- name: Build documentation
run: uv run pydoctor
Expand Down Expand Up @@ -174,3 +174,4 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
platforms: linux/amd64,linux/arm64
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache

4 changes: 2 additions & 2 deletions .github/workflows/tagged_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
python-version-file: "pyproject.toml"

- name: Install editable package and required dependencies
run: uv sync
run: uv sync --no-dev

# Building the wheel dynamically assigns the version according to git
# * The setuptools_git_versioning package reads the git tags and assigns the version
Expand All @@ -118,4 +118,4 @@ jobs:
uses: pypa/[email protected]
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
password: ${{ secrets.PYPI_API_TOKEN }}
112 changes: 87 additions & 25 deletions Containerfile
Original file line number Diff line number Diff line change
@@ -1,20 +1,85 @@
# Build a virtualenv using miniconda
# * Conda creates a completely isolated environment,
# including all required shared libraries, enabling
# just putting the virtualenv into a distroless image
# without having to faff around with linking all
# the filelist (including for each dependency) of
# https://packages.debian.org/trixie/libpython3.12-dev, e.g.
#
# echo "Copying symlinked python binary into venv" && \
# cp --remove-destination /usr/local/bin/python3.12 /venv/bin/python && \
# echo "Copying libpython package into venv" && \
# cp -r /usr/local/lib/* /venv/lib/ && \
# cp -r /usr/local/include/python3.12/* /venv/include/ && \
# mkdir -p /venv/lib/aarch64-linux-gnu/ && \
# cp -r /usr/lib/aarch64-linux-gnu/* /venv/lib/aarch64-linux-gnu/ && \
# mkdir -p /venv/include/aarch64-linux-gnu/ && \
# cp -r /usr/include/aarch64-linux-gnu/* /venv/include/aarch64-linux-gnu/ && \
# POTENTIAL FOR SMALLER CONTAINERFILE IF THIS CAN BE GOT WORKING


# # --- Base Python image -----------------------------------------------------------------
# FROM python:3.12-bookworm AS python-base
#
# # --- Distroless Container creation -----------------------------------------------------
# FROM gcr.io/distroless/cc-debian12 AS python-distroless
#
# ARG CHIPSET_ARCH=aarch64-linux-gnu
#
# # Copy the python installation from the base image
# COPY --from=python-base /usr/local/lib/ /usr/local/lib/
# COPY --from=python-base /usr/local/bin/python /usr/local/bin/python
# COPY --from=python-base /etc/ld.so.cache /etc/ld.so.cache
#
# # Add common compiled libraries
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libz.so.1 /lib/${CHIPSET_ARCH}/
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libffi.so.8 /lib/${CHIPSET_ARCH}/
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libbz2.so.1.0 /lib/${CHIPSET_ARCH}/
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libm.so.6 /lib/${CHIPSET_ARCH}/
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libc.so.6 /lib/${CHIPSET_ARCH}/
#
# # Don't generate .pyc, enable tracebacks
# ENV LANG=C.UTF-8 \
# LC_ALL=C.UTF-8 \
# PYTHONDONTWRITEBYTECODE=1 \
# PYTHONFAULTHANDLER=1
#
# # Check python installation works
# COPY --from=python-base /bin/rm /bin/rm
# COPY --from=python-base /bin/sh /bin/sh
# RUN python --version
# RUN rm /bin/sh /bin/rm
#
# # --- Virtualenv builder image ----------------------------------------------------------
# FROM python-base AS build-venv
# COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
#
# ENV UV_LINK_MODE=copy \
# UV_COMPILE_BYTECODE=1 \
# UV_PYTHON_DOWNLOADS=never \
# UV_PYTHON=python3.12 \
# UV_NO_CACHE=1 \
# CFLAGS="-g0 -Wl,--strip-all" \
# VENV=/.venv
#
# COPY pyproject.toml ./
#
# # Synchronize DEPENDENCIES without the application itself.
# # This layer is cached until uv.lock or pyproject.toml change.
# # Delete any unwanted parts of the installed packages to reduce size
# RUN uv venv ${VENV} && \
# echo "Installing dependencies into ${VENV}" && \
# mkdir src && \
# du -h ${VENV}/lib/python3.12/site-packages && \
# uv sync --no-dev --no-install-project && \
# echo "Copying libpython package into ${VENV}" && \
# cp --remove-destination /usr/local/bin/python3.12 ${VENV}/bin/python && \
# cp /usr/local/lib/libpython3.12.so.1.0 ${VENV}/lib/ && \
# echo "Optimizing site-packages" && \
# rm -r ${VENV}/lib/python3.12/site-packages/**/tests && \
# du -h ${VENV}/lib/python3.12/site-packages | sort -h | tail -n 4
#
# COPY . /src
# RUN uv pip install --no-deps /src && ls /.venv/bin
#
# # --- Distroless App image --------------------------------------------------------------
# FROM python-distroless
#
# COPY --from=build-venv /.venv /venv
#
# ENV RAWDIR=/work/raw \
# ZARRDIR=/work/data \
# ECCODES_DEFINITION_PATH=.venv/share/eccodes/definitions
#
# ENTRYPOINT ["/venv/bin/nwp-consumer-cli"]
# VOLUME /work
# STOPSIGNAL SIGINT


# WORKING CONTAINERFILE

FROM quay.io/condaforge/miniforge3:latest AS build-venv

Expand All @@ -30,25 +95,22 @@ COPY pyproject.toml /_lock/
# Synchronize DEPENDENCIES without the application itself.
# This layer is cached until uv.lock or pyproject.toml change.
# Delete any unwanted parts of the installed packages to reduce size
RUN --mount=type=cache,target=/root/.cache \
apt-get update && apt-get install build-essential -y && \
RUN apt-get -qq update && apt-get -qq -y install gcc && \
echo "Creating virtualenv at /venv" && \
conda create -qy -p /venv python=3.12 numcodecs
RUN which gcc
conda create --quiet --yes -p /venv python=3.12 numcodecs eccodes
RUN echo "Installing dependencies into /venv" && \
cd /_lock && \
mkdir src && \
uv sync --no-dev --no-install-project && \
echo "Optimizing /venv site-packages" && \
rm -r /venv/lib/python3.12/site-packages/**/tests && \
rm -r /venv/lib/python3.12/site-packages/**/_*cache*

rm -r /venv/lib/python3.12/site-packages/**/_*cache* && \
rm -r /venv/share/eccodes/definitions/bufr

# Then install the application itself
# * Delete the test and cache folders from installed packages to reduce size
COPY . /src
RUN --mount=type=cache,target=/root/.cache \
uv pip install --no-deps --python=$UV_PROJECT_ENVIRONMENT /src
RUN uv pip install --no-deps --python=$UV_PROJECT_ENVIRONMENT /src

# Copy the virtualenv into a distroless image
# * These are small images that only contain the runtime dependencies
Expand Down
27 changes: 12 additions & 15 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,44 +18,39 @@ authors = [
classifiers = ["Programming Language :: Python :: 3"]
dependencies = [
"dask == 2024.8.1",
"eccodes == 2.38.1",
"eccodes == 2.38.3",
"ecmwf-api-client == 1.6.3",
"cfgrib == 0.9.14.0",
"cfgrib == 0.9.14.1",
"dagster-pipes == 1.8.5",
"joblib == 1.4.2",
"numpy == 2.1.0",
"ocf-blosc2 == 0.0.11",
"psutil == 6.0.0",
"requests == 2.32.3",
"returns == 0.23.0",
"s3fs == 2024.9.0",
"xarray == 2024.9.0",
"zarr == 2.18.2"
"zarr == 2.18.3"
]

[project.optional-dependencies]
test = [
[dependency-groups]
dev = [
# Testing
"botocore == 1.33.7", # Required for moto, prevents installing the whole of boto3
"flask == 3.0.0",
"flask-cors == 4.0.0",
"moto[s3,server] == 4.2.11",
"unittest-xml-reporting == 3.2.0",
"hypothesis == 6.115.3",
]
lint = [
# Linting
"returns[compatible-mypy]",
"ruff == 0.6.9",
"pandas-stubs",
"types-psutil",
"types-pytz",
"types-pyyaml",
]
docs = [
# Docs
"pydoctor >= 24.3.0",
]
dev = [
"nwp-consumer[test,lint,docs]",
]
lsp = [
# IDE support
"python-lsp-server",
"pylsp-mypy",
"python-lsp-ruff",
Expand Down Expand Up @@ -102,6 +97,8 @@ plugins = [
# If they are ever made, remove from here!
module = [
"cfgrib",
"botocore.session",
"botocore.client",
"joblib",
"ocf_blosc2",
"s3fs",
Expand Down
3 changes: 3 additions & 0 deletions src/nwp_consumer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
+-------------------------------+-------------------------------------+---------------------------------------------+
| MODEL_REPOSITORY | The model repository to use. | ceda-metoffice-global |
+-------------------------------+-------------------------------------+---------------------------------------------+
| CONCURRENCY | Whether to use concurrency. | True |
+-------------------------------+-------------------------------------+---------------------------------------------+
Development Documentation
Expand Down Expand Up @@ -149,6 +151,7 @@
"gribapi",
"aiobotocore",
"s3fs",
"fsspec",
"asyncio",
"botocore",
"cfgrib",
Expand Down
9 changes: 4 additions & 5 deletions src/nwp_consumer/cmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@ def parse_env() -> Adaptors:
"""Parse from the environment."""
model_repository_adaptor: type[ports.ModelRepository]
match os.getenv("MODEL_REPOSITORY"):
case None:
log.error("MODEL_REPOSITORY is not set in environment.")
sys.exit(1)
case None | "gfs":
model_repository_adaptor = repositories.NOAAS3ModelRepository
case "ceda":
model_repository_adaptor = repositories.CedaMetOfficeGlobalModelRepository
case "ecmwf-realtime-s3":
model_repository_adaptor = repositories.CEDAFTPModelRepository
case "ecmwf-realtime":
model_repository_adaptor = repositories.ECMWFRealTimeS3ModelRepository
case _ as model:
log.error(f"Unknown model: {model}")
Expand Down
64 changes: 59 additions & 5 deletions src/nwp_consumer/internal/entities/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@

import dataclasses
import datetime as dt
import json
from importlib.metadata import PackageNotFoundError, version

import dask.array
import numpy as np
import pandas as pd
import pytz
Expand All @@ -46,6 +49,11 @@

from .parameters import Parameter

try:
__version__ = version("nwp-consumer")
except PackageNotFoundError:
__version__ = "v?"


@dataclasses.dataclass(slots=True)
class NWPDimensionCoordinateMap:
Expand All @@ -69,7 +77,7 @@ class NWPDimensionCoordinateMap:
"""The forecast step times.
This corresponds to the horizon of the values, which is the time
difference between the forecast initialisation time and the target
difference between the forecast initialization time and the target
time at which the forecast data is valid.
"""
variable: list[Parameter]
Expand Down Expand Up @@ -207,7 +215,7 @@ def to_pandas(self) -> dict[str, pd.Index]: # type: ignore
This is useful for interoperability with xarray, which prefers to define
DataArray coordinates using a dict pandas Index objects.
For the most part, the conversion consists of a straighforward cast
For the most part, the conversion consists of a straightforward cast
to a pandas Index object. However, there are some caveats involving
the time-centric dimensions:
Expand Down Expand Up @@ -367,11 +375,57 @@ def default_chunking(self) -> dict[str, int]:
that wants to cover the entire dimension should have a size equal to the
dimension length.
It defaults to a single chunk per init time and step, and a single chunk
for each entire other dimension.
It defaults to a single chunk per init time and step, and 8 chunks
for each entire other dimension. These are purposefully small, to ensure
that when perfomring parallel writes, chunk boundaries are not crossed.
"""
out_dict: dict[str, int] = {
"init_time": 1,
"step": 1,
} | {dim: len(getattr(self, dim)) for dim in self.dims if dim not in ["init_time", "step"]}
} | {
dim: len(getattr(self, dim)) // 8 if len(getattr(self, dim)) > 8 else 1
for dim in self.dims
if dim not in ["init_time", "step"]
}

return out_dict


def as_zeroed_dataarray(self, name: str) -> xr.DataArray:
"""Express the coordinates as an xarray DataArray.
Data is populated with zeros and a default chunking scheme is applied.
Args:
name: The name of the DataArray.
See Also:
- https://docs.xarray.dev/en/stable/user-guide/io.html#distributed-writes
"""
# Create a dask array of zeros with the shape of the dataset
# * The values of this are ignored, only the shape and chunks are used
dummy_values = dask.array.zeros( # type: ignore
shape=list(self.shapemap.values()),
chunks=tuple([self.default_chunking()[k] for k in self.shapemap]),
)
attrs: dict[str, str] = {
"produced_by": "".join((
f"nwp-consumer {__version__} at ",
f"{dt.datetime.now(tz=dt.UTC).strftime('%Y-%m-%d %H:%M')}",
)),
"variables": json.dumps({
p.value: {
"description": p.metadata().description,
"units": p.metadata().units,
} for p in self.variable
}),
}
# Create a DataArray object with the given coordinates and dummy values
da: xr.DataArray = xr.DataArray(
name=name,
data=dummy_values,
coords=self.to_pandas(),
attrs=attrs,
)
return da

Loading

0 comments on commit dbbcf57

Please sign in to comment.