diff --git a/.github/workflows/branch_ci.yml b/.github/workflows/branch_ci.yml new file mode 100644 index 00000000..1cea47cc --- /dev/null +++ b/.github/workflows/branch_ci.yml @@ -0,0 +1,184 @@ +# Workflow that runs on pushes to non-default branches + +name: Non-Default Branch Push CI (Python) + +on: + push: + branches-ignore: ['main'] + paths-ignore: ['README.md'] + +# Specify concurrency such that only one workflow can run at a time +# * Different workflow files are not affected +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +# Registry for storing Container images +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +# Ensure the GitHub token can remove packages +permissions: + packages: write + + +jobs: + + # Job to run a linter and typechecker against the codebase + lint-typecheck: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install editable package and required dependencies + run: uv sync + + - name: Lint package + run: uv run ruff check --output-format=github . + + - name: Typecheck package + run: uv run mypy . + # TODO: GitHub output when https://github.com/python/mypy/pull/17771 merged + + # Job to run unittests + # * Produces a JUnit XML report that can be displayed in the GitHub UI + test-unit: + runs-on: ubuntu-latest + needs: lint-typecheck + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install editable package and required dependencies + run: uv sync + + # Run unittests + # * Produce JUnit XML report + - name: Run unit tests + run: uv run python -m xmlrunner discover -s src/nwp_consumer -p "test_*.py" --output-file ut-report.xml + + # Create test summary to be visualised on the job summary screen on GitHub + # * Runs even if previous steps fail + - name: Create test summary + uses: test-summary/action@v2 + with: + paths: "*t-report.xml" + show: "fail, skip" + if: always() + + # Define a job that builds the documentation + # * Surfaces the documentation as an artifact + build-docs: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install editable package and required dependencies + run: uv sync + + # Pydoctor is ran to find any linking errors in the docs + - name: Build documentation + run: | + uv run pydoctor --html-output=tmpdocs -W -q + PDOC_ALLOW_EXEC=1 uv run pdoc -o docs \ + --docformat=google \ + --logo="https://cdn.prod.website-files.com/62d92550f6774db58d441cca/6324a2038936ecda71599a8b_OCF_Logo_black_trans.png" \ + src/nwp_consumer + + - name: Upload documentation + uses: actions/upload-artifact@v4 + with: + name: docs + path: docs + + # Job for building container image + # * Builds and pushes an OCI Container image to the registry defined in the environment variables + # * Only runs if test and lint jobs pass + build-container: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + needs: ["lint-typecheck", "test-unit"] + + steps: + # Do a non-shallow clone of the repo to ensure tags are present + # * This allows setuptools-git-versioning to automatically set the version + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Tag the built image according to the event type + # The event is a branch commit, so use the commit sha + - name: Extract metadata (tags, labels) for Container + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: type=ref,event=branch + + # Build and push the Container image to the registry + # * Creates a multiplatform-aware image + # * Pulls build cache from the registry + - name: Build and push container image + uses: docker/build-push-action@v6 + with: + context: . + file: Containerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64 + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 50b219c4..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,294 +0,0 @@ -name: Python CI - -on: - push: - branches: [] - paths-ignore: - - 'README.md' - tags: - - 'v*' - pull_request: - branches: [] - paths-ignore: - - 'README.md' - workflow_dispatch: - -# Specify concurrency such that only one workflow can run at a time -# * Different workflow files are not affected -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -# Registry for storing Container images -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -# Ensure the GitHub token can remove packages -permissions: - packages: write - - -jobs: - - # Define a dependencies job that runs on all branches and PRs - # * Installs dependencies and caches them - build-venv: - runs-on: ubuntu-latest - container: quay.io/condaforge/miniforge3:latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Restore cached virtualenv, if available - # * The pyproject.toml hash is part of the cache key, invalidating - # the cache if the file changes - - name: Restore cached virtualenv - id: restore-cache - uses: actions/cache/restore@v3 - with: - path: ./venv - key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml') }} - - # Should mirror the build-venv stage in the Containerfile - - name: Build venv - run: | - apt -qq update && apt -qq install -y build-essential - conda create -p ./venv python=3.12 - ./venv/bin/python -m pip install --upgrade -q pip wheel setuptools - if: steps.restore-cache.outputs.cache-hit != 'true' - - # Should mirror the build-reqs stage in the Containerfile - # * Except this installs the dev dependencies as well - - name: Install all dependencies - run: | - conda install -p ./venv -q -y eccodes zarr - ./venv/bin/python -m pip install -q .[dev] --no-binary=nwp-consumer - if: steps.restore-cache.outputs.cache-hit != 'true' - - # Cache the virtualenv for future runs - - name: Cache virtualenv - uses: actions/cache/save@v3 - with: - path: ./venv - key: ${{ steps.restore-cache.outputs.cache-primary-key }} - if: steps.restore-cache.outputs.cache-hit != 'true' - - # Define a unittest job that runs on all branches and PRs - test-unit: - runs-on: ubuntu-latest - container: quay.io/condaforge/miniforge3:latest - needs: build-venv - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Restore cached virtualenv - - name: Restore cached virtualenv - uses: actions/cache/restore@v3 - with: - path: ./venv - key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml') }} - - - name: Install package - run: ./venv/bin/python -m pip install -q . - - # Run unittests - # * Produce JUnit XML report - - name: Run unit tests - env: - ECCODES_DEFINITION_PATH: ${{ github.workspace }}/venv/share/eccodes/definitions - run: ./venv/bin/python -m xmlrunner discover -s src/nwp_consumer -p "test_*.py" --output-file ut-report.xml - - # Create test summary to be visualised on the job summary screen on GitHub - # * Runs even if previous steps fail - - name: Create test summary - uses: test-summary/action@v2 - with: - paths: "*t-report.xml" - show: "fail, skip" - if: always() - - # Define an autotagger job that runs on merge requests - tag: - runs-on: ubuntu-latest - needs: test-unit - if: | - github.event_name == 'pull_request' && - github.event.action == 'closed' && - github.event.pull_request.merged == true - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Bump version and push tag - uses: RueLaLa/auto-tagger@master - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PR_NUMBER: ${{ github.event.number }} - - - # Define an integration test job that runs only on version on main tags - test-integration: - runs-on: ubuntu-latest - container: quay.io/condaforge/miniforge3:latest - needs: build-venv - if: | - github.event_name == 'workflow_dispatch' || - (contains(github.ref, 'refs/tags/v') && github.event_name == 'push') - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Restore cached virtualenv - - name: Restore cached virtualenv - uses: actions/cache/restore@v3 - with: - path: ./venv - key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml') }} - - - name: Install package - run: ./venv/bin/python -m pip install -q . - - # Run integration tests - # * Requires secrets to be set in the repository settings - # * Produce JUnit XML report - - name: Run integration tests - env: - LOGLEVEL: "debug" - RAW_DIR: "/tmp/raw" - ZARR_DIR: "/tmp/zarr" - CEDA_FTP_PASS: ${{ secrets.CEDA_FTP_PASS }} - CEDA_FTP_USER: ${{ secrets.CEDA_FTP_USER }} - METOFFICE_API_KEY: ${{ secrets.METOFFICE_API_KEY }} - METOFFICE_ORDER_ID: ${{ secrets.METOFFICE_ORDER_ID }} - ECMWF_API_KEY: ${{ secrets.ECMWF_API_KEY }} - ECMWF_API_EMAIL: ${{ secrets.ECMWF_API_EMAIL }} - ECMWF_API_URL: ${{ secrets.ECMWF_API_URL }} - run: ./venv/bin/python -m xmlrunner discover -s src/test_integration -p "test_*.py" --output-file it-report.xml - - # Create test summary to be visualised on the job summary screen on GitHub - # * Runs even if previous steps fail - - name: Create test summary - uses: test-summary/action@v2 - with: - paths: "*t-report.xml" - show: "fail, skip" - if: always() - - # Define a "build-container" job that runs on branch commits only - # * Builds and pushes an OCI Container image to the registry defined in the environment variables - # * Only runs if test job passes - build-container: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - needs: test-unit - if: github.event_name != 'pull_request' - - steps: - # Do a non-shallow clone of the repo to ensure tags are present - # * This allows setuptools-git-versioning to automatically set the version - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Buildx - uses: docker/setup-buildx-action@v2 - - - name: Log in to the Container registry - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # Tag the built image according to the event type - # * If the event is a valid version tag, use the tag name - # * If the event is a branch commit, use the commit sha - - name: Extract metadata (tags, labels) for Container - id: meta - uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=ref,event=branch - type=semver,pattern={{version}} - - # Build and push the Container image to the registry - # * Creates a multiplatform-aware image - # * Semantic versioning is handled via the metadata action - # * The image layers are cached between action runs with the following strategy - # * - On push to main, also push build cache - # * - On push to other branches, only pull build cache - - name: Build and push Container image and cache - uses: docker/build-push-action@v4 - if: github.event_name == 'push' && github.ref == 'refs/heads/main' - with: - context: . - file: Containerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - platforms: linux/amd64,linux/arm64 - cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache - cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max - - name: Build and push container image - uses: docker/build-push-action@v4 - if: github.event_name != 'push' || github.ref != 'refs/heads/main' - with: - context: . - file: Containerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - platforms: linux/amd64,linux/arm64 - cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache - - # Define a "build-wheel" job that runs on version tags - # * Only runs if integration test job passes - build-wheel: - runs-on: ubuntu-latest - needs: test-integration - if: contains(github.ref, 'refs/tags/v') && github.event_name == 'push' - - steps: - # Do a non-shallow clone of the repo to ensure tags are present - # * This allows setuptools-git-versioning to automatically set the version - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - # Restore cached virtualenv - - name: Restore cached virtualenv - uses: actions/cache/restore@v3 - with: - path: ./venv - key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml') }} - - # Building the wheel dynamically assigns the version according to git - # * The setuptools_git_versioning package reads the git tags and assigns the version - # * The version is then used in the wheel filename and made available in the package - # * setuptools_git_versioning is configured in pyproject.toml - - name: Build wheel - run: ./venv/bin/python -m pip wheel . --no-deps --wheel-dir dist - - - name: Upload wheel - uses: actions/upload-artifact@v3 - with: - name: wheel - path: dist/*.whl - - - name: Publish wheel - uses: pypa/gh-action-pypi-publish@v1.8.10 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml new file mode 100644 index 00000000..3ab9fe2a --- /dev/null +++ b/.github/workflows/main_ci.yml @@ -0,0 +1,39 @@ +# Workflow that runs on closed PRs to the default branch + +name: Default Branch PR Merged CI (Python) + +on: + pull_request: + types: ["closed"] + branches: ["main"] + +# Specify concurrency such that only one workflow can run at a time +# * Different workflow files are not affected +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + + +jobs: + + # Define an autotagger job that creates tags on changes to master + # Use #major #minor in merge commit messages to bump version beyond patch + # See https://github.com/RueLaLa/auto-tagger?tab=readme-ov-file#usage + tag: + runs-on: ubuntu-latest + if: | + github.event_name == 'pull_request' && + github.event.action == 'closed' && + github.event.pull_request.merged == true + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Bump version and push tag + uses: RueLaLa/auto-tagger@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_NUMBER: ${{ github.event.number }} + diff --git a/.github/workflows/tagged_ci.yml b/.github/workflows/tagged_ci.yml new file mode 100644 index 00000000..ac0953cf --- /dev/null +++ b/.github/workflows/tagged_ci.yml @@ -0,0 +1,119 @@ +# Workflow that runs on new SemVer tags on the default branch + +name: Default Branch SemVer Tagged CI (Python) + +on: + push: + branches: ['main'] + tags: ['v[0-9]+.[0-9]+.[0-9]+'] + paths-ignore: ['README.md'] + +# Specify concurrency such that only one workflow can run at a time +# * Different workflow files are not affected +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +# Registry for storing Container images +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + + +jobs: + + # Job for building container image + # * Builds and pushes an OCI Container image to the registry defined in the environment variables + build-container: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + # Do a non-shallow clone of the repo to ensure tags are present + # * This allows setuptools-git-versioning to automatically set the version + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Tag the built image according to the event type + # The event is a semver release, so use the version + - name: Extract metadata (tags, labels) for Container + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: type=semver,pattern={{version}} + + # Build and push the Container image to the registry + # * Creates a multiplatform-aware image + # * Pulls build cache from the registry and pushes new cache back + - name: Build and push container image + uses: docker/build-push-action@v6 + with: + context: . + file: Containerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64,linux/arm64 + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max + + + # Job to build and publish the package on PyPi as a wheel + build-wheel: + runs-on: ubuntu-latest + + steps: + # Do a non-shallow clone of the repo to ensure tags are present + # * This allows setuptools-git-versioning to automatically set the version + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install editable package and required dependencies + run: uv sync --no-dev + + # Building the wheel dynamically assigns the version according to git + # * The setuptools_git_versioning package reads the git tags and assigns the version + # * The version is then used in the wheel filename and made available in the package + # * setuptools_git_versioning is configured in pyproject.toml + - name: Build wheel + run: uv pip wheel . --no-deps --wheel-dir dist + + - name: Upload wheel + uses: actions/upload-artifact@v4 + with: + name: wheel + path: dist/*.whl + + - name: Publish wheel + uses: pypa/gh-action-pypi-publish@v1.10 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 0557b4ea..a4a290d9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +*.pyc # C extensions *.so @@ -24,18 +25,9 @@ share/python-wheels/ *.egg-info/ .installed.cfg *.egg +*.egg-info MANIFEST -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into test_integration. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - # Unit test / coverage reports htmlcov/ .tox/ @@ -51,69 +43,17 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy +_trial_temp/ +**/_trial_temp.lock # Sphinx documentation docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# PDM -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py +docs/ # Environments -.env .venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site +uv.lock +.env # mypy .mypy_cache/ @@ -123,21 +63,12 @@ dmypy.json # ruff .ruff_cache/ -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - # Cython debug symbols cython_debug/ -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ +# Code Editors +.idea +.fleet # Downloaded files downloads @@ -146,10 +77,6 @@ zarr /testing **.idx -# S3 mocking -s3: - # MacOS .DS_Store **/*.swp - diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md deleted file mode 100644 index ecc26730..00000000 --- a/ARCHITECTURE.md +++ /dev/null @@ -1,75 +0,0 @@ -# Architecture - -This document describes the high level architecture of the nwp-consumer project. - -## Birds-eye view - -```mermaid -flowchart - subgraph "Hexagonal Architecture" - - subgraph "NWP Consumer" - subgraph "Ports" - portFI(FetcherInterface) --- core - core --- portSI(StorageInterface) - - subgraph "Core" - core{{Domain Logic}} - end - end - end - - subgraph "Driving Adaptors" - i1{ICON} --implements--> portFI - i2{ECMWF} --implements--> portFI - i3{MetOffice} --implements--> portFI - end - - subgraph "Driven Adaptors" - portSI --- o1{S3} - portSI --- o2{Huggingface} - portSI --- o3{LocalFS} - end - - end -``` - -At the top level, the consumer downloads raw NWP data, processes it to zarr, and saves it to a storage backend. - -It is built following the hexagonal architecture pattern. -This pattern is used to separate the core business logic from the driving and driven adaptors. -The core business logic is the `service` module, which contains the domain logic. -This logic is agnostic to the driving and driven actors, -instead relying on abstract classes as the ports to interact with them. - - -## Entry Points - -`src/nwp_consumer/cmd/main.py` contains the main function which runs the consumer. - -`src/nwp_consumer/internal/service/consumer.py` contains the `NWPConsumer` class, -the methods of which are the business use cases of the consumer. - -`StorageInterface` and `FetcherInterface` classes define the ports used by driving and driven actors. - -`src/nwp_consumer/internal/inputs` contains the adaptors for the driving actors. - -`src/nwp_consumer/internal/outputs` contains the adaptors for the driven actors. - -## Core - -The core business logic is contained in the `service` module. -According to the hexagonal pattern, the core logic is agnostic to the driving and driven actors. -As such, there is an internal data representation of the NWP data that the core logic acts upon. -Due to the multidimensional data of the NWP data, it is hard to define a schema for this. - -Internal data is stored an xarray dataset. -This dataset effectively acts as an array of `DataArrays` for each parameter or variable. -It should have the following dimensions and coordinates: - -- `time` dimension -- `step` dimension -- `latitude` or `x` dimension -- `longitude` or `y` dimension - -Parameters should be stored as DataArrays in the dataset. \ No newline at end of file diff --git a/Containerfile b/Containerfile index eea07b30..9c4cd8ae 100644 --- a/Containerfile +++ b/Containerfile @@ -1,36 +1,153 @@ -# Build a virtualenv using miniconda -# * Install required compilation tools for wheels via apt -# * Install required non-python binaries via conda +# POTENTIAL FOR SMALLER CONTAINERFILE IF THIS CAN BE GOT WORKING + + +# # --- Base Python image --------------------------------------------------------------- +# FROM python:3.12-bookworm AS python-base +# +# --- Builder image creation ------------------------------------------------------------- +# FROM python-base AS builder +# +# Setup non-root user +# ARG USER=monty +# RUN groupadd ${USER} && useradd -m ${USER} -g ${USER} +# USER ${USER} +# ENV PATH="/home/${USER}/.local/bin:${PATH}" +# +# WORKDIR /home/${USER} +# +# Don't generate .pyc, enable tracebacks +# ENV LANG=C.UTF-8 \ +# LC_ALL=C.UTF-8 \ +# PYTHONDONTWRITEBYTECODE=1 \ +# PYTHONFAULTHANDLER=1 +# +# # COPY --from=ghcr.io/astral-sh/uv:python3.12-bookworm --chown=1000:1000 /usr/local/bin/uv /home/${USER}/.local/bin/uv +# COPY --from=ghcr.io/astral-sh/uv:python3.12-bookworm /usr/local/bin/uv /usr/local/bin/uv +# +# RUN uv --version +# +# # --- Distroless Container creation ----------------------------------------------------- +# FROM gcr.io/distroless/cc-debian12 AS python-distroless +# +# ARG CHIPSET_ARCH=aarch64-linux-gnu +# +# # Copy the python installation from the base image +# COPY --from=python-base /usr/local/lib/ /usr/local/lib/ +# COPY --from=python-base /usr/local/bin/python /usr/local/bin/python +# COPY --from=python-base /etc/ld.so.cache /etc/ld.so.cache +# +# # Add common compiled libraries +# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libz.so.1 /usr/lib/${CHIPSET_ARCH}/ +# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libffi* /usr/lib/${CHIPSET_ARCH}/ +# # COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libbz2.so.1.0 /usr/lib/${CHIPSET_ARCH}/ +# # COPY --from=python-base /lib/${CHIPSET_ARCH}/libm.so.6 /lib/${CHIPSET_ARCH}/ +# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libc.so.6 /usr/lib/${CHIPSET_ARCH}/ +# +# Create non root user +# ARG USER=monty +# COPY --from=python-base /bin/echo /bin/echo +# COPY --from=python-base /bin/rm /bin/rm +# COPY --from=python-base /bin/sh /bin/sh +# +# RUN echo "${USER}:x:1000:${USER}" >> /etc/group +# RUN echo "${USER}:x:1001:" >> /etc/group +# RUN echo "${USER}:x:1000:1001::/home/${USER}:" >> /etc/passwd +# +# Check python installation works +# RUN python --version +# RUN rm /bin/sh /bin/echo /bin/rm +# +# Don't generate .pyc, enable tracebacks +# ENV LANG=C.UTF-8 \ +# LC_ALL=C.UTF-8 \ +# PYTHONDONTWRITEBYTECODE=1 \ +# PYTHONFAULTHANDLER=1 +# +# # --- Build the application ------------------------------------------------------------- +# FROM builder AS build-app +# +# WORKDIR /app +# +# # Install dependencies using system python +# ENV UV_LINK_MODE=copy \ +# UV_COMPILE_BYTECODE=1 \ +# UV_PYTHON_DOWNLOADS=never \ +# UV_NO_CACHE=1 \ +# CFLAGS="-g0 -Wl,--strip-all" +# +# # Synchronize DEPENDENCIES without the application itself. +# # This layer is cached until pyproject.toml changes. +# # Delete any unwanted parts of the installed packages to reduce size +# RUN --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ +# echo "Installing dependencies" && \ +# mkdir src && \ +# uv sync --no-dev --no-install-project && uv run python -m eccodes selfcheck +# # echo "Optimizing site-packages" && \ +# # rm -r .venv/.local/lib/python3.12/site-packages/**/tests && \ +# # du -h .venv/.local/lib/python3.12/site-packages | sort -h | tail -n 4 +# +# COPY . . +# +# RUN python -m eccodes selfcheck +# +# # --- Distroless App image -------------------------------------------------------------- +# FROM python-distroless +# +# COPY --from=build-app /usr/local /usr/local +# +# ENV RAWDIR=/work/raw \ +# ZARRDIR=/work/data +# +# ENTRYPOINT ["nwp-consumer-cli"] +# VOLUME /work +# STOPSIGNAL SIGINT + + +# WORKING CONTAINERFILE + + FROM quay.io/condaforge/miniforge3:latest AS build-venv -RUN apt -qq update && apt -qq install -y build-essential -RUN conda create -p /venv python=3.12 -RUN /venv/bin/pip install --upgrade -q pip wheel setuptools -# Install packages into the virtualenv as a separate step -# * Only re-execute this step when the requirements files change -FROM build-venv AS build-reqs -WORKDIR /app -COPY pyproject.toml pyproject.toml -RUN conda install -p /venv -q -y eccodes zarr -RUN /venv/bin/pip install -q . --no-cache-dir --no-binary=nwp-consumer - -# Build binary for the package -# * The package is versioned via setuptools_git_versioning -# hence the .git directory is required -# * The README.md is required for the long description -FROM build-reqs AS build-app -COPY src src -COPY .git .git -COPY README.md README.md -RUN /venv/bin/pip install . -RUN rm -r /venv/share/eccodes/definitions/bufr -RUN rm -r /venv/lib/python3.12/site-packages/pandas/tests +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +ENV UV_LINK_MODE=copy \ + UV_COMPILE_BYTECODE=1 \ + UV_PYTHON_DOWNLOADS=never \ + UV_LINK_MODE=copy \ + UV_PYTHON=python3.12 \ + UV_PROJECT_ENVIRONMENT=/venv +COPY pyproject.toml /_lock/ + +# Synchronize DEPENDENCIES without the application itself. +# This layer is cached until uv.lock or pyproject.toml change. +# Delete any unwanted parts of the installed packages to reduce size +RUN apt-get -qq update && apt-get -qq -y install gcc && \ + echo "Creating virtualenv at /venv" && \ + conda create --quiet --yes -p /venv python=3.12 eccodes +RUN echo "Installing dependencies into /venv" && \ + cd /_lock && \ + mkdir src && \ + uv sync --no-dev --no-install-project && \ + echo "Optimizing /venv site-packages" && \ + rm -r /venv/lib/python3.12/site-packages/**/tests && \ + rm -r /venv/lib/python3.12/site-packages/**/_*cache* && \ + rm -r /venv/share/eccodes/definitions/bufr + +# Then install the application itself +# * Delete the test and cache folders from installed packages to reduce size +COPY . /src +RUN uv pip install --no-deps --python=$UV_PROJECT_ENVIRONMENT /src # Copy the virtualenv into a distroless image # * These are small images that only contain the runtime dependencies FROM gcr.io/distroless/python3-debian11 WORKDIR /app -COPY --from=build-app /venv /venv -HEALTHCHECK CMD ["/venv/bin/nwp-consumer", "check"] -ENTRYPOINT ["/venv/bin/nwp-consumer"] -VOLUME /tmp/nwpc +COPY --from=build-venv /venv /venv + +ENV RAWDIR=/work/raw \ + ZARRDIR=/work/data \ + ECCODES_DEFINITION_PATH=/venv/share/eccodes/definitions + +ENTRYPOINT ["/venv/bin/nwp-consumer-cli"] +VOLUME /work +STOPSIGNAL SIGINT diff --git a/README.md b/README.md index 6b7835df..4c15d748 100644 --- a/README.md +++ b/README.md @@ -1,281 +1,131 @@ -

-NWP CONSUMER -
-
-Microservice for consuming NWP data. -

- -
- - - - - GitHub Workflow Status (with branch) - - - - GitHub tag (latest SemVer pre-release) - - PyPI version -
- -
- -A microservice for multi-source consumption of NWP data, storing it in a common format. Built with inspiration -from the [Hexagonal Architecture](https://alistair.cockburn.us/hexagonal-architecture) pattern, the nwp-consumer is -currently packaged with adapters for pulling and converting `.grib` data from: - -- [MetOffice Atmospheric API](https://gridded-data-ui.cda.api.metoffice.gov.uk) -- [CEDA Atmospheric Archive](https://catalogue.ceda.ac.uk) -- [ECMWF MARS API](https://apps.ecmwf.int/mars-catalogue) -- [DWD's ICON Model from the Opendata API](https://opendata.dwd.de) -- [CMC's GDPS Model from the Opendata API](https://dd.weather.gc.ca/) -- [NOAA's GFS Model from AWS Open Data](https://noaa-gfs-bdp-pds.s3.amazonaws.com) -- [NOAA's GFS Model from NCAR's Archive](https://rda.ucar.edu/datasets/ds084.1/) - -Similarly, the service can write to multiple sinks: - -- Local filesystem -- [AWS S3](https://aws.amazon.com/s3/) -- [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) - -Its modular nature enables straightforward extension to alternate future sources. - -## Running the service - -The service uses environment variables to configure sources and sinks in accordance with -the [Twelve-Factor App methodology](https://12factor.net/config). -The program will inform you of missing env vars when using an adaptor, but you can also check the -[config](src/nwp_consumer/internal/config/config.py) for the given module, or use the `env` command. - -### Using Docker - -This service is designed to be run as a Docker container. The `Containerfile` is the Dockerfile for the service. -It is recommended to run it this way due to the dependency on external non-python binaries, which at the moment -cannot be easily distributed in a PyPi package. To run, pull the latest version from `ghcr.io` via: - -```shell -$ docker run \ - -v /path/to/datadir:/data \ - -e ENV_VAR= \ - ghcr.io/openclimatefix/nwp-consumer:latest -``` - -### Using the Python Package - -Ensure the [external dependencies](#external-dependencies) are installed. Then, do one of the following: +# NWP Consumer -Either +**Download and convert weather data for use in ML pipelines** -- Install from [PyPI](https://pypi.org/project/nwp-consumer) with - ```shell - $ pip install nwp-consumer - ``` +[![tags badge](https://img.shields.io/github/v/tag/openclimatefix/nwp-consumer?include_prereleases&sort=semver&color=7BCDF3)](https://github.com/openclimatefix/nwp-consumer/tags) +[![pypi badge](https://img.shields.io/pypi/v/nwp-consumer?&color=086788)](https://pypi.org/project/nwp-consumer) +[![documentation badge](https://img.shields.io/badge/docs-latest-333333)](https://openclimatefix.github.io/nwp-consumer/) +[![contributors badge](https://img.shields.io/github/contributors/openclimatefix/nwp-consumer?color=FFFFFF)](https://github.com/openclimatefix/nwp-consumer/graphs/contributors) +[![workflows badge](https://img.shields.io/github/actions/workflow/status/openclimatefix/nwp-consumer/branch_ci.yml?branch=main&color=FFD053)](https://github.com/openclimatefix/nwp-consumer/actions/workflows/ci.yml) +[![ease of contribution: easy](https://img.shields.io/badge/ease%20of%20contribution:%20easy-32bd50)](https://github.com/openclimatefix/ocf-meta-repo?tab=readme-ov-file#overview-of-ocfs-nowcasting-repositories) -*or* +Some renewables, such as solar and wind, generate power according to the weather conditions. +Any forecasting therefore requires predictions of how these conditions will change. +Many meteorological organisations provide Numerical Weather Prediction (NWP) data, +which can then used for model training and inference. -- Clone the repository and install the package via - ```shell - $ git clone git@github.com:openclimatefix/nwp-consumer.git - $ cd nwp-consumer - $ pip install . - ``` +This data is often very large and can come in various formats. +Furthermore, these formats are not necessarily suitable for training, +so may require preprocessing and conversion. -Then run the service via +This package aims to streamline the collection and processing of this NWP data. -```shell -$ ENV_VAR= nwp-consumer -``` +> [!Note] +> This is *not* built to replace tools such as [Herbie](https://github.com/blaylockbk/Herbie). +> It is built to produce data specific to the needs of Open Climate Fix's models, +> so things like the output format and the variable selection are hard-coded. +> If you need a more configurable cli-driven tool, consider using herbie instead. -### CLI +## Installation -Whether running via Docker or the Python package, available commands can be found with the command `help` or the -`--help` flag. For example: +Install from PyPi using pip: -```shell -$ nwp-consumer --help -# or -$ docker run ghcr.io/openclimatefix/nwp-consumer:latest --help +```bash +$ pip install nwp-consumer ``` -## Ubiquitous Language - -The following terms are used throughout the codebase and documentation. They are defined here to avoid ambiguity. - -- ***InitTime*** - The time at which a forecast is initialised. For example, a forecast initialised at 12:00 on 1st -January. +Or use the container image: -- ***TargetTime*** - The time at which a predicted value is valid. For example, a forecast with InitTime 12:00 on 1st -January predicts that the temperature at TargetTime 12:00 on 2nd January at position x will be 10 degrees. - - -## Repository structure - -Produced using [exa](https://github.com/ogham/exa): -```shell -$ exa --tree --git-ignore -F -I "*init*|test*.*" +```bash +$ docker pull ghcr.io/openclimatefix/nwp-consumer ``` -```yml -./ -├── Containerfile # The Dockerfile for the service -├── pyproject.toml # The build configuration for the service -├── README.md -└── src/ - ├── nwp_consumer/ # The main library package - │ ├── cmd/ - │ │ └── main.py # The entrypoint to the service - │ └── internal/ # Packages internal to the service. Like the 'lib' folder - │ ├── config/ - │ │ └── config.py # Contains the configuration specification for running the service - │ ├── inputs/ # Holds subpackages for each incoming data source - │ │ ├── ceda/ - │ │ │ ├── _models.py - │ │ │ ├── client.py # Contains the client and functions to map CEDA data to the service model - │ │ │ └── README.md # Info about the CEDA data source - │ │ └── metoffice/ - │ │ ├── _models.py - │ │ ├── client.py # # Contains the client and functions to map MetOffice data to the service model - │ │ └── README.md # Info about the MetOffice data source - │ ├── models.py # Describes the internal data models for the service - │ ├── outputs/ # Holds subpackages for each data sink - │ │ ├── localfs/ - │ │ │ └── client.py # Contains the client for storing data on the local filesystem - │ │ └── s3/ - │ │ └── client.py # Contains the client for storing data on S3 - │ └── service/ # Contains the business logic and use-cases of the application - │ └── service.py # Defines the service class for the application, whose methods are the use-cases - └── test_integration/ -``` - -`nwp-consumer` is structured following principles from the hexagonal architecture pattern. In brief, this means a clear -separation between the application's business logic - it's **Core** - and the **Actors** that are external to it. In -this package, the core of the service is in `internal/service/` and the actors are in `internal/inputs/` and -`internal/outputs/`. The service logic has no knowledge of the external actors, instead defining interfaces that the -actors must implement. These are found in `internal/models.py`. The actors are then responsible for implementing these -interfaces, and are *dependency-injected* in at runtime. This allows the service to be easily tested and extended. See -[further reading](#further-reading) for more information. - -## Local development +## Example usage -Clone the repository, and create and activate a new python virtualenv for it. `cd` to the repository root. +**To download the latest available day of GFS data:*** -Install the [External](#external-dependencies) and [Python](#python-requirements) dependencies as shown in the sections -below. - -### Taskfile - -This repository bundles often used commands into a [taskfile](./taskfile.yml) for convenience. To use these commands, ensure -[go-task](https://taskfile.dev/) is installed, easily done via [homebrew](https://taskfile.dev/installation). - -You can then see the available tasks using - -```shell -$ task -l +```bash +$ nwp-consumer consume ``` -### External dependencies - -The `cfgrib` python library depends on the ECMWF *cfgrib* binary, which is a wrapper around the ECMWF *ecCodes* library. -One of these must be installed on the system and accessible as a shared library. +**To create an archive of a month of GFS data:** -On a MacOS with HomeBrew use +> [!Note] +> This will download several gigabytes of data to your home partition. +> Make sure you have plenty of free space (and time!) -```shell -$ brew install eccodes +```bash +$ nwp-consumer archive --year 2024 --month 1 ``` -Or if you manage binary packages with *Conda* use +## Documentation -```shell -$ conda install -c conda-forge cfgrib -``` +Documentation is generated via [pdoc](https://pdoc.dev/docs/pdoc.html). +To build the documentation, run the following command in the repository root: -As an alternative you may install the official source distribution -by following the instructions at -https://confluence.ecmwf.int/display/ECC/ecCodes+installation - -You may run a simple selfcheck command to ensure that your system is set up correctly: - -```shell -$ python -m selfcheck -Found: ecCodes v2.27.0. -Your system is ready. +```bash +$ PDOC_ALLOW_EXEC=1 python -m pdoc -o docs --docformat=google src/nwp_consumer ``` -### Python requirements +> [!Note] +> The `PDOC_ALLOW_EXEC=1` environment variable is required due to a facet +> of the `ocf_blosc2` library, which imports itself automatically and hence +> necessitates execution to be enabled. -Install the required python dependencies and make it editable with +## FAQ -```shell -$ pip install -e . -``` +### How do I authenticate with model repositories that require accounts? -or use the taskfile +Authentication, and model repository selection, is handled via environment variables. +Choose a repository via the `MODEL_REPOSITORY` environment variable. Required environment +variables can be found in the repository's metadata function. Missing variables will be +warned about at runtime. -```shell -$ task install -``` +### How do I use an S3 bucket for created stores? -This looks for requirements specified in the `pyproject.toml` file. +The `ZARRDIR` environment variable can be set to an S3 url +(ex: `s3://some-bucket-name/some-prefix`). Valid credentials for accessing the bucket +must be discoverable in the environment as per +[Botocore's documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) -Note that these are the bare dependencies for running the application. If you want to run tests, -you need the development dependencies as well, which can be installed via +### How do I change what variables are pulled? -```shell -$ pip install -e ".[dev]" -``` +With difficulty! This package pulls data specifically tailored to Open Climate Fix's needs, +and as such, the data it pulls (and the schema that data is surfaced with) +is a fixed part of the package. A large part of the value proposition of this consumer is +that the data it produces is consistent and comparable between different sources, so pull +requests to the effect of adding or changing this for a specific model are unlikely to be +approved. -or +However, desired changes can be made via cloning the repo and making the relevant +parameter modifications to the model's expected coordinates in it's metadata for the desired model +repository. -```shell -$ task install-dev -``` - -
- Where is the requirements.txt file? - -There is no `requirements.txt` file. Instead, the project uses setuptool's pyproject.toml integration to specify -dependencies. This is a new feature of setuptools and pip, and is the -[recommended way](https://packaging.python.org/en/latest/tutorials/packaging-projects/) to specify dependencies. -See [the setuptools guide](https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html) and -[the PEP621 specification](https://packaging.python.org/en/latest/specifications/declaring-project-metadata) -for more information, as well as [Further Reading](#further-reading). -
+## Development + +This project uses [MyPy](https://mypy.readthedocs.io/en/stable/) for static type checking +and [Ruff](https://docs.astral.sh/ruff/) for linting. +Installing the development dependencies makes them available in your virtual environment. -### Running tests +Use them via: -Ensure you have installed the [Python requirements](#python-requirements) and the -[External dependencies](#external-dependencies). - -Run the unit tests with - -```shell -$ python -m unittest discover -s src/nwp_consumer -p "test_*.py" +```bash +$ python -m mypy . +$ python -m ruff check . ``` -or +Be sure to do this periodically while developing to catch any errors early +and prevent headaches with the CI pipeline. It may seem like a hassle at first, +but it prevents accidental creation of a whole suite of bugs. -```shell -$ task test-unit -``` - -and the integration tests with +### Running the test suite -```shell -$ python -m unittest discover -s test_integration -p "test_*.py" -``` +Run the unittests with: -or - -```shell -$ task test-integration +```bash +$ python -m unittest discover -s src/nwp_consumer -p "test_*.py" ``` -See [further reading](#further-reading) for more information on the `src` directory structure. - ---- - ## Further reading On packaging a python project using setuptools and pyproject.toml: @@ -301,7 +151,24 @@ src and flat layouts. ## Contributing and community -- See the [OCF Organisation Repo](https://github.com/openclimatefix) for details on contributing. -- Find out more about OCF in the [Meta Repo](https://github.com/openclimatefix/ocf-meta-repo). -- Follow OCF on [Twitter](https://twitter.com/OpenClimateFix). -- Check out the OCF blog at https://openclimatefix.org/blog for updates. +[![issues badge](https://img.shields.io/github/issues/openclimatefix/ocf-template?color=FFAC5F)](https://github.com/openclimatefix/ocf-template/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc) + +- PR's are welcome! See the [Organisation Profile](https://github.com/openclimatefix) for details on contributing +- Find out about our other projects in the [OCF Meta Repo](https://github.com/openclimatefix/ocf-meta-repo) +- Check out the [OCF blog](https://openclimatefix.org/blog) for updates +- Follow OCF on [LinkedIn](https://uk.linkedin.com/company/open-climate-fix) + + + + + + + + + + +--- + +*Part of the [Open Climate Fix](https://github.com/orgs/openclimatefix/people) community.* + +[![OCF Logo](https://cdn.prod.website-files.com/62d92550f6774db58d441cca/6324a2038936ecda71599a8b_OCF_Logo_black_trans.png)](https://openclimatefix.org) diff --git a/pyproject.toml b/pyproject.toml index 8d98a8b0..b028e637 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ # --- PROJECT CONFIGURATION --- # - [build-system] -requires = ["setuptools>=67", "wheel", "setuptools-git-versioning>=1.13.3"] +requires = ["setuptools>=67", "wheel", "setuptools-git-versioning>=2.0,<3"] build-backend = "setuptools.build_meta" # Metadata (see https://peps.python.org/pep-0621/) @@ -11,77 +10,113 @@ name = "nwp-consumer" dynamic = ["version"] # Set automtically using git: https://setuptools-git-versioning.readthedocs.io/en/stable/ description = "Tool for aggregating raw NWP files into .zarr files" readme = {file = "README.md", content-type = "text/markdown"} -requires-python = ">=3.11.0" +requires-python = ">=3.12,<3.13" license = {text = "MIT License"} authors = [ { name = "Sol Cotton", email = "sol@openclimatefix.org"} ] classifiers = ["Programming Language :: Python :: 3"] dependencies = [ - "cfgrib == 0.9.13.0", - "dask[distributed] == 2024.6.2", - "docopt == 0.6.2", + "dask == 2024.8.1", + "eccodes == 2.38.3", "ecmwf-api-client == 1.6.3", - "fsspec[s3] == 2024.6.1", - "huggingface-hub == 0.23.4", - "marshmallow == 3.21.3", - "marshmallow-dataclass == 8.7.0", - "numpy == 2.0.0", - "ocf-blosc2 == 0.0.4", + "cfgrib == 0.9.14.1", + "dagster-pipes == 1.8.5", + "joblib == 1.4.2", + "numpy == 2.1.0", + "ocf-blosc2 == 0.0.11", "psutil == 6.0.0", - "pyproj == 3.6.1", - "requests == 2.32.3", - "structlog == 24.2.0", - "urllib3 == 2.0.7", - "xarray == 2024.6.0", - "zarr == 2.18.2", - "sentry-sdk == 2.1.1" + "returns == 0.23.0", + "s3fs == 2024.9.0", + "xarray == 2024.9.0", + "zarr == 2.18.3" ] -[project.optional-dependencies] -test = [ +[dependency-groups] +dev = [ + # Testing "botocore == 1.33.7", # Required for moto, prevents installing the whole of boto3 "flask == 3.0.0", "flask-cors == 4.0.0", "moto[s3,server] == 4.2.11", "unittest-xml-reporting == 3.2.0", -] -lint = [ - "mypy == 1.7.1", - "python-lsp-server == 1.9.0", - "ruff == 0.1.7", -] -dev = [ - "nwp-consumer[test,lint]" + "hypothesis == 6.115.3", + # Linting + "returns[compatible-mypy]", + "ruff == 0.6.9", + "pandas-stubs", + "types-psutil", + "types-pytz", + "types-pyyaml", + # Docs + "pydoctor >= 24.3.0", + "pdoc >= 15.0.0", + # IDE support + "python-lsp-server", + "pylsp-mypy", + "python-lsp-ruff", ] [project.scripts] -nwp-consumer = "nwp_consumer.cmd.main:main" # Set the entrypoint for the CLI +nwp-consumer-cli = "nwp_consumer.cmd.main:run_cli" [project.urls] repository = "https://github.com/openclimatefix/nwp-consumer" -[tool.setuptools] -include-package-data = false +[tool.setuptools.packages.find] +where = ["src"] +exclude = ["test*"] + +[tool.setuptools.package-data] +"*" = ["py.typed", "*.pyi"] [tool.setuptools-git-versioning] enabled = true # --- LINTING AND TYPING CONFIGURATION --- # +[tool.basedpyright] +include = ["src"] +exclude = ["**/__pycache__"] +pythonVersion = "3.12" + # MyPy configuration # * See https://mypy.readthedocs.io/en/stable/index.html [tool.mypy] python_version = "3.12" +strict = true +warn_unreachable = true warn_return_any = true disallow_untyped_defs = true plugins = [ - 'numpy.typing.mypy_plugin' + "returns.contrib.mypy.returns_plugin", + "numpy.typing.mypy_plugin", ] +[[tool.mypy.overrides]] +# Ignore missing imports for libraries that don't have them. +# If they are ever made, remove from here! +module = [ + "cfgrib", + "botocore.session", + "botocore.client", + "joblib", + "ocf_blosc2", + "s3fs", + "zarr", +] +ignore_missing_imports = true + # Ruff configuration # * See https://beta.ruff.rs/docs/ [tool.ruff] +line-length = 100 +indent-width = 4 +exclude = ["__init__.py"] + +[tool.ruff.lint] +fixable = ["ALL"] +ignore = ["ANN101", "ANN102"] select = [ "F", # pyflakes "E", # pycodestyle @@ -102,22 +137,30 @@ select = [ "D", # pydocstyle "RUF", # ruff-specific rules ] -line-length = 100 -indent-width = 4 -# Use pydocstyle Google convention -# See https://www.pydocstyle.org/en/stable/error_codes.html -ignore = [ - "D203", "D213", "D215", "D400", "D401", "D404", "D406", - "D407", "D408", "D409", "D413", - "ANN101", -] -exclude = ["__init__.py"] -fixable = ["ALL"] + +[tool.ruff.lint.per-file-ignores] +"test*" = ["D", "ANN"] + +[tool.ruff.lint.pydocstyle] +convention = "google" [tool.ruff.format] quote-style = "double" indent-style = "space" line-ending = "auto" +docstring-code-format = true +docstring-code-line-length = 100 + +# --- DOCUMENTATION CONFIGURATION --- # + +[tool.pydoctor] +add-package = ["src/nwp_consumer"] +project-base-dir = "src/nwp_consumer" +docformat = "google" +html-output = "docs" +theme = "classic" +warnings-as-errors = true +privacy = [ + "HIDDEN:**.test_*", +] -[tool.ruff.per-file-ignores] -"test*" = ["D", "ANN"] diff --git a/src/nwp_consumer/__init__.py b/src/nwp_consumer/__init__.py index 1d5d4c91..b212e68e 100644 --- a/src/nwp_consumer/__init__.py +++ b/src/nwp_consumer/__init__.py @@ -1,88 +1,163 @@ -"""Logging configuration for the application.""" +"""NWP Consumer. -import logging -import os -import sys +Usage Documentation +=================== -import psutil -import structlog +Configuration +------------- -# Ignore modules' emitted logs -for name in ( - "boto", - "elasticsearch", - "urllib3", - "cfgrib", - "xarray", - "ecmwfapi", - "api", - "multiprocessing", -): - logging.getLogger(name).setLevel(logging.ERROR) - -# Set the log level -LOGLEVEL = os.getenv("LOGLEVEL", "INFO").upper() -_nameToLevel = { - "CRITICAL": logging.CRITICAL, - "FATAL": logging.FATAL, - "ERROR": logging.ERROR, - "WARN": logging.WARNING, - "WARNING": logging.WARNING, - "INFO": logging.INFO, - "DEBUG": logging.DEBUG, - "NOTSET": logging.NOTSET, -} - - -class UsageProfiler: - """Add CPU and RAM usage to the log event.""" - - def __call__( - self, - logger: structlog.types.WrappedLogger, # noqa: ARG002 - name: str, # noqa: ARG002 - event_dict: structlog.types.EventDict, - ) -> structlog.types.EventDict: - """Override the default structlog processor to add CPU and RAM usage to the log event.""" - event_dict["cpu"] = psutil.cpu_percent(1) - event_dict["ram"] = psutil.virtual_memory().used / 1024 / 1024 - return event_dict - -shared_processors = [ - structlog.stdlib.PositionalArgumentsFormatter(), - structlog.processors.CallsiteParameterAdder( - [ - structlog.processors.CallsiteParameter.FILENAME, - structlog.processors.CallsiteParameter.PATHNAME, - structlog.processors.CallsiteParameter.LINENO, - ], - ), - structlog.stdlib.add_log_level, - structlog.processors.TimeStamper(fmt="iso"), - structlog.processors.StackInfoRenderer(), - UsageProfiler(), -] - -if sys.stderr.isatty(): - # Pretty printing when we run in a terminal session. - # Automatically prints pretty tracebacks when "rich" is installed - processors = [ - *shared_processors, - structlog.dev.ConsoleRenderer(), - ] +The following environment variables can be used to configure the application: + +.. code-block:: none + + | Key | Description | Default | + |---------------------------|-------------------------------------|---------------------------------------------| + | LOGLEVEL | The logging level for the app. | INFO | + |---------------------------|-------------------------------------|---------------------------------------------| + | RAWDIR | The working directory for the app. | ~/.local/cache/nwp///raw | + | | Can be a local path or an S3 URI. | | + |---------------------------|-------------------------------------|---------------------------------------------| + | ZARRDIR | The output directory for the app. | ~/.local/cache/nwp///data | + | | Can be a local path or an S3 URI. | | + |---------------------------|-------------------------------------|---------------------------------------------| + | NOTIFICATION_REPOSITORY | The notification repository to use. | stdout | + |---------------------------|-------------------------------------|---------------------------------------------| + | MODEL_REPOSITORY | The model repository to use. | ceda-metoffice-global | + |---------------------------|-------------------------------------|---------------------------------------------| + | CONCURRENCY | Whether to use concurrency. | True | + |---------------------------|-------------------------------------|---------------------------------------------| + +There is also specific configuration variables for some model repositories. +Refer to their documentation for more information: `nwp_consumer.internal.repositories`. + + +Development Documentation +========================= + +Getting started for development +------------------------------- + +In order to work on the project, first clone the repository. +Then, create a virtual environment and install the dependencies +using an editable pip installation:: + + $ git clone git@github.com:openclimatefix/nwp-consumer.git + $ cd nwp-consumer + $ python -m venv ./venv + $ source ./venv/bin/activate + $ pip install -e .[dev] + +.. note:: ZSH users may have to escape the square brackets in the last command. + +This enables the use of the 'nwp-consumer-cli' command in the virtualenv, which +runs the `nwp_consumer.cmd.main.run_cli` entrypoint. The editable installation +ensures that changes to the code are immediately reflected while using the command. + + +Project structure +----------------- + +The code is structured following principles from the `Hexagonal Architecture`_ pattern. +In brief, this means a clear separation between +the application's business logic - it's *core* - and the *actors* that are external to it. + +The core of the services is split into three main components: + +- `nwp_consumer.internal.entities` - The domain classes that define the structure of the data + that the services works with, and the business logic they contain. +- `nwp_consumer.internal.ports` - The interfaces that define how the services interact with external actors. +- `nwp_consumer.internal.services` - The business logic that defines how the service functions. + +Alongside these core components are the actors, which adhere to the interfaces defined in the +ports module. Actors come in two flavours, *driving* and *driven*. +Driven actors are sources and sinks of data, such as databases and message queues, +while driving actors are methods of interacting with the core, such as a command-line interface +or REST server. + +This application currently has the following defined actors: +- `nwp_consumer.internal.repositories.model_repositories` (driven) - The sources of NWP data. +- `nwp_consumer.internal.repositories.notification_repositories` (driven) - The sinks of notification data. +- `nwp_consumer.internal.handlers.cli` (driving) - The command-line interface for the services. + +The actors are then responsible for implementing the abstract ports, +and are *dependency-injected* in at runtime. This allows the services to be easily tested +and extended. See 'further reading' for more information. + +Head into `nwp_consumer.internal` to see the details of each of these components. + +Where do I go to...? +-------------------- + +- **...modify the business logic?** Check out the `internal.services` module. +- **...add a new source of NWP data?** Implement a new repository in `internal.repositories.model_repositories`. +- **...modify the command line interface?** Check out `internal.handlers.cli`. + +Further reading +=============== + +On packaging a python project using setuptools and pyproject.toml: + +- The official PyPA packaging guide: https://packaging.python.org/ +- A step-by-step practical guide on the *godatadriven* blog: + https://godatadriven.com/blog/a-practical-guide-to-setuptools-and-pyproject-toml/ +- The pyproject.toml metadata specification: + https://packaging.python.org/en/latest/specifications/declaring-project-metadata + +On hexagonal architecture: + +- A concrete example using Python: + https://medium.com/towards-data-engineering/a-concrete-example-of-the-hexagonal-architecture-in-python-d821213c6fb9 +- An overview of the fundamentals incorporating Typescript: + https://medium.com/ssense-tech/hexagonal-architecture-there-are-always-two-sides-to-every-story-bc0780ed7d9c + +- Another example using Go: + https://medium.com/@matiasvarela/hexagonal-architecture-in-go-cfd4e436faa3 + +On the directory structure: + +- The official PyPA discussion on src and flat layouts" + https://packaging.python.org/en/latest/discussions/src-layout-vs-flat-layout/ + +.. _Hexagonal Architecture: https://alistair.cockburn.us/hexagonal-architecture/ +""" + +import logging +import sys +import os + +if sys.stdout.isatty(): + # Simple logging for terminals + _formatstr="%(levelname)s [%(name)s] | %(message)s" else: - # Print JSON when we run, e.g., in a Docker container. - # Also print structured tracebacks. - processors = [ - *shared_processors, - structlog.processors.EventRenamer("message", replace_by="_event"), - structlog.processors.dict_tracebacks, - structlog.processors.JSONRenderer(sort_keys=True), - ] - -# Add required processors and formatters to structlog -structlog.configure( - wrapper_class=structlog.make_filtering_bound_logger(_nameToLevel[LOGLEVEL]), - processors=processors, + # JSON logging for containers + _formatstr="".join(( + "{", + '"message": "%(message)s", ', + '"severity": "%(levelname)s", "timestamp": "%(asctime)s.%(msecs)03dZ", ', + '"logging.googleapis.com/labels": {"python_logger": "%(name)s"}, ', + '"logging.googleapis.com/sourceLocation": ', + '{"file": "%(filename)s", "line": %(lineno)d, "function": "%(funcName)s"}', + "}", + )) + +_loglevel: int | str = logging.getLevelName(os.getenv("LOGLEVEL", "INFO").upper()) +logging.basicConfig( + level=logging.INFO if isinstance(_loglevel, str) else _loglevel, + stream=sys.stdout, + format=_formatstr, + datefmt="%Y-%m-%dT%H:%M:%S", ) + +for logger in [ + "numcodecs", + "numexpr", + "gribapi", + "aiobotocore", + "s3fs", + "fsspec", + "asyncio", + "botocore", + "cfgrib", +]: + logging.getLogger(logger).setLevel(logging.WARNING) diff --git a/src/nwp_consumer/cmd/__init__.py b/src/nwp_consumer/cmd/__init__.py index e69de29b..7b4ead3a 100644 --- a/src/nwp_consumer/cmd/__init__.py +++ b/src/nwp_consumer/cmd/__init__.py @@ -0,0 +1 @@ +"""Entrypoints for the nwp_consumer application.""" \ No newline at end of file diff --git a/src/nwp_consumer/cmd/main.py b/src/nwp_consumer/cmd/main.py index 182e2dc8..e308d5fa 100644 --- a/src/nwp_consumer/cmd/main.py +++ b/src/nwp_consumer/cmd/main.py @@ -1,259 +1,75 @@ -"""nwp-consumer. +"""Entrypoints to the nwp-consumer service.""" -Usage: - nwp-consumer download --source=SOURCE [--sink=SINK --from=FROM --to=TO --rdir=RDIR --zdir=ZDIR --rsink=RSINK --no-rename-vars --no-variable-dim --create-latest] - nwp-consumer convert --source=SOURCE [--sink=SINK --from=FROM --to=TO --rdir=RDIR --zdir=ZDIR --rsink=RSINK --no-rename-vars --no-variable-dim --create-latest] - nwp-consumer consume --source=SOURCE [--sink=SINK --from=FROM --to=TO --rdir=RDIR --zdir=ZDIR --rsink=RSINK --no-rename-vars --no-variable-dim --create-latest] - nwp-consumer env (--source=SOURCE | --sink=SINK) - nwp-consumer check [--sink=SINK] [--rdir=RDIR] [--zdir=ZDIR] - nwp-consumer (-h | --help) - nwp-consumer --version - -Commands: - download Download raw data from source to raw sink - convert Convert raw data present in raw sink to zarr sink - consume Download and convert raw data from source to sink - check Perform a healthcheck on the service - env Print the unset environment variables required by the source/sink - -Options: - --from=FROM Start datetime in YYYY-MM-DDTHH:MM or YYYY-MM-DD format [default: today]. - --to=TO End datetime in YYYY-MM-DD or YYYY-MM-DDTHH:MM format. - --source=SOURCE Data source (ceda/metoffice/ecmwf-mars/ecmwf-s3/icon/cmc/gfs). - --sink=SINK Data sink (local/s3/huggingface) [default: local]. - --rsink=RSINK Data sink for raw data, if different (local/s3/huggingface) [default: SINK]. - --rdir=RDIR Directory of raw data store [default: /tmp/raw]. - --zdir=ZDIR Directory of zarr data store [default: /tmp/zarr]. - --create-latest Create a zarr of the dataset with the latest init time [default: False]. - --no-rename-vars Don't rename parameters to standard names. - --no-variable-dim Don't stack data variables into a single dimension. - -Generic Options: - --version Show version. - -h, --help Show this screen. - -v, --verbose Enable verbose logging [default: False]. -""" - -import contextlib -import datetime as dt -import importlib.metadata -import pathlib -import shutil -import sys -from distutils.util import strtobool - -import dask -import dask.distributed -import sentry_sdk -import structlog +import logging import os -from docopt import docopt - -from nwp_consumer import internal -from nwp_consumer.internal import config -from nwp_consumer.internal.service import NWPConsumerService - -__version__ = "local" - -with contextlib.suppress(importlib.metadata.PackageNotFoundError): - __version__ = importlib.metadata.version("package-name") - -log = structlog.getLogger() - -#sentry -sentry_sdk.init( - dsn=os.getenv("SENTRY_DSN"), - environment=os.getenv("ENVIRONMENT", "local"), - traces_sample_rate=1 -) - -sentry_sdk.set_tag("app_name", "nwp_consumer") -sentry_sdk.set_tag("version", __version__) - - -def run(argv: list[str]) -> tuple[list[pathlib.Path], list[pathlib.Path]]: - """Run the CLI. - - Args: - argv: The command line arguments. - - Returns: - A tuple of lists of raw and processed files. - """ - # --- Map environment variables to service configuration --- ## Configure dask - - dask.config.set({"array.slicing.split_large_chunks": True}) - if config.ConsumerEnv().DASK_SCHEDULER_ADDRESS != "": - # Connect to the dask scheduler if the address is set - # * This becomes the default client for all dask operations - client = dask.distributed.Client( - address=config.ConsumerEnv().DASK_SCHEDULER_ADDRESS, - ) - log.info( - event="Connected to dask scheduler", - address=config.ConsumerEnv().DASK_SCHEDULER_ADDRESS, - ) - - # --- Run the service with the desired command --- # - arguments = docopt(__doc__, argv=argv, version=__version__) - - # Logic for the env command - if arguments["env"]: - parse_actor(source=arguments["--source"], sink=arguments["--sink"]).print_env() - return [], [] - - # Create the service using the fetcher and storer - fetcher = parse_actor(arguments["--source"], None)().configure_fetcher() - storer = parse_actor(None, arguments["--sink"])().configure_storer() - if arguments["--rsink"] == "SINK": - rawstorer = storer - else: - rawstorer = parse_actor(None, arguments["--rsink"])().configure_storer() - - service = NWPConsumerService( - fetcher=fetcher, - storer=storer, - rawstorer=rawstorer, - zarrdir=arguments["--zdir"], - rawdir=arguments["--rdir"], - rename_vars=not arguments["--no-rename-vars"], - variable_dim=not arguments["--no-variable-dim"], +import sys +from typing import NamedTuple + +from nwp_consumer.internal import handlers, ports, repositories, services + +log = logging.getLogger("nwp-consumer") + +class Adaptors(NamedTuple): + """Adaptors for the CLI.""" + model_repository: type[ports.ModelRepository] + notification_repository: type[ports.NotificationRepository] + +def parse_env() -> Adaptors: + """Parse from the environment.""" + model_repository_adaptor: type[ports.ModelRepository] + match os.getenv("MODEL_REPOSITORY"): + # Default to NOAA S3 as it is freely accessible + case None | "gfs": + model_repository_adaptor = \ + repositories.model_repositories.NOAAS3ModelRepository + case "ceda": + model_repository_adaptor = \ + repositories.model_repositories.CEDAFTPModelRepository + case "ecmwf-realtime": + model_repository_adaptor = \ + repositories.model_repositories.ECMWFRealTimeS3ModelRepository + case "metoffice-datahub": + model_repository_adaptor = \ + repositories.model_repositories.MetOfficeDatahubModelRepository + case _ as mr: + log.error( + f"Unknown model repository '{mr}'. Expected one of " + f"['gfs', 'ceda', 'ecmwf-realtime', 'metoffice-datahub']", + ) + sys.exit(1) + + notification_repository_adaptor: type[ports.NotificationRepository] + match os.getenv("NOTIFICATION_REPOSITORY", "stdout"): + case "stdout": + notification_repository_adaptor = \ + repositories.notification_repositories.StdoutNotificationRepository + case "dagster-pipes": + notification_repository_adaptor = \ + repositories.notification_repositories.DagsterPipesNotificationRepository + case _ as notification: + log.error(f"Unknown notification repository: {notification}") + sys.exit(1) + + return Adaptors( + model_repository=model_repository_adaptor, + notification_repository=notification_repository_adaptor, ) - # Logic for the "check" command - if arguments["check"]: - _ = service.Check() - return [], [] - - # Process the from and to arguments - start, end = _parse_from_to(arguments["--from"], arguments["--to"]) - - # Logic for the other commands - log.info("nwp-consumer service starting", version=__version__, arguments=arguments) - rawFiles: list[pathlib.Path] = [] - processedFiles: list[pathlib.Path] = [] - - if arguments["download"]: - rawFiles = service.DownloadRawDataset(start=start, end=end) - - if arguments["convert"]: - processedFiles = service.ConvertRawDatasetToZarr(start=start, end=end) - - if arguments["consume"]: - service.Check() - rawFiles = service.DownloadRawDataset(start=start, end=end) - processedFiles = service.ConvertRawDatasetToZarr(start=start, end=end) - - if arguments["--create-latest"]: - processedFiles += service.CreateLatestZarr() - - return rawFiles, processedFiles - - -def main() -> None: - """Entry point for the nwp-consumer CLI.""" - erred = False - - programStartTime = dt.datetime.now(tz=dt.UTC) - try: - files: tuple[list[pathlib.Path], list[pathlib.Path]] = run(argv=sys.argv[1:]) - log.info( - event="processed files", - raw_files=len(files[0]), - processed_files=len(files[1]), - ) - except Exception as e: - log.error("encountered error running nwp-consumer", error=str(e), exc_info=True) - erred = True - finally: - clearableCache: list[pathlib.Path] = list(internal.CACHE_DIR.glob("*")) - for p in clearableCache: - if p.exists() and p.is_dir(): - shutil.rmtree(p) - if p.is_file(): - p.unlink(missing_ok=True) - elapsedTime = dt.datetime.now(tz=dt.UTC) - programStartTime - log.info(event="nwp-consumer finished", elapsed_time=str(elapsedTime), version=__version__) - if erred: - exit(1) - - -def _parse_from_to(fr: str, to: str | None) -> tuple[dt.datetime, dt.datetime]: - """Process the from and to arguments.""" - # Modify the default "today" argument to today's date - if fr == "today": - fr = dt.datetime.now(tz=dt.UTC).strftime("%Y-%m-%d") - # Modify the "latest" argument to the most recent 6 hour interval - if fr == "latest": - now = dt.datetime.now(tz=dt.UTC) - fr = now.replace(hour=(now.hour // 6) * 6, minute=0).strftime("%Y-%m-%dT%H:%M") - # If --from specifies a date, and --to is not set, set --to to the next day - if len(fr) == 10 and to is None: - to = ( - dt.datetime.strptime( - fr, - "%Y-%m-%d", - ).replace(tzinfo=dt.UTC) - + dt.timedelta(days=1) - ).strftime("%Y-%m-%d") - # Otherwise, --from specifies a datetime, - # so if --to is not set, set --to to the same time - if to is None: - to = fr - # If --from and --to are missing time information, assume midnight - if len(fr) == 10: - fr += "T00:00" - if len(to) == 10: - to += "T00:00" - # Process to datetime objects - start: dt.datetime = dt.datetime.strptime( - fr, - "%Y-%m-%dT%H:%M", - ).replace(tzinfo=dt.UTC) - end: dt.datetime = dt.datetime.strptime( - to, - "%Y-%m-%dT%H:%M", - ).replace(tzinfo=dt.UTC) - - if end < start: - raise ValueError("argument '--from' cannot specify date prior to '--to'") - - return start, end - - -def parse_actor(source: str | None, sink: str | None) -> type[config.EnvParser]: - """Parse the actor argument into a class that can parse environment variables.""" - SOURCE_ENV_MAP: dict[str, type[config.EnvParser]] = { - "ceda": config.CEDAEnv, - "metoffice": config.MetOfficeEnv, - "ecmwf-mars": config.ECMWFMARSEnv, - "ecmwf-s3": config.ECMWFS3Env, - "icon": config.ICONEnv, - "cmc": config.CMCEnv, - "gfs": config.GFSEnv, - } - SINK_ENV_MAP: dict[str, type[config.EnvParser]] = { - "local": config.LocalEnv, - "s3": config.S3Env, - "huggingface": config.HuggingFaceEnv, - } - - if source: - try: - return SOURCE_ENV_MAP[source] - except KeyError as e: - raise ValueError( - f"Unknown source {source}. Expected one of {list(SOURCE_ENV_MAP.keys())}", - ) from e - if sink: - try: - return SINK_ENV_MAP[sink] - except KeyError as e: - raise ValueError( - f"Unknown sink {sink}. Expected one of {list(SINK_ENV_MAP.keys())}", - ) from e - raise ValueError("Either source or sink must be specified") +def run_cli() -> None: + """Entrypoint for the CLI handler.""" + # TODO: InfoUseCase + adaptors = parse_env() + c = handlers.CLIHandler( + consumer_usecase=services.ConsumerService( + model_repository=adaptors.model_repository, + notification_repository=adaptors.notification_repository, + ), + archiver_usecase=services.ArchiverService( + model_repository=adaptors.model_repository, + notification_repository=adaptors.notification_repository, + ), + ) + returncode: int = c.run() + sys.exit(returncode) -if __name__ == "__main__": - main() diff --git a/src/nwp_consumer/cmd/test_main.py b/src/nwp_consumer/cmd/test_main.py deleted file mode 100644 index 73ee7683..00000000 --- a/src/nwp_consumer/cmd/test_main.py +++ /dev/null @@ -1,56 +0,0 @@ -import datetime as dt -import os -import unittest -from unittest import mock - -from nwp_consumer.internal import FetcherInterface - -from .main import _parse_from_to - - - -class TestParseFromTo(unittest.TestCase): - def test_today(self) -> None: - # Test that today is processed correctly - start, end = _parse_from_to("today", None) - self.assertEqual( - start, - dt.datetime.now(tz=dt.UTC).replace(hour=0, minute=0, second=0, microsecond=0), - ) - self.assertEqual( - end, - dt.datetime.now(tz=dt.UTC).replace(hour=0, minute=0, second=0, microsecond=0) - + dt.timedelta(days=1), - ) - - def test_from_date(self) -> None: - # Test that a date is processed correctly - start, end = _parse_from_to("2021-01-01", None) - self.assertEqual(start, dt.datetime(2021, 1, 1, tzinfo=dt.UTC)) - self.assertEqual(end, dt.datetime(2021, 1, 2, tzinfo=dt.UTC)) - - def test_from_datetime(self) -> None: - # Test that a datetime is processed correctly - start, end = _parse_from_to("2021-01-01T12:00", None) - self.assertEqual(start, dt.datetime(2021, 1, 1, 12, 0, tzinfo=dt.UTC)) - self.assertEqual(end, dt.datetime(2021, 1, 1, 12, 0, tzinfo=dt.UTC)) - - def test_from_datetime_to_date(self) -> None: - # Test that a datetime is processed correctly - start, end = _parse_from_to("2021-01-01T12:00", "2021-01-02") - self.assertEqual(start, dt.datetime(2021, 1, 1, 12, 0, tzinfo=dt.UTC)) - self.assertEqual(end, dt.datetime(2021, 1, 2, 0, tzinfo=dt.UTC)) - - def test_from_datetime_to_datetime(self) -> None: - # Test that a datetime is processed correctly - start, end = _parse_from_to("2021-01-01T12:00", "2021-01-02T12:00") - self.assertEqual(start, dt.datetime(2021, 1, 1, 12, 0, tzinfo=dt.UTC)) - self.assertEqual(end, dt.datetime(2021, 1, 2, 12, 0, tzinfo=dt.UTC)) - - def test_invalid_datetime(self) -> None: - # Test that an invalid datetime is processed correctly - with self.assertRaises(ValueError): - _parse_from_to("2021-01-01T12:00:00", None) - - with self.assertRaises(ValueError): - _parse_from_to("2021010100", None) diff --git a/src/nwp_consumer/internal/__init__.py b/src/nwp_consumer/internal/__init__.py index b7be9270..1a135697 100644 --- a/src/nwp_consumer/internal/__init__.py +++ b/src/nwp_consumer/internal/__init__.py @@ -1,37 +1,18 @@ -"""The internal package contains code not intended for external import.""" +"""Internal workings of the services. -__all__ = [ - "OCFParameter", - "FetcherInterface", - "StorageInterface", - "FileInfoModel", - "CACHE_DIR", - "CACHE_DIR_RAW", - "CACHE_DIR_ZARR", - "IT_FULLPATH_ZARR", - "IT_FOLDER_STRUCTURE_RAW", - "IT_FOLDER_GLOBSTR_RAW", - "IT_FOLDER_STRUCTURE_ZARR", - "IT_FOLDER_GLOBSTR_ZARR", - "rawCachePath", - "zarrCachePath", -] +Why have an internal package? +----------------------------- -from .cache import ( - CACHE_DIR, - CACHE_DIR_RAW, - CACHE_DIR_ZARR, - IT_FOLDER_GLOBSTR_RAW, - IT_FOLDER_GLOBSTR_ZARR, - IT_FOLDER_STRUCTURE_RAW, - IT_FOLDER_STRUCTURE_ZARR, - IT_FULLPATH_ZARR, - rawCachePath, - zarrCachePath, -) -from .models import ( - FetcherInterface, - FileInfoModel, - OCFParameter, - StorageInterface, -) +This package is meant to be run as a services, either via a binary or a container image. +However, the code can still be used as a library, and a user could import the modules +from this package and use them in their own code. + +The "internal" package signifies that the modules within are not meant to be used, +or are not guaranteed to be stable, for external users. This helps to discourage casual +dependence in other services. Any functionality looking to be re-used should either +become a shared library or simply be copied from the source code. +""" + +from . import entities, handlers, ports, repositories, services + +__all__ = ["entities", "ports", "handlers", "repositories", "services"] diff --git a/src/nwp_consumer/internal/cache.py b/src/nwp_consumer/internal/cache.py deleted file mode 100644 index 4bdfd34b..00000000 --- a/src/nwp_consumer/internal/cache.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Defines the cache for the application. - -Many sources of data do not give any option for accessing their files -via e.g. a BytesIO object. Were this the case, we could use a generic -local filesystem adaptor to handle all incoming data. Since it isn't, -and instead often a pre-existing file object is required to push data -into, a cache is required to store the data temporarily. - -The cache is a simple directory structure that stores files in a -hierarchical format; with the top level directory being the source of -the data, followed by a subdirectory for the type of data (raw or -zarr), then further subdirectories according to the init time -associated with the file. - -Driven actors are then responsible for mapping the cached data to the -desired storage location. - -Example: -|--- /tmp/nwpc -| |--- source1 -| | |--- raw -| | | |--- 2021 -| | | |--- 01 -| | | |--- 01 -| | | |--- 0000 -| | | |--- parameter1.grib -| | | |--- parameter2.grib -| | | |--- 1200 -| | | |--- parameter1.grib -| | | |--- parameter2.grib -| | |--- zarr -| | |--- 2021 -| | |--- 01 -| | |--- 01 -| | |--- 20210101T0000.zarr.zip -| | |--- 20210101T1200.zarr.zip -""" - -import datetime as dt -import pathlib - -# --- Constants --- # - -# Define the location of the consumer's cache directory -CACHE_DIR = pathlib.Path("/tmp/nwpc") # noqa: S108 -CACHE_DIR_RAW = CACHE_DIR / "raw" -CACHE_DIR_ZARR = CACHE_DIR / "zarr" - -# Define the datetime format strings for creating a folder -# structure from a datetime object for raw and zarr files -IT_FOLDER_STRUCTURE_RAW = "%Y/%m/%d/%H%M" -IT_FOLDER_GLOBSTR_RAW = "*/*/*/*" -IT_FOLDER_STRUCTURE_ZARR = "%Y/%m/%d" -IT_FOLDER_GLOBSTR_ZARR = "*/*/*" - -# Define the datetime format string for a zarr filename -IT_FILENAME_ZARR = "%Y%m%dT%H%M.zarr" -IT_FULLPATH_ZARR = f"{IT_FOLDER_STRUCTURE_ZARR}/{IT_FILENAME_ZARR}" - -# --- Functions --- # - - -def rawCachePath(it: dt.datetime, filename: str) -> pathlib.Path: - """Create a filepath to cache a raw file. - - Args: - it: The initialisation time of the file to cache. - filename: The name of the file (including extension). - - Returns: - The path to the cached file. - """ - # Build the directory structure according to the file's datetime - parent: pathlib.Path = CACHE_DIR_RAW / it.strftime(IT_FOLDER_STRUCTURE_RAW) - parent.mkdir(parents=True, exist_ok=True) - return parent / filename - - -def zarrCachePath(it: dt.datetime) -> pathlib.Path: - """Create a filepath to cache a zarr file. - - Args: - it: The initialisation time of the file to cache. - - Returns: - The path to the cache file. - """ - # Build the directory structure according to the file's datetime - parent: pathlib.Path = CACHE_DIR_ZARR / it.strftime(IT_FOLDER_STRUCTURE_ZARR) - parent.mkdir(parents=True, exist_ok=True) - return parent / it.strftime(IT_FILENAME_ZARR) diff --git a/src/nwp_consumer/internal/config/__init__.py b/src/nwp_consumer/internal/config/__init__.py deleted file mode 100644 index 84b9e414..00000000 --- a/src/nwp_consumer/internal/config/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Configuration for the service.""" - -__all__ = [ - "EnvParser", - "CEDAEnv", - "ConsumerEnv", - "CMCEnv", - "ECMWFMARSEnv", - "ECMWFS3Env", - "ICONEnv", - "GFSEnv", - "HuggingFaceEnv", - "MetOfficeEnv", - "S3Env", - "LocalEnv", -] - -from .env import ( - CEDAEnv, - CMCEnv, - ConsumerEnv, - ECMWFMARSEnv, - ECMWFS3Env, - EnvParser, - GFSEnv, - HuggingFaceEnv, - ICONEnv, - LocalEnv, - MetOfficeEnv, - S3Env, -) diff --git a/src/nwp_consumer/internal/config/env.py b/src/nwp_consumer/internal/config/env.py deleted file mode 100644 index 2a7e80ad..00000000 --- a/src/nwp_consumer/internal/config/env.py +++ /dev/null @@ -1,248 +0,0 @@ -"""Config struct for application running.""" -import os -from distutils.util import strtobool -from typing import get_type_hints - -import structlog - -from nwp_consumer import internal -from nwp_consumer.internal import inputs, outputs - -log = structlog.getLogger() - - -class EnvParser: - """Mixin to parse environment variables into class fields. - - Whilst this could be done with Pydantic, it's nice to avoid the - extra dependency if possible, and pydantic would be overkill for - this small use case. - """ - - def __init__(self) -> None: - """Parse environment variables into class fields. - - If the class field is upper case, parse it into the indicated - type from the environment. Required fields are those set in - the child class without a default value. - - Examples: - >>> MyEnv(EnvParser): - >>> REQUIRED_ENV_VAR: str - >>> OPTIONAL_ENV_VAR: str = "default value" - >>> ignored_var: str = "ignored" - """ - for field, t in get_type_hints(self).items(): - # Skip item if not upper case - if not field.isupper(): - continue - - # Log Error if required field not supplied - default_value = getattr(self, field, None) - match (default_value, os.environ.get(field)): - case (None, None): - # No default value, and field not in env - raise OSError(f"Required field {field} not supplied") - case (_, None): - # A default value is set and field not in env - pass - case (_, _): - # Field is in env - env_value: str | bool = os.environ[field] - # Handle bools seperately as bool("False") == True - if t == bool: - env_value = bool(strtobool(os.environ[field])) - # Cast to desired type - self.__setattr__(field, t(env_value)) - - @classmethod - def print_env(cls) -> None: - """Print the required environment variables.""" - message: str = f"Environment variables for {cls.__class__.__name__}:\n" - for field, _ in get_type_hints(cls).items(): - if not field.isupper(): - continue - default_value = getattr(cls, field, None) - message += f"\t{field}{'(default: ' + default_value + ')' if default_value else ''}\n" - log.info(message) - - def configure_fetcher(self) -> internal.FetcherInterface: - """Configure the associated fetcher.""" - raise NotImplementedError( - "Fetcher not implemented for this environment. Check the available inputs.", - ) - - def configure_storer(self) -> internal.StorageInterface: - """Configure the associated storer.""" - raise NotImplementedError( - "Storer not implemented for this environment. Check the available outputs.", - ) - - -# --- Configuration environment variables --- # - - -class ConsumerEnv(EnvParser): - """Config for Consumer.""" - - DASK_SCHEDULER_ADDRESS: str = "" - - -# --- Inputs environment variables --- # - - -class CEDAEnv(EnvParser): - """Config for CEDA FTP server.""" - - CEDA_FTP_USER: str - CEDA_FTP_PASS: str - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.ceda.Client(ftpUsername=self.CEDA_FTP_USER, ftpPassword=self.CEDA_FTP_PASS) - - -class MetOfficeEnv(EnvParser): - """Config for Met Office API.""" - - METOFFICE_ORDER_ID: str - METOFFICE_API_KEY: str - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.metoffice.Client( - apiKey=self.METOFFICE_API_KEY, - orderID=self.METOFFICE_ORDER_ID, - ) - - -class ECMWFMARSEnv(EnvParser): - """Config for ECMWF MARS API.""" - - ECMWF_API_KEY: str - ECMWF_API_URL: str - ECMWF_API_EMAIL: str - ECMWF_AREA: str = "uk" - ECMWF_HOURS: int = 48 - ECMWF_PARAMETER_GROUP: str = "default" - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.ecmwf.MARSClient( - area=self.ECMWF_AREA, - hours=self.ECMWF_HOURS, - param_group=self.ECMWF_PARAMETER_GROUP, - ) - - -class ECMWFS3Env(EnvParser): - """Config for ECMWF S3.""" - - ECMWF_AWS_S3_BUCKET: str - ECMWF_AWS_ACCESS_KEY: str = "" - ECMWF_AWS_ACCESS_SECRET: str = "" - ECMWF_AWS_REGION: str - ECMWF_AREA: str = "uk" - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.ecmwf.S3Client( - bucket=self.ECMWF_AWS_S3_BUCKET, - area=self.ECMWF_AREA, - region=self.ECMWF_AWS_REGION, - key=self.ECMWF_AWS_ACCESS_KEY, - secret=self.ECMWF_AWS_ACCESS_SECRET, - ) - - -class ICONEnv(EnvParser): - """Config for ICON API.""" - - ICON_MODEL: str = "europe" - ICON_HOURS: int = 48 - ICON_PARAMETER_GROUP: str = "default" - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.icon.Client( - model=self.ICON_MODEL, - hours=self.ICON_HOURS, - param_group=self.ICON_PARAMETER_GROUP, - ) - - -class CMCEnv(EnvParser): - """Config for CMC API.""" - - CMC_MODEL: str = "gdps" - CMC_HOURS: int = 240 - CMC_PARAMETER_GROUP: str = "full" - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.cmc.Client( - model=self.CMC_MODEL, - hours=self.CMC_HOURS, - param_group=self.CMC_PARAMETER_GROUP, - ) - -class GFSEnv(EnvParser): - """Config for GFS API.""" - - GFS_MODEL: str = "global" - GFS_HOURS: int = 48 - GFS_PARAMETER_GROUP: str = "default" - - def configure_fetcher(self) -> internal.FetcherInterface: - """Overrides the corresponding method in the parent class.""" - return inputs.noaa.AWSClient( - model=self.GFS_MODEL, - param_group=self.GFS_PARAMETER_GROUP, - hours=self.GFS_HOURS, - ) - - -# --- Outputs environment variables --- # - - -class LocalEnv(EnvParser): - """Config for local storage.""" - - # Required for EnvParser to believe it's a valid class - dummy_field: str = "" - - def configure_storer(self) -> internal.StorageInterface: - """Overrides the corresponding method in the parent class.""" - return outputs.localfs.Client() - - -class S3Env(EnvParser): - """Config for S3.""" - - AWS_S3_BUCKET: str - AWS_ACCESS_KEY: str = "" - AWS_ACCESS_SECRET: str = "" - AWS_REGION: str - - def configure_storer(self) -> internal.StorageInterface: - """Overrides the corresponding method in the parent class.""" - return outputs.s3.Client( - bucket=self.AWS_S3_BUCKET, - region=self.AWS_REGION, - key=self.AWS_ACCESS_KEY, - secret=self.AWS_ACCESS_SECRET, - ) - - -class HuggingFaceEnv(EnvParser): - """Config for HuggingFace API.""" - - HUGGINGFACE_TOKEN: str - HUGGINGFACE_REPO_ID: str - - def configure_storer(self) -> internal.StorageInterface: - """Overrides the corresponding method in the parent class.""" - return outputs.huggingface.Client( - token=self.HUGGINGFACE_TOKEN, - repoID=self.HUGGINGFACE_REPO_ID, - ) diff --git a/src/nwp_consumer/internal/config/test_env.py b/src/nwp_consumer/internal/config/test_env.py deleted file mode 100644 index fc720140..00000000 --- a/src/nwp_consumer/internal/config/test_env.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Tests for the config module.""" - -import unittest.mock - -from .env import EnvParser, ICONEnv - - -class TestConfig(EnvParser): - """Test config class.""" - - REQUIRED_STR: str - REQUIRED_BOOL: bool - REQUIRED_INT: int - OPTIONAL_STR: str = "default" - OPTIONAL_BOOL: bool = True - OPTIONAL_INT: int = 4 - - -class Test_EnvParser(unittest.TestCase): - """Tests for the _EnvParseMixin class.""" - - @unittest.mock.patch.dict( - "os.environ", - { - "REQUIRED_STR": "required_str", - "REQUIRED_BOOL": "false", - "REQUIRED_INT": "5", - }, - ) - def test_parsesEnvVars(self) -> None: - tc = TestConfig() - - self.assertEqual("required_str", tc.REQUIRED_STR) - self.assertFalse(tc.REQUIRED_BOOL) - self.assertEqual(5, tc.REQUIRED_INT) - self.assertEqual("default", tc.OPTIONAL_STR) - self.assertTrue(tc.OPTIONAL_BOOL) - self.assertEqual(4, tc.OPTIONAL_INT) - - @unittest.mock.patch.dict( - "os.environ", - { - "REQUIRED_STR": "required_str", - "REQUIRED_BOOL": "not a bool", - "REQUIRED_INT": "5.7", - }, - ) - def test_errorsIfCantCastType(self) -> None: - with self.assertRaises(ValueError): - TestConfig() - - def test_errorsIfRequiredFieldNotSet(self) -> None: - with self.assertRaises(OSError): - TestConfig() - - @unittest.mock.patch.dict( - "os.environ", {"ICON_HOURS": "3", "ICON_PARAMETER_GROUP": "basic"} - ) - def test_parsesIconConfig(self) -> None: - tc = ICONEnv() - - self.assertEqual(3, tc.ICON_HOURS) - self.assertEqual("basic", tc.ICON_PARAMETER_GROUP) diff --git a/src/nwp_consumer/internal/entities/__init__.py b/src/nwp_consumer/internal/entities/__init__.py new file mode 100644 index 00000000..9b71b7fc --- /dev/null +++ b/src/nwp_consumer/internal/entities/__init__.py @@ -0,0 +1,39 @@ +"""Struct definitions for domain entities. + +These define data objects and behaviours that are used in the services core. + +Domain Entities +--------------- + +Entities are the core building blocks of the domain layer. They are the +representations of the business objects that are manipulated by the application. + +By using domain entities in the core, it is ensured that the business logic is +separated from the technical details of the application. + +A domain entity may have associated methods that define its behaviour, but it +should not contain any logic that is specific to a particular implementation. +""" + +from .repometadata import ModelRepositoryMetadata, ModelMetadata +from .tensorstore import ParameterScanResult, TensorStore +from .postprocess import PostProcessOptions, CodecOptions +from .notification import PerformanceMetadata, StoreCreatedNotification, StoreAppendedNotification +from .parameters import Parameter +from .coordinates import NWPDimensionCoordinateMap +from .performance import PerformanceMonitor + +__all__ = [ + "ModelRepositoryMetadata", + "ModelMetadata", + "ParameterScanResult", + "TensorStore", + "PostProcessOptions", + "CodecOptions", + "PerformanceMetadata", + "StoreCreatedNotification", + "StoreAppendedNotification", + "Parameter", + "NWPDimensionCoordinateMap", + "PerformanceMonitor", +] diff --git a/src/nwp_consumer/internal/entities/coordinates.py b/src/nwp_consumer/internal/entities/coordinates.py new file mode 100644 index 00000000..8e192d09 --- /dev/null +++ b/src/nwp_consumer/internal/entities/coordinates.py @@ -0,0 +1,431 @@ +"""Domain entities describing dimensional coordinates. + +Multidimensional data +--------------------- + +Tensor datasets are the primary data structure used in the consumer, which are +characterised by their multidimensional nature. To map data points in a tensor +back to selectable, indexable points along the dimensions of the tensor, a +mapping is required between the integer ticks along the dimension axes and the +values those ticks represent. + +For instance, consider a 2D tensor containing x, y data of the lap number vs lap +time of a runner running around a racetrack. The point (2, 4) in the tensor +would represent the runner's time at lap 2. In this instance the indexes are +2 and 4, but to get back to the values they represent, a mapping of the +dimension indices to coordinate values must be consulted, for instance: + +x index: [0, 1, 2, 3, 4] +x value: [lap 1, lap 2, lap 3, lap 4, lap 5] + +y index: [0, 1, 2, 3, 4] +y value: [0 seconds, 30 seconds, 60 seconds, 90 seconds, 120 seconds] + +Now by consulting the mapping we can see that the point (2, 4) in the tensor +represents that the runners time at lap three was 60 seconds. + + +This formalisation is useful also in the reverse case: inserting data into a +tensor according to its dimension coordinate values, and not its indexes. +This is the primary use case for these maps in this service. + +It is far more likely that for incoming data the coordinate values along the +dimension axes are known, as opposed to the indexes they represent. This mapping +then enables insertion that data into the correct regions of the tensor, which is +a key part of parallel writing. +""" + +import dataclasses +import datetime as dt +import json +from importlib.metadata import PackageNotFoundError, version + +import dask.array +import numpy as np +import pandas as pd +import pytz +import xarray as xr +from returns.result import Failure, ResultE, Success + +from .parameters import Parameter + +try: + __version__ = version("nwp-consumer") +except PackageNotFoundError: + __version__ = "v?" + + +@dataclasses.dataclass(slots=True) +class NWPDimensionCoordinateMap: + """Container for dimensions names and their coordinate index values. + + Each field in the container is a dimension label, and the corresponding + value is a list of the coordinate values for each index along the dimension. + + All NWP data has an associated init time, step, and variable, + so these dimensions names are required. Spatial coordinates however + differ between providers and their grids, so the known spatial dimensions + are optional (but one of them should be present!). + + For instance, most models produce data on a latitude/longitude grid, + but others use alternative projections resulting in x/y/ grids instead. + """ + + init_time: list[dt.datetime] + """The init times of the forecast values.""" + step: list[int] + """The forecast step times. + + This corresponds to the horizon of the values, which is the time + difference between the forecast initialization time and the target + time at which the forecast data is valid. + """ + variable: list[Parameter] + """The variables in the forecast data.""" + latitude: list[float] | None = None + """The latitude coordinates of the forecast grid in degrees. + + TODO: Should go +ve to -ve, e.g. for global, should be 90 to -90. + """ + longitude: list[float] | None = None + """The longitude coordinates of the forecast grid in degrees. + + TODO: Should go -ve to +ve, e.g. for global, should be -180 to 180. + """ + + @property + def dims(self) -> list[str]: + """Get instantiated dimensions. + + Ignores any dimensions that do not have a corresponding coordinate + index value list. + """ + return [f.name for f in dataclasses.fields(self) if getattr(self, f.name) is not None] + + @property + def shapemap(self) -> dict[str, int]: + """Mapping of dimension names to lengths; the 'shape' of the coordinates. + + This is the length of each dimension in the map, + which can be thought of as the number of ticks along each dimension + axis. + """ + return {dim: len(getattr(self, dim)) for dim in self.dims} + + @classmethod + def from_pandas( + cls, + pd_indexes: dict[str, pd.Index], # type: ignore + ) -> ResultE["NWPDimensionCoordinateMap"]: + """Create a new NWPDimensionCoordinateMap from a dictionary of pandas Index objects. + + This is useful for interoperability with xarray, which prefers to define + DataArray coordinates using a dict pandas Index objects. + + To extract the coordinate values from an xarray DataArray, + there is the "indexes" property on an xarray Coordinates object: + + Example: + >>> > idxs = xr_data.coords.indexes + >>> > NWPDimensionCoordinateMap.from_pandas(idxs) + >>> { + >>> "init_time": [dt.datetime(2021, 1, 1, 0, 0)], + >>> "step": [1, 2], + >>> "variable": [Parameter.TEMPERATURE_SL], + >>> "latitude": [90, 80, 70], + >>> "longitude": [45, 50, 55], + >>> } + + See Also: + `NWPDimensionCoordinateMap.to_pandas` for the reverse operation. + """ + if not all(key in pd_indexes for key in ["init_time", "step", "variable"]): + return Failure(KeyError( + f"Cannot create {cls.__class__.__name__} instance from pandas indexes " + "as required keys 'init_time', 'step', and 'variable' are not all present. " + f"Got: '{list(pd_indexes.keys())}'", + )) + if not all(len(pd_indexes[key].to_list()) > 0 for key in ["init_time", "step", "variable"]): + return Failure(ValueError( + f"Cannot create {cls.__class__.__name__} instance from pandas indexes " + "as the 'init_time', 'step', and 'variable' dimensions must have " + "at least one coordinate value.", + )) + input_parameter_set: set[str] = set(pd_indexes["variable"].to_list()) + known_parameter_set: set[str] = {str(p) for p in Parameter} + if not input_parameter_set.issubset(known_parameter_set): + return Failure(ValueError( + f"Cannot create {cls.__class__.__name__} instance from pandas indexes " + "as the 'variable' dimension contains unknown parameters: ", + f"'{list(input_parameter_set.difference(known_parameter_set))}'. " + "Ensure the parameter names match the names of the standard parameter set " + "defined by the `entities.Parameter` Enum.", + )) + if not all(key in [f.name for f in dataclasses.fields(cls)] for key in pd_indexes): + unknown_keys: list[str] = list( + set(pd_indexes.keys()).difference([f.name for f in dataclasses.fields(cls)]), + ) + return Failure(KeyError( + f"Cannot create {cls.__class__.__name__} instance from pandas indexes " + f"as unknown index/dimension keys were encountered: {unknown_keys}.", + )) + # TODO: Ensure correct ordering of lat/long? + + # Convert the pandas Index objects to lists of the appropriate types + return Success( + cls( + # NOTE: The timezone information is stripped from the datetime objects + # as numpy cannot handle timezone-aware datetime objects. As such, it + # must be added back in when converting to a datetime object. + init_time=[ + ts.to_pydatetime().replace(tzinfo=dt.UTC) + for ts in pd_indexes["init_time"].to_list() + ], + step=[np.timedelta64(ts, "h").astype(int) for ts in pd_indexes["step"].to_list()], + # NOTE: This list comprehension can be done safely, as above we have + # already performed a check on the pandas variable names being a subset + # of the `Parameter` enum value names. + variable=[ + Parameter(pdp) + for pdp in pd_indexes["variable"].to_list() + ], + # NOTE: For latitude and longitude values, we round to 4 decimal places + # to avoid floating point precision issues when comparing values. + # It is important to note that this places a limit on the precision + # of the latitude and longitude values that can be stored in the map. + # 4 decimal places corresponds to a precision of ~11m at the equator. + latitude=[float(f"{lat:.4f}") for lat in pd_indexes["latitude"].to_list()] + if "latitude" in pd_indexes + else None, + longitude=[float(f"{lon:.4f}") for lon in pd_indexes["longitude"].to_list()] + if "longitude" in pd_indexes + else None, + ), + ) + + @classmethod + def from_xarray(cls, xarray_obj: xr.DataArray | xr.Dataset) \ + -> ResultE["NWPDimensionCoordinateMap"]: + """Create a new NWPDimensionCoordinateMap from an XArray DataArray or Dataset object.""" + return cls.from_pandas(xarray_obj.coords.indexes) # type: ignore + + def to_pandas(self) -> dict[str, pd.Index]: # type: ignore + """Convert the coordinate map to a dictionary of pandas Index objects. + + This is useful for interoperability with xarray, which prefers to define + DataArray coordinates using a dict pandas Index objects. + + For the most part, the conversion consists of a straightforward cast + to a pandas Index object. However, there are some caveats involving + the time-centric dimensions: + + - XArray will complain if any of the numpy time types have any precision + other than nanoseconds, so care is taken to convert all time types to + np.timedelta64['ns'] or np.datetime64['ns'] as appropriate. + - Similarly, numpy can't handle timezone-aware datetime objects, so + any timezone information is stripped before conversion. + + See Also: + `NWPDimensionCoordinateMap.from_pandas` for the reverse operation. + + """ + out_dict: dict[str, pd.Index] = { # type: ignore + "init_time": pd.Index( + [ + np.datetime64(t.astimezone(pytz.utc).replace(tzinfo=None), "ns") + for t in self.init_time + ], + ), + "step": pd.Index([np.timedelta64(np.timedelta64(h, "h"), "ns") for h in self.step]), + "variable": pd.Index([p.value for p in self.variable]), + } | { + dim: pd.Index(getattr(self, dim)) + for dim in self.dims + if dim not in ["init_time", "step", "variable"] + } + return out_dict + + def determine_region( + self, + inner: "NWPDimensionCoordinateMap", + ) -> ResultE[dict[str, slice]]: + """Return the index slices of inner mapping relative to the outer map. + + The caller is the "outer" dimension mapping, which the "inner" should be + a subset of. A number of requirements must be met for this operation to be + successful: + + - The inner must be a subset of the outer mapping along all dimensions + (i.e. all coordinate values in the inner must be present in the outer + for each dimension). + - The inner's coordinate values must be contiguous along each dimension + of the outer. + - The inner must be of the same dimension map type as the outer map + (i.e. must have exactly the same dimension labels). + + The returned dictionary of slices defines the region of the base map covered + by the instances dimension mapping. + + Note that xarray does have its own implementation of this: the "region='auto'" + argument to the "to_zarr" method performs a similar function. This is + reimplemented in this package partly to ensure consistency of behaviour, + partly to enable more descriptive logging in failure states, and partly to + enable extending the functionality. + + Args: + inner: The dimension coordinate dictionary of the smaller dataset. + + Examples: + Getting the inner map slices relative to the outer map: + + >>> outer = NWPCoordinateMap( + >>> init_time=[dt.datetime(2021, 1, 1, 0, 0)], + >>> step=list(range(48)), + >>> variable=["temperature_sl", "downward_shortwave_radiation_flux_gl"], + >>> latitude=[68.0, 69.0, 70.0], + >>> longitude=[-10.0, -9.0, -8.0] + >>> ), + >>> inner = outer.copy() + >>> # Modify the step of the inner to only cover half the outer + >>> inner["step"] = list(range(24)) + >>> + >>> outer.determine_region(inner) + Success({ + "init_time": slice(0, 1), + "step": slice(0, 24), # Notice the slice is inclusive of the last index + "variable": slice(0, 2), + "latitude": slice(0, 3), + "longitude": slice(0, 3), + }) + + Returns: + Dictionary mapping the slices defining the indexes of the coordinates in + the outer dataset that correspond to the coordinates of the inner + """ + # Ensure the inner and outer maps have the same rank and dimension labels + if inner.dims != self.dims: + return Failure( + KeyError( + "Cannot find slices in non-matching coordinate mappings: " + "both objects must have identical dimensions (rank and labels)." + f"Got: {inner.dims} and {self.dims}.", + ), + ) + + # Ensure the inner map is entirely contained within the outer map + slices = {} + for inner_dim_label in inner.dims: + inner_dim_coords = getattr(inner, inner_dim_label) + outer_dim_coords = getattr(self, inner_dim_label) + if len(inner_dim_coords) > len(outer_dim_coords): + return Failure( + ValueError( + f"Coordinate values for dimension '{inner_dim_label}' in the inner map " + "exceed the number of coordinate values in the outer map. " + f"Got: {len(inner_dim_coords)} (> {len(outer_dim_coords)}) " + f"coordinate values.", + ), + ) + if not set(inner_dim_coords).issubset(set(outer_dim_coords)): + diff_coords = list(set(inner_dim_coords).difference(set(outer_dim_coords))) + first_diff_index: int = inner_dim_coords.index(diff_coords[0]) + return Failure( + ValueError( + f"Coordinate values for dimension '{inner_dim_label}' not all present " + "within outer dimension map. The inner map must be entirely contained " + "within the outer map along every dimension. " + f"Got: {len(diff_coords)}/{len(outer_dim_coords)} differing values. " + f"First differing value: '{diff_coords[0]}' (inner[{first_diff_index}]) != " + f"'{outer_dim_coords[first_diff_index]}' (outer[{first_diff_index}]).", + ), + ) + + # Ensure the inner map's coordinate values are contiguous in the outer map. + # * First, get the index of the corresponding value in the outer map for each + # coordinate value in the inner map: + outer_dim_indices = sorted( + [outer_dim_coords.index(c) for c in inner_dim_coords], + ) + contiguous_index_run = list(range(outer_dim_indices[0], outer_dim_indices[-1] + 1)) + if outer_dim_indices != contiguous_index_run: + idxs = np.argwhere(np.gradient(outer_dim_indices) > 1).flatten() + # TODO: Sometimes, providers send their data in multiple files, the area + # TODO: of which might loop around the edges of the grid. In this case, it would + # TODO: be useful to determine if the run is non-contiguous only in that it wraps + # TODO: around that boundary, and in that case, split it and write it in two goes. + return Failure( + ValueError( + f"Coordinate values for dimension '{inner_dim_label}' do not correspond " + f"with a contiguous index set in the outer dimension map. " + f"Non-contiguous values '{[outer_dim_coords[i] for i in idxs]} " + f"(index {[outer_dim_indices[i] for i in idxs]})' " + f"adjacent in dimension coordinates.", + ), + ) + + slices[inner_dim_label] = slice(outer_dim_indices[0], outer_dim_indices[-1] + 1) + + return Success(slices) + + def default_chunking(self) -> dict[str, int]: + """The expected chunk sizes for each dimension. + + A dictionary mapping of dimension labels to the size of a chunk along that + dimension. Note that the number is chunk size, not chunk number, so a chunk + that wants to cover the entire dimension should have a size equal to the + dimension length. + + It defaults to a single chunk per init time and step, and 8 chunks + for each entire other dimension. These are purposefully small, to ensure + that when perfomring parallel writes, chunk boundaries are not crossed. + """ + out_dict: dict[str, int] = { + "init_time": 1, + "step": 1, + } | { + dim: len(getattr(self, dim)) // 8 if len(getattr(self, dim)) > 8 else 1 + for dim in self.dims + if dim not in ["init_time", "step"] + } + + return out_dict + + + def as_zeroed_dataarray(self, name: str) -> xr.DataArray: + """Express the coordinates as an xarray DataArray. + + Data is populated with zeros and a default chunking scheme is applied. + + Args: + name: The name of the DataArray. + + See Also: + - https://docs.xarray.dev/en/stable/user-guide/io.html#distributed-writes + """ + # Create a dask array of zeros with the shape of the dataset + # * The values of this are ignored, only the shape and chunks are used + dummy_values = dask.array.zeros( # type: ignore + shape=list(self.shapemap.values()), + chunks=tuple([self.default_chunking()[k] for k in self.shapemap]), + ) + attrs: dict[str, str] = { + "produced_by": "".join(( + f"nwp-consumer {__version__} at ", + f"{dt.datetime.now(tz=dt.UTC).strftime('%Y-%m-%d %H:%M')}", + )), + "variables": json.dumps({ + p.value: { + "description": p.metadata().description, + "units": p.metadata().units, + } for p in self.variable + }), + } + # Create a DataArray object with the given coordinates and dummy values + da: xr.DataArray = xr.DataArray( + name=name, + data=dummy_values, + coords=self.to_pandas(), + attrs=attrs, + ) + return da + diff --git a/src/nwp_consumer/internal/entities/notification.py b/src/nwp_consumer/internal/entities/notification.py new file mode 100644 index 00000000..f0af7f6f --- /dev/null +++ b/src/nwp_consumer/internal/entities/notification.py @@ -0,0 +1,55 @@ +"""Domain entities for service notifications. + +Upon completion of the processing of a request, the service can +deliver a notification of the result to a notification repository. +This module defines the structure of the notification messages. +""" + +import dataclasses + + +@dataclasses.dataclass +class PerformanceMetadata: + """Metadata for a service operation.""" + + duration_seconds: float + """The duration of the operation in seconds.""" + + memory_mb: float + """The memory usage of the operation in megabytes.""" + + +@dataclasses.dataclass(slots=True) +class StoreCreatedNotification: + """A notification of successful store creation.""" + + filename: str + """The name of the store created, including extension.""" + + size_mb: int + """The size of the store in megabytes.""" + + performance: PerformanceMetadata + """Metadata for the operation.""" + + def __str__(self) -> str: + """Return a string representation of the notification.""" + return "".join(( + f"Store created: {self.filename} ({self.size_mb} MB) in ", + f"{self.performance.duration_seconds} secs ", + f"(using {self.performance.memory_mb} MB RAM)", + )) + + +@dataclasses.dataclass(slots=True) +class StoreAppendedNotification: + """A notification of successful append to a store.""" + + filename: str + """The name of the store appended to, including extension.""" + + size_mb: int + """The size of the store in megabytes.""" + + performance: PerformanceMetadata + """Metadata for the operation.""" diff --git a/src/nwp_consumer/internal/entities/parameters.py b/src/nwp_consumer/internal/entities/parameters.py new file mode 100644 index 00000000..2c20ea68 --- /dev/null +++ b/src/nwp_consumer/internal/entities/parameters.py @@ -0,0 +1,357 @@ +"""Domain entities for NWP parameters. + +NWP forecasts have to forecast something, and that something is the value +of one or more parameters. Often referred to as variables (or +channels once stored in a tensor), these parameters are physical, measurable +quantities that are forecasted by the model. + +Variables are forecasted at different levels in the atmosphere. A common +level of interest is called "screen level (sl)". This corresponds to 1.5-2m +above the Earth's surface, which is the height of many measuring stations. + +Variables also have units, which are the physical quantities that the +variable is measured in. For example, temperature is measured in degrees +Celsius (C), and wind speed is measured in meters per second (m/s). +Some variables just occupy the range [0, 1], such as cloud cover. +This unit is referred to as the Unit Interval (UI). + +See Also: + - https://datahub.metoffice.gov.uk/docs/glossary + - https://codes.ecmwf.int/grib/param-db +""" + +import dataclasses +import logging +from enum import StrEnum, auto + +import xarray as xr +from returns.result import Failure, ResultE, Success + +log = logging.getLogger("nwp-consumer") + + +@dataclasses.dataclass(slots=True) +class ParameterLimits: + """Class containing information about the limits of a parameter.""" + + upper: float + """The upper limit on the parameter value. + + Not an absolute maximum, but rather the maximum value that + the parameter can resonably be expected to take. + + As an example, the maximum distance that can be seen horizontally + is 4.5km at sea level, so the upper limit for a visibility + parameter should be ~4500m. + """ + + lower: float + """The lower limit on the parameter value. + + Not an absolute minimum, but rather the minimum value that + the parameter can reasonably be expected to take. + + As an example, the minimum temperature on Earth is -89C, + so the lower limit for a temperature parameter should be ~ -90C. + """ + + threshold: float = 0.05 + """How carefully to enforce the limits. + + The threshold defines the number of values that can be outside + the limits in a given dataset before the data is considered invalid. + Denoted as a ratio (#outside/#total). + """ + + +@dataclasses.dataclass(slots=True, frozen=True) +class ParameterData: + """Class containing information about a parameter.""" + + name: str + """The name of the parameter as appears in produced datasets.""" + + description: str + """A brief description of the parameter.""" + + units: str + """The units of the parameter.""" + + limits: ParameterLimits + """Reasonable physical limits for the parameter. + + Used in sanity and validity checking the database values. + """ + + alternate_shortnames: list[str] = dataclasses.field(default_factory=list) + """Alternate names for the parameter found in the wild.""" + + def __str__(self) -> str: + """String representation of the parameter.""" + return self.name + + +class Parameter(StrEnum): + """Parameters of interest to OCF. + + Inheriting from StrEnum and using ``auto()`` makes the values + of the enums equal to the lowercased enum name. + + See Also: + - https://docs.python.org/3/library/enum.html#enum.StrEnum + """ + + TEMPERATURE_SL = auto() + DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL = auto() + DOWNWARD_LONGWAVE_RADIATION_FLUX_GL = auto() + RELATIVE_HUMIDITY_SL = auto() + VISIBILITY_SL = auto() + WIND_U_COMPONENT_10m = auto() + WIND_V_COMPONENT_10m = auto() + WIND_U_COMPONENT_100m = auto() + WIND_V_COMPONENT_100m = auto() + WIND_U_COMPONENT_200m = auto() + WIND_V_COMPONENT_200m = auto() + SNOW_DEPTH_GL = auto() + CLOUD_COVER_HIGH = auto() + CLOUD_COVER_MEDIUM = auto() + CLOUD_COVER_LOW = auto() + CLOUD_COVER_TOTAL = auto() + TOTAL_PRECIPITATION_RATE_GL = auto() + DOWNWARD_ULTRAVIOLET_RADIATION_FLUX_GL = auto() + DIRECT_SHORTWAVE_RADIATION_FLUX_GL = auto() + + def metadata(self) -> ParameterData: + """Get the metadata for the parameter.""" + match self.name: + case self.TEMPERATURE_SL.name: + return ParameterData( + name=str(self), + description="Temperature at screen level", + units="C", + limits=ParameterLimits(upper=60, lower=-90), + alternate_shortnames=["t", "t2m"], + ) + case self.DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL.name: + return ParameterData( + name=str(self), + description="Downward shortwave radiation flux at ground level. " + "Defined as the mean amount of solar radiation " + "incident on the surface expected over the next hour." + "This is made up of both direct and diffuse radiation.", + units="W/m^2", + limits=ParameterLimits(upper=1500, lower=0), + alternate_shortnames=["swavr", "ssrd", "dswrf", "sdswrf"], + ) + case self.DOWNWARD_LONGWAVE_RADIATION_FLUX_GL.name: + return ParameterData( + name=str(self), + description="Downward longwave radiation flux at ground level. " + "Defined as the mean amount of thermal radiation " + "incident on the surface expected over the next hour.", + units="W/m^2", + limits=ParameterLimits(upper=500, lower=0), + alternate_shortnames=["strd", "dlwrf", "sdlwrf"], + ) + case self.RELATIVE_HUMIDITY_SL.name: + return ParameterData( + name=str(self), + description="Relative humidity at screen level. " + "Defined as the ratio of partial pressure of water vapour " + "to the equilibrium vapour pressure of water", + units="%", + limits=ParameterLimits(upper=100, lower=0), + alternate_shortnames=["r", "r2"], + ) + case self.VISIBILITY_SL.name: + return ParameterData( + name=str(self), + description="Visibility at screen level. " + "Defined as the distance at which an object can be seen " + "horizontally in daylight conditions.", + units="m", + limits=ParameterLimits(upper=4500, lower=0), + alternate_shortnames=["vis"], + ) + case self.WIND_U_COMPONENT_10m.name: + return ParameterData( + name=str(self), + description="U component of wind at 10m above ground level. " + "Defined as the horizontal speed of " + "the wind in the eastward direction.", + units="m/s", + limits=ParameterLimits(upper=100, lower=-100), + alternate_shortnames=["u10"], + ) + case self.WIND_V_COMPONENT_10m.name: + return ParameterData( + name=str(self), + description="V component of wind at 10m above ground level. " + "Defined as the horizontal speed of " + "the wind in the northward direction.", + units="m/s", + # Non-tornadic winds are usually < 100m/s + limits=ParameterLimits(upper=100, lower=-100), + alternate_shortnames=["v10"], + ) + case self.WIND_U_COMPONENT_100m.name: + return ParameterData( + name=str(self), + description="U component of wind at 100m above ground level. " + "Defined as the horizontal speed of " + "the wind in the eastward direction.", + units="m/s", + limits=ParameterLimits(upper=100, lower=-100), + alternate_shortnames=["u100"], + ) + case self.WIND_V_COMPONENT_100m.name: + return ParameterData( + name=str(self), + description="V component of wind at 100m above ground level. " + "Defined as the horizontal speed of " + "the wind in the northward direction.", + units="m/s", + limits=ParameterLimits(upper=100, lower=-100), + alternate_shortnames=["v100"], + ) + case self.WIND_U_COMPONENT_200m.name: + return ParameterData( + name=str(self), + description="U component of wind at 200m above ground level. " + "Defined as the horizontal speed of " + "the wind in the eastward direction.", + units="m/s", + limits=ParameterLimits(upper=150, lower=-150), + alternate_shortnames=["u200"], + ) + case self.WIND_V_COMPONENT_200m.name: + return ParameterData( + name=str(self), + description="V component of wind at 200m above ground level. " + "Defined as the horizontal speed of " + "the wind in the northward direction.", + units="m/s", + limits=ParameterLimits(upper=150, lower=-150), + alternate_shortnames=["v200"], + ) + case self.SNOW_DEPTH_GL.name: + return ParameterData( + name=str(self), + description="Depth of snow on the ground.", + units="m", + limits=ParameterLimits(upper=12, lower=0), + alternate_shortnames=["sd", "sdwe"], + ) + case self.CLOUD_COVER_HIGH.name: + return ParameterData( + name=str(self), + description="Fraction of grid square covered by high-level cloud. " + "Defined as the ratio of " + "the area of the grid square covered by high-level (>6km) cloud " + "to the square's total area.", + units="UI", + limits=ParameterLimits(upper=1, lower=0), + alternate_shortnames=["hcc"], + ) + case self.CLOUD_COVER_MEDIUM.name: + return ParameterData( + name=str(self), + description="Fraction of grid square covered by medium-level cloud. " + "Defined as the ratio of " + "the area of the grid square covered by medium-level (2-6km) cloud " + "to the square's total area.", + units="UI", + limits=ParameterLimits(upper=1, lower=0), + alternate_shortnames=["mcc"], + ) + case self.CLOUD_COVER_LOW.name: + return ParameterData( + name=str(self), + description="Fraction of grid square covered by low-level cloud. " + "Defined as the ratio of " + "the area of the grid square covered by low-level (<2km) cloud " + "to the square's total area.", + units="UI", + limits=ParameterLimits(upper=1, lower=0), + alternate_shortnames=["lcc"], + ) + case self.CLOUD_COVER_TOTAL.name: + return ParameterData( + name=str(self), + description="Fraction of grid square covered by any cloud. " + "Defined as the ratio of " + "the area of the grid square covered by any cloud " + "to the square's total area.", + units="UI", + limits=ParameterLimits(upper=1, lower=0), + alternate_shortnames=["tcc", "clt"], + ) + case self.TOTAL_PRECIPITATION_RATE_GL.name: + return ParameterData( + name=str(self), + description="Total precipitation rate at ground level. " + "Defined as the rate at which liquid is deposited on the ground " + "including rain, snow, and hail.", + units="kg/m^2/s", + limits=ParameterLimits(upper=0.2, lower=0), + alternate_shortnames=["prate", "tprate"], + ) + case self.DOWNWARD_ULTRAVIOLET_RADIATION_FLUX_GL.name: + return ParameterData( + name=str(self), + description="Downward ultraviolet radiation flux at ground level. " + "Defined as the mean amount of " + "ultraviolet radiation incident on the surface " + "expected over the next hour.", + units="W/m^2", + limits=ParameterLimits(upper=1000, lower=0), + alternate_shortnames=["uvb"], + ) + case self.DIRECT_SHORTWAVE_RADIATION_FLUX_GL.name: + return ParameterData( + name=str(self), + description="Direct shortwave radiation flux at ground level. " + "Defined as the mean amount of " + "unscattered solar radiation incident on" + "a surface plane perpendicular to the direction of the sun " + "expected over the next hour.", + units="W/m^2", + limits=ParameterLimits(upper=1000, lower=0), + alternate_shortnames=["dsrp"], + ) + case _: + # Shouldn't happen thanks to the test case in test_parameters.py + raise ValueError(f"Unknown parameter: {self}") + + def try_from_alternate(name: str) -> ResultE["Parameter"]: + """Map an alternate name to a parameter.""" + for p in Parameter: + if name in p.metadata().alternate_shortnames: + return Success(p) + return Failure(ValueError(f"Unknown shortname: {name}")) + + @staticmethod + def rename_else_drop_ds_vars( + ds: xr.Dataset, allowed_parameters: list["Parameter"], + ) -> xr.Dataset: + """Rename variables to match expected names, dropping invalid ones. + + Returns a dataset with all variables in it renamed to a known `entities.Parameter` + name, if a matching parameter exists, and it is an allowed parameter. Otherwise, + the variable is dropped from the dataset. + + Args: + ds: The xarray dataset to rename. + allowed_parameters: The list of parameters allowed in the resultant dataset. + """ + for var in ds.data_vars: + param_result = Parameter.try_from_alternate(str(var)) + match param_result: + case Success(p): + if p in allowed_parameters: + ds = ds.rename_vars({var: p.value}) + continue + log.debug("Dropping invalid parameter '%s' from dataset", var) + ds = ds.drop_vars(str(var)) + return ds + diff --git a/src/nwp_consumer/internal/entities/performance.py b/src/nwp_consumer/internal/entities/performance.py new file mode 100644 index 00000000..4b36fe27 --- /dev/null +++ b/src/nwp_consumer/internal/entities/performance.py @@ -0,0 +1,84 @@ +"""Class for tracking memory usage in a separate thread. + +Adapted from the Joblib documentation: +https://joblib.readthedocs.io/en/stable/auto_examples/parallel_generator.html#memorymonitor-helper +""" + +import time +from threading import Thread +from types import TracebackType + +import psutil + + +class PerformanceMonitor(Thread): + """Monitor the memory usage in MB in a separate thread. + + Note that this class is good enough to highlight the memory profile of + Parallel in this example, but is not a general purpose profiler fit for + all cases. + """ + + thread: Thread + memory_buffer: list[int] + cpu_buffer: list[float] + start_time: float + end_time: float + stop: bool = True + + def __enter__(self) -> None: + """Start the monitor.""" + super().__init__() + self.stop = False + self.memory_buffer: list[int] = [] + self.cpu_buffer: list[float] = [] + self.start_time = time.time() + self.start() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + """Stop the performance monitor, saving the results.""" + self.stop = True + self.end_time = time.time() + super().join(timeout=30) + + def get_usage(self) -> tuple[int, float]: + """Get usage of a process and its children.""" + p = psutil.Process() + # CPU usage of process and its children + cpu: float = p.cpu_percent() + # Memory usage does not reflect child processes + # * Manually add the memory usage of child processes + memory: int = p.memory_info().rss + for c in p.children(): + memory += c.memory_info().rss + return memory, cpu + + def get_runtime(self) -> int: + """Get the runtime of the thread in seconds.""" + return int(self.end_time - self.start_time) + + def run(self) -> None: + """Run the thread.""" + memory_start, cpu_start = self.get_usage() + while not self.stop: + new_memory, new_cpu = self.get_usage() + # Memory is just a total, so get the delta + self.memory_buffer.append(new_memory - memory_start) + # CPU is calculated by psutil against the base CPU, + # so no need to get a delta + self.cpu_buffer.append(new_cpu) + time.sleep(0.2) + + def max_memory_mb(self) -> float: + """Get the maximum memory usage during the thread's runtime.""" + return max(self.memory_buffer) / 1e6 + + def max_cpu_percent(self) -> float: + """Get the maximum CPU usage during the thread's runtime.""" + return max(self.cpu_buffer) + diff --git a/src/nwp_consumer/internal/entities/postprocess.py b/src/nwp_consumer/internal/entities/postprocess.py new file mode 100644 index 00000000..b17cd90d --- /dev/null +++ b/src/nwp_consumer/internal/entities/postprocess.py @@ -0,0 +1,68 @@ +"""Domain entities for post-processing.""" + +import dataclasses +from codecs import Codec +from enum import Enum + +import ocf_blosc2 + + +class CodecOptions(Codec, Enum): + """Options for compression codecs.""" + UNSET = None + OCF_BLOSC2 = ocf_blosc2.Blosc2(clevel=5) + """Use the OCF Blosc2 codec. + + See Also: + - https://pypi.org/project/ocf-blosc2/ + """ + + def __bool__(self) -> bool: + """Boolean indicating whether a codec is set.""" + return self != CodecOptions.UNSET + + +@dataclasses.dataclass(slots=True) +class PostProcessOptions: + """Options for post-processing NWP data. + + The defaults for any option should be the null value, + i.e. nothing occurs by default. + """ + + validate: bool = False + """Whether to validate the data. + + Note that for the moment, this is a very memory-intensive operation. + Turn on only if there exists RAM to spare! + """ + + codec: CodecOptions = CodecOptions.UNSET + """Whether to compress the data with a non-standard codec. + + By default, Zarr writes chunks compressed using the `Blosc compressor + `_. + """ + + plot: bool = False + """Whether to save a plot of the data.""" + + + def requires_rewrite(self) -> bool: + """Boolean indicating whether the specified options necessitate a rewrite.""" + return any( + [ + self.codec, + ], + ) + + def requires_postprocessing(self) -> bool: + """Boolean indicating whether the specified options necessitate post-processing.""" + return any( + [ + self.validate, + self.codec, + self.plot, + ], + ) + diff --git a/src/nwp_consumer/internal/entities/repometadata.py b/src/nwp_consumer/internal/entities/repometadata.py new file mode 100644 index 00000000..3e5abdd8 --- /dev/null +++ b/src/nwp_consumer/internal/entities/repometadata.py @@ -0,0 +1,169 @@ +"""Domain classes for repository metadata. + +Sources of NWP data have attributes both pertaining to and apart from +the data they deliver. This module defines classes for metadata that +tracks relevant information about the model repository and the data +it provides. This might be helpful in determining the quality of the +data, defining pipelines for processing, or establishing the availability +for a live service. + +In this instance, the `ModelMetadata` refers to information pertaining +to the model used to generate the data itself, whilst the +`ModelRepositoryMetadata` refers to information about the repository +where NWP data produced by the model resides. +""" + +import dataclasses +import datetime as dt +import os + +import pandas as pd + +from .coordinates import NWPDimensionCoordinateMap +from .postprocess import PostProcessOptions + + +@dataclasses.dataclass(slots=True) +class ModelMetadata: + """Metadata for an NWP model.""" + + name: str + """The name of the model. + + Used to name the tensor in the zarr store. + """ + + resolution: str + """The resolution of the model with units.""" + + expected_coordinates: NWPDimensionCoordinateMap + """The expected dimension coordinate mapping. + + This is a dictionary mapping dimension labels to their coordinate values, + for a single init time dataset, e.g. + + >>> { + >>> "init_time": [dt.datetime(2021, 1, 1, 0, 0), ...], + >>> "step": [1, 2, ...], + >>> "latitude": [90, 89.75, 89.5, ...], + >>> "longitude": [180, 179, ...], + >>> } + + To work this out, it can be useful to use the 'grib_ls' tool from eccodes: + + >>> grib_ls -n geography -wcount=13 raw_file.grib + + Which prints grid data from the grib file. + """ + + def __str__(self) -> str: + """Return a pretty-printed string representation of the metadata.""" + pretty: str = "".join(( + "Model:", + "\n\t{self.name} ({self.resolution} resolution)", + "\tCoordinates:", + "\n".join( + f"\t\t{dim}: {vals}" + if len(vals) < 5 + else f"\t\t{dim}: {vals[:3]} ... {vals[-3:]}" + for dim, vals in self.expected_coordinates.__dict__.items() + ), + )) + return pretty + + +@dataclasses.dataclass(slots=True) +class ModelRepositoryMetadata: + """Metadata for an NWP Model repository.""" + + name: str + """The name of the model repository.""" + + is_archive: bool + """Whether the repository is a complete archival set. + + Archival datasets are able to backfill old data. + Non-archival datasets only provide a limited window of data. + """ + + is_order_based: bool + """Whether the repository is order-based. + + This means parameters cannot be chosen freely, + but rather are defined by pre-selected agreements with the provider. + """ + + running_hours: list[int] + """The running hours of the model. + + Most NWP models are run at fixed intervals throughout the day.""" + + delay_minutes: int + """The approximate model delay in minutes. + + This delay is the time between the running of the model and the time + at which the data is actually available.""" + + required_env: list[str] + """Environment variables required for usage.""" + + optional_env: dict[str, str] + """Optional environment variables.""" + + max_connections: int + """The maximum number of simultaneous connections allowed to the model repository. + + This determines the maximum level of concurrency that can be achieved when + downloading data from the repository. + """ + + postprocess_options: PostProcessOptions + """Options for post-processing the data.""" + + def determine_latest_it_from(self, t: dt.datetime) -> dt.datetime: + """Determine the latest available initialization time from a given time. + + Args: + t: The time from which to determine the latest initialization time. + + Returns: + The latest available initialization time prior to the given time. + """ + it = t.replace(minute=0, second=0, microsecond=0) \ + - dt.timedelta(minutes=self.delay_minutes) + while it.hour not in self.running_hours: + it -= dt.timedelta(hours=1) + + return it + + def month_its(self, year: int, month: int) -> list[dt.datetime]: + """Generate all init times for a given month.""" + days = pd.Period(f"{year}-{month}").days_in_month + its: list[dt.datetime] = [] + for day in range(1, days + 1): + for hour in self.running_hours: + its.append(dt.datetime(year, month, day, hour, tzinfo=dt.UTC)) + return its + + def missing_required_envs(self) -> list[str]: + """Get a list of unset required environment variables. + + Returns: + A list of missing environment variables. + """ + return [var for var in self.required_env if var not in os.environ] + + def __str__(self) -> str: + """Return a pretty-printed string representation of the metadata.""" + pretty: str = "".join(( + "Model Repository: ", + f"\n\t{self.name} ({'archive' if self.is_archive else 'live/rolling'} dataset.)", + f"\n\truns at: {self.running_hours} hours ", + "(available after {self.delay_minutes} minute delay)", + "\nEnvironment variables:", + "\n\tRequired:", + "\n".join(f"\t\t{var}" for var in self.required_env), + "\n\tOptional:", + "\n".join(f"\t\t{var}={val}" for var, val in self.optional_env.items()), + )) + return pretty diff --git a/src/nwp_consumer/internal/entities/tensorstore.py b/src/nwp_consumer/internal/entities/tensorstore.py new file mode 100644 index 00000000..b71e3baf --- /dev/null +++ b/src/nwp_consumer/internal/entities/tensorstore.py @@ -0,0 +1,501 @@ +"""Domain classes for store metadata. + +Converted data is stored in Zarr stores, which are chunked datastores +enabling subselection across any dimension of data, provided it is +chunked appropriately. + +This module provides a class for storing metadata about a Zarr store. + +TODO: 2024-11-20 This module wants refactoring into smaller testable components. +""" + +import abc +import dataclasses +import datetime as dt +import logging +import os +import pathlib +import shutil +from collections.abc import MutableMapping +from typing import Any + +import pandas as pd +import xarray as xr +import zarr +from returns.result import Failure, ResultE, Success + +from .coordinates import NWPDimensionCoordinateMap +from .parameters import Parameter +from .postprocess import PostProcessOptions + +log = logging.getLogger("nwp-consumer") + + +@dataclasses.dataclass(slots=True) +class ParameterScanResult: + """Container for the results of a scan of a parameter's values.""" + + mean: float + """The mean value of the parameter's data.""" + is_valid: bool + """Whether the parameter's data values are valid. + + This is determined according to the parameter's limits and threshold. + See `entities.parameters.Parameter`. + """ + has_nulls: bool + """Whether the parameter's data contains null values.""" + + +@dataclasses.dataclass(slots=True) +class TensorStore(abc.ABC): + """Store class for multidimensional data. + + This class is used to store data in a Zarr store. + Each store instance has defined coordinates for the data, + and is capable of handling parallel, region-based updates. + """ + + name: str + """Identifier for the store and the data within.""" + + path: str + """The path to the store.""" + + coordinate_map: NWPDimensionCoordinateMap + """The coordinates of the store.""" + + size_kb: int + """The size of the store in kilobytes.""" + + encoding: dict[str, Any] + """The encoding passed to Zarr whilst writing.""" + + @classmethod + def initialize_empty_store( + cls, + model: str, + repository: str, + coords: NWPDimensionCoordinateMap, + ) -> ResultE["TensorStore"]: + """Initialize a store for a given init time. + + This method writes a blank dataarray to disk based on the input coordinates, + which define the dimension labels and tick values of the output dataset object. + + .. note: If a store already exists at the expected path, + it is checked for consistency with the input coordinates and used if valid. + + The dataarray is 'blank' because it is written via:: + + dataarray.to_zarr(".zarr", compute=False) + + which writes the metadata alone. + The utility of this is to enable region-based writing of new data to the store, + which further allows for parallel write processes. + + There is a gotcha: regional writes can never be done in parallel to the same chunk, + so writes must always be done at the chunk level or higher (as a chunk is an + individual file in the store). To this effect, chunks are chosen to cover as small + a unit of data as could reasonably be expected to be provided by an NWP source: + + - Raw data files may not contain the full grid of data, hence a chunk of size equal + to a quarter the length of the grid dimension (lat/lon/x/y axes) is used. + - Raw data files may contain as little as one step for a single parameter, so a chunk + size of 1 is used along the step dimension. + - As above for the init_time dimension. + + Args: + model: The name of the model providing the tensor data. + This is also used as the name of the tensor. + repository: The name of the repository providing the tensor data. + coords: The coordinates of the store. + + Returns: + An indicator of a successful store write containing the number of bytes written. + + See Also: + - https://docs.xarray.dev/en/stable/user-guide/io.html#appending-to-existing-zarr-stores + - https://docs.xarray.dev/en/stable/user-guide/io.html#distributed-writes + + Returns: + A new instance of the TensorStore class. + """ + if not isinstance(coords.init_time, list) or len(coords.init_time) == 0: + return Failure( + ValueError( + "Cannot initialize store with 'init_time' dimension coordinates not " + "specified via a populated list. Check instantiation of " + "NWPDimensionCoordinateMap passed to this function. " + f"Got: {coords.init_time} (not a list, or empty).", + ), + ) + + zarrdir = os.getenv("ZARRDIR", f"~/.local/cache/nwp/{repository}/{model}/data") + store: zarr.storage.Store + path: str + filename: str = TensorStore.gen_store_filename(coords=coords) + try: + if zarrdir.startswith("s3"): + store_result = cls._create_zarrstore_s3(zarrdir, filename) + store, path = store_result.unwrap() # Can do this as exceptions are caught + else: + path = pathlib.Path("/".join((zarrdir, filename))).expanduser().as_posix() + store = zarr.storage.DirectoryStore(path) + except Exception as e: + return Failure(OSError( + f"Unable to create Directory Store at dir '{zarrdir}'. " + "Ensure ZARRDIR environment variable is specified correctly. " + f"Error context: {e}", + )) + + # Write the coordinates to a skeleton Zarr store + # * 'compute=False' enables only saving metadata + # * 'mode="w-"' fails if it finds an existing store + da: xr.DataArray = coords.as_zeroed_dataarray(name=model) + encoding = { + model: {"write_empty_chunks": False}, + "init_time": {"units": "nanoseconds since 1970-01-01"}, + "step": {"units": "hours"}, + } + try: + _ = da.to_zarr( + store=store, + compute=False, + mode="w-", + consolidated=True, + encoding=encoding, + ) + log.info("Created blank zarr store at '%s'", path) + # Ensure the store is readable + store_da = xr.open_dataarray(store, engine="zarr") + except zarr.errors.ContainsGroupError: + store_da = xr.open_dataarray(store, engine="zarr") + if store_da.name != da.name: # TODO: Also check for equality of coordinates + return Failure(OSError( + f"Existing store at '{path}' is for a different model. " + "Delete the existing store or move it to a new location, " + "or choose a new location for the new store via ZARRDIR.", + )) + log.info(f"Using existing store at '{path}'") + return Success( + cls( + name=model, + path=path, + coordinate_map=coords, + size_kb=store_da.nbytes // 1024, + encoding=encoding, + ), + ) + except Exception as e: + return Failure( + OSError( + f"Failed writing blank store to '{path}': {e}", + ), + ) + + # Check the resultant array's coordinates can be converted back + coordinate_map_result = NWPDimensionCoordinateMap.from_xarray(store_da) + if isinstance(coordinate_map_result, Failure): + return Failure( + OSError( + f"Error reading back coordinates of initialized store " + f"from path '{path}' (possible corruption): {coordinate_map_result}", + ), + ) + + return Success( + cls( + name=model, + path=path, + coordinate_map=coordinate_map_result.unwrap(), + size_kb=0, + encoding=encoding, + ), + ) + + #def from_existing_store( + # model: str, + # repository: str, + # expected_coords: NWPDimensionCoordinateMap, + #) -> ResultE["TensorStore"]: + # """Create a TensorStore instance from an existing store.""" + # pass # TODO + + # for dim in store_da.dims: + # if dim not in da.dims: + # return Failure( + # ValueError( + # "Cannot use existing store due to mismatched coordinates. " + # f"Dimension '{dim}' in existing store not found in new store. " + # "Use 'overwrite_existing=True' or move the existing store at " + # f"'{store}' to a new location. ", + # ), + # ) + # if not np.array_equal(store_da.coords[dim].values, da.coords[dim].values): + # return Failure( + # ValueError( + # "Cannot use existing store due to mismatched coordinates. " + # f"Dimension '{dim}' in existing store has different coordinate " + # "values from specified. " + # "Use 'overwrite_existing=True' or move the existing store at " + # f"'{store}' to a new location.", + # ), + # ) + + # --- Business logic methods --- # + def write_to_region( + self, + da: xr.DataArray, + region: dict[str, slice] | None = None, + ) -> ResultE[int]: + """Write partial data to the store. + + The optional region is a dictionary which maps dimension labels to slices. + These define the region in the store to write to. + + If the region dict is empty or not provided, the region is determined + via the 'determine_region' method. + + Args: + da: The data to write to the store. + region: The region to write to. + + Returns: + An indicator of a successful store write containing the number of bytes written. + """ + # Attempt to determine the region if missing + if region is None or region == {}: + region_result = NWPDimensionCoordinateMap.from_xarray(da).bind( + self.coordinate_map.determine_region, + ) + if isinstance(region_result, Failure): + return Failure(region_result.failure()) + region = region_result.unwrap() + + # Perform the regional write + try: + da.to_zarr(store=self.path, region=region, consolidated=True) + except Exception as e: + return Failure( + OSError( + f"Error writing to region of store: {e}", + ), + ) + + # Calculate the number of bytes written + nbytes: int = da.nbytes + del da + self.size_kb += nbytes // 1024 + return Success(nbytes) + + def validate_store(self) -> ResultE[bool]: + """Validate the store. + + This method checks the store for the presence of all expected parameters. + + Returns: + A bool indicating the result of the validation. + """ + store_da: xr.DataArray = xr.open_dataarray(self.path, engine="zarr") + # Consistency check on the coordinates of the store + coords_result = NWPDimensionCoordinateMap.from_xarray(store_da) + match coords_result: + case Failure(e): + return Failure(e) + case Success(coords): + if coords != self.coordinate_map: + return Failure(ValueError( + "Coordinate consistency check failed: " + "Store coordinates do not match expected coordinates. " + f"Expected: {self.coordinate_map}. Got: {coords}.", + )) + + # Validity check on the parameters of the store + for param in self.coordinate_map.variable: + scan_result: ResultE[ParameterScanResult] = self.scan_parameter_values(p=param) + match scan_result: + case Failure(e): + return Failure(e) + case Success(scan): + log.debug(f"Scanned parameter {param.name}: {scan.__repr__()}") + if not scan.is_valid or scan.has_nulls: + return Success(False) + + return Success(True) + + def delete_store(self) -> ResultE[None]: + """Delete the store.""" + if self.path.startswith("s3://"): + import s3fs + try: + fs = s3fs.S3FileSystem( + anon=False, + client_kwargs={ + "region_name": os.getenv("AWS_REGION", "eu-west-1"), + "endpoint_url": os.getenv("AWS_ENDPOINT_URL", None), + }, + ) + fs.rm(self.path, recursive=True) + except Exception as e: + return Failure(OSError( + f"Unable to delete S3 store at path '{self.path}'." + "Ensure AWS credentials are correct and discoverable by botocore. " + f"Error context: {e}", + )) + else: + try: + shutil.rmtree(self.path) + except Exception as e: + return Failure(OSError( + f"Unable to delete store at path '{self.path}'. " + f"Error context: {e}", + )) + log.info("Deleted zarr store at '%s'", self.path) + return Success(None) + + def scan_parameter_values(self, p: Parameter) -> ResultE[ParameterScanResult]: + """Scan the values of a parameter in the store. + + Extracts data from the values of the given parameter in the store. + This reads the data from the store, so note that this can be an + expensive operation for large datasets. + + Args: + p: The name of the parameter to scan. + + Returns: + A ParameterScanResult object. + """ + if p not in self.coordinate_map.variable: + return Failure(KeyError( + "Parameter scan failed: " + f"Cannot validate unknown parameter: {p.name}. " + "Ensure the parameter has been renamed to match the entities " + "parameters defined in `entities.parameters` if desired, or " + "add the parameter to the entities parameters if it is new. " + f"Store parameters: {[p.name for p in self.coordinate_map.variable]}.", + )) + store_da: xr.DataArray = xr.open_dataarray(self.path, engine="zarr") + + # Calculating the mean of a dataarray returns another dataarray, so it + # must be converted to a numpy array via `values`. Even though it is a + # single number in this case, type checkers don't know that, so the + # second call to `mean()` helps to reassure them its a float. + mean = store_da.mean().values.mean() + + return Success( + ParameterScanResult( + mean=mean, + is_valid=True, + has_nulls=False, + ), + ) + + def postprocess(self, options: PostProcessOptions) -> ResultE[str]: + """Post-process the store. + + This creates a new store, as many of the postprocess options require + modifications to the underlying file structure of the store. + """ + # TODO: Implement postprocessing options + if options.requires_postprocessing(): + log.info("Applying postprocessing options to store %s", self.name) + + if options.validate: + log.warning("Validation not yet implemented in efficient manner. Skipping option.") + + log.debug("Postprocessing complete for store %s", self.name) + return Success(self.path) + + else: + return Success(self.path) + + def update_attrs(self, attrs: dict[str, str]) -> ResultE[str]: + """Update the attributes of the store. + + This method updates the attributes of the store with the given dictionary. + """ + group: zarr.Group = zarr.open_group(self.path) + group.attrs.update(attrs) + zarr.consolidate_metadata(self.path) + return Success(self.path) + + def missing_times(self) -> ResultE[list[dt.datetime]]: + """Find the missing init_time in the store. + + A "missing init_time" is determined by the values corresponding + to the first two coordinate values of each dimension: if all are + NaN or None values then the time is considered missing. + """ + try: + store_da: xr.DataArray = xr.open_dataarray(self.path, engine="zarr") + except Exception as e: + return Failure(OSError( + "Cannot determine missing times in store due to " + f"error reading '{self.path}': {e}", + )) + missing_times: list[dt.datetime] = [] + for it in store_da.coords["init_time"].values: + if store_da.sel(init_time=it).isel({ + d: slice(0, 2) for d in self.coordinate_map.dims + if d != "init_time" + }).isnull().all().values: + missing_times.append(pd.Timestamp(it).to_pydatetime().replace(tzinfo=dt.UTC)) + return Success(missing_times) + + @staticmethod + def _create_zarrstore_s3(s3_folder: str, filename: str) \ + -> ResultE[tuple[MutableMapping, str]]: # type: ignore + """Create a mutable mapping to an S3 store. + + Authentication with S3 is done via botocore's credential discovery. + + Returns: + A tuple containing the store mapping and the path to the store, + in a result object indicating success or failure. + + See Also: + - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials + """ + import s3fs + if not s3_folder.startswith("s3://"): + return Failure(ValueError( + "S3 folder path must start with 's3://'. " + f"Got: {s3_folder}", + )) + log.debug("Attempting AWS connection using credential discovery") + try: + fs = s3fs.S3FileSystem( + anon=False, + client_kwargs={ + "region_name": os.getenv("AWS_REGION", "eu-west-1"), + "endpoint_url": os.getenv("AWS_ENDPOINT_URL", None), + }, + ) + path = s3_folder + "/" + filename + fs.mkdirs(path=path, exist_ok=True) + store = s3fs.mapping.S3Map(path, fs, check=False, create=True) + except Exception as e: + return Failure(OSError( + f"Unable to create file mapping for path '{path}'. " + "Ensure ZARRDIR environment variable is specified correctly, " + "and AWS credentials are discoverable by botocore. " + f"Error context: {e}", + )) + return Success((store, path)) + + @staticmethod + def gen_store_filename(coords: NWPDimensionCoordinateMap) -> str: + """Create a filename for the store. + + If the store only covers a single init_time, the filename is the init time. + Else, if it covers multiple init_times, the filename is the range of init times. + The extension is '.zarr'. + """ + store_range: str = coords.init_time[0].strftime("%Y%m%d%H") + if len(coords.init_time) > 1: + store_range = f"{coords.init_time[0]:%Y%m%d%H}-{coords.init_time[-1]:%Y%m%d%H}" + + return store_range + ".zarr" + diff --git a/src/nwp_consumer/internal/entities/test_coordinates.py b/src/nwp_consumer/internal/entities/test_coordinates.py new file mode 100644 index 00000000..0f2b10c0 --- /dev/null +++ b/src/nwp_consumer/internal/entities/test_coordinates.py @@ -0,0 +1,253 @@ +import dataclasses +import datetime as dt +import unittest + +import pandas as pd +from returns.result import Failure, Success + +from .coordinates import NWPDimensionCoordinateMap +from .parameters import Parameter + + +class TestCoordinates(unittest.TestCase): + """Test the business methods of the NWPDimensionCoordinateMap class.""" + + def test_determine_region(self) -> None: + + @dataclasses.dataclass + class TestCase: + name: str + inner: NWPDimensionCoordinateMap + expected_slices: dict[str, slice] + should_error: bool + + outer: NWPDimensionCoordinateMap = NWPDimensionCoordinateMap( + init_time=[dt.datetime(2021, 1, 1, i, tzinfo=dt.UTC) for i in range(0, 9, 3)], + step=list(range(12)), + variable=[ + Parameter.TEMPERATURE_SL, + Parameter.CLOUD_COVER_HIGH, + Parameter.TOTAL_PRECIPITATION_RATE_GL, + ], + latitude=[60.0, 61.0, 62.0], + longitude=[10.0, 11.0, 12.0], + ) + + tests = [ + TestCase( + name="basic_subset", + inner=NWPDimensionCoordinateMap( + init_time=outer.init_time[:1], + step=outer.step[:6], + variable=outer.variable, + latitude=outer.latitude, + longitude=outer.longitude, + ), + expected_slices={ + "init_time": slice(0, 1), + "step": slice(0, 6), + "variable": slice(0, 3), + "latitude": slice(0, 3), + "longitude": slice(0, 3), + }, + should_error=False, + ), + TestCase( + name="subset_with_multiple_span", + inner=NWPDimensionCoordinateMap( + init_time=[ + dt.datetime(2021, 1, 1, i, tzinfo=dt.UTC) + for i in [3, 6] + ], + step=outer.step, + variable=outer.variable, + latitude=outer.latitude, + longitude=outer.longitude, + ), + expected_slices={ + "init_time": slice(1, 3), + "step": slice(0, 12), + "variable": slice(0, 3), + "latitude": slice(0, 3), + "longitude": slice(0, 3), + }, + should_error=False, + ), + TestCase( + name="subset_with_non_contiguous_values", + inner=NWPDimensionCoordinateMap( + init_time=outer.init_time[:1], + step=list(range(1, 6, 2)), + variable=outer.variable, + latitude=[60.0, 63.0], + longitude=outer.longitude, + ), + expected_slices={}, + should_error=True, + ), + TestCase( + name="not_a_subset", + inner=NWPDimensionCoordinateMap( + init_time=outer.init_time[:1], + step=[15], + variable=outer.variable, + latitude=[12, 13, 14, 15], + longitude=outer.longitude, + ), + expected_slices={}, + should_error=True, + ), + TestCase( + name="different_dimensions", + inner=NWPDimensionCoordinateMap( + init_time=outer.init_time[:1], + step=[15], + variable=outer.variable, + ), + expected_slices={}, + should_error=True, + ), + ] + + for t in tests: + with self.subTest(name=t.name): + result = outer.determine_region(inner=t.inner) + if t.should_error: + self.assertTrue( + isinstance(result, Failure), + msg=f"{t.name}: Expected error to be returned.", + ) + else: + self.assertEqual(result, Success(t.expected_slices)) + + def test_to_pandas(self) -> None: + + @dataclasses.dataclass + class TestCase: + name: str + coords: NWPDimensionCoordinateMap + expected_indexes: dict[str, pd.Index] # type: ignore + + tests = [ + TestCase( + name="valid_data", + coords=NWPDimensionCoordinateMap( + init_time=[dt.datetime(2021, 1, 1, i, tzinfo=dt.UTC) for i in range(0, 9, 3)], + step=list(range(12)), + variable=[ + Parameter.TEMPERATURE_SL, + Parameter.CLOUD_COVER_HIGH, + Parameter.TOTAL_PRECIPITATION_RATE_GL, + ], + latitude=[60.0, 61.0, 62.0], + longitude=[10.0, 11.0, 12.0], + ), + expected_indexes={ + "init_time": pd.to_datetime([ + "2021-01-01T00:00:00Z", + "2021-01-01T03:00:00Z", + "2021-01-01T06:00:00Z", + ]), + "step": pd.Index([hour * 60 * 60 * 1000000000 for hour in range(12)]), + "variable": pd.Index([ + Parameter.TEMPERATURE_SL.value, + Parameter.CLOUD_COVER_HIGH.value, + Parameter.TOTAL_PRECIPITATION_RATE_GL.value, + ]), + "latitude": pd.Index([60.0, 61.0, 62.0]), + "longitude": pd.Index([10.0, 11.0, 12.0]), + }, + ), + ] + + for t in tests: + with self.subTest(name=t.name): + result = t.coords.to_pandas() + self.assertEqual(result["init_time"].dtype, "datetime64[ns]") + self.assertListEqual(list(result.keys()), list(t.expected_indexes.keys())) + for key in result: + self.assertListEqual( + result[key].values.tolist(), + t.expected_indexes[key].values.tolist()) + + def test_from_pandas(self) -> None: + @dataclasses.dataclass + class TestCase: + name: str + data: dict[str, pd.Index] # type: ignore + expected_coordinates: NWPDimensionCoordinateMap | None + should_error: bool + + tests = [ + TestCase( + name="valid_data", + data={ + "init_time": pd.to_datetime(["2021-01-01T00:00:00Z", "2021-01-01T03:00:00Z"]), + "step": pd.to_timedelta(["0 days", "3 days"]), + "variable": pd.Index(["temperature_sl", "cloud_cover_high"]), + "latitude": pd.Index([60.0, 61.0]), + "longitude": pd.Index([10.0, 11.0]), + }, + expected_coordinates=NWPDimensionCoordinateMap( + init_time=[ + dt.datetime(2021, 1, 1, 0, tzinfo=dt.UTC), + dt.datetime(2021, 1, 1, 3, tzinfo=dt.UTC)], + step=[0, 72], + variable=[Parameter.TEMPERATURE_SL, Parameter.CLOUD_COVER_HIGH], + latitude=[60.0, 61.0], + longitude=[10.0, 11.0], + ), + should_error=False, + ), + TestCase( + name="missing_required_keys", + data={ + "init_time": pd.to_datetime(["2021-01-01T00:00:00Z", "2021-01-01T03:00:00Z"]), + "step": pd.to_timedelta(["0 days", "3 days"]), + "latitude": pd.Index([60.0, 61.0]), + "longitude": pd.Index([10.0, 11.0]), + }, + expected_coordinates=None, + should_error=True, + ), + TestCase( + name="unknown_parameter", + data={ + "init_time": pd.to_datetime(["2021-01-01T00:00:00Z", "2021-01-01T03:00:00Z"]), + "step": pd.to_timedelta(["0 hours", "1 hours", "2 hours", "3 hours"]), + "variable": pd.Index(["temperature_sl", "not_a_variable"]), + "latitude": pd.Index([60.0, 61.0]), + "longitude": pd.Index([10.0, 11.0], dtype="int64"), + }, + expected_coordinates=None, + should_error=True, + ), + TestCase( + name="unknown_keys", + data={ + "init_time": pd.to_datetime(["2021-01-01T00:00:00Z", "2021-01-01T03:00:00Z"]), + "step": pd.to_timedelta(["0 days", "3 days"]), + "variable": pd.Index(["temperature_sl", "cloud_cover_high"]), + "latitude": pd.Index([60.0, 61.0]), + "longitude": pd.Index([10.0, 11.0]), + "unknown": pd.Index(["unknown"]), + }, + expected_coordinates=None, + should_error=True, + ), + ] + + for t in tests: + with self.subTest(name=t.name): + result = NWPDimensionCoordinateMap.from_pandas(t.data) + if t.should_error: + self.assertTrue( + isinstance(result, Failure), + msg=f"{t.name}: Expected error to be returned.", + ) + else: + self.assertEqual(result, Success(t.expected_coordinates)) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/nwp_consumer/internal/entities/test_parameters.py b/src/nwp_consumer/internal/entities/test_parameters.py new file mode 100644 index 00000000..54b1bf97 --- /dev/null +++ b/src/nwp_consumer/internal/entities/test_parameters.py @@ -0,0 +1,68 @@ +import unittest + +import numpy as np +import xarray as xr +from hypothesis import given +from hypothesis import strategies as st +from returns.pipeline import is_successful + +from .parameters import Parameter + + +class TestParameters(unittest.TestCase): + """Test the business methods of the Parameters class.""" + + @given(st.sampled_from(Parameter)) + def test_metadata(self, p: Parameter) -> None: + """Test the metadata method.""" + metadata = p.metadata() + self.assertEqual(metadata.name, p.value) + + @given(st.sampled_from([s for p in Parameter for s in p.metadata().alternate_shortnames])) + def test_try_from_shortname(self, shortname: str) -> None: + """Test the try_from_shortname method.""" + p = Parameter.try_from_alternate(shortname) + self.assertTrue(is_successful(p)) + + p = Parameter.try_from_alternate("invalid") + self.assertFalse(is_successful(p)) + + @given( + st.sampled_from([s for p in Parameter for s in p.metadata().alternate_shortnames]), + st.sampled_from(Parameter), + ) + def test_rename_else_drop_ds_vars(self, shortname: str, parameter: Parameter) -> None: + """Test the rename_else_drop_ds_vars method.""" + allowed_parameters: list[Parameter] = [parameter] + + ds = xr.Dataset( + data_vars={ + shortname: ( + ("init_time", "step", "latitude", "longitude"), np.random.rand(1, 12, 15, 15), + ), + "unknown-parameter": ( + ("init_time", "step", "latitude", "longitude"), np.random.rand(1, 12, 15, 15), + ), + }, + coords={ + "init_time": np.array([0]), + "step": np.array(range(12)), + "latitude": np.array(range(15)), + "longitude": np.array(range(15)), + }, + ) + + ds = Parameter.rename_else_drop_ds_vars( + ds, + allowed_parameters=allowed_parameters, + ) + + if shortname in parameter.metadata().alternate_shortnames: + self.assertTrue(len(list(ds.data_vars)) == 1) + self.assertEqual(next(iter(ds.data_vars)), str(parameter)) + else: + self.assertTrue(len(list(ds.data_vars)) == 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/nwp_consumer/internal/entities/test_postprocess.py b/src/nwp_consumer/internal/entities/test_postprocess.py new file mode 100644 index 00000000..42414b4e --- /dev/null +++ b/src/nwp_consumer/internal/entities/test_postprocess.py @@ -0,0 +1,29 @@ +import unittest + +from .postprocess import PostProcessOptions + + +class TestPostProcessOptions(unittest.TestCase): + """Test the business methods of the PostProcessOptions class.""" + + def test_requires_postprocessing(self) -> None: + """Test that an empty initialization means no postprocessing.""" + test_class = PostProcessOptions() + + self.assertFalse( + test_class.requires_postprocessing(), + msg="Empty class should not require postprocessing.", + ) + + def test_requires_rewrite(self) -> None: + """Test that an empty initialization means no rewriting.""" + test_class = PostProcessOptions() + + self.assertFalse( + test_class.requires_rewrite(), + msg="Empty class should not require rewriting.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/nwp_consumer/internal/entities/test_repometadata.py b/src/nwp_consumer/internal/entities/test_repometadata.py new file mode 100644 index 00000000..e2287015 --- /dev/null +++ b/src/nwp_consumer/internal/entities/test_repometadata.py @@ -0,0 +1,53 @@ +import dataclasses +import datetime as dt +import unittest + +from .postprocess import PostProcessOptions +from .repometadata import ModelRepositoryMetadata + + +class TestModelRepositoryMetadata(unittest.TestCase): + """Test the business methods of the ModelRepositoryMetadata class.""" + + metadata: ModelRepositoryMetadata = ModelRepositoryMetadata( + name="test", + is_archive=False, + is_order_based=False, + running_hours=[0, 6, 12, 18], + delay_minutes=60, + required_env=["TEST"], + optional_env={"TEST": "test"}, + max_connections=1, + postprocess_options=PostProcessOptions(), + ) + + def test_determine_latest_it_from(self) -> None: + """Test the determine_latest_it_from method.""" + + @dataclasses.dataclass + class TestCase: + name: str + t: dt.datetime + expected: dt.datetime + + tests = [ + TestCase( + name="rolls_back_inter-day", + t=dt.datetime(2021, 1, 2, 0, tzinfo=dt.UTC), + expected=dt.datetime(2021, 1, 1, 18, tzinfo=dt.UTC), + ), + TestCase( + name="rolls_back_intra-day", + t=dt.datetime(2021, 1, 1, 5, tzinfo=dt.UTC), + expected=dt.datetime(2021, 1, 1, 0, tzinfo=dt.UTC), + ), + ] + + for test in tests: + with self.subTest(name=test.name): + result = self.metadata.determine_latest_it_from(test.t) + self.assertEqual(result, test.expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/nwp_consumer/internal/entities/test_tensorstore.py b/src/nwp_consumer/internal/entities/test_tensorstore.py new file mode 100644 index 00000000..69b8a835 --- /dev/null +++ b/src/nwp_consumer/internal/entities/test_tensorstore.py @@ -0,0 +1,213 @@ +import contextlib +import dataclasses +import datetime as dt +import logging +import os +import unittest +from collections.abc import Generator +from types import TracebackType +from unittest.mock import patch + +import numpy as np +import xarray as xr +from botocore.client import BaseClient as BotocoreClient +from botocore.session import Session +from moto.server import ThreadedMotoServer +from returns.pipeline import is_successful +from returns.result import Success + +from .coordinates import NWPDimensionCoordinateMap +from .parameters import Parameter +from .postprocess import PostProcessOptions +from .tensorstore import TensorStore + +logging.getLogger("werkzeug").setLevel(logging.ERROR) + + +class MockS3Bucket: + + client: BotocoreClient + server: ThreadedMotoServer + bucket: str = "test-bucket" + + def __enter__(self) -> None: + """Create a mock S3 server and bucket.""" + self.server = ThreadedMotoServer() + self.server.start() + + session = Session() + self.client = session.create_client( + service_name="s3", + region_name="us-east-1", + endpoint_url="http://localhost:5000", + aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], + ) + + self.client.create_bucket( + Bucket=self.bucket, + ) + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + response = self.client.list_objects_v2( + Bucket=self.bucket, + ) + """Delete all objects in the bucket and stop the server.""" + if "Contents" in response: + for obj in response["Contents"]: + self.client.delete_object( + Bucket=self.bucket, + Key=obj["Key"], + ) + self.server.stop() + + +class TestTensorStore(unittest.TestCase): + """Test the business methods of the TensorStore class.""" + + @contextlib.contextmanager + def store(self, year: int) -> Generator[TensorStore, None, None]: + """Create an instance of the TensorStore class.""" + + test_coords: NWPDimensionCoordinateMap = NWPDimensionCoordinateMap( + init_time=[ + dt.datetime(year, 1, 1, h, tzinfo=dt.UTC) + for h in [0, 6, 12, 18] + ], + step=[1, 2, 3, 4], + variable=[Parameter.TEMPERATURE_SL], + latitude=np.linspace(90, -90, 12).tolist(), + longitude=np.linspace(0, 360, 18).tolist(), + ) + + init_result = TensorStore.initialize_empty_store( + model="test_da", + repository="dummy_repository", + coords=test_coords, + ) + self.assertIsInstance(init_result, Success, msg=init_result) + store = init_result.unwrap() + yield store + store.delete_store() + + @patch.dict(os.environ, { + "AWS_ENDPOINT_URL": "http://localhost:5000", + "AWS_ACCESS_KEY_ID": "test-key", + "AWS_SECRET_ACCESS_KEY": "test-secret", + "ZARRDIR": "s3://test-bucket/data", + }, clear=True) + def test_initialize_and_delete_s3(self) -> None: + """Test the initialize_empty_store method.""" + + with MockS3Bucket(), self.store(year=2022) as ts: + delete_result = ts.delete_store() + self.assertIsInstance(delete_result, Success, msg=delete_result) + + def test_write_to_region(self) -> None: + """Test the write_to_region method.""" + with self.store(year=2022) as ts: + test_da: xr.DataArray = xr.DataArray( + name="test_da", + data=np.ones( + shape=list(ts.coordinate_map.shapemap.values()), + ), + coords=ts.coordinate_map.to_pandas(), + ) + + # Write each init time and step one at a time + for it in test_da.coords["init_time"].values: + for step in test_da.coords["step"].values: + write_result = ts.write_to_region( + da=test_da.where( + test_da["init_time"] == it, drop=True, + ).where(test_da["step"] == step, drop=True), + ) + self.assertIsInstance(write_result, Success, msg=write_result) + + def test_postprocess(self) -> None: + """Test the postprocess method.""" + + @dataclasses.dataclass + class TestCase: + name: str + options: PostProcessOptions + should_error: bool + + tests: list[TestCase] = [ + TestCase( + name="empty_options", + options=PostProcessOptions(), + should_error=False, + ), + ] + + with self.store(year=1971) as ts: + for t in tests: + with self.subTest(name=t.name): + result = ts.postprocess(t.options) + if t.should_error: + self.assertTrue( + isinstance(result, Exception), + msg="Expected error to be returned.", + ) + else: + self.assertTrue(is_successful(result)) + + def test_missing_times(self) -> None: + """Test the missing_times method.""" + + @dataclasses.dataclass + class TestCase: + name: str + times_to_write: list[dt.datetime] + expected: list[dt.datetime] + + with self.store(year=2024) as ts: + tests: list[TestCase] = [ + TestCase( + name="all_missing_times", + times_to_write=[], + expected=ts.coordinate_map.init_time, + ), + TestCase( + name="some_missing_times", + times_to_write=[ts.coordinate_map.init_time[0], ts.coordinate_map.init_time[2]], + expected=[ts.coordinate_map.init_time[1], ts.coordinate_map.init_time[3]], + ), + TestCase( + name="no_missing_times", + times_to_write=ts.coordinate_map.init_time, + expected=[], + ), + ] + + for t in tests: + with self.subTest(name=t.name): + for i in t.times_to_write: + write_result = ts.write_to_region( + da=xr.DataArray( + name="test_da", + data=np.ones( + shape=[ + 1 if k == "init_time" else v + for k, v in ts.coordinate_map.shapemap.items() + ], + ), + coords=ts.coordinate_map.to_pandas() | { + "init_time": [np.datetime64(i.replace(tzinfo=None), "ns")], + }, + ), + ) + write_result.unwrap() + result = ts.missing_times() + missing_times = result.unwrap() + self.assertListEqual(missing_times, t.expected) + +if __name__ == "__main__": + unittest.main() + diff --git a/src/nwp_consumer/internal/handlers/__init__.py b/src/nwp_consumer/internal/handlers/__init__.py new file mode 100644 index 00000000..088523fd --- /dev/null +++ b/src/nwp_consumer/internal/handlers/__init__.py @@ -0,0 +1,31 @@ +"""Implementation of adaptors for driving actors. + +Driving actors +-------------- + +A driving actor is an external component that initiates interaction +with the core logic. Also referred to as *primary* actors, a driving +actor represents an entrypoint that uses the core driving ports +(see `nwp_consumer.internal.ports.services`) in its implementation. +In this manner, it *handles* whatever input it receives and *drives* +the core logic to perform the necessary operations, hence the module +name. + +Examples of driving or primary actors include: + +- a REST services receiving requests +- a CLI tool processing user input + +This module +----------- + +This module contains implementations for the following driving actors: + +- Command-line interface (CLI) - `nwp_consumer.internal.handlers.cli` +""" + +from .cli import CLIHandler + +__all__ = [ + "CLIHandler" +] diff --git a/src/nwp_consumer/internal/handlers/cli.py b/src/nwp_consumer/internal/handlers/cli.py new file mode 100644 index 00000000..e69956fa --- /dev/null +++ b/src/nwp_consumer/internal/handlers/cli.py @@ -0,0 +1,115 @@ +"""Adaptor for the CLI driving actor.""" + +import argparse +import datetime as dt +import logging + +from returns.result import Failure, Success + +from nwp_consumer.internal import ports + +log = logging.getLogger("nwp-consumer") + + +class CLIHandler: + """CLI driving actor.""" + + def __init__( + self, + consumer_usecase: ports.ConsumeUseCase, + archiver_usecase: ports.ArchiveUseCase, + ) -> None: + """Create a new instance.""" + self._consumer_usecase = consumer_usecase + self._archiver_usecase = archiver_usecase + + + @property + def parser(self) -> argparse.ArgumentParser: + """Return the CLI argument parser.""" + parser = argparse.ArgumentParser(description="NWP Consumer CLI") + subparsers = parser.add_subparsers(dest="command") + + consume_command = subparsers.add_parser( + "consume", + help="Consume NWP data for a single init time", + ) + consume_command.add_argument( + "--init-time", "-i", + help="Initialization time of the forecast (YYYY-MM-DDTHH). " + "Omit to pull the latest available forecast.", + type=dt.datetime.fromisoformat, + required=False, + ) + + archive_command = subparsers.add_parser( + "archive", + help="Archive NWP data for a given month", + ) + archive_command.add_argument( + "--year", "-y", + help="Year to archive", + type=int, + required=True, + ) + archive_command.add_argument( + "--month", "-m", + help="Month to archive", + type=int, + required=True, + ) + + info_command = subparsers.add_parser("info", help="Show model repository info") + info_options = info_command.add_mutually_exclusive_group() + info_options.add_argument( + "--model", + help="Show information about the selected model repository.", + action="store_true", + ) + info_options.add_argument( + "--parameters", + help="Show information about all available parameters.", + action="store_true", + ) + + return parser + + def run(self) -> int: + """Run the CLI handler. + + Returns the appropriate exit code. + """ + args = self.parser.parse_args() + match args.command: + case "consume": + result = self._consumer_usecase.consume(it=args.init_time) + + match result: + case Failure(e): + log.error(f"Failed to consume NWP data: {e}") + return 1 + case Success(path): + log.info(f"Successfully consumed NWP data to '{path}'") + return 0 + + case "archive": + result = self._archiver_usecase.archive(year=args.year, month=args.month) + + match result: + case Failure(e): + log.error(f"Failed to archive NWP data: {e}") + return 1 + case Success(path): + log.info(f"Successfully archived NWP data to '{path}'") + return 0 + + case "info": + log.error("Info command is coming soon! :)") + return 0 + + case _: + log.error(f"Unknown command: {args.command}") + self.parser.print_help() + return 1 + + return 0 diff --git a/src/nwp_consumer/internal/inputs/__init__.py b/src/nwp_consumer/internal/inputs/__init__.py deleted file mode 100644 index b8d4905f..00000000 --- a/src/nwp_consumer/internal/inputs/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Available inputs to source data from.""" - -__all__ = [ - "ceda", - "metoffice", - "ecmwf", - "icon", - "cmc", - "meteofrance", - "noaa", -] - -from . import ( - ceda, - cmc, - ecmwf, - icon, - meteofrance, - metoffice, - noaa, -) - diff --git a/src/nwp_consumer/internal/inputs/ceda/README.md b/src/nwp_consumer/internal/inputs/ceda/README.md deleted file mode 100644 index 04f28d3c..00000000 --- a/src/nwp_consumer/internal/inputs/ceda/README.md +++ /dev/null @@ -1,273 +0,0 @@ -# CEDA - ---- - -## Data - -See -- https://artefacts.ceda.ac.uk/formats/grib/ -- https://dap.ceda.ac.uk/badc/ukmo-nwp/doc/NWP_UKV_Information.pdf - -Investigate files via eccodes: - -```shell -$ conda install -c conda-forge eccodes -``` - -More info on eccodes: https://confluence.ecmwf.int/display/ECC/grib_ls - -For example: - -```shell -$ grib_ls -n parameter -w stepRange=1 201901010000_u1096_ng_umqv_Wholesale1.grib -``` - -## Files - -Sourced from https://zenodo.org/record/7357056. There are two files per -`init_time` (model run time) that contain surface-level parameters of interest. - -The contents of those files differs somewhat from what is presented in the above -document - -#### Un-split File 1 `yyyymmddhhmm_u1096_ng_umqv_Wholesale1.grib` - -Full domain, 35 time steps and the following surface level parameters. - -| paramId | shortName | units | name | -|---------|-----------|----------------|-------------------------| -| 130 | t | K | Temperature | -| 3017 | dpt | K | Dew point temperature | -| 3020 | vis | m | Visibility | -| 157 | r | % | Relative humidity | -| 260074 | prmsl | Pa | Pressure reduced to MSL | -| 207 | 10si | m s**-1 | 10 metre wind speed | -| 260260 | 10wdir | Degree true | 10 metre wind direction | -| 3059 | prate | kg m**-2 s**-1 | Precipitation rate | -| | unknown | unknown | unknown | - -View via pasting the output of the following to this -[online table converter](https://tableconvert.com/json-to-markdown): - -```shell -$ grib_ls -n parameter -w stepRange=0 -j 201901010000_u1096_ng_umqv_Wholesale1.grib -``` - -When loading this file in using *cfgrib*, it loads in 5 distinct xarray datasets. - -
- Wholesale1 Datasets - - --- Dataset 1 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 ... 1 days 12:00:00 - heightAboveGround float64 1.0 - valid_time (step) datetime64[ns] 2019-01-01 ... 2019-01-02T12:00:00 - Dimensions without coordinates: values - Data variables: - t (step, values) float32 ... (1.5m temperature) - r (step, values) float32 ... (1.5m relative humidity) - dpt (step, values) float32 ... (1.5m dew point) - vis (step, values) float32 ... (1.5m visibility) - - --- Dataset 2 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 ... 1 days 12:00:00 - heightAboveGround float64 10.0 - valid_time (step) datetime64[ns] 2019-01-01 ... 2019-01-02T12:00:00 - Dimensions without coordinates: values - Data variables: - si10 (step, values) float32 ... (10m wind speed) - wdir10 (step, values) float32 ... (10m wind direction) - - --- Dataset 3 --- - Dataset 3 - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 1 days 12:00:00 - meanSea float64 0.0 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - prmsl (step, values) float32 ... (mean sea level pressure) - - --- Dataset 4 --- - Dimensions: (step: 36, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 01:00:00 02:00:00 ... 1 days 12:00:00 - surface float64 0.0 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - unknown (step, values) float32 ... (?) - - --- Dataset 5 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 1 days 12:00:00 - surface float64 0.0 - valid_time (step) datetime64[ns] 2019-01-01 ... 2019-01-02T12:00:00 - Dimensions without coordinates: values - Data variables: - unknown (step, values) float32 ... (?) - prate (step, values) float32 ... (total precipitation rate) - -
- -#### Un-split File 2 `yyyymmddhhmm_u1096_ng_umqv_Wholesale2.grib` - -Full domain, 35 time steps and the following surface level parameters: - -| centre | paramId | shortName | units | name | -|--------|---------|-----------|---------|------------------------------------| -| egrr | | unknown | unknown | unknown | -| egrr | 3073 | lcc | % | Low cloud cover | -| egrr | 3074 | mcc | % | Medium cloud cover | -| egrr | 3075 | hcc | % | High cloud cover | -| egrr | | unknown | unknown | unknown | -| egrr | 228046 | hcct | m | Height of convective cloud top | -| egrr | 3073 | lcc | % | Low cloud cover | -| egrr | 260107 | cdcb | m | Cloud base | -| egrr | 3066 | sde | m | Snow depth | -| egrr | 260087 | dswrf | W m**-2 | Downward short-wave radiation flux | -| egrr | 260097 | dlwrf | W m**-2 | Downward long-wave radiation flux | -| egrr | | unknown | unknown | unknown | -| egrr | 3008 | h | m | Geometrical height | - -View via pasting the ouput of the following to this -[online table converter](https://tableconvert.com/json-to-markdown): - -```shell -$ grib_ls -n parameter -w stepRange=0 -j 201901010000_u1096_ng_umqv_Wholesale2.grib -``` - -When loading this file to xarray using *cfgrib*, it comes in 6 distinct -datasets. These datasets only contain 11 of the 13 parameters specified -above, with two of the 11 being unknown variables. - -
- Wholesal21 Datasets - - --- Dataset 1 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 1 days 12:00:00 - atmosphere float64 0.0 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - unknown (step, values) float32 ... (?) - - --- Dataset 2 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 1 days 12:00:00 - cloudBase float64 0.0 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - cdcb (step, values) float32 ... (convective cloud base height) - - --- Dataset 3 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 ... 1 days 12:00:00 - heightAboveGroundLayer float64 0.0 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - lcc (step, values) float32 ... (low cloud amount) - - --- Dataset 4 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 ... 1 days 12:00:00 - heightAboveGroundLayer float64 1.524e+03 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - mcc (step, values) float32 ... (medium cloud amount) - - --- Dataset 5 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 ... 1 days 12:00:00 - heightAboveGroundLayer float64 4.572e+03 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - hcc (step, values) float32 ... (high cloud amount) - - --- Dataset 6 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 1 days 12:00:00 - surface float64 0.0 - valid_time (step) datetime64[ns] 2019-01-01 ... 2019-01-02T12:00:00 - Dimensions without coordinates: values - Data variables: - unknown (step, values) float32 ... - sde (step, values) float32 ... (snow depth water equivalent) - hcct (step, values) float32 ... (height of convective cloud top) - dswrf (step, values) float32 ... (downward short-wave radiation flux) - dlwrf (step, values) float32 ... (downward long-wave radiation flux) - - --- Dataset 7 --- - Dimensions: (step: 37, values: 385792) - Coordinates: - time datetime64[ns] 2019-01-01 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 1 days 12:00:00 - level float64 0.0 - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: values - Data variables: - h (step, values) float32 ... (geometrical height) - -
- - -## Geography - - -The geography namespace of the files returns the following information: - -```shell -grib_ls -n geography -w shortName=t,stepRange=0 -j 201901010000_u1096_ng_umqv_Wholesale1.grib -``` - - -| Name | Value | -|------------------------------------|---------------------| -| Ni | 548 | -| Nj | 704 | -| latitudeOfReferencePointInDegrees | 4.9e-05 | -| longitudeOfReferencePointInDegrees | -2e-06 | -| m | 0 | -| XRInMetres | 400000 | -| YRInMetres | -100000 | -| iScansNegatively | 0 | -| jScansPositively | 1 | -| jPointsAreConsecutive | 0 | -| DiInMetres | 2000 | -| DjInMetres | 2000 | -| X1InGridLengths | -238000 | -| Y1InGridLengths | 1.222e+06 | -| X2InGridLengths | 856000 | -| Y2InGridLengths | -184000 | -| gridType | transverse_mercator | -| bitmapPresent | 1 | -| bitmap | 255... | - diff --git a/src/nwp_consumer/internal/inputs/ceda/__init__.py b/src/nwp_consumer/internal/inputs/ceda/__init__.py deleted file mode 100644 index 74f4c648..00000000 --- a/src/nwp_consumer/internal/inputs/ceda/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ['Client'] - -from .client import Client diff --git a/src/nwp_consumer/internal/inputs/ceda/_models.py b/src/nwp_consumer/internal/inputs/ceda/_models.py deleted file mode 100644 index 86d8f7df..00000000 --- a/src/nwp_consumer/internal/inputs/ceda/_models.py +++ /dev/null @@ -1,58 +0,0 @@ -import datetime as dt -from typing import ClassVar - -from marshmallow import EXCLUDE, Schema, fields -from marshmallow_dataclass import dataclass - -import nwp_consumer.internal as internal - - -@dataclass -class CEDAFileInfo(internal.FileInfoModel): - """Schema of the items section of the response from the CEDA API.""" - - class Meta: - unknown = EXCLUDE - - name: str - - Schema: ClassVar[type[Schema]] = Schema # To prevent confusing type checkers - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class. - - The init time is found the first part of the file name for CEDA files, - e.g. 202201010000_u1096_ng_umqv_Wholesale1.grib - """ - return dt.datetime.strptime(self.name.split("_")[0], "%Y%m%d%H%M").replace( - tzinfo=dt.UTC, - ) - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self.name - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"badc/ukmo-nwp/data/ukv-grib/{self.it():%Y/%m/%d}/{self.name}" - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() - - -@dataclass -class CEDAResponse: - """Schema of the response from the CEDA API.""" - - class Meta: - unknown = EXCLUDE - - path: str - items: list[CEDAFileInfo] = fields.List(fields.Nested(CEDAFileInfo.Schema())) - - Schema: ClassVar[type[Schema]] = Schema # To prevent confusing type checkers diff --git a/src/nwp_consumer/internal/inputs/ceda/client.py b/src/nwp_consumer/internal/inputs/ceda/client.py deleted file mode 100644 index d190953a..00000000 --- a/src/nwp_consumer/internal/inputs/ceda/client.py +++ /dev/null @@ -1,327 +0,0 @@ -"""Client adapting CEDA API to internal Fetcher port.""" - -import datetime as dt -import pathlib -import typing -import urllib.parse -import urllib.request - -import cfgrib -import numpy as np -import requests -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._models import CEDAFileInfo, CEDAResponse - -log = structlog.getLogger() - -# Defines parameters in CEDA that are not available from MetOffice -PARAMETER_IGNORE_LIST: typing.Sequence[str] = ( - "unknown", - "h", - "hcct", - "cdcb", - "dpt", - "prmsl", - "cbh", -) - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("time", "step", "x", "y") - -# Defines the mapping from CEDA parameter names to OCF parameter names - - - -class Client(internal.FetcherInterface): - """Implements a client to fetch data from CEDA.""" - - # CEDA FTP Username - __username: str - # CEDA FTP Password - __password: str - # FTP url for CEDA data - __ftpBase: str - - def __init__(self, ftpUsername: str, ftpPassword: str) -> None: - """Create a new CEDAClient. - - Exposes a client for CEDA's FTP server that conforms to the FetcherInterface. - - Args: - ftpUsername: The username to use to connect to the CEDA FTP server. - ftpPassword: The password to use to connect to the CEDA FTP server. - """ - self.__username: str = urllib.parse.quote(ftpUsername) - self.__password: str = urllib.parse.quote(ftpPassword) - self.__ftpBase: str = f"ftp://{self.__username}:{self.__password}@ftp.ceda.ac.uk" - - def datasetName(self) -> str: - """Overrides corresponding parent method.""" - return "UKV" - - def getInitHours(self) -> list[int]: - """Overrides corresponding parent method.""" - return [0, 3, 6, 9, 12, 15, 18, 21] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: - """Overrides corresponding parent method.""" - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - # Fetch info for all files available on the input date - # * CEDA has a HTTPS JSON API for this purpose - response: requests.Response = requests.request( - method="GET", - url=f"https://data.ceda.ac.uk/badc/ukmo-nwp/data/ukv-grib/{it:%Y/%m/%d}?json", - ) - - if response.status_code == 404: - # No data available for this init time. Fail soft - log.warn( - event="no data available for init time", - init_time=f"{it:%Y/%m/%d %H:%M}", - url=response.url, - ) - return [] - if not response.ok: - # Something else has gone wrong. Fail hard - log.warn( - event="error response from filelist endpoint", - url=response.url, - response=response.json(), - ) - return [] - - # Map the response to a CEDAResponse object to ensure it looks as expected - try: - responseObj: CEDAResponse = CEDAResponse.Schema().load(response.json()) - except Exception as e: - log.warn( - event="response from ceda does not match expected schema", - error=e, - response=response.json(), - ) - return [] - - # Filter the files for the desired init time - wantedFiles: list[CEDAFileInfo] = [ - fileInfo for fileInfo in responseObj.items if _isWantedFile(fi=fileInfo, dit=it) - ] - - return wantedFiles - - def downloadToCache( - self, *, fi: internal.FileInfoModel, - ) -> pathlib.Path: - """Overrides corresponding parent method.""" - if self.__password == "" or self.__username == "": - log.error(event="all ceda credentials not provided") - return pathlib.Path() - - log.debug(event="requesting download of file", file=fi.filename(), path=fi.filepath()) - url: str = f"{self.__ftpBase}/{fi.filepath()}" - try: - response = urllib.request.urlopen(url=url) - except Exception as e: - log.warn( - event="error calling url for file", - url=fi.filepath(), - filename=fi.filename(), - error=e, - ) - return pathlib.Path() - - # Stream the filedata into a cached file - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with cfp.open("wb") as f: - for chunk in iter(lambda: response.read(16 * 1024), b""): - f.write(chunk) - f.flush() - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: - """Overrides corresponding parent method.""" - if p.suffix != ".grib": - log.warn(event="cannot map non-grib file to dataset", filepath=p.as_posix()) - return xr.Dataset() - - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Check the file has the right name - if not any(setname in p.name.lower() for setname in [ - "wholesale1.grib", "wholesale2.grib", "wholesale1t54.grib", "wholesale2t54.grib", - ]): - log.debug( - event="skipping file as it does not match expected name", - filepath=p.as_posix(), - ) - return xr.Dataset() - - # Load the wholesale file as a list of datasets - # * cfgrib loads multiple hypercubes for a single multi-parameter grib file - # * Can also set backend_kwargs={"indexpath": ""}, to avoid the index file - try: - datasets: list[xr.Dataset] = cfgrib.open_datasets( - path=p.as_posix(), - chunks={"time": 1, "step": -1, "variable": -1, "x": "auto", "y": "auto"}, - backend_kwargs={"indexpath": ""}, - ) - except Exception as e: - log.warn(event="error converting raw file to dataset", filepath=p.as_posix(), error=e) - return xr.Dataset() - - for i, ds in enumerate(datasets): - # Ensure the temperature is defined at 1 meter above ground level - # * In the early NWPs (definitely in the 2016-03-22 NWPs): - # - `heightAboveGround` only has one entry ("1" meter above ground) - # - `heightAboveGround` isn't set as a dimension for `t`. - # * In later NWPs, 'heightAboveGround' has 2 values (0, 1) and is a dimension for `t`. - if "t" in ds and "heightAboveGround" in ds["t"].dims: - ds = ds.sel(heightAboveGround=1) - - # Snow depth is in `m` from CEDA, but OCF expects `kg m-2`. - # * A scaling factor of 1000 converts between the two. - # * See "Snow Depth" entry in https://gridded-data-ui.cda.api.metoffice.gov.uk/glossary - if "sde" in ds: - ds = ds.assign(sde=ds["sde"] * 1000) - - # Delete unnecessary data variables - for var_name in PARAMETER_IGNORE_LIST: - if var_name in ds: - del ds[var_name] - - # Delete unwanted coordinates - ds = ds.drop_vars( - names=[c for c in ds.coords if c not in COORDINATE_ALLOW_LIST], - errors="ignore", - ) - - # Put the modified dataset back in the list - datasets[i] = ds - - # Merge the datasets back into one - wholesaleDataset = xr.merge( - objects=datasets, - compat="override", - combine_attrs="drop_conflicts", - ) - - del datasets - - # Add in x and y coordinates - try: - wholesaleDataset = _reshapeTo2DGrid(ds=wholesaleDataset) - except Exception as e: - log.warn(event="error reshaping to 2D grid", filepath=p.as_posix(), error=e) - return xr.Dataset() - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - wholesaleDataset = ( - wholesaleDataset.rename({"time": "init_time"}) - .expand_dims("init_time") - .transpose("init_time", "step", "y", "x") - .sortby("step") - .chunk( - { - "init_time": 1, - "step": -1, - "y": len(wholesaleDataset.y) // 2, - "x": len(wholesaleDataset.x) // 2, - }, - ) - ) - return wholesaleDataset - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides corresponding parent method.""" - return { - "10wdir": internal.OCFParameter.WindDirectionFromWhichBlowingSurfaceAdjustedAGL, - "10si": internal.OCFParameter.WindSpeedSurfaceAdjustedAGL, - "prate": internal.OCFParameter.RainPrecipitationRate, - "r": internal.OCFParameter.RelativeHumidityAGL, - "t": internal.OCFParameter.TemperatureAGL, - "vis": internal.OCFParameter.VisibilityAGL, - "dswrf": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "dlwrf": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "hcc": internal.OCFParameter.HighCloudCover, - "mcc": internal.OCFParameter.MediumCloudCover, - "lcc": internal.OCFParameter.LowCloudCover, - "sde": internal.OCFParameter.SnowDepthWaterEquivalent, - } - - -def _isWantedFile(*, fi: CEDAFileInfo, dit: dt.datetime) -> bool: - """Check if the input FileInfo corresponds to a wanted GRIB file. - - :param fi: The File Info object describing the file to check - :param dit: The desired init time - """ - if fi.it().date() != dit.date() or fi.it().time() != dit.time(): - return False - # False if item doesn't correspond to Wholesale1 or Wholesale2 files up to 54 time steps - if not any(setname in fi.filename() for setname in ["Wholesale1.grib", "Wholesale2.grib", "Wholesale1T54.grib", "Wholesale2T54.grib"]): - return False - - return True - - -def _reshapeTo2DGrid(*, ds: xr.Dataset) -> xr.Dataset: - """Convert 1D into 2D array. - - In the grib files, the pixel values are in a flat 1D array (indexed by the `values` dimension). - The ordering of the pixels in the grib are left to right, bottom to top. - - This function replaces the `values` dimension with an `x` and `y` dimension, - and, for each step, reshapes the images to be 2D. - - :param ds: The dataset to reshape - """ - # Adapted from https://stackoverflow.com/a/62667154 and - # https://github.com/SciTools/iris-grib/issues/140#issuecomment-1398634288 - - # Define geographical domain for UKV. Taken from page 4 of https://zenodo.org/record/7357056 - dx = dy = 2000 - maxY = 1223000 - minY = -185000 - minX = -239000 - maxX = 857000 - # * Note that the UKV NWPs y is top-to-bottom, hence step is negative. - northing = np.arange(start=maxY, stop=minY, step=-dy, dtype=np.int32) - easting = np.arange(start=minX, stop=maxX, step=dx, dtype=np.int32) - - if ds.sizes["values"] != len(northing) * len(easting): - raise ValueError( - f"dataset has {ds.sizes['values']} values, " - f"but expected {len(northing) * len(easting)}", - ) - - # Create new coordinates, - # which give the `x` and `y` position for each position in the `values` dimension: - ds = ds.assign_coords( - { - "x": ("values", np.tile(easting, reps=len(northing))), - "y": ("values", np.repeat(northing, repeats=len(easting))), - }, - ) - - # Now set `values` to be a MultiIndex, indexed by `y` and `x`: - ds = ds.set_index(values=("y", "x")) - - # Now unstack. This gets rid of the `values` dimension and indexes - # the data variables using `y` and `x`. - return ds.unstack("values") diff --git a/src/nwp_consumer/internal/inputs/ceda/test_client.py b/src/nwp_consumer/internal/inputs/ceda/test_client.py deleted file mode 100644 index 503b6378..00000000 --- a/src/nwp_consumer/internal/inputs/ceda/test_client.py +++ /dev/null @@ -1,132 +0,0 @@ -import datetime as dt -import pathlib -import unittest.mock - -import numpy as np -import xarray as xr - -from ._models import CEDAFileInfo -from .client import ( - Client, - _isWantedFile, - _reshapeTo2DGrid, -) - -# --------- Test setup --------- # - -testClient = Client(ftpPassword="", ftpUsername="") - - -# --------- Client methods --------- # - -class TestClient_ListRawFilesForInitTime(unittest.TestCase): - - def test_listsFilesCorrectly(self) -> None: - pass - - -class TestClient_FetchRawFileBytes(unittest.TestCase): - - def test_fetchesFileCorrectly(self) -> None: - pass - - -class TestClient_MapCachedRaw(unittest.TestCase): - - def test_convertsWholesale1FileCorrectly(self) -> None: - wholesalePath: pathlib.Path = pathlib.Path(__file__).parent / "test_wholesale1.grib" - - out = testClient.mapCachedRaw(p=wholesalePath) - - # Ensure the dimensions have the right sizes - self.assertDictEqual( - {"init_time": 1, "step": 4, "y": 704, "x": 548}, - dict(out.sizes.items()), - ) - # Ensure the correct variables are in the variable dimension - self.assertCountEqual( - ["prate", "r", "si10", "t", "vis", "wdir10"], - list(out.data_vars.keys()), - ) - - @unittest.skip("Broken on github ci") - def test_convertsWholesale2FileCorrectly(self) -> None: - wholesalePath: pathlib.Path = pathlib.Path(__file__).parent / "test_wholesale2.grib" - - out = testClient.mapCachedRaw(p=wholesalePath) - - # Ensure the dimensions have the right sizes - self.assertDictEqual( - {"init_time": 1, "step": 4, "y": 704, "x": 548}, - dict(out.sizes.items()), - ) - # Ensure the correct variables are in the variable dimension - self.assertCountEqual( - ["dlwrf", "dswrf", "hcc", "lcc", "mcc", "sde"], - list(out.data_vars.keys()), - ) - -# --------- Static methods --------- # - -class TestIsWantedFile(unittest.TestCase): - - def test_correctlyFiltersCEDAFileInfos(self) -> None: - initTime: dt.datetime = dt.datetime( - year=2021, month=1, day=1, hour=0, minute=0, tzinfo=dt.timezone.utc, - ) - - wantedFileInfos: list[CEDAFileInfo] = [ - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale1.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale2.grib"), - ] - - unwantedFileInfos: list[CEDAFileInfo] = [ - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale1T54.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale2T54.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale3.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale3T54.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale4.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale5.grib"), - CEDAFileInfo(name="202101010000_u1096_ng_umqv_Wholesale5T54.grib"), - CEDAFileInfo(name="202101010300_u1096_ng_umqv_Wholesale1T120.grib"), - CEDAFileInfo(name="202101010300_u1096_ng_umqv_Wholesale1.grib"), - ] - - self.assertTrue( - all(_isWantedFile(fi=fo, dit=initTime) for fo in wantedFileInfos)) - self.assertFalse( - all(_isWantedFile(fi=fo, dit=initTime) for fo in unwantedFileInfos)) - - -class TestReshapeTo2DGrid(unittest.TestCase): - - def test_correctlyReshapesData(self) -> None: - dataset = xr.Dataset( - data_vars={ - "wdir10": (("step", "values"), np.random.rand(4, 385792)), - }, - coords={ - "step": [0, 1, 2, 3], - }, - ) - - reshapedDataset = _reshapeTo2DGrid(ds=dataset) - - self.assertEqual(548, reshapedDataset.dims["x"]) - self.assertEqual(704, reshapedDataset.dims["y"]) - - with self.assertRaises(KeyError): - _ = reshapedDataset["values"] - - def test_raisesErrorForIncorrectNumberOfValues(self) -> None: - ds1 = xr.Dataset( - data_vars={ - "wdir10": (("step", "values"), [[1, 2, 3, 4], [5, 6, 7, 8]]), - }, - coords={ - "step": [0, 1], - }, - ) - - with self.assertRaises(ValueError): - _ = _reshapeTo2DGrid(ds=ds1) diff --git a/src/nwp_consumer/internal/inputs/ceda/test_wholesale1.grib b/src/nwp_consumer/internal/inputs/ceda/test_wholesale1.grib deleted file mode 100644 index 82c48e73..00000000 Binary files a/src/nwp_consumer/internal/inputs/ceda/test_wholesale1.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/ceda/test_wholesale2.grib b/src/nwp_consumer/internal/inputs/ceda/test_wholesale2.grib deleted file mode 100644 index 1c8d302f..00000000 Binary files a/src/nwp_consumer/internal/inputs/ceda/test_wholesale2.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/cmc/CMC_glb_CAPE_SFC_0_latlon.15x.15_2023080900_P027.grib2 b/src/nwp_consumer/internal/inputs/cmc/CMC_glb_CAPE_SFC_0_latlon.15x.15_2023080900_P027.grib2 deleted file mode 100644 index 6950eae6..00000000 Binary files a/src/nwp_consumer/internal/inputs/cmc/CMC_glb_CAPE_SFC_0_latlon.15x.15_2023080900_P027.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/cmc/CMC_glb_TMP_TGL_2_latlon.15x.15_2023080900_P027.grib2 b/src/nwp_consumer/internal/inputs/cmc/CMC_glb_TMP_TGL_2_latlon.15x.15_2023080900_P027.grib2 deleted file mode 100644 index 6548a192..00000000 Binary files a/src/nwp_consumer/internal/inputs/cmc/CMC_glb_TMP_TGL_2_latlon.15x.15_2023080900_P027.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/cmc/CMC_glb_VGRD_ISBL_200_latlon.15x.15_2023080900_P027.grib2 b/src/nwp_consumer/internal/inputs/cmc/CMC_glb_VGRD_ISBL_200_latlon.15x.15_2023080900_P027.grib2 deleted file mode 100644 index b7e53b91..00000000 Binary files a/src/nwp_consumer/internal/inputs/cmc/CMC_glb_VGRD_ISBL_200_latlon.15x.15_2023080900_P027.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/cmc/__init__.py b/src/nwp_consumer/internal/inputs/cmc/__init__.py deleted file mode 100644 index 5d97b9a1..00000000 --- a/src/nwp_consumer/internal/inputs/cmc/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -__all__ = ["Client"] - -from .client import Client - diff --git a/src/nwp_consumer/internal/inputs/cmc/_consts.py b/src/nwp_consumer/internal/inputs/cmc/_consts.py deleted file mode 100644 index aba6595f..00000000 --- a/src/nwp_consumer/internal/inputs/cmc/_consts.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Defines all parameters available from GDPS.""" - - -GDPS_VARIABLES = [ - "ALBDO", - "ABSV", - "CWAT", - "TSOIL", - "SOILVIC", - "SOILM", - "SFCWRO", - "CAPE", - "CIN", - "ACPCP", - "DLWRF", - "DSWRF", - "HGT", - "FPRATE", - "IPRATE", - "PCPNTYPE", - "LHTFL", - "NLWRS", - "NSWRS", - "PRATE", - "PRES", - "RH", - "SKINT", - "SDEN", - "SNOD", - "SPRATE", - "SPFH", - "TMP", - "TCDC", - "APCP", - "ULWRF", - "VVEL", - "GUST", - "UGRD", - "VGRD", -] - -GEPS_VARIABLES = [ - "CAPE", - "CIN", - "HGT", - "ICETK", - "PRES", - "PRMSL", - "PWAT", - "RH", - "SCWRO", - "SNOD", - "SPFH", - "TCDC", - "TMP", - "TSOIL", - "UGRD", - "VGRD", - "WEASD", - "WIND", - "VVEL" - ] diff --git a/src/nwp_consumer/internal/inputs/cmc/_models.py b/src/nwp_consumer/internal/inputs/cmc/_models.py deleted file mode 100644 index fa414c8b..00000000 --- a/src/nwp_consumer/internal/inputs/cmc/_models.py +++ /dev/null @@ -1,37 +0,0 @@ -import datetime as dt - -from nwp_consumer import internal - - -class CMCFileInfo(internal.FileInfoModel): - def __init__( - self, - it: dt.datetime, - filename: str, - currentURL: str, - step: int, - ) -> None: - self._it = it - self._filename = filename - self._url = currentURL - self.step = step - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._filename - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._url + "/" + self._filename - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class.""" - return self._it - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - return [self.step] - - def variables(self) -> list: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() diff --git a/src/nwp_consumer/internal/inputs/cmc/client.py b/src/nwp_consumer/internal/inputs/cmc/client.py deleted file mode 100644 index 4e6bf1f8..00000000 --- a/src/nwp_consumer/internal/inputs/cmc/client.py +++ /dev/null @@ -1,332 +0,0 @@ -"""Implements a client to fetch GDPS/GEPS data from CMC.""" -import datetime as dt -import pathlib -import re -import typing -import urllib.request - -import requests -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._consts import GDPS_VARIABLES, GEPS_VARIABLES -from ._models import CMCFileInfo -from ... import OCFParameter - -log = structlog.getLogger() - - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("time", "step", "latitude", "longitude") - - -class Client(internal.FetcherInterface): - """Implements a client to fetch GDPS/GEPS data from CMC.""" - - baseurl: str # The base URL for the GDPS/GEPS model - model: str # The model to fetch data for - parameters: list[str] # The parameters to fetch - - def __init__(self, model: str, hours: int = 48, param_group: str = "default") -> None: - """Create a new GDPS Client. - - Exposes a client for GDPS and GEPS data from Canada CMC that conforms to the FetcherInterface. - - Args: - model: The model to fetch data for. Valid models are "gdps" and "geps". - param_group: The set of parameters to fetch. - Valid groups are "default", "full", and "basic". - """ - self.baseurl = "https://dd.weather.gc.ca" - - match model: - case "gdps": - self.baseurl += "/model_gem_global/15km/grib2/lat_lon/" - case "geps": - self.baseurl += "/ensemble/geps/grib2/raw/" - case _: - raise ValueError( - f"unknown GDPS/GEPS model {model}. Valid models are 'gdps' and 'geps'", - ) - - match (param_group, model): - case ("default", _): - self.parameters = ["t", "tclc", "dswrf", "dlwrf", "snod", "rh", "u", "v"] - case ("full", "geps"): - self.parameters = GEPS_VARIABLES - case ("full", "gdps"): - self.parameters = GDPS_VARIABLES - case ("basic", "geps"): - self.parameters = GEPS_VARIABLES[:2] - case ("basic", "gdps"): - self.parameters = GDPS_VARIABLES[:2] - case (_, _): - raise ValueError( - f"unknown parameter group {param_group}." - "Valid groups are 'default', 'full', 'basic'", - ) - - self.model = model - self.hours = hours - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"CMC_{self.model}".upper() - - def getInitHours(self) -> list[int]: # noqa: D102 - return [0, 12] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - # GDPS data is only available for today's and yesterday's date. - # If data hasn't been uploaded for that inittime yet, - # then yesterday's data will still be present on the server. - if it.date() != dt.datetime.now(dt.UTC).date(): - raise ValueError("GDPS/GEPS data is only available on today's date") - - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - files: list[internal.FileInfoModel] = [] - - # Files are split per parameter, level, and step, with a webpage per parameter - # * The webpage contains a list of files for the parameter - # * Find these files for each parameter and add them to the list - for param in self.parameters: - # The list of files for the parameter - parameterFiles: list[internal.FileInfoModel] = [] - - # Fetch CMC webpage detailing the available files for the timestep - response = requests.get(f"{self.baseurl}/{it.strftime('%H')}/000/", timeout=3) - - if response.status_code != 200: - log.warn( - event="error fetching filelisting webpage for parameter", - status=response.status_code, - url=response.url, - param=param, - inittime=it.strftime("%Y-%m-%d %H:%M"), - ) - continue - - # The webpage's HTML contains a list of tags - # * Each tag has a href, most of which point to a file) - for line in response.text.splitlines(): - # Check if the line contains a href, if not, skip it - refmatch = re.search(pattern=r'href="(.+)">', string=line) - if refmatch is None: - continue - - # The href contains the name of a file - parse this into a FileInfo object - fi: CMCFileInfo | None = None - # If downloading all variables, match all files - # * Otherwise only match single level and time invariant - fi = _parseCMCFilename( - name=refmatch.groups()[0], - baseurl=self.baseurl, - match_pl=self.parameters in ["t", "tclc", "dswrf", "dlwrf", "snod", "rh", "u", "v"], - match_hl=self.parameters in ["t", "tclc", "dswrf", "dlwrf", "snod", "rh", "u", "v"], - ) - # Ignore the file if it is not for today's date or has a step > 48 (when conforming) - if fi is None or fi.it() != it or (fi.step > self.hours and self.conform): - continue - - # Add the file to the list - parameterFiles.append(fi) - - log.debug( - event="listed files for parameter", - param=param, - inittime=it.strftime("%Y-%m-%d %H:%M"), - url=response.url, - numfiles=len(parameterFiles), - ) - - # Add the files for the parameter to the list of all files - files.extend(parameterFiles) - - return files - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: # noqa: D102 - if p.suffix != ".grib2": - log.warn( - event="cannot map non-grib file to dataset", - filepath=p.as_posix(), - ) - return xr.Dataset() - - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Load the raw file as a dataset - try: - ds = xr.open_dataset( - p.as_posix(), - engine="cfgrib", - chunks={ - "time": 1, - "step": 1, - "latitude": "auto", - "longitude": "auto", - }, - ) - except Exception as e: - log.warn( - event="error converting raw file as dataset", - error=e, - filepath=p.as_posix(), - ) - return xr.Dataset() - # Rename variable to the value, as some have unknown as the name - if next(iter(ds.data_vars.keys())) == "unknown": - ds = ds.rename({"unknown": str(p.name).split("_")[2].lower()}) - - # Rename variables that are both pressure level and surface - if "surface" in list(ds.coords): - ds = ds.rename({"surface": "heightAboveGround"}) - - if "heightAboveGround" in list(ds.coords) and next(iter(ds.data_vars.keys())) in [ - "q", - "t", - "u", - "v", - ]: - # Rename data variable to add _surface to it so merging works later - ds = ds.rename( - {next(iter(ds.data_vars.keys())): f"{next(iter(ds.data_vars.keys()))}_surface"}, - ) - - if "isobaricInhPa" in list(ds.coords): - if "rh" in list(ds.data_vars.keys()): - ds = ds.rename({"isobaricInhPa": "isobaricInhPa_humidity"}) - if "absv" in list(ds.data_vars.keys()) or "vvel" in list(ds.data_vars.keys()): - ds = ds.rename({"isobaricInhPa": "isobaricInhPa_absv_vvel"}) - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - ds = ( - ds.rename({"time": "init_time"}) - .expand_dims("init_time") - .expand_dims("step") - .transpose("init_time", "step", ...) - .sortby("step") - .chunk( - { - "init_time": 1, - "step": -1, - }, - ) - ) - - return ds - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - log.debug(event="requesting download of file", file=fi.filename(), path=fi.filepath()) - try: - response = urllib.request.urlopen(fi.filepath()) - except Exception as e: - log.warn( - event="error calling url for file", - url=fi.filepath(), - filename=fi.filename(), - error=e, - ) - return pathlib.Path() - - if response.status != 200: - log.warn( - event="error downloading file", - status=response.status, - url=fi.filepath(), - filename=fi.filename(), - ) - return pathlib.Path() - - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with open(cfp, "wb") as f: - f.write(response.read()) - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def parameterConformMap(self) -> dict[str, OCFParameter]: - """Overrides the corresponding method in the parent class.""" - # See https://eccc-msc.github.io/open-data/msc-data/nwp_gdps/readme_gdps-datamart_en/ - # for a list of CMC parameters - return { - "t": internal.OCFParameter.TemperatureAGL, - "tclc": internal.OCFParameter.TotalCloudCover, - "dswrf": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "dlwrf": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "snod": internal.OCFParameter.SnowDepthWaterEquivalent, - "rh": internal.OCFParameter.RelativeHumidityAGL, - "u": internal.OCFParameter.WindUComponentAGL, - "v": internal.OCFParameter.WindVComponentAGL, - } - - - -def _parseCMCFilename( - name: str, - baseurl: str, - match_sl: bool = True, - match_hl: bool = False, - match_pl: bool = False, -) -> CMCFileInfo | None: - """Parse a string of HTML into an CMCFileInfo object, if it contains one. - - Args: - name: The name of the file to parse - baseurl: The base URL for the GDPS model - match_sl: Whether to match single-level files - match_hl: Whether to match Height Above Ground-level files - match_pl: Whether to match pressure-level files - """ - # TODO: @Jacob even fixed, these do not match a lot of the files in the store, is that on purpose? # noqa: E501 - # Define the regex patterns to match the different types of file - # * Single Level GDPS: `CMC___SFC_0_latlon_YYYYMMDD_PLLL.grib2` - # * Sinle Level GEPS: `CMC_geps-raw_CIN_SFC_0_latlon0p5x0p5_2024011800_P000_allmbrs.grib2` - slRegex = r"CMC_[a-z-]{3,8}_([A-Za-z_\d]+)_SFC_0_latlon[\S]{7}_(\d{10})_P(\d{3})[\S]*.grib" - # * HeightAboveGround GDPS: `CMC_glb_ISBL_TGL_40_latlon.15x.15_2023080900_P027.grib2` - # * HeightAboveGround GEPS: `CMC_geps-raw_SPFH_TGL_2_latlon0p5x0p5_2023080900_P027_allmbrs.grib2` # noqa: E501 - hlRegex = r"CMC_[a-z-]{3,8}_([A-Za-z_\d]+)_TGL_(\d{1,4})_latlon[\S]{7}_(\d{10})_P(\d{3})[\S]*.grib" # noqa: E501 - # * Pressure Level GDPS: `CMC_glb_TMP_ISBL_500_latlon.15x.15_2023080900_P027.grib2` - # * Pressure Level GEPS: `CMC_geps-raw_TMP_ISBL_500_latlon0p5x0p5_2023080900_P027_allmbrs.grib2` - plRegex = r"CMC_[a-z-]{3,8}_([A-Za-z_\d]+)_ISBL_(\d{1,4})_latlon[\S]{7}_(\d{10})_P(\d{3})[\S]*.grib" # noqa: E501 - - itstring = paramstring = "" - stepstring = "000" - # Try to match the href to one of the regex patterns - slmatch = re.search(pattern=slRegex, string=name) - hlmatch = re.search(pattern=hlRegex, string=name) - plmatch = re.search(pattern=plRegex, string=name) - - if slmatch and match_sl: - paramstring, itstring, stepstring = slmatch.groups() - elif hlmatch and match_hl: - paramstring, levelstring, itstring, stepstring = hlmatch.groups() - elif plmatch and match_pl: - paramstring, levelstring, itstring, stepstring = plmatch.groups() - else: - return None - - it = dt.datetime.strptime(itstring, "%Y%m%d%H").replace(tzinfo=dt.UTC) - - return CMCFileInfo( - it=it, - filename=name, - currentURL=f"{baseurl}/{it.strftime('%H')}/{stepstring}/", - step=int(stepstring), - ) diff --git a/src/nwp_consumer/internal/inputs/cmc/test_client.py b/src/nwp_consumer/internal/inputs/cmc/test_client.py deleted file mode 100644 index edc7c990..00000000 --- a/src/nwp_consumer/internal/inputs/cmc/test_client.py +++ /dev/null @@ -1,78 +0,0 @@ -import datetime as dt -import pathlib -import unittest -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from ._models import CMCFileInfo - -from .client import Client, _parseCMCFilename - -testClient = Client(model="gdps") - - -class TestClient(unittest.TestCase): - def test_mapCachedRaw(self) -> None: - # Test with global file - testFilePath: pathlib.Path = ( - pathlib.Path(__file__).parent / "CMC_glb_VGRD_ISBL_200_latlon.15x.15_2023080900_P027.grib2" - ) - out = testClient.mapCachedRaw(p=testFilePath) - - # Check latitude and longitude are injected - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - self.assertEqual(len(out["latitude"].values), 1201) - self.assertEqual(len(out["longitude"].values), 2400) - # Check that the dimensions are correctly ordered and renamed - self.assertEqual( - out[next(iter(out.data_vars.keys()))].dims, - ("init_time", "step", "latitude", "longitude"), - ) - - # Test with europe file - testFilePath: pathlib.Path = ( - pathlib.Path(__file__).parent / "CMC_glb_CAPE_SFC_0_latlon.15x.15_2023080900_P027.grib2" - ) - out = testClient.mapCachedRaw(p=testFilePath) - - # Check latitude and longitude are present - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - self.assertEqual(len(out["latitude"].values), 1201) - self.assertEqual(len(out["longitude"].values), 2400) - # Check that the dimensions are correctly ordered and renamed - self.assertEqual( - out[next(iter(out.data_vars.keys()))].dims, - ("init_time", "step", "latitude", "longitude"), - ) - - - -class TestParseCMCFilename(unittest.TestCase): - baseurl = "https://dd.weather.gc.ca/model_gem_global/15km/grib2/lat_lon/" - - def test_parses(self) -> None: - tests = { - "gdps-sl": "CMC_glb_CIN_SFC_0_latlon.15x.15_2023080900_P027.grib2", - "geps-sl": "CMC_geps-raw_CIN_SFC_0_latlon0p5x0p5_2023080900_P027_allmbrs.grib2", - "gdps-hl": "CMC_glb_SPFH_TGL_40_latlon.15x.15_2023080900_P027.grib2", - "geps-hl": "CMC_geps-raw_SPFH_TGL_80_latlon0p5x0p5_2023080900_P000_allmbrs.grib2", - "gdps-pl": "CMC_glb_TMP_ISBL_300_latlon.15x.15_2023080900_P000.grib2", - "geps-pl": "CMC_geps-raw_TMP_ISBL_0500_latlon0p5x0p5_2023080900_P000_allmbrs.grib2", - } - - for k, v in tests.items(): - with self.subTest(msg=k): - out: CMCFileInfo | None = _parseCMCFilename( - name=v, - baseurl=self.baseurl, - match_hl="hl" in k, - match_pl="pl" in k, - ) - if out is None: - self.fail(f"Failed to parse filename {v}") - self.assertEqual(out.filename(), v) - self.assertEqual(out.it(), dt.datetime(2023, 8, 9, 0, tzinfo=dt.UTC)) - - diff --git a/src/nwp_consumer/internal/inputs/ecmwf/README.md b/src/nwp_consumer/internal/inputs/ecmwf/README.md deleted file mode 100644 index f00bcf34..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# ECMWF API - -## Authentication - -The ECMWF API requires the setting of a few environment variables, -or an `.ecmwfapirc` file in the user's home directory. See the PyPi entry: -https://pypi.org/project/ecmwf-api-client/, or the ECMWFMARSConfig class -in `nwp_consumer/internal/config/config.py`. The variables are - -```shell -ECMWF_API_KEY= -ECMWF_API_EMAIL= -ECMWF_API_URL= -``` - -which can be accessed via visiting [https://api.ecmwf.int/v1/key/](https://api.ecmwf.int/v1/key/). - -## MARS - -View the glossary for ECMWF MARS variables available for the operational forecast: -https://codes.ecmwf.int/grib/param-db - -View the glossary for the MARS postprocessing keywords: -https://confluence.ecmwf.int/display/UDOC/Post-processing+keywords diff --git a/src/nwp_consumer/internal/inputs/ecmwf/__init__.py b/src/nwp_consumer/internal/inputs/ecmwf/__init__.py deleted file mode 100644 index 2e777948..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -__all__ = [ - "MARSClient", - "S3Client", -] - -from .mars import MARSClient -from .s3 import S3Client diff --git a/src/nwp_consumer/internal/inputs/ecmwf/_models.py b/src/nwp_consumer/internal/inputs/ecmwf/_models.py deleted file mode 100644 index 1ed2dbe0..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/_models.py +++ /dev/null @@ -1,77 +0,0 @@ -import datetime as dt -from dataclasses import dataclass - -import nwp_consumer.internal as internal - - -@dataclass -class ECMWFMarsFileInfo(internal.FileInfoModel): - inittime: dt.datetime - area: str - params: list[str] - steplist: list[int] - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - # ECMWF does not have explicit filenames when using the MARS API - # * As such, name manually based on their inittime and area covered - # e.g. `ecmwf_uk_20210101T0000.grib` - return f"ecmwf_{self.area}_{self.inittime.strftime('%Y%m%dT%H%M')}.grib" - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return "" - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class.""" - return self.inittime - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - return self.params - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - return self.steplist - - -@dataclass -class ECMWFLiveFileInfo(internal.FileInfoModel): - """Dataclass for ECMWF live data files. - - Live ECMWF files are extensionless grib files named e.g. 'A1D02200000022001001'. - The files contain data for two areas. The names contain the following information - - A1D%m%d%H%M%m'%d'%H'%M'1, where the first time is the initialisation time - and the second the target time. - """ - - fname: str - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self.fname + ".grib" - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"ecmwf/{self.fname}" - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class. - - The file name doesn't have the year in it, so we've added it. - This might be a problem around the new year. - """ - return dt.datetime.strptime( - f"{self.fname[3:10]}-{dt.datetime.now().year}", "%m%d%H%M-%Y" - ).replace( - tzinfo=dt.UTC, - ) - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() diff --git a/src/nwp_consumer/internal/inputs/ecmwf/mars.py b/src/nwp_consumer/internal/inputs/ecmwf/mars.py deleted file mode 100644 index 565a104f..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/mars.py +++ /dev/null @@ -1,395 +0,0 @@ -"""Implements a client to fetch data from ECMWF.""" - -import datetime as dt -import inspect -import os -import pathlib -import re -import tempfile -import typing - -import cfgrib -import ecmwfapi.api -import structlog -import xarray as xr -from ecmwfapi import ECMWFService - -from nwp_consumer import internal - -from ._models import ECMWFMarsFileInfo - -log = structlog.getLogger() - -# Mapping from ECMWF eccode to ECMWF short name -# * https://codes.ecmwf.int/grib/param-db/?filter=All -PARAMETER_ECMWFCODE_MAP: dict[str, str] = { - "167.128": "tas", # 2 metre temperature - "165.128": "uas", # 10 metre U-component of wind - "166.128": "vas", # 10 metre V-component of wind - "47.128": "dsrp", # Direct solar radiation - "57.128": "uvb", # Downward uv radiation at surface - "188.128": "hcc", # High cloud cover - "187.128": "mcc", # Medium cloud cover - "186.128": "lcc", # Low cloud cover - "164.128": "clt", # Total cloud cover - "169.128": "ssrd", # Surface shortwave radiation downward - "175.128": "strd", # Surface longwave radiation downward - "260048": "tprate", # Total precipitation rate - "141.128": "sd", # Snow depth, m - "246.228": "u100", # 100 metre U component of wind - "247.228": "v100", # 100 metre V component of wind - "239.228": "u200", # 200 metre U component of wind - "240.228": "v200", # 200 metre V component of wind - "20.3": "vis", # Visibility -} - -AREA_MAP: dict[str, str] = { - "uk": "62/-12/48/3", - "nw-india": "31/68/20/79", - "india": "35/67/6/97", - "malta": "37/13/35/15", - "eu": "E", - "global": "G", -} - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("time", "step", "latitude", "longitude") - - -def marsLogger(msg: str) -> None: - """Redirect log from ECMWF API to structlog. - - Keyword Arguments: - ----------------- - msg: The message to redirect. - """ - debugSubstrings: list[str] = ["Requesting", "Transfering", "efficiency", "Done"] - errorSubstrings: list[str] = ["ERROR", "FATAL"] - if any(map(msg.__contains__, debugSubstrings)): - log.debug(event=msg, caller="mars") - if any(map(msg.__contains__, errorSubstrings)): - log.warning(event=msg, caller="mars") - - -class MARSClient(internal.FetcherInterface): - """Implements a client to fetch data from ECMWF's MARS API.""" - - server: ecmwfapi.api.ECMWFService - area: str - desired_params: list[str] - - def __init__( - self, - area: str = "uk", - hours: int = 48, - param_group: str = "default", - ) -> None: - """Create a new ECMWF Mars Client. - - Exposes a client for ECMWF's MARS API that conforms to the FetcherInterface. - - Args: - area: The area to fetch data for. Can be one of: - ["uk", "nw-india", "malta", "eu", "global"] - hours: The number of hours to fetch data for. Must be less than 90. - param_group: The parameter group to fetch data for. Can be one of: - ["default", "basic"] - """ - self.server = ECMWFService(service="mars", log=marsLogger) - - if area not in AREA_MAP: - raise KeyError(f"area must be one of {list(AREA_MAP.keys())}") - self.area = area - - self.hours = hours - - match param_group: - case "basic": - log.debug(event="Initialising ECMWF Client with basic parameter group") - self.desired_params = ["167.128", "169.128"] # 2 Metre Temperature, Dswrf - case _: - self.desired_params = list(PARAMETER_ECMWFCODE_MAP.keys()) - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"ECMWF_{self.area.upper()}" - - def getInitHours(self) -> list[int]: # noqa: D102 - # MARS data of the operational archive is available at 00 and 12 UTC - return [0, 12] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - # MARS requests can only ask for data that is more than 24 hours old: see - # https://confluence.ecmwf.int/display/UDOC/MARS+access+restrictions - if it > dt.datetime.now(tz=dt.UTC) - dt.timedelta(hours=24): - raise ValueError( - "ECMWF MARS requests can only ask for data that is more than 24 hours old", - ) - return [] - - tf = tempfile.NamedTemporaryFile(suffix=".txt", delete=False) - - with open(tf.name, "w") as f: - req: str = self._buildMarsRequest( - list_only=True, - it=it, - target=tf.name, - params=self.desired_params, - steps=list(range(0, self.hours)), - ) - - log.debug(event="listing ECMWF MARS inittime data", request=req, inittime=it) - - try: - self.server.execute(req=req, target=tf.name) - except ecmwfapi.api.APIException as e: - log.warn("error listing ECMWF MARS inittime data", error=e) - return [] - - # Explicitly check that the MARS listing file is readable and non-empty - if (os.access(tf.name, os.R_OK) is False) or (os.stat(tf.name).st_size < 100): - log.warn( - event="ECMWF filelisting is empty, check error logs", - filepath=tf.name, - ) - return [] - - # Ensure only available parameters are requested by populating the - # `available_params` list according to the result of the list request - with open(tf.name) as f: - file_contents: str = f.read() - available_data = _parseListing(fileData=file_contents) - for parameter in self.desired_params: - if parameter not in available_data["params"]: - log.warn( - event=f"ECMWF MARS inittime data does not contain parameter {parameter}", - parameter=parameter, - inittime=it, - ) - - log.debug( - event="Listed raw files for ECMWF MARS inittime", - inittime=it, - available_params=available_data["params"], - ) - - # Clean up the temporary file - tf.close() - os.unlink(tf.name) - - return [ - ECMWFMarsFileInfo( - inittime=it, - area=self.area, - params=available_data["params"], - steplist=available_data["steps"], - ), - ] - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - - req: str = self._buildMarsRequest( - list_only=False, - it=fi.it(), - target=cfp.as_posix(), - params=fi.variables(), - steps=fi.steps(), - ) - - log.debug( - event="fetching ECMWF MARS data", - request=req, - inittime=fi.it(), - filename=fi.filename(), - ) - - try: - self.server.execute(req=req, target=cfp.as_posix()) - except ecmwfapi.api.APIException as e: - log.warn("error fetching ECMWF MARS data", error=e) - return pathlib.Path() - - if cfp.exists() is False: - log.warn("ECMWF data file does not exist", filepath=cfp.as_posix()) - return pathlib.Path() - - log.debug( - event="fetched all data from MARS", - filename=fi.filename(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: - """Overrides the corresponding method in the parent class.""" - if p.suffix != ".grib": - log.warn(event="cannot map non-grib file to dataset", filepath=p.as_posix()) - return xr.Dataset() - - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Load the wholesale file as a list of datasets - # * cfgrib loads multiple hypercubes for a single multi-parameter grib file - # * Can also set backend_kwargs={"indexpath": ""}, to avoid the index file - try: - datasets: list[xr.Dataset] = cfgrib.open_datasets( - path=p.as_posix(), - chunks={ - "time": 1, - "step": -1, - "longitude": "auto", - "latitude": "auto", - }, - backend_kwargs={"indexpath": ""}, - ) - except Exception as e: - log.warn(event="error converting raw file to dataset", filepath=p.as_posix(), error=e) - return xr.Dataset() - - # Merge the datasets back into one - wholesaleDataset = xr.merge( - objects=datasets, - compat="override", - combine_attrs="drop_conflicts", - ) - del datasets - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - wholesaleDataset = ( - wholesaleDataset.rename({"time": "init_time"}) - .expand_dims("init_time") - .transpose("init_time", "step", "latitude", "longitude") - .sortby("step") - .chunk( - { - "init_time": 1, - "step": -1, - "latitude": len(wholesaleDataset.latitude) // 2, - "longitude": len(wholesaleDataset.longitude) // 2, - }, - ) - ) - - return wholesaleDataset - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides the corresponding method in the parent class.""" - return { - "tas": internal.OCFParameter.TemperatureAGL, - "t2m": internal.OCFParameter.TemperatureAGL, - "uas": internal.OCFParameter.WindUComponentAGL, - "vas": internal.OCFParameter.WindVComponentAGL, - "dsrp": internal.OCFParameter.DirectSolarRadiation, - "uvb": internal.OCFParameter.DownwardUVRadiationAtSurface, - "hcc": internal.OCFParameter.HighCloudCover, - "mcc": internal.OCFParameter.MediumCloudCover, - "lcc": internal.OCFParameter.LowCloudCover, - "clt": internal.OCFParameter.TotalCloudCover, - "ssrd": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "strd": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "tprate": internal.OCFParameter.RainPrecipitationRate, - "sd": internal.OCFParameter.SnowDepthWaterEquivalent, - "u100": internal.OCFParameter.WindUComponent100m, - "v100": internal.OCFParameter.WindVComponent100m, - "u200": internal.OCFParameter.WindUComponent200m, - "v200": internal.OCFParameter.WindVComponent200m, - "vis": internal.OCFParameter.VisibilityAGL, - } - - def _buildMarsRequest( - self, - *, - list_only: bool, - it: dt.datetime, - target: str, - params: list[str], - steps: list[int], - ) -> str: - """Build a MARS request according to the parameters of the client. - - Args: - list_only: Whether to build a request that only lists the files that match - the request, or whether to build a request that downloads the files - that match the request. - it: The initialisation time to request data for. - target: The path to the target file to write the data to. - params: The parameters to request data for. - steps: The steps to request data for. - - Returns: - The MARS request. - """ - marsReq: str = f""" - {"list" if list_only else "retrieve"}, - class = od, - date = {it.strftime("%Y%m%d")}, - expver = 1, - levtype = sfc, - param = {'/'.join(params)}, - step = {'/'.join(map(str, steps))}, - stream = oper, - time = {it.strftime("%H")}, - type = fc, - area = {AREA_MAP[self.area]}, - grid = 0.1/0.1, - target = "{target}" - """ - - return inspect.cleandoc(marsReq) - - -def _parseListing(fileData: str) -> dict[str, list[str] | list[int]]: - """Parse the response from a MARS list request. - - When calling LIST to MARS, the response is a file containing the available - parameters, steps, times and sizes etc. This function parses the file to - extract the available parameters. - - The files contains some metadata, followed by a table as follows: - - ``` - file length missing offset param step - 0 13204588 . 149401026 20.3 0 - 0 13204588 . 502365532 47.128 0 - 0 13204588 . 568388472 57.128 0 - 0 19804268 . 911707760 141.128 0 - 0 13204588 . 1050353320 164.128 0 - - Grand Total - ``` - - This function uses positive lookahead and lookbehind regex to extract the - lines between the table header and the "Grand Total" line. The fourth - column of each line is the parameter. The fifth is the step. - - Args: - fileData: The data from the file. - - Returns: - A dict of parameters and steps available in the remote file. - """ - tablematch = re.search( - pattern=r"(? 4: - out["steps"].add(int(line.split()[5])) - out["params"].add(line.split()[4]) - out = {k: sorted(list(v)) for k, v in out.items()} - return out diff --git a/src/nwp_consumer/internal/inputs/ecmwf/s3.py b/src/nwp_consumer/internal/inputs/ecmwf/s3.py deleted file mode 100644 index aefc08bf..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/s3.py +++ /dev/null @@ -1,201 +0,0 @@ -"""Input covering an OCF-specific use case of pulling ECMWF data from an s3 bucket.""" - -import datetime as dt -import pathlib -import typing - -import cfgrib -import s3fs -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._models import ECMWFLiveFileInfo -from ... import OCFParameter - -log = structlog.getLogger() - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("time", "step", "latitude", "longitude") - - -class S3Client(internal.FetcherInterface): - """Implements a client to fetch ECMWF data from S3.""" - - area: str - desired_params: list[str] - bucket: pathlib.Path - - __fs: s3fs.S3FileSystem - - bucketPath: str = "ecmwf" - - def __init__( - self, - bucket: str, - region: str, - area: str = "uk", - key: str | None = "", - secret: str | None = "", - endpointURL: str = "", - ) -> None: - """Creates a new ECMWF S3 client. - - Exposes a client for fetching ECMWF data from an S3 bucket conforming to the - FetcherInterface. ECMWF S3 data is order-based, so parameters and steps cannot be - requested by this client. - - Args: - bucket: The name of the S3 bucket to fetch data from. - region: The AWS region to connect to. - key: The AWS access key to use for authentication. - secret: The AWS secret key to use for authentication. - area: The area for which to fetch data. - endpointURL: The endpoint URL to use for the S3 connection. - """ - if (key, secret) == ("", ""): - log.info( - event="attempting AWS connection using default credentials", - ) - key, secret = None, None - - self.__fs: s3fs.S3FileSystem = s3fs.S3FileSystem( - key=key, - secret=secret, - client_kwargs={ - "region_name": region, - "endpoint_url": None if endpointURL == "" else endpointURL, - }, - ) - self.area = area - self.bucket = pathlib.Path(bucket) - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"ECMWF_{self.area}".upper() - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: - """Overrides the corresponding method in the parent class.""" - allFiles: list[str] = self.__fs.ls((self.bucket / self.bucketPath).as_posix()) - # List items are of the form "bucket/folder/filename, so extract just the filename - initTimeFiles: list[internal.FileInfoModel] = [ - ECMWFLiveFileInfo(fname=pathlib.Path(file).name) - for file in allFiles - if it.strftime("A2D%m%d%H") in file - ] - return initTimeFiles - - def downloadToCache( - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - """Overrides the corresponding method in the parent class.""" - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with open(cfp, "wb") as f, self.__fs.open( - (self.bucket / fi.filepath()).as_posix(), - "rb", - ) as s: - for chunk in iter(lambda: s.read(12 * 1024), b""): - f.write(chunk) - f.flush() - - if not cfp.exists(): - log.warn(event="Failed to download file", filepath=fi.filepath()) - return pathlib.Path() - - # Check the sizes are the same - s3size = self.__fs.info((self.bucket / fi.filepath()).as_posix())["size"] - if cfp.stat().st_size != s3size: - log.warn( - event="Downloaded file size does not match expected size", - expected=s3size, - actual=cfp.stat().st_size, - ) - return pathlib.Path() - - return cfp - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: - """Overrides the corresponding method in the parent class.""" - all_dss: list[xr.Dataset] = cfgrib.open_datasets(p.as_posix()) - area_dss: list[xr.Dataset] = _filterDatasetsByArea(all_dss, self.area) - if len(area_dss) == 0: - log.warn( - event="No datasets found for area", - area=self.area, - file=p, - file_datasets=len(all_dss), - ) - return xr.Dataset() - - ds: xr.Dataset = xr.merge(area_dss, combine_attrs="drop_conflicts") - del area_dss, all_dss - - ds = ds.drop_vars( - names=[v for v in ds.coords if v not in COORDINATE_ALLOW_LIST], - errors="ignore", - ) - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - ds = ( - ds.rename({"time": "init_time"}) - .expand_dims("init_time") - .expand_dims("step") - .transpose("init_time", "step", "latitude", "longitude") - .sortby("step") - .chunk( - { - "init_time": 1, - "step": -1, - "latitude": len(ds.latitude) // 2, - "longitude": len(ds.longitude) // 2, - }, - ) - ) - - return ds - - def getInitHours(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - return [0, 6, 12, 18] - - def parameterConformMap(self) -> dict[str, OCFParameter]: - """Overrides the corresponding method in the parent class.""" - return { - "dsrp": internal.OCFParameter.DirectSolarRadiation, - "uvb": internal.OCFParameter.DownwardUVRadiationAtSurface, - "sd": internal.OCFParameter.SnowDepthWaterEquivalent, - "tcc": internal.OCFParameter.TotalCloudCover, - "clt": internal.OCFParameter.TotalCloudCover, - "u10": internal.OCFParameter.WindUComponentAGL, - "v10": internal.OCFParameter.WindVComponentAGL, - "t2m": internal.OCFParameter.TemperatureAGL, - "ssrd": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "strd": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "lcc": internal.OCFParameter.LowCloudCover, - "mcc": internal.OCFParameter.MediumCloudCover, - "hcc": internal.OCFParameter.HighCloudCover, - "vis": internal.OCFParameter.VisibilityAGL, - "u200": internal.OCFParameter.WindUComponent200m, - "v200": internal.OCFParameter.WindVComponent200m, - "u100": internal.OCFParameter.WindUComponent100m, - "v100": internal.OCFParameter.WindVComponent100m, - "tprate": internal.OCFParameter.RainPrecipitationRate, - } - - -def _filterDatasetsByArea(dss: list[xr.Dataset], area: str) -> list[xr.Dataset]: - """Filters a list of datasets by area.""" - match area: - case "uk": - return list(filter(lambda ds: ds.coords["latitude"].as_numpy().max() == 60, dss)) - case "nw-india": - return list(filter(lambda ds: ds.coords["latitude"].as_numpy().max() == 31, dss)) - case "india": - return list(filter(lambda ds: ds.coords["latitude"].as_numpy().max() == 35, dss)) - case _: - log.warn(event="Unknown area", area=area) - return [] diff --git a/src/nwp_consumer/internal/inputs/ecmwf/test_2params.grib b/src/nwp_consumer/internal/inputs/ecmwf/test_2params.grib deleted file mode 100644 index 7ddaab6e..00000000 Binary files a/src/nwp_consumer/internal/inputs/ecmwf/test_2params.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/ecmwf/test_list_response.txt b/src/nwp_consumer/internal/inputs/ecmwf/test_list_response.txt deleted file mode 100644 index 0fa26e8c..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/test_list_response.txt +++ /dev/null @@ -1,753 +0,0 @@ -class = od -date = 2017-09-11 -expver = 1 -file[0] = hpss:/mars/prod/od/o/oper/fc/sfc/marsodoper/0001/fc/20170911/sfc/1200/879664.20170927.205633 -id = 879664 -levtype = sfc -month = 201709 -stream = oper -time = 12:00:00 -type = fc -year = 2017 -file length missing offset param step -0 13204588 . 149401026 20.3 0 -0 13204588 . 502365532 47.128 0 -0 13204588 . 568388472 57.128 0 -0 19804268 . 911707760 141.128 0 -0 13204588 . 1050353320 164.128 0 -0 13204588 . 1063557908 165.128 0 -0 13204588 . 1076762496 166.128 0 -0 13204588 . 1089967084 167.128 0 -0 13204588 . 1116376260 169.128 0 -0 13204588 . 1155990024 175.128 0 -0 13204588 . 1274831316 186.128 0 -0 13204588 . 1288035904 187.128 0 -0 13204588 . 1301240492 188.128 0 -0 13204588 . 1822819104 246.228 0 -0 13204588 . 1836023692 247.228 0 -0 13204588 . 2059730628 20.3 1 -0 13204588 . 2373076942 47.128 1 -0 13204588 . 2439099882 57.128 1 -0 19804268 . 2742805406 141.128 1 -0 13204588 . 2881450966 164.128 1 -0 13204588 . 2894655554 165.128 1 -0 13204588 . 2907860142 166.128 1 -0 13204588 . 2921064730 167.128 1 -0 13204588 . 2947473906 169.128 1 -0 13204588 . 2987087670 175.128 1 -0 13204588 . 3105928962 186.128 1 -0 13204588 . 3119133550 187.128 1 -0 13204588 . 3132338138 188.128 1 -0 13204588 . 3601098398 246.228 1 -0 13204588 . 3614302986 247.228 1 -0 13204588 . 3838001328 20.3 2 -0 13204588 . 4151280934 47.128 2 -0 13204588 . 4217303874 57.128 2 -0 19804268 . 4521009398 141.128 2 -0 13204588 . 4659654958 164.128 2 -0 13204588 . 4672859546 165.128 2 -0 13204588 . 4686064134 166.128 2 -0 13204588 . 4699268722 167.128 2 -0 13204588 . 4725677898 169.128 2 -0 13204588 . 4765291662 175.128 2 -0 13204588 . 4884132954 186.128 2 -0 13204588 . 4897337542 187.128 2 -0 13204588 . 4910542130 188.128 2 -0 13204588 . 5379302390 246.228 2 -0 13204588 . 5392506978 247.228 2 -0 13204588 . 5616173368 20.3 3 -0 13204588 . 5968973866 47.128 3 -0 13204588 . 6034996806 57.128 3 -0 19804268 . 6338702330 141.128 3 -0 13204588 . 6477347890 164.128 3 -0 13204588 . 6490552478 165.128 3 -0 13204588 . 6503757066 166.128 3 -0 13204588 . 6516961654 167.128 3 -0 13204588 . 6543370830 169.128 3 -0 13204588 . 6582984594 175.128 3 -0 13204588 . 6701825886 186.128 3 -0 13204588 . 6715030474 187.128 3 -0 13204588 . 6728235062 188.128 3 -0 13204588 . 7223404498 246.228 3 -0 13204588 . 7236609086 247.228 3 -0 13204588 . 7460219078 20.3 4 -0 13204588 . 7773355806 47.128 4 -0 13204588 . 7839378746 57.128 4 -0 19804268 . 8143084270 141.128 4 -0 13204588 . 8281729830 164.128 4 -0 13204588 . 8294934418 165.128 4 -0 13204588 . 8308139006 166.128 4 -0 13204588 . 8321343594 167.128 4 -0 13204588 . 8347752770 169.128 4 -0 13204588 . 8387366534 175.128 4 -0 13204588 . 8506207826 186.128 4 -0 13204588 . 8519412414 187.128 4 -0 13204588 . 8532617002 188.128 4 -0 13204588 . 9001377262 246.228 4 -0 13204588 . 9014581850 247.228 4 -0 13204588 . 9238129714 20.3 5 -0 13204588 . 9551226076 47.128 5 -0 13204588 . 9617249016 57.128 5 -0 19804268 . 9920954540 141.128 5 -0 13204588 . 10059600100 164.128 5 -0 13204588 . 10072804688 165.128 5 -0 13204588 . 10086009276 166.128 5 -0 13204588 . 10099213864 167.128 5 -0 13204588 . 10125623040 169.128 5 -0 13204588 . 10165236804 175.128 5 -0 13204588 . 10284078096 186.128 5 -0 13204588 . 10297282684 187.128 5 -0 13204588 . 10310487272 188.128 5 -0 13204588 . 10779247532 246.228 5 -0 13204588 . 10792452120 247.228 5 -0 13204588 . 11015964730 20.3 6 -0 13204588 . 11368638958 47.128 6 -0 13204588 . 11434661898 57.128 6 -0 19804268 . 11777981186 141.128 6 -0 13204588 . 11916626746 164.128 6 -0 13204588 . 11929831334 165.128 6 -0 13204588 . 11943035922 166.128 6 -0 13204588 . 11956240510 167.128 6 -0 13204588 . 11982649686 169.128 6 -0 13204588 . 12022263450 175.128 6 -0 13204588 . 12141104742 186.128 6 -0 13204588 . 12154309330 187.128 6 -0 13204588 . 12167513918 188.128 6 -0 13204588 . 12689092530 246.228 6 -0 13204588 . 12702297118 247.228 6 -0 13204588 . 12925782934 20.3 7 -0 13204588 . 13238809550 47.128 7 -0 13204588 . 13304832490 57.128 7 -0 19804268 . 13608538014 141.128 7 -0 13204588 . 13747183574 164.128 7 -0 13204588 . 13760388162 165.128 7 -0 13204588 . 13773592750 166.128 7 -0 13204588 . 13786797338 167.128 7 -0 13204588 . 13813206514 169.128 7 -0 13204588 . 13852820278 175.128 7 -0 13204588 . 13971661570 186.128 7 -0 13204588 . 13984866158 187.128 7 -0 13204588 . 13998070746 188.128 7 -0 13204588 . 14466831006 246.228 7 -0 13204588 . 14480035594 247.228 7 -0 13204588 . 14703501378 20.3 8 -0 13204588 . 15016493576 47.128 8 -0 13204588 . 15082516516 57.128 8 -0 19804268 . 15386222040 141.128 8 -0 13204588 . 15524867600 164.128 8 -0 13204588 . 15538072188 165.128 8 -0 13204588 . 15551276776 166.128 8 -0 13204588 . 15564481364 167.128 8 -0 13204588 . 15590890540 169.128 8 -0 13204588 . 15630504304 175.128 8 -0 13204588 . 15749345596 186.128 8 -0 13204588 . 15762550184 187.128 8 -0 13204588 . 15775754772 188.128 8 -0 13204588 . 16244515032 246.228 8 -0 13204588 . 16257719620 247.228 8 -0 13204588 . 16481180904 20.3 9 -0 13204588 . 16833774492 47.128 9 -0 13204588 . 16899797432 57.128 9 -0 19804268 . 17203502956 141.128 9 -0 13204588 . 17342148516 164.128 9 -0 13204588 . 17355353104 165.128 9 -0 13204588 . 17368557692 166.128 9 -0 13204588 . 17381762280 167.128 9 -0 13204588 . 17408171456 169.128 9 -0 13204588 . 17447785220 175.128 9 -0 13204588 . 17566626512 186.128 9 -0 13204588 . 17579831100 187.128 9 -0 13204588 . 17593035688 188.128 9 -0 13204588 . 18088205124 246.228 9 -0 13204588 . 18101409712 247.228 9 -0 13204588 . 18324865646 20.3 10 -0 13204588 . 18637850684 47.128 10 -0 13204588 . 18703873624 57.128 10 -0 19804268 . 19007579148 141.128 10 -0 13204588 . 19146224708 164.128 10 -0 13204588 . 19159429296 165.128 10 -0 13204588 . 19172633884 166.128 10 -0 13204588 . 19185838472 167.128 10 -0 13204588 . 19212247648 169.128 10 -0 13204588 . 19251861412 175.128 10 -0 13204588 . 19370702704 186.128 10 -0 13204588 . 19383907292 187.128 10 -0 13204588 . 19397111880 188.128 10 -0 13204588 . 19865872140 246.228 10 -0 13204588 . 19879076728 247.228 10 -0 13204588 . 20102545664 20.3 11 -0 13204588 . 20415549110 47.128 11 -0 13204588 . 20481572050 57.128 11 -0 19804268 . 20785277574 141.128 11 -0 13204588 . 20923923134 164.128 11 -0 13204588 . 20937127722 165.128 11 -0 13204588 . 20950332310 166.128 11 -0 13204588 . 20963536898 167.128 11 -0 13204588 . 20989946074 169.128 11 -0 13204588 . 21029559838 175.128 11 -0 13204588 . 21148401130 186.128 11 -0 13204588 . 21161605718 187.128 11 -0 13204588 . 21174810306 188.128 11 -0 13204588 . 21643570566 246.228 11 -0 13204588 . 21656775154 247.228 11 -0 13204588 . 21880262776 20.3 12 -0 13204588 . 22232906434 47.128 12 -0 13204588 . 22298929374 57.128 12 -0 19804268 . 22642248662 141.128 12 -0 13204588 . 22780894222 164.128 12 -0 13204588 . 22794098810 165.128 12 -0 13204588 . 22807303398 166.128 12 -0 13204588 . 22820507986 167.128 12 -0 13204588 . 22846917162 169.128 12 -0 13204588 . 22886530926 175.128 12 -0 13204588 . 23005372218 186.128 12 -0 13204588 . 23018576806 187.128 12 -0 13204588 . 23031781394 188.128 12 -0 13204588 . 23553360006 246.228 12 -0 13204588 . 23566564594 247.228 12 -0 13204588 . 23790080002 20.3 13 -0 13204588 . 24103127738 47.128 13 -0 13204588 . 24169150678 57.128 13 -0 19804268 . 24472856202 141.128 13 -0 13204588 . 24611501762 164.128 13 -0 13204588 . 24624706350 165.128 13 -0 13204588 . 24637910938 166.128 13 -0 13204588 . 24651115526 167.128 13 -0 13204588 . 24677524702 169.128 13 -0 13204588 . 24717138466 175.128 13 -0 13204588 . 24835979758 186.128 13 -0 13204588 . 24849184346 187.128 13 -0 13204588 . 24862388934 188.128 13 -0 13204588 . 25331149194 246.228 13 -0 13204588 . 25344353782 247.228 13 -0 13204588 . 25567894082 20.3 14 -0 13204588 . 25880963710 47.128 14 -0 13204588 . 25946986650 57.128 14 -0 19804268 . 26250692174 141.128 14 -0 13204588 . 26389337734 164.128 14 -0 13204588 . 26402542322 165.128 14 -0 13204588 . 26415746910 166.128 14 -0 13204588 . 26428951498 167.128 14 -0 13204588 . 26455360674 169.128 14 -0 13204588 . 26494974438 175.128 14 -0 13204588 . 26613815730 186.128 14 -0 13204588 . 26627020318 187.128 14 -0 13204588 . 26640224906 188.128 14 -0 13204588 . 27108985166 246.228 14 -0 13204588 . 27122189754 247.228 14 -0 13204588 . 27345740392 20.3 15 -0 13204588 . 27698449984 47.128 15 -0 13204588 . 27764472924 57.128 15 -0 19804268 . 28068178448 141.128 15 -0 13204588 . 28206824008 164.128 15 -0 13204588 . 28220028596 165.128 15 -0 13204588 . 28233233184 166.128 15 -0 13204588 . 28246437772 167.128 15 -0 13204588 . 28272846948 169.128 15 -0 13204588 . 28312460712 175.128 15 -0 13204588 . 28431302004 186.128 15 -0 13204588 . 28444506592 187.128 15 -0 13204588 . 28457711180 188.128 15 -0 13204588 . 28952880616 246.228 15 -0 13204588 . 28966085204 247.228 15 -0 13204588 . 29189649042 20.3 16 -0 13204588 . 29502773520 47.128 16 -0 13204588 . 29568796460 57.128 16 -0 19804268 . 29872501984 141.128 16 -0 13204588 . 30011147544 164.128 16 -0 13204588 . 30024352132 165.128 16 -0 13204588 . 30037556720 166.128 16 -0 13204588 . 30050761308 167.128 16 -0 13204588 . 30077170484 169.128 16 -0 13204588 . 30116784248 175.128 16 -0 13204588 . 30235625540 186.128 16 -0 13204588 . 30248830128 187.128 16 -0 13204588 . 30262034716 188.128 16 -0 13204588 . 30730794976 246.228 16 -0 13204588 . 30743999564 247.228 16 -0 13204588 . 30967569542 20.3 17 -0 13204588 . 31280699790 47.128 17 -0 13204588 . 31346722730 57.128 17 -0 19804268 . 31650428254 141.128 17 -0 13204588 . 31789073814 164.128 17 -0 13204588 . 31802278402 165.128 17 -0 13204588 . 31815482990 166.128 17 -0 13204588 . 31828687578 167.128 17 -0 13204588 . 31855096754 169.128 17 -0 13204588 . 31894710518 175.128 17 -0 13204588 . 32013551810 186.128 17 -0 13204588 . 32026756398 187.128 17 -0 13204588 . 32039960986 188.128 17 -0 13204588 . 32508721246 246.228 17 -0 13204588 . 32521925834 247.228 17 -0 13204588 . 32745476662 20.3 18 -0 13204588 . 33098200614 47.128 18 -0 13204588 . 33164223554 57.128 18 -0 19804268 . 33507542842 141.128 18 -0 13204588 . 33646188402 164.128 18 -0 13204588 . 33659392990 165.128 18 -0 13204588 . 33672597578 166.128 18 -0 13204588 . 33685802166 167.128 18 -0 13204588 . 33712211342 169.128 18 -0 13204588 . 33751825106 175.128 18 -0 13204588 . 33870666398 186.128 18 -0 13204588 . 33883870986 187.128 18 -0 13204588 . 33897075574 188.128 18 -0 13204588 . 34418654186 246.228 18 -0 13204588 . 34431858774 247.228 18 -0 13204588 . 34655395052 20.3 19 -0 13204588 . 34968483438 47.128 19 -0 13204588 . 35034506378 57.128 19 -0 19804268 . 35338211902 141.128 19 -0 13204588 . 35476857462 164.128 19 -0 13204588 . 35490062050 165.128 19 -0 13204588 . 35503266638 166.128 19 -0 13204588 . 35516471226 167.128 19 -0 13204588 . 35542880402 169.128 19 -0 13204588 . 35582494166 175.128 19 -0 13204588 . 35701335458 186.128 19 -0 13204588 . 35714540046 187.128 19 -0 13204588 . 35727744634 188.128 19 -0 13204588 . 36196504894 246.228 19 -0 13204588 . 36209709482 247.228 19 -0 13204588 . 36433237374 20.3 20 -0 13204588 . 36746346620 47.128 20 -0 13204588 . 36812369560 57.128 20 -0 19804268 . 37116075084 141.128 20 -0 13204588 . 37254720644 164.128 20 -0 13204588 . 37267925232 165.128 20 -0 13204588 . 37281129820 166.128 20 -0 13204588 . 37294334408 167.128 20 -0 13204588 . 37320743584 169.128 20 -0 13204588 . 37360357348 175.128 20 -0 13204588 . 37479198640 186.128 20 -0 13204588 . 37492403228 187.128 20 -0 13204588 . 37505607816 188.128 20 -0 13204588 . 37974368076 246.228 20 -0 13204588 . 37987572664 247.228 20 -0 13204588 . 38211115288 20.3 21 -0 13204588 . 38563846038 47.128 21 -0 13204588 . 38629868978 57.128 21 -0 19804268 . 38933574502 141.128 21 -0 13204588 . 39072220062 164.128 21 -0 13204588 . 39085424650 165.128 21 -0 13204588 . 39098629238 166.128 21 -0 13204588 . 39111833826 167.128 21 -0 13204588 . 39138243002 169.128 21 -0 13204588 . 39177856766 175.128 21 -0 13204588 . 39296698058 186.128 21 -0 13204588 . 39309902646 187.128 21 -0 13204588 . 39323107234 188.128 21 -0 13204588 . 39818276670 246.228 21 -0 13204588 . 39831481258 247.228 21 -0 13204588 . 40055020832 20.3 22 -0 13204588 . 40368123314 47.128 22 -0 13204588 . 40434146254 57.128 22 -0 19804268 . 40737851778 141.128 22 -0 13204588 . 40876497338 164.128 22 -0 13204588 . 40889701926 165.128 22 -0 13204588 . 40902906514 166.128 22 -0 13204588 . 40916111102 167.128 22 -0 13204588 . 40942520278 169.128 22 -0 13204588 . 40982134042 175.128 22 -0 13204588 . 41100975334 186.128 22 -0 13204588 . 41114179922 187.128 22 -0 13204588 . 41127384510 188.128 22 -0 13204588 . 41596144770 246.228 22 -0 13204588 . 41609349358 247.228 22 -0 13204588 . 41832865678 20.3 23 -0 13204588 . 42145953858 47.128 23 -0 13204588 . 42211976798 57.128 23 -0 19804268 . 42515682322 141.128 23 -0 13204588 . 42654327882 164.128 23 -0 13204588 . 42667532470 165.128 23 -0 13204588 . 42680737058 166.128 23 -0 13204588 . 42693941646 167.128 23 -0 13204588 . 42720350822 169.128 23 -0 13204588 . 42759964586 175.128 23 -0 13204588 . 42878805878 186.128 23 -0 13204588 . 42892010466 187.128 23 -0 13204588 . 42905215054 188.128 23 -0 13204588 . 43373975314 246.228 23 -0 13204588 . 43387179902 247.228 23 -0 13204588 . 43610686714 20.3 24 -0 13204588 . 43963368300 47.128 24 -0 13204588 . 44029391240 57.128 24 -0 19804268 . 44372710528 141.128 24 -0 13204588 . 44511356088 164.128 24 -0 13204588 . 44524560676 165.128 24 -0 13204588 . 44537765264 166.128 24 -0 13204588 . 44550969852 167.128 24 -0 13204588 . 44577379028 169.128 24 -0 13204588 . 44616992792 175.128 24 -0 13204588 . 44735834084 186.128 24 -0 13204588 . 44749038672 187.128 24 -0 13204588 . 44762243260 188.128 24 -0 13204588 . 45283821872 246.228 24 -0 13204588 . 45297026460 247.228 24 -0 13204588 . 45520524088 20.3 25 -0 13204588 . 45833552868 47.128 25 -0 13204588 . 45899575808 57.128 25 -0 19804268 . 46203281332 141.128 25 -0 13204588 . 46341926892 164.128 25 -0 13204588 . 46355131480 165.128 25 -0 13204588 . 46368336068 166.128 25 -0 13204588 . 46381540656 167.128 25 -0 13204588 . 46407949832 169.128 25 -0 13204588 . 46447563596 175.128 25 -0 13204588 . 46566404888 186.128 25 -0 13204588 . 46579609476 187.128 25 -0 13204588 . 46592814064 188.128 25 -0 13204588 . 47061574324 246.228 25 -0 13204588 . 47074778912 247.228 25 -0 13204588 . 47298259896 20.3 26 -0 13204588 . 47611250486 47.128 26 -0 13204588 . 47677273426 57.128 26 -0 19804268 . 47980978950 141.128 26 -0 13204588 . 48119624510 164.128 26 -0 13204588 . 48132829098 165.128 26 -0 13204588 . 48146033686 166.128 26 -0 13204588 . 48159238274 167.128 26 -0 13204588 . 48185647450 169.128 26 -0 13204588 . 48225261214 175.128 26 -0 13204588 . 48344102506 186.128 26 -0 13204588 . 48357307094 187.128 26 -0 13204588 . 48370511682 188.128 26 -0 13204588 . 48839271942 246.228 26 -0 13204588 . 48852476530 247.228 26 -0 13204588 . 49075946602 20.3 27 -0 13204588 . 49428498458 47.128 27 -0 13204588 . 49494521398 57.128 27 -0 19804268 . 49798226922 141.128 27 -0 13204588 . 49936872482 164.128 27 -0 13204588 . 49950077070 165.128 27 -0 13204588 . 49963281658 166.128 27 -0 13204588 . 49976486246 167.128 27 -0 13204588 . 50002895422 169.128 27 -0 13204588 . 50042509186 175.128 27 -0 13204588 . 50161350478 186.128 27 -0 13204588 . 50174555066 187.128 27 -0 13204588 . 50187759654 188.128 27 -0 13204588 . 50682929090 246.228 27 -0 13204588 . 50696133678 247.228 27 -0 13204588 . 50919591620 20.3 28 -0 13204588 . 51232499532 47.128 28 -0 13204588 . 51298522472 57.128 28 -0 19804268 . 51602227996 141.128 28 -0 13204588 . 51740873556 164.128 28 -0 13204588 . 51754078144 165.128 28 -0 13204588 . 51767282732 166.128 28 -0 13204588 . 51780487320 167.128 28 -0 13204588 . 51806896496 169.128 28 -0 13204588 . 51846510260 175.128 28 -0 13204588 . 51965351552 186.128 28 -0 13204588 . 51978556140 187.128 28 -0 13204588 . 51991760728 188.128 28 -0 13204588 . 52460520988 246.228 28 -0 13204588 . 52473725576 247.228 28 -0 13204588 . 52697157692 20.3 29 -0 13204588 . 53010049064 47.128 29 -0 13204588 . 53076072004 57.128 29 -0 19804268 . 53379777528 141.128 29 -0 13204588 . 53518423088 164.128 29 -0 13204588 . 53531627676 165.128 29 -0 13204588 . 53544832264 166.128 29 -0 13204588 . 53558036852 167.128 29 -0 13204588 . 53584446028 169.128 29 -0 13204588 . 53624059792 175.128 29 -0 13204588 . 53742901084 186.128 29 -0 13204588 . 53756105672 187.128 29 -0 13204588 . 53769310260 188.128 29 -0 13204588 . 54238070520 246.228 29 -0 13204588 . 54251275108 247.228 29 -0 13204588 . 54474679350 20.3 30 -0 13204588 . 54827191178 47.128 30 -0 13204588 . 54893214118 57.128 30 -0 19804268 . 55236533406 141.128 30 -0 13204588 . 55375178966 164.128 30 -0 13204588 . 55388383554 165.128 30 -0 13204588 . 55401588142 166.128 30 -0 13204588 . 55414792730 167.128 30 -0 13204588 . 55441201906 169.128 30 -0 13204588 . 55480815670 175.128 30 -0 13204588 . 55599656962 186.128 30 -0 13204588 . 55612861550 187.128 30 -0 13204588 . 55626066138 188.128 30 -0 13204588 . 56147644750 246.228 30 -0 13204588 . 56160849338 247.228 30 -0 13204588 . 56384230992 20.3 31 -0 13204588 . 56697120466 47.128 31 -0 13204588 . 56763143406 57.128 31 -0 19804268 . 57066848930 141.128 31 -0 13204588 . 57205494490 164.128 31 -0 13204588 . 57218699078 165.128 31 -0 13204588 . 57231903666 166.128 31 -0 13204588 . 57245108254 167.128 31 -0 13204588 . 57271517430 169.128 31 -0 13204588 . 57311131194 175.128 31 -0 13204588 . 57429972486 186.128 31 -0 13204588 . 57443177074 187.128 31 -0 13204588 . 57456381662 188.128 31 -0 13204588 . 57925141922 246.228 31 -0 13204588 . 57938346510 247.228 31 -0 13204588 . 58161715180 20.3 32 -0 13204588 . 58474580614 47.128 32 -0 13204588 . 58540603554 57.128 32 -0 19804268 . 58844309078 141.128 32 -0 13204588 . 58982954638 164.128 32 -0 13204588 . 58996159226 165.128 32 -0 13204588 . 59009363814 166.128 32 -0 13204588 . 59022568402 167.128 32 -0 13204588 . 59048977578 169.128 32 -0 13204588 . 59088591342 175.128 32 -0 13204588 . 59207432634 186.128 32 -0 13204588 . 59220637222 187.128 32 -0 13204588 . 59233841810 188.128 32 -0 13204588 . 59702602070 246.228 32 -0 13204588 . 59715806658 247.228 32 -0 13204588 . 59939173910 20.3 33 -0 13204588 . 60291640332 47.128 33 -0 13204588 . 60357663272 57.128 33 -0 19804268 . 60661368796 141.128 33 -0 13204588 . 60800014356 164.128 33 -0 13204588 . 60813218944 165.128 33 -0 13204588 . 60826423532 166.128 33 -0 13204588 . 60839628120 167.128 33 -0 13204588 . 60866037296 169.128 33 -0 13204588 . 60905651060 175.128 33 -0 13204588 . 61024492352 186.128 33 -0 13204588 . 61037696940 187.128 33 -0 13204588 . 61050901528 188.128 33 -0 13204588 . 61546070964 246.228 33 -0 13204588 . 61559275552 247.228 33 -0 13204588 . 61782632774 20.3 34 -0 13204588 . 62095478808 47.128 34 -0 13204588 . 62161501748 57.128 34 -0 19804268 . 62465207272 141.128 34 -0 13204588 . 62603852832 164.128 34 -0 13204588 . 62617057420 165.128 34 -0 13204588 . 62630262008 166.128 34 -0 13204588 . 62643466596 167.128 34 -0 13204588 . 62669875772 169.128 34 -0 13204588 . 62709489536 175.128 34 -0 13204588 . 62828330828 186.128 34 -0 13204588 . 62841535416 187.128 34 -0 13204588 . 62854740004 188.128 34 -0 13204588 . 63323500264 246.228 34 -0 13204588 . 63336704852 247.228 34 -0 13204588 . 63560060114 20.3 35 -0 13204588 . 63872912524 47.128 35 -0 13204588 . 63938935464 57.128 35 -0 19804268 . 64242640988 141.128 35 -0 13204588 . 64381286548 164.128 35 -0 13204588 . 64394491136 165.128 35 -0 13204588 . 64407695724 166.128 35 -0 13204588 . 64420900312 167.128 35 -0 13204588 . 64447309488 169.128 35 -0 13204588 . 64486923252 175.128 35 -0 13204588 . 64605764544 186.128 35 -0 13204588 . 64618969132 187.128 35 -0 13204588 . 64632173720 188.128 35 -0 13204588 . 65100933980 246.228 35 -0 13204588 . 65114138568 247.228 35 -0 13204588 . 65337505120 20.3 36 -0 13204588 . 65689975490 47.128 36 -0 13204588 . 65755998430 57.128 36 -0 19804268 . 66099317718 141.128 36 -0 13204588 . 66237963278 164.128 36 -0 13204588 . 66251167866 165.128 36 -0 13204588 . 66264372454 166.128 36 -0 13204588 . 66277577042 167.128 36 -0 13204588 . 66303986218 169.128 36 -0 13204588 . 66343599982 175.128 36 -0 13204588 . 66462441274 186.128 36 -0 13204588 . 66475645862 187.128 36 -0 13204588 . 66488850450 188.128 36 -0 13204588 . 67010429062 246.228 36 -0 13204588 . 67023633650 247.228 36 -0 13204588 . 67247021976 20.3 37 -0 13204588 . 67559886024 47.128 37 -0 13204588 . 67625908964 57.128 37 -0 19804268 . 67929614488 141.128 37 -0 13204588 . 68068260048 164.128 37 -0 13204588 . 68081464636 165.128 37 -0 13204588 . 68094669224 166.128 37 -0 13204588 . 68107873812 167.128 37 -0 13204588 . 68134282988 169.128 37 -0 13204588 . 68173896752 175.128 37 -0 13204588 . 68292738044 186.128 37 -0 13204588 . 68305942632 187.128 37 -0 13204588 . 68319147220 188.128 37 -0 13204588 . 68787907480 246.228 37 -0 13204588 . 68801112068 247.228 37 -0 13204588 . 69024509532 20.3 38 -0 13204588 . 69337403932 47.128 38 -0 13204588 . 69403426872 57.128 38 -0 19804268 . 69707132396 141.128 38 -0 13204588 . 69845777956 164.128 38 -0 13204588 . 69858982544 165.128 38 -0 13204588 . 69872187132 166.128 38 -0 13204588 . 69885391720 167.128 38 -0 13204588 . 69911800896 169.128 38 -0 13204588 . 69951414660 175.128 38 -0 13204588 . 70070255952 186.128 38 -0 13204588 . 70083460540 187.128 38 -0 13204588 . 70096665128 188.128 38 -0 13204588 . 70565425388 246.228 38 -0 13204588 . 70578629976 247.228 38 -0 13204588 . 70802043166 20.3 39 -0 13204588 . 71154568210 47.128 39 -0 13204588 . 71220591150 57.128 39 -0 19804268 . 71524296674 141.128 39 -0 13204588 . 71662942234 164.128 39 -0 13204588 . 71676146822 165.128 39 -0 13204588 . 71689351410 166.128 39 -0 13204588 . 71702555998 167.128 39 -0 13204588 . 71728965174 169.128 39 -0 13204588 . 71768578938 175.128 39 -0 13204588 . 71887420230 186.128 39 -0 13204588 . 71900624818 187.128 39 -0 13204588 . 71913829406 188.128 39 -0 13204588 . 72408998842 246.228 39 -0 13204588 . 72422203430 247.228 39 -0 13204588 . 72645638886 20.3 40 -0 13204588 . 72958547860 47.128 40 -0 13204588 . 73024570800 57.128 40 -0 19804268 . 73328276324 141.128 40 -0 13204588 . 73466921884 164.128 40 -0 13204588 . 73480126472 165.128 40 -0 13204588 . 73493331060 166.128 40 -0 13204588 . 73506535648 167.128 40 -0 13204588 . 73532944824 169.128 40 -0 13204588 . 73572558588 175.128 40 -0 13204588 . 73691399880 186.128 40 -0 13204588 . 73704604468 187.128 40 -0 13204588 . 73717809056 188.128 40 -0 13204588 . 74186569316 246.228 40 -0 13204588 . 74199773904 247.228 40 -0 13204588 . 74423238860 20.3 41 -0 13204588 . 74736120340 47.128 41 -0 13204588 . 74802143280 57.128 41 -0 19804268 . 75105848804 141.128 41 -0 13204588 . 75244494364 164.128 41 -0 13204588 . 75257698952 165.128 41 -0 13204588 . 75270903540 166.128 41 -0 13204588 . 75284108128 167.128 41 -0 13204588 . 75310517304 169.128 41 -0 13204588 . 75350131068 175.128 41 -0 13204588 . 75468972360 186.128 41 -0 13204588 . 75482176948 187.128 41 -0 13204588 . 75495381536 188.128 41 -0 13204588 . 75964141796 246.228 41 -0 13204588 . 75977346384 247.228 41 -0 13204588 . 76200835622 20.3 42 -0 13204588 . 76553294646 47.128 42 -0 13204588 . 76619317586 57.128 42 -0 19804268 . 76962636874 141.128 42 -0 13204588 . 77101282434 164.128 42 -0 13204588 . 77114487022 165.128 42 -0 13204588 . 77127691610 166.128 42 -0 13204588 . 77140896198 167.128 42 -0 13204588 . 77167305374 169.128 42 -0 13204588 . 77206919138 175.128 42 -0 13204588 . 77325760430 186.128 42 -0 13204588 . 77338965018 187.128 42 -0 13204588 . 77352169606 188.128 42 -0 13204588 . 77873748218 246.228 42 -0 13204588 . 77886952806 247.228 42 -0 13204588 . 78110453028 20.3 43 -0 13204588 . 78423282710 47.128 43 -0 13204588 . 78489305650 57.128 43 -0 19804268 . 78793011174 141.128 43 -0 13204588 . 78931656734 164.128 43 -0 13204588 . 78944861322 165.128 43 -0 13204588 . 78958065910 166.128 43 -0 13204588 . 78971270498 167.128 43 -0 13204588 . 78997679674 169.128 43 -0 13204588 . 79037293438 175.128 43 -0 13204588 . 79156134730 186.128 43 -0 13204588 . 79169339318 187.128 43 -0 13204588 . 79182543906 188.128 43 -0 13204588 . 79651304166 246.228 43 -0 13204588 . 79664508754 247.228 43 -0 13204588 . 79888021814 20.3 44 -0 13204588 . 80200843328 47.128 44 -0 13204588 . 80266866268 57.128 44 -0 19804268 . 80570571792 141.128 44 -0 13204588 . 80709217352 164.128 44 -0 13204588 . 80722421940 165.128 44 -0 13204588 . 80735626528 166.128 44 -0 13204588 . 80748831116 167.128 44 -0 13204588 . 80775240292 169.128 44 -0 13204588 . 80814854056 175.128 44 -0 13204588 . 80933695348 186.128 44 -0 13204588 . 80946899936 187.128 44 -0 13204588 . 80960104524 188.128 44 -0 13204588 . 81428864784 246.228 44 -0 13204588 . 81442069372 247.228 44 -0 13204588 . 81665595122 20.3 45 -0 13204588 . 82018034998 47.128 45 -0 13204588 . 82084057938 57.128 45 -0 19804268 . 82387763462 141.128 45 -0 13204588 . 82526409022 164.128 45 -0 13204588 . 82539613610 165.128 45 -0 13204588 . 82552818198 166.128 45 -0 13204588 . 82566022786 167.128 45 -0 13204588 . 82592431962 169.128 45 -0 13204588 . 82632045726 175.128 45 -0 13204588 . 82750887018 186.128 45 -0 13204588 . 82764091606 187.128 45 -0 13204588 . 82777296194 188.128 45 -0 13204588 . 83272465630 246.228 45 -0 13204588 . 83285670218 247.228 45 -0 13204588 . 83509197478 20.3 46 -0 13204588 . 83822017948 47.128 46 -0 13204588 . 83888040888 57.128 46 -0 19804268 . 84191746412 141.128 46 -0 13204588 . 84330391972 164.128 46 -0 13204588 . 84343596560 165.128 46 -0 13204588 . 84356801148 166.128 46 -0 13204588 . 84370005736 167.128 46 -0 13204588 . 84396414912 169.128 46 -0 13204588 . 84436028676 175.128 46 -0 13204588 . 84554869968 186.128 46 -0 13204588 . 84568074556 187.128 46 -0 13204588 . 84581279144 188.128 46 -0 13204588 . 85050039404 246.228 46 -0 13204588 . 85063243992 247.228 46 -0 13204588 . 85286768454 20.3 47 -0 13204588 . 85599585576 47.128 47 -0 13204588 . 85665608516 57.128 47 -0 19804268 . 85969314040 141.128 47 -0 13204588 . 86107959600 164.128 47 -0 13204588 . 86121164188 165.128 47 -0 13204588 . 86134368776 166.128 47 -0 13204588 . 86147573364 167.128 47 -0 13204588 . 86173982540 169.128 47 -0 13204588 . 86213596304 175.128 47 -0 13204588 . 86332437596 186.128 47 -0 13204588 . 86345642184 187.128 47 -0 13204588 . 86358846772 188.128 47 -0 13204588 . 86827607032 246.228 47 -0 13204588 . 86840811620 247.228 47 -0 13204588 . 87064340310 20.3 48 -0 13204588 . 87416750940 47.128 48 -0 13204588 . 87482773880 57.128 48 -0 19804268 . 87826093168 141.128 48 -0 13204588 . 87964738728 164.128 48 -0 13204588 . 87977943316 165.128 48 -0 13204588 . 87991147904 166.128 48 -0 13204588 . 88004352492 167.128 48 -0 13204588 . 88030761668 169.128 48 -0 13204588 . 88070375432 175.128 48 -0 13204588 . 88189216724 186.128 48 -0 13204588 . 88202421312 187.128 48 -0 13204588 . 88215625900 188.128 48 -0 13204588 . 88737204512 246.228 48 -0 13204588 . 88750409100 247.228 48 - -Grand Total: -============ - -Entries : 735 -Total : 10,028,756,500 (9.34001 Gbytes) diff --git a/src/nwp_consumer/internal/inputs/ecmwf/test_mars.py b/src/nwp_consumer/internal/inputs/ecmwf/test_mars.py deleted file mode 100644 index bc260abb..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/test_mars.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Tests for the ecmwf module.""" - -import datetime as dt -import pathlib -import unittest.mock - -from .mars import PARAMETER_ECMWFCODE_MAP, MARSClient, _parseListing - -# --------- Test setup --------- # - -testMARSClient = MARSClient( - area="uk", - hours=48, -) - -test_list_response: str = """ -class = od -date = 2017-09-11 -expver = 1 -file[0] = hpss:/mars/prod/od/o/oper/fc/sfc/marsodoper/0001/fc/20170911/sfc/1200/879664.20170927.205633 -id = 879664 -levtype = sfc -month = 201709 -stream = oper -time = 12:00:00 -type = fc -year = 2017 -file length missing offset param step -0 13204588 . 1089967084 167.128 0 -0 13204588 . 1116376260 169.128 0 -0 13204588 . 2921064730 167.128 1 -0 13204588 . 2947473906 169.128 1 -0 13204588 . 4699268722 167.128 2 -0 13204588 . 4725677898 169.128 2 -0 13204588 . 6516961654 167.128 3 -0 13204588 . 6543370830 169.128 3 - -Grand Total: -============ - -Entries : 8 -Total : 105,636,704 (100.743 Mbytes) -""" - - -# --------- Client methods --------- # - - -class TestECMWFMARSClient(unittest.TestCase): - """Tests for the ECMWFMARSClient method.""" - - def test_init(self) -> None: - with self.assertRaises(KeyError): - _ = MARSClient(area="not a valid area", hours=48) - - def test_mapCachedRaw(self) -> None: - testFilePath: pathlib.Path = pathlib.Path(__file__).parent / "test_2params.grib" - - out = testMARSClient.mapCachedRaw(p=testFilePath) - - # Ensure the dimensions have the right sizes - self.assertDictEqual( - {"init_time": 1, "step": 49, "latitude": 241, "longitude": 301}, - dict(out.sizes.items()), - ) - # Ensure the dimensions of the variables are in the correct order - self.assertEqual( - ("init_time", "step", "latitude", "longitude"), - out[next(iter(out.data_vars.keys()))].dims, - ) - # Ensure the correct datavars are in the dataset - self.assertCountEqual(["tprate", "sd"], list(out.data_vars.keys())) - - def test_buildMarsRequest(self) -> None: - testFilePath: pathlib.Path = pathlib.Path(__file__).parent / "test_2params.grib" - - # Test that the request is build correctly for the default client - testDefaultClient = MARSClient() - out = testDefaultClient._buildMarsRequest( - list_only=True, - target=testFilePath.as_posix(), - it=dt.datetime(2020, 1, 1, tzinfo=dt.UTC), - params=testDefaultClient.desired_params, - steps=range(4), - ) - - out.replace(" ", "") - lines = out.split("\n") - self.assertEqual(lines[0], "list,") - - d: dict = {} - for line in lines[1:]: - key, value = line.split("=") - d[key.strip()] = value.strip().replace(",", "") - - self.assertEqual(d["param"], "/".join(PARAMETER_ECMWFCODE_MAP.keys())) - self.assertEqual(d["date"], "20200101") - - # Test that the request is build correctly with the basic parameters - - testBasicClient = MARSClient( - area="uk", - hours=4, - param_group="basic", - ) - - out = testBasicClient._buildMarsRequest( - list_only=False, - target=testFilePath.as_posix(), - it=dt.datetime(2020, 1, 1, tzinfo=dt.UTC), - params=testBasicClient.desired_params, - steps=range(4), - ) - - out.replace(" ", "") - lines = out.split("\n") - self.assertEqual(lines[0], "retrieve,") - - d2: dict = {} - for line in lines[1:]: - key, value = line.split("=") - d2[key.strip()] = value.strip().replace(",", "") - - self.assertEqual(d2["param"], "167.128/169.128") - self.assertEqual(d2["date"], "20200101") - - -# --------- Static methods --------- # - - -class TestParseAvailableParams(unittest.TestCase): - def test_parsesSmallFileCorrectly(self) -> None: - out = _parseListing(fileData=test_list_response) - - self.assertDictEqual( - { - "params": ["167.128", "169.128"], - "steps": [0, 1, 2, 3], - }, - out, - ) - - def test_parsesParamsCorrectly(self) -> None: - testFilePath: pathlib.Path = pathlib.Path(__file__).parent / "test_list_response.txt" - - filedata: str = testFilePath.read_text() - - out = _parseListing(fileData=filedata) - - self.maxDiff = None - self.assertDictEqual( - { - "params": ["141.128","164.128","165.128","166.128","167.128","169.128","175.128","186.128","187.128","188.128","20.3","246.228","247.228","47.128","57.128"], - "steps": list(range(0, 49)), - }, - out, - ) diff --git a/src/nwp_consumer/internal/inputs/ecmwf/test_multiarea.grib b/src/nwp_consumer/internal/inputs/ecmwf/test_multiarea.grib deleted file mode 100644 index 26837676..00000000 Binary files a/src/nwp_consumer/internal/inputs/ecmwf/test_multiarea.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/ecmwf/test_s3.py b/src/nwp_consumer/internal/inputs/ecmwf/test_s3.py deleted file mode 100644 index 25801a77..00000000 --- a/src/nwp_consumer/internal/inputs/ecmwf/test_s3.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Unit tests for the S3Client class.""" - -import datetime as dt -import unittest -from pathlib import Path - -import xarray as xr -from botocore.client import BaseClient as BotocoreClient -from botocore.session import Session -from moto.server import ThreadedMotoServer -import numpy as np - -from ._models import ECMWFLiveFileInfo -from .s3 import S3Client - -ENDPOINT_URL = "http://localhost:5000" -BUCKET = "test-bucket" -KEY = "test-key" -SECRET = "test-secret" # noqa: S105 -REGION = "us-east-1" - -RAW = Path("ecmwf") - - -class TestS3Client(unittest.TestCase): - testS3: BotocoreClient - client: S3Client - server: ThreadedMotoServer - - @classmethod - def setUpClass(cls) -> None: - # Start a local S3 server - cls.server = ThreadedMotoServer() - cls.server.start() - - session = Session() - cls.testS3 = session.create_client( - service_name="s3", - region_name=REGION, - endpoint_url=ENDPOINT_URL, - aws_access_key_id=KEY, - aws_secret_access_key=SECRET, - ) - - # Create a mock S3 bucket - cls.testS3.create_bucket( - Bucket=BUCKET, - ) - - # Create an instance of the S3Client class - cls.client = S3Client( - area="uk", - key=KEY, - secret=SECRET, - region=REGION, - bucket=BUCKET, - endpointURL=ENDPOINT_URL, - ) - - @classmethod - def tearDownClass(cls) -> None: - # Delete all objects in bucket - response = cls.testS3.list_objects_v2( - Bucket=BUCKET, - ) - if "Contents" in response: - for obj in response["Contents"]: - cls.testS3.delete_object( - Bucket=BUCKET, - Key=obj["Key"], - ) - cls.server.stop() - - def test_listFilesForInitTime(self) -> None: - files = [ - "A2D01010000010100001", - "A2D01010000010101001", - "A2D01010000010102011", - "A2D01010000010103001", - "A2D01011200010112001", # Different init time - "A2D02191200010112001", # Leap year on 2024-02-29 - ] - for file in files: - # Create files in the mock bucket - self.testS3.put_object( - Bucket=BUCKET, - Key=(RAW / file).as_posix(), - Body=b"test", - ) - - # Test the listFilesForInitTime method - initTime = dt.datetime(2021, 1, 1, 0, 0, 0, tzinfo=dt.UTC) - out = self.client.listRawFilesForInitTime(it=initTime) - self.assertEqual(len(out), 4) - - def test_downloadRawFile(self) -> None: - # Create a file in the mock bucket - self.testS3.put_object( - Bucket=BUCKET, - Key=(RAW / "A2D01010000010100001").as_posix(), - Body=b"test", - ) - - # Test the downloadRawFile method - out = self.client.downloadToCache(fi=ECMWFLiveFileInfo(fname="A2D01010000010100001")) - self.assertEqual(out.read_bytes(), b"test") - - out.unlink() - - def test_mapCached(self) -> None: - testfile: Path = Path(__file__).parent / "test_multiarea.grib" - out: xr.Dataset = self.client.mapCachedRaw(p=testfile) - - self.assertEqual( - out[next(iter(out.data_vars.keys()))].dims, - ("init_time", "step", "latitude", "longitude"), - ) - self.assertEqual(len(out.data_vars.keys()), 18) - self.assertEqual(out.coords["latitude"].to_numpy().max(), 60) - self.assertIn("t2m", list(out.data_vars.keys())) - self.assertTrue(np.all(out.data_vars["t2m"].values)) - - print(out) - - # Check that setting the area maps only the relevant data - indiaClient = S3Client( - area="nw-india", - key=KEY, - secret=SECRET, - region=REGION, - bucket=BUCKET, - endpointURL=ENDPOINT_URL, - ) - out = indiaClient.mapCachedRaw(p=testfile) - self.assertEqual(out.coords["latitude"].to_numpy().max(), 31) - self.assertIn("t2m", list(out.data_vars.keys())) - self.assertTrue(np.all(out.data_vars["t2m"].values)) - diff --git a/src/nwp_consumer/internal/inputs/icon/__init__.py b/src/nwp_consumer/internal/inputs/icon/__init__.py deleted file mode 100644 index 02fde8c9..00000000 --- a/src/nwp_consumer/internal/inputs/icon/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["Client"] - -from .client import Client diff --git a/src/nwp_consumer/internal/inputs/icon/_consts.py b/src/nwp_consumer/internal/inputs/icon/_consts.py deleted file mode 100644 index bf746d14..00000000 --- a/src/nwp_consumer/internal/inputs/icon/_consts.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Defines all parameters available from icon.""" - - -EU_SL_VARS = [ - "alb_rad", - "alhfl_s", - "ashfl_s", - "asob_s", - "asob_t", - "aswdifd_s", - "aswdifu_s", - "aswdir_s", - "athb_s", - "athb_t", - "aumfl_s", - "avmfl_s", - "cape_con", - "cape_ml", - "clch", - "clcl", - "clcm", - "clct", - "clct_mod", - "cldepth", - "h_snow", - "hbas_con", - "htop_con", - "htop_dc", - "hzerocl", - "pmsl", - "ps", - "qv_2m", - "qv_s", - "rain_con", - "rain_gsp", - "relhum_2m", - "rho_snow", - "runoff_g", - "runoff_s", - "snow_con", - "snow_gsp", - "snowlmt", - "synmsg_bt_cl_ir10.8", - "t_2m", - "t_g", - "t_snow", - "tch", - "tcm", - "td_2m", - "tmax_2m", - "tmin_2m", - "tot_prec", - "tqc", - "tqi", - "u_10m", - "v_10m", - "vmax_10m", - "w_snow", - "w_so", - "ww", - "z0", -] - -EU_ML_VARS = ["clc", "fi", "omega", "p", "qv", "relhum", "t", "tke", "u", "v", "w"] - -GLOBAL_SL_VARS = [ - *EU_SL_VARS, - "alb_rad", - "c_t_lk", - "freshsnw", - "fr_ice", - "h_ice", - "h_ml_lk", - "t_ice", - "t_s", - "tqr", - "tqs", - "tqv", -] - -GLOBAL_ML_VARS: list[str] = ["fi", "relhum", "t", "u", "v"] diff --git a/src/nwp_consumer/internal/inputs/icon/_models.py b/src/nwp_consumer/internal/inputs/icon/_models.py deleted file mode 100644 index adb165fe..00000000 --- a/src/nwp_consumer/internal/inputs/icon/_models.py +++ /dev/null @@ -1,37 +0,0 @@ -import datetime as dt - -from nwp_consumer import internal - - -class IconFileInfo(internal.FileInfoModel): - def __init__( - self, it: dt.datetime, filename: str, currentURL: str, step: int, - ) -> None: - self._it = it - # The name of the file when stored by the storer. We decompress from bz2 - # at download time, so we don't want that extension on the filename. - self._filename = filename.replace(".bz2", "") - self._url = currentURL - self.step = step - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._filename - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - # The filename in the fully-qualified filepath still has the .bz2 extension - # so add it back in - return self._url + "/" + self._filename + ".bz2" - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class.""" - return self._it - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - return [self.step] - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() diff --git a/src/nwp_consumer/internal/inputs/icon/client.py b/src/nwp_consumer/internal/inputs/icon/client.py deleted file mode 100644 index e8d8009b..00000000 --- a/src/nwp_consumer/internal/inputs/icon/client.py +++ /dev/null @@ -1,439 +0,0 @@ -"""Implements a client to fetch ICON data from DWD.""" -import bz2 -import datetime as dt -import pathlib -import re -import urllib.request - -import numpy as np -import requests -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._consts import EU_ML_VARS, EU_SL_VARS, GLOBAL_ML_VARS, GLOBAL_SL_VARS -from ._models import IconFileInfo - -log = structlog.getLogger() - - -class Client(internal.FetcherInterface): - """Implements a client to fetch ICON data from DWD.""" - - baseurl: str # The base URL for the ICON model - model: str # The model to fetch data for - parameters: list[str] # The parameters to fetch - - def __init__(self, model: str, hours: int = 48, param_group: str = "default") -> None: - """Create a new Icon Client. - - Exposes a client for ICON data from DWD that conforms to the FetcherInterface. - - Args: - model: The model to fetch data for. Valid models are "europe" and "global". - hours: The number of hours to fetch data for. - param_group: The set of parameters to fetch. - Valid groups are "default", "full", and "basic". - """ - self.baseurl = "https://opendata.dwd.de/weather/nwp" - - match model: - case "europe": - self.baseurl += "/icon-eu/grib" - case "global": - self.baseurl += "/icon/grib" - case _: - raise ValueError( - f"unknown icon model {model}. Valid models are 'europe' and 'global'", - ) - - match (param_group, model): - case ("default", _): - self.parameters = [ - "t_2m", - "clch", - "clcm", - "clcl", - "asob_s", - "athb_s", - "w_snow", - "relhum_2m", - "u_10m", - "v_10m", - "clat", - "clon", - ] - case ("basic", "europe"): - self.parameters = ["t_2m", "asob_s"] - case ("basic", "global"): - self.parameters = ["t_2m", "asob_s", "clat", "clon"] - case ("single-level", "europe"): - self.parameters = EU_SL_VARS - case ("single-level", "global"): - self.parameters = [*GLOBAL_SL_VARS, "clat", "clon"] - case ("multi-level", "europe"): - self.parameters = EU_ML_VARS - case ("multi-level", "global"): - self.parameters = [*GLOBAL_ML_VARS, "clat", "clon"] - case ("full", "europe"): - self.parameters = EU_SL_VARS + EU_ML_VARS - case ("full", "global"): - self.parameters = [*GLOBAL_SL_VARS, *GLOBAL_ML_VARS, "clat", "clon"] - case (_, _): - raise ValueError( - f"unknown parameter group {param_group}." - "Valid groups are 'default', 'full', 'basic', 'single-level', 'multi-level'", - ) - - self.model = model - self.hours = hours - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"ICON_{self.model}".upper() - - def getInitHours(self) -> list[int]: # noqa: D102 - return [0, 6, 12, 18] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - # ICON data is only available for today's date. If data hasn't been uploaded for that init - # time yet, then yesterday's data will still be present on the server. - if dt.datetime.now(dt.UTC) - it > dt.timedelta(days=1): - log.warn( - event="requested init time is too old", - inittime=it.strftime("%Y-%m-%d %H:%M"), - ) - return [] - - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - files: list[internal.FileInfoModel] = [] - - # Files are split per parameter, level, and step, with a webpage per parameter - # * The webpage contains a list of files for the parameter - # * Find these files for each parameter and add them to the list - for param in self.parameters: - # The list of files for the parameter - parameterFiles: list[internal.FileInfoModel] = [] - - # Fetch DWD webpage detailing the available files for the parameter - response = requests.get(f"{self.baseurl}/{it.strftime('%H')}/{param}/", timeout=3) - - if response.status_code != 200: - log.warn( - event="error fetching filelisting webpage for parameter", - status=response.status_code, - url=response.url, - param=param, - inittime=it.strftime("%Y-%m-%d %H:%M"), - ) - continue - - # The webpage's HTML contains a list of tags - # * Each tag has a href, most of which point to a file) - for line in response.text.splitlines(): - # Check if the line contains a href, if not, skip it - refmatch = re.search(pattern=r'href="(.+)">', string=line) - if refmatch is None: - continue - - # The href contains the name of a file - parse this into a FileInfo object - fi: IconFileInfo | None = None - # Find the corresponding files for the parameter - fi = _parseIconFilename( - name=refmatch.groups()[0], - baseurl=self.baseurl, - match_ml=True, - match_pl=True, - ) - # Ignore the file if it is not for today's date - # or has a step > desired hours - if fi is None or fi.it() != it or (fi.step > self.hours): - continue - - # Add the file to the list - parameterFiles.append(fi) - - log.debug( - event="listed files for parameter", - param=param, - inittime=it.strftime("%Y-%m-%d %H:%M"), - url=response.url, - numfiles=len(parameterFiles), - ) - - # Add the files for the parameter to the list of all files - files.extend(parameterFiles) - - return files - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: - """Overrides the corresponding method in the parent class.""" - if p.suffix != ".grib2": - log.warn( - event="cannot map non-grib file to dataset", - filepath=p.as_posix(), - ) - return xr.Dataset() - - if "_CLAT" in p.stem or "_CLON" in p.stem: - # Ignore the latitude and longitude files - return xr.Dataset() - - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Load the raw file as a dataset - try: - ds = xr.open_dataset( - p.as_posix(), - engine="cfgrib", - chunks={ - "time": 1, - "step": 1, - "latitude": "auto", - "longitude": "auto", - }, - backend_kwargs={"indexpath": ""}, - ) - except Exception as e: - log.warn( - event="error converting raw file as dataset", - error=e, - filepath=p.as_posix(), - ) - return xr.Dataset() - - # Most datasets are opened as xarray datasets with "step" as a scalar (nonindexed) coordinate - # Some do not, so add it in manually - if "step" not in ds.coords: - ds = ds.assign_coords({"step": np.timedelta64(0, 'ns')}) - - # The global data is stacked as a 1D values array without lat or long data - # * Manually add it in from the CLAT and CLON files - if self.model == "global": - ds = _addLatLon(ds=ds, p=p) - - # Rename variables to match their listing online to prevent single/multi overlap - # * This assumes the name of the file locally is the same as online - pmatch = re.search(r"_\d{3}_([A-Z0-9_]+).grib", p.name) - if pmatch is not None: - var_name = pmatch.groups()[0] - ds = ds.rename({list(ds.data_vars.keys())[0]: var_name.lower()}) - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - ds = ( - ds.rename({"time": "init_time"}) - .expand_dims(["init_time", "step"]) - .drop_vars(["valid_time", "number", "surface", "heightAboveGround", "level", "isobaricLevel"], errors="ignore") - .sortby("step") - .transpose("init_time", "step", ...) - .chunk( - { - "init_time": 1, - "step": -1, - }, - ) - ) - - return ds - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - log.debug(event="requesting download of file", file=fi.filename(), path=fi.filepath()) - try: - response = urllib.request.urlopen(fi.filepath()) - except Exception as e: - log.warn( - event="error calling url for file", - url=fi.filepath(), - filename=fi.filename(), - error=e, - ) - return pathlib.Path() - - if response.status != 200: - log.warn( - event="error downloading file", - status=response.status, - url=fi.filepath(), - filename=fi.filename(), - ) - return pathlib.Path() - - # Extract the bz2 file when downloading - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with open(str(cfp), "wb") as f: - dec = bz2.BZ2Decompressor() - for chunk in iter(lambda: response.read(16 * 1024), b""): - f.write(dec.decompress(chunk)) - f.flush() - - if not cfp.exists(): - log.warn( - event="error extracting bz2 file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - ) - return pathlib.Path() - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides the corresponding method in the parent class.""" - # See https://d-nb.info/1081305452/34 for a list of ICON parameters - return { - "t_2m": internal.OCFParameter.TemperatureAGL, - "clch": internal.OCFParameter.HighCloudCover, - "clcm": internal.OCFParameter.MediumCloudCover, - "clcl": internal.OCFParameter.LowCloudCover, - "asob_s": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "athb_s": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "w_snow": internal.OCFParameter.SnowDepthWaterEquivalent, - "relhum_2m": internal.OCFParameter.RelativeHumidityAGL, - "u_10m": internal.OCFParameter.WindUComponentAGL, - "v_10m": internal.OCFParameter.WindVComponentAGL, - "clat": "lat", # Icon has a seperate dataset for latitude... - "clon": "lon", # ... and longitude (for the global model)! Go figure - } - - -def _parseIconFilename( - name: str, - baseurl: str, - match_sl: bool = True, - match_ti: bool = True, - match_ml: bool = False, - match_pl: bool = False, -) -> IconFileInfo | None: - """Parse a string of HTML into an IconFileInfo object, if it contains one. - - Args: - name: The name of the file to parse - baseurl: The base URL for the ICON model - match_sl: Whether to match single-level files - match_ti: Whether to match time-invariant files - match_ml: Whether to match model-level files - match_pl: Whether to match pressure-level files - """ - # Define the regex patterns to match the different types of file; X is step, L is level - # * Single Level: `MODEL_single-level_YYYYDDMMHH_XXX_SOME_PARAM.grib2.bz2` - slRegex = r"single-level_(\d{10})_(\d{3})_([A-Za-z_\d]+).grib" - # * Time Invariant: `MODEL_time-invariant_YYYYDDMMHH_SOME_PARAM.grib2.bz2` - tiRegex = r"time-invariant_(\d{10})_([A-Za-z_\d]+).grib" - # * Model Level: `MODEL_model-level_YYYYDDMMHH_XXX_LLL_SOME_PARAM.grib2.bz2` - mlRegex = r"model-level_(\d{10})_(\d{3})_(\d+)_([A-Za-z_\d]+).grib" - # * Pressure Level: `MODEL_pressure-level_YYYYDDMMHH_XXX_LLLL_SOME_PARAM.grib2.bz2` - plRegex = r"pressure-level_(\d{10})_(\d{3})_(\d+)_([A-Za-z_\d]+).grib" - - itstring = paramstring = "" - stepstring = "000" - # Try to match the href to one of the regex patterns - slmatch = re.search(pattern=slRegex, string=name) - timatch = re.search(pattern=tiRegex, string=name) - mlmatch = re.search(pattern=mlRegex, string=name) - plmatch = re.search(pattern=plRegex, string=name) - - if slmatch and match_sl: - itstring, stepstring, paramstring = slmatch.groups() - elif timatch and match_ti: - itstring, paramstring = timatch.groups() - elif mlmatch and match_ml: - itstring, stepstring, levelstring, paramstring = mlmatch.groups() - elif plmatch and match_pl: - itstring, stepstring, levelstring, paramstring = plmatch.groups() - else: - return None - - it = dt.datetime.strptime(itstring, "%Y%m%d%H").replace(tzinfo=dt.UTC) - - return IconFileInfo( - it=it, - filename=name, - currentURL=f"{baseurl}/{it.strftime('%H')}/{paramstring.lower()}/", - step=int(stepstring), - ) - - -def _addLatLon(*, ds: xr.Dataset, p: pathlib.Path) -> xr.Dataset: - """Add latitude and longitude data to the dataset. - - Global ICON files do not contain latitude and longitude data, - opting instead for a single `values` dimension. The lats and longs are then - accessible from seperate files. This function injects the lat and lon data - from these files into the dataset. - - :param ds: The dataset to reshape - :param p: The path to the file being reshaped - """ - # Adapted from https://stackoverflow.com/a/62667154 and - # https://github.com/SciTools/iris-grib/issues/140#issuecomment-1398634288 - - # Inject latitude and longitude into the dataset if they are missing - if "latitude" not in ds.dims: - rawlats: list[pathlib.Path] = list(p.parent.glob("*CLAT.grib2")) - if len(rawlats) == 0: - log.warn( - event="no latitude file found for init time", - filepath=p.as_posix(), - init_time=p.parent.name, - ) - return xr.Dataset() - latds = xr.open_dataset( - rawlats[0], - engine="cfgrib", - backend_kwargs={"errors": "ignore"}, - ).load() - tiledlats = latds["tlat"].data - del latds - - if "longitude" not in ds: - rawlons: list[pathlib.Path] = list(p.parent.glob("*CLON.grib2")) - if len(rawlons) == 0: - log.warn( - event="no longitude file found for init time", - filepath=p.as_posix(), - init_time=p.parent.name, - ) - return xr.Dataset() - londs = xr.open_dataset( - rawlons[0], - engine="cfgrib", - backend_kwargs={"errors": "ignore"}, - ).load() - tiledlons = londs["tlon"].data - del londs - - if ds.sizes["values"] != len(tiledlats) or ds.sizes["values"] != len(tiledlons): - raise ValueError( - f"dataset has {ds.sizes['values']} values, " - f"but expected {len(tiledlats) * len(tiledlons)}", - ) - - # Create new coordinates, - # which give the `latitude` and `longitude` position for each position in the `values` dimension: - - ds = ds.assign_coords( - { - "latitude": ("values", tiledlats), - "longitude": ("values", tiledlons), - }, - ) - - return ds diff --git a/src/nwp_consumer/internal/inputs/icon/test_client.py b/src/nwp_consumer/internal/inputs/icon/test_client.py deleted file mode 100644 index c6dd6610..00000000 --- a/src/nwp_consumer/internal/inputs/icon/test_client.py +++ /dev/null @@ -1,142 +0,0 @@ -import datetime as dt -import pathlib -import unittest -from typing import TYPE_CHECKING - -import xarray as xr - -if TYPE_CHECKING: - from ._models import IconFileInfo - -from .client import Client, _parseIconFilename - -testClientGlobal = Client(model="global") -testClientEurope = Client(model="europe") - - -class TestClient(unittest.TestCase): - def test_mapCachedRawGlobal(self) -> None: - tests = [ - { - "filename": "test_icon_global_001_CLCL.grib2", - "expected_dims": ["init_time", "step", "values"], - "expected_var": "ccl", - }, - { - "filename": "test_icon_global_001_HTOP_CON.grib2", - "expected_dims": ["init_time", "step", "values"], - "expected_var": "hcct", - }, - { - "filename": "test_icon_global_001_CLCT_MOD.grib2", - "expected_dims": ["init_time", "step", "values"], - "expected_var": "CLCT_MOD", - }, - ] - - for tst in tests: - with self.subTest(f"test file {tst['filename']}"): - out = testClientGlobal.mapCachedRaw(p=pathlib.Path(__file__).parent / tst["filename"]) - print(out) - - # Check latitude and longitude are injected - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - # Check that the dimensions are correctly ordered and renamed - self.assertEqual((list(out.dims.keys())), tst["expected_dims"]) - - def test_mapCachedRawEurope(self) -> None: - tests = [ - { - "filename": "test_icon_europe_001_CLCL.grib2", - "expected_dims": ["init_time", "step", "latitude", "longitude"], - "expected_var": "ccl", - }, - ] - - for tst in tests: - with self.subTest(f"test file {tst['filename']}"): - out = testClientEurope.mapCachedRaw(p=pathlib.Path(__file__).parent / tst["filename"]) - print(out) - - # Check latitude and longitude are injected - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - # Check that the dimensions are correctly ordered and renamed - for data_var in out.data_vars: - self.assertEqual(list(out[data_var].dims), tst["expected_dims"]) - - def test_mergeRaw(self) -> None: - ds1 = testClientGlobal.mapCachedRaw( - p=pathlib.Path(__file__).parent / "test_icon_global_001_CLCT_MOD.grib2" - ) - ds2 = testClientGlobal.mapCachedRaw( - p=pathlib.Path(__file__).parent / "test_icon_global_001_HTOP_CON.grib2" - ) - - # This should merge without raising an error - _ = xr.merge([ds1, ds2]) - - -class TestParseIconFilename(unittest.TestCase): - baseurl = "https://opendata.dwd.de/weather/nwp/icon/grib" - - def test_parsesSingleLevel(self) -> None: - filename: str = "icon_global_icosahedral_single-level_2020090100_000_T_HUM.grib2.bz2" - - out: IconFileInfo | None = _parseIconFilename( - name=filename, - baseurl=self.baseurl, - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename.removesuffix(".bz2")) - self.assertEqual(out.it(), dt.datetime(2020, 9, 1, 0, tzinfo=dt.UTC)) - - def test_parsesTimeInvariant(self) -> None: - filename: str = "icon_global_icosahedral_time-invariant_2020090100_CLAT.grib2.bz2" - - out: IconFileInfo | None = _parseIconFilename( - name=filename, - baseurl=self.baseurl, - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename.removesuffix(".bz2")) - self.assertEqual(out.it(), dt.datetime(2020, 9, 1, 0, tzinfo=dt.UTC)) - - def test_parsesModelLevel(self) -> None: - filename: str = "icon_global_icosahedral_model-level_2020090100_048_32_CLCL.grib2.bz2" - - out: IconFileInfo | None = _parseIconFilename( - name=filename, - baseurl=self.baseurl, - match_ml=True, - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename.removesuffix(".bz2")) - self.assertEqual(out.it(), dt.datetime(2020, 9, 1, 0, tzinfo=dt.UTC)) - - out: IconFileInfo | None = _parseIconFilename( - name=filename, - baseurl=self.baseurl, - match_ml=False, - ) - self.assertIsNone(out) - - def test_parsesPressureLevel(self) -> None: - filename: str = "icon_global_icosahedral_pressure-level_2020090100_048_1000_T.grib2.bz2" - - out: IconFileInfo | None = _parseIconFilename( - name=filename, - baseurl=self.baseurl, - match_pl=True, - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename.removesuffix(".bz2")) - self.assertEqual(out.it(), dt.datetime(2020, 9, 1, 0, tzinfo=dt.UTC)) - - out: IconFileInfo | None = _parseIconFilename( - name=filename, - baseurl=self.baseurl, - match_pl=False, - ) - self.assertIsNone(out) diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_europe_000_ASOB_S.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_europe_000_ASOB_S.grib2 deleted file mode 100644 index a2b14f0d..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_europe_000_ASOB_S.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_europe_001_CLCL.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_europe_001_CLCL.grib2 deleted file mode 100644 index d77be855..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_europe_001_CLCL.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_CLCL.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_CLCL.grib2 deleted file mode 100644 index 7fad65ea..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_CLCL.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_CLCT_MOD.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_CLCT_MOD.grib2 deleted file mode 100644 index 66f31e08..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_CLCT_MOD.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_HTOP_CON.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_HTOP_CON.grib2 deleted file mode 100644 index b0b229bb..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_global_001_HTOP_CON.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_global_CLAT.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_global_CLAT.grib2 deleted file mode 100644 index 5cbe15e0..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_global_CLAT.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/icon/test_icon_global_CLON.grib2 b/src/nwp_consumer/internal/inputs/icon/test_icon_global_CLON.grib2 deleted file mode 100644 index b13f1e9f..00000000 Binary files a/src/nwp_consumer/internal/inputs/icon/test_icon_global_CLON.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/meteofrance/HP1_00H24H_t.grib2 b/src/nwp_consumer/internal/inputs/meteofrance/HP1_00H24H_t.grib2 deleted file mode 100644 index c2186000..00000000 Binary files a/src/nwp_consumer/internal/inputs/meteofrance/HP1_00H24H_t.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/meteofrance/IP1_00H24H_t.grib2 b/src/nwp_consumer/internal/inputs/meteofrance/IP1_00H24H_t.grib2 deleted file mode 100644 index ec160ed6..00000000 Binary files a/src/nwp_consumer/internal/inputs/meteofrance/IP1_00H24H_t.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/meteofrance/SP1_00H24H_t.grib2 b/src/nwp_consumer/internal/inputs/meteofrance/SP1_00H24H_t.grib2 deleted file mode 100644 index 61e69393..00000000 Binary files a/src/nwp_consumer/internal/inputs/meteofrance/SP1_00H24H_t.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/meteofrance/__init__.py b/src/nwp_consumer/internal/inputs/meteofrance/__init__.py deleted file mode 100644 index 02fde8c9..00000000 --- a/src/nwp_consumer/internal/inputs/meteofrance/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["Client"] - -from .client import Client diff --git a/src/nwp_consumer/internal/inputs/meteofrance/_consts.py b/src/nwp_consumer/internal/inputs/meteofrance/_consts.py deleted file mode 100644 index a512d1d5..00000000 --- a/src/nwp_consumer/internal/inputs/meteofrance/_consts.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Defines all parameters available from Arpege.""" - -ARPEGE_GLOBAL_VARIABLES = ['u10','v10','si10','wdir10','t2m','r2','gust','efg10','nfg10','ssrd','tp','sprate','d2m','sh2','sshf','slhf','strd','ssr','str','ewss','nsss','t','sp','tcwv','lcc','mcc','hcc','hpbl','h','ws','u','v','pres','r','wdir','u200','v200','si200','u100','v100','si100','z','q','clwc','ciwc','cc','dpt','tke','w','pv','vo','absv','papt',] -ARPEGE_GLOBAL_PARAMETER_SETS = ['HP1','HP2','IP1','IP2','IP3','IP4','SP1','SP2'] diff --git a/src/nwp_consumer/internal/inputs/meteofrance/_models.py b/src/nwp_consumer/internal/inputs/meteofrance/_models.py deleted file mode 100644 index ee594d65..00000000 --- a/src/nwp_consumer/internal/inputs/meteofrance/_models.py +++ /dev/null @@ -1,37 +0,0 @@ -import datetime as dt - -from nwp_consumer import internal - - -class ArpegeFileInfo(internal.FileInfoModel): - def __init__( - self, - it: dt.datetime, - filename: str, - currentURL: str, - step: int, - ) -> None: - self._it = it - self._filename = filename - self._url = currentURL - self.step = step - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._filename - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._url + self._filename - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class.""" - return self._it - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - return [self.step] - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() diff --git a/src/nwp_consumer/internal/inputs/meteofrance/client.py b/src/nwp_consumer/internal/inputs/meteofrance/client.py deleted file mode 100644 index 7a512331..00000000 --- a/src/nwp_consumer/internal/inputs/meteofrance/client.py +++ /dev/null @@ -1,315 +0,0 @@ -"""Implements a client to fetch Arpege data from MeteoFrance AWS.""" -import datetime as dt -import pathlib -import re -import typing - -import cfgrib -import s3fs -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._consts import ARPEGE_GLOBAL_PARAMETER_SETS, ARPEGE_GLOBAL_VARIABLES -from ._models import ArpegeFileInfo - -log = structlog.getLogger() - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("time", "step", "latitude", "longitude") - - -class Client(internal.FetcherInterface): - """Implements a client to fetch Arpege data from AWS.""" - - baseurl: str # The base URL for the Argpege model - model: str # The model to fetch data for - parameters: list[str] # The parameters to fetch - - def __init__(self, model: str, hours: int = 48, param_group: str = "default") -> None: - """Create a new Arpege Client. - - Exposes a client for Arpege data from AWS MeteoFrance that conforms to the FetcherInterface. - - Args: - model: The model to fetch data for. Valid models are "europe" and "global". - param_group: The set of parameters to fetch. - Valid groups are "default", "full", and "basic". - """ - self.baseurl = "s3://mf-nwp-models/" - self.fs = s3fs.S3FileSystem(anon=True) - - match model: - case "europe": - self.baseurl += "arpege-europe/v1/" - case "global": - self.baseurl += "arpege-world/v1/" - case _: - raise ValueError( - f"unknown arpege model {model}. Valid models are 'europe' and 'global'", - ) - - match (param_group, model): - case ("default", _): - self.parameters = ["t2m", "hcc", "mcc", "lcc", "ssrd", "d2m", "u10", "v10"] - case ("basic", "europe"): - self.parameters = ["t2m", "ssrd"] - case ("basic", "global"): - self.parameters = ["t2m", "ssrd"] - case ("full", "europe"): - self.parameters = ARPEGE_GLOBAL_VARIABLES - case ("full", "global"): - self.parameters = ARPEGE_GLOBAL_VARIABLES - case (_, _): - raise ValueError( - f"unknown parameter group {param_group}." - "Valid groups are 'default', 'full', 'basic'", - ) - - self.model = model - self.hours = hours - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"MeteoFrance_{self.model}".upper() - - def getInitHours(self) -> list[int]: # noqa: D102 - return [0, 6, 12, 18] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - files: list[internal.FileInfoModel] = [] - - # Files are split per set of parameters, and set of steps - # The list of files for the parameter - parameterFiles: list[internal.FileInfoModel] = [] - - # Parameter sets - for parameter_set in ARPEGE_GLOBAL_PARAMETER_SETS: - # Fetch Arpege webpage detailing the available files for the parameter - files = self.fs.ls( - f"{self.baseurl}{it.strftime('%Y-%m-%d')}/{it.strftime('%H')}/{parameter_set}/" - ) - - # The webpage's HTML contains a list of tags - # * Each tag has a href, most of which point to a file) - for f in files: - if ".inv" in f: # Ignore the .inv files - continue - # The href contains the name of a file - parse this into a FileInfo object - fi: ArpegeFileInfo | None = None - fi = _parseArpegeFilename( - name=f.split("/")[-1], - baseurl=f"{self.baseurl}{it.strftime('%Y-%m-%d')}/{it.strftime('%H')}/{parameter_set}/", - match_hl=len(self.parameters) > 6, - match_pl=len(self.parameters) > 6, - ) - # Ignore the file if it is not for today's date or has a step > desired - if fi is None or fi.it() != it or (fi.step > self.hours): - continue - - # Add the file to the list - parameterFiles.append(fi) - - log.debug( - event="listed files for parameter", - param=parameter_set, - inittime=it.strftime("%Y-%m-%d %H:%M"), - url=f, - numfiles=len(parameterFiles), - ) - - # Add the files for the parameter to the list of all files - files.extend(parameterFiles) - - return files - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: # noqa: D102 - if p.suffix != ".grib2": - log.warn( - event="cannot map non-grib file to dataset", - filepath=p.as_posix(), - ) - return xr.Dataset() - - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Load the raw file as a dataset - try: - ds = cfgrib.open_datasets( - p.as_posix(), - ) - except Exception as e: - log.warn( - event="error converting raw file as dataset", - error=e, - filepath=p.as_posix(), - ) - return xr.Dataset() - # Check if datasets is more than a single dataset or not - # * If it is, merge the datasets into a single dataset - if len(ds) > 1: - if "_IP" in str(p): # Pressure levels - for i, d in enumerate(ds): - if "isobaricInhPa" in d.coords and "isobaricInhPa" not in d.dims: - d = d.expand_dims("isobaricInhPa") - ds[i] = d - ds = xr.merge([d for d in ds if "isobaricInhPa" in d.coords], compat="override") - elif "_SP" in str(p): # Single levels - for i, d in enumerate(ds): - if "surface" in d.coords: - d = d.rename({"surface": "heightAboveGround"}) - # Make heightAboveGround a coordinate - if "heightAboveGround" in d.coords: - d = d.expand_dims("heightAboveGround") - ds[i] = d - # Merge all the datasets that have heightAboveGround - ds = xr.merge([d for d in ds if "heightAboveGround" in d.coords], compat="override") - elif "_HP" in str(p): # Height levels - for i, d in enumerate(ds): - if "heightAboveGround" in d.coords and "heightAboveGround" not in d.dims: - d = d.expand_dims("heightAboveGround") - ds[i] = d - ds = xr.merge([d for d in ds if "heightAboveGround" in d.coords], compat="override") - else: - ds = ds[0] - ds = ds.drop_vars("unknown", errors="ignore") - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - ds = ( - ds.rename({"time": "init_time"}) - .expand_dims("init_time") - .transpose("init_time", "step", ...) - .sortby("step") - .chunk( - { - "init_time": 1, - "step": -1, - }, - ) - ) - - return ds - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - log.debug(event="requesting download of file", file=fi.filename(), path=fi.filepath()) - # Extract the bz2 file when downloading - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - - self.fs.get(str(fi.filepath()), str(cfp)) - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides the corresponding method in the parent class.""" - # See https://mf-models-on-aws.org/en/doc/datasets/v1/ - # for a list of Arpege parameters - return { - "t2m": internal.OCFParameter.TemperatureAGL, - "hcc": internal.OCFParameter.HighCloudCover, - "mcc": internal.OCFParameter.MediumCloudCover, - "lcc": internal.OCFParameter.LowCloudCover, - "ssrd": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "d2m": internal.OCFParameter.RelativeHumidityAGL, - "u10": internal.OCFParameter.WindUComponentAGL, - "v10": internal.OCFParameter.WindVComponentAGL, - } - - -def _parseArpegeFilename( - name: str, - baseurl: str, - match_sl: bool = True, - match_hl: bool = True, - match_pl: bool = False, -) -> ArpegeFileInfo | None: - """Parse a string of HTML into an ArpegeFileInfo object, if it contains one. - - Args: - name: The name of the file to parse - baseurl: The base URL for the Arpege model - match_sl: Whether to match single-level files - match_hl: Whether to match height-level files - match_pl: Whether to match pressure-level files - """ - # Defined from the href of the file, its harder to split - # Define the regex patterns to match the different types of file; X is step, L is level - # * Single Level: `MODEL_single-level_YYYYDDMMHH_XXX_SOME_PARAM.grib2.bz2` - slRegex = r"s3://mf-nwp-models/arpege-([A-Za-z_\d]+)/v1/(\d{4})-(\d{2})-(\d{2})/(\d{2})/SP(\d{1})/(\d{2})H(\d{2})H.grib2" - # * Height Level: `MODEL_time-invariant_YYYYDDMMHH_SOME_PARAM.grib2.bz2` - hlRegex = r"s3://mf-nwp-models/arpege-([A-Za-z_\d]+)/v1/(\d{4})-(\d{2})-(\d{2})/(\d{2})/HP(\d{1})/(\d{2})H(\d{2})H.grib2" - # * Pressure Level: `MODEL_model-level_YYYYDDMMHH_XXX_LLL_SOME_PARAM.grib2.bz2` - plRegex = r"s3://mf-nwp-models/arpege-([A-Za-z_\d]+)/v1/(\d{4})-(\d{2})-(\d{2})/(\d{2})/IP(\d{1})/(\d{2})H(\d{2})H.grib2" - - itstring_year = itstring_month = itstring_day = itstring_hour = paramstring = "" - stepstring_start = stepstring_end = "00" - # Try to match the href to one of the regex patterns - slmatch = re.search(pattern=slRegex, string=baseurl + name) - hlmatch = re.search(pattern=hlRegex, string=baseurl + name) - plmatch = re.search(pattern=plRegex, string=baseurl + name) - - if slmatch and match_sl: - ( - _, - itstring_year, - itstring_month, - itstring_day, - itstring_hour, - paramstring, - stepstring_start, - stepstring_end, - ) = slmatch.groups() - elif hlmatch and match_hl: - ( - _, - itstring_year, - itstring_month, - itstring_day, - itstring_hour, - paramstring, - stepstring_start, - stepstring_end, - ) = hlmatch.groups() - elif plmatch and match_pl: - ( - _, - itstring_year, - itstring_month, - itstring_day, - itstring_hour, - paramstring, - stepstring_start, - stepstring_end, - ) = plmatch.groups() - else: - return None - - it = dt.datetime.strptime( - itstring_year + itstring_month + itstring_day + itstring_hour, "%Y%m%d%H" - ).replace(tzinfo=dt.UTC) - - # TODO Construct the public URL from S3 path? - - return ArpegeFileInfo( - it=it, - filename=name, - currentURL=f"{baseurl}", - step=int(stepstring_start), - ) diff --git a/src/nwp_consumer/internal/inputs/meteofrance/test_client.py b/src/nwp_consumer/internal/inputs/meteofrance/test_client.py deleted file mode 100644 index 13b16128..00000000 --- a/src/nwp_consumer/internal/inputs/meteofrance/test_client.py +++ /dev/null @@ -1,99 +0,0 @@ -import datetime as dt -import pathlib -import unittest -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from ._models import ArpegeFileInfo - -from .client import Client, _parseArpegeFilename - -testClient = Client(model="global") - - -class TestClient(unittest.TestCase): - def test_mapCachedRaw(self) -> None: - - tests = [ - { - "filename": "SP1_00H24H_t.grib2", - "expected_dims": ("init_time", "step", "latitude", "longitude"), - "expected_var": "t", - }, - { - "filename": "HP1_00H24H_t.grib2", - "expected_dims": ("init_time", "step", "heightAboveGround", "latitude", "longitude"), - "expected_var": "t", - }, - { - "filename": "IP1_00H24H_t.grib2", - "expected_dims": ("init_time", "step", "isobaricInhPa", "latitude", "longitude"), - "expected_var": "t", - }, - ] - - for tst in tests: - with self.subTest(f"test file {tst['filename']}"): - out = testClient.mapCachedRaw(p=pathlib.Path(__file__).parent / tst["filename"]) - - # Check latitude and longitude are injected - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - # Check that the dimensions are correctly ordered and renamed - self.assertEqual( - out[next(iter(out.data_vars.keys()))].dims, - tst["expected_dims"], - ) - - -class TestParseArpegeFilename(unittest.TestCase): - baseurl = "s3://mf-nwp-models/arpege-world/v1/2023-12-03/12/" - - def test_parsesSingleLevel(self) -> None: - filename: str = "00H24H.grib2" - - out: ArpegeFileInfo | None = _parseArpegeFilename( - name=filename, - baseurl=self.baseurl+"SP1/", - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename) - self.assertEqual(out.it(), dt.datetime(2023, 12, 3, 12, tzinfo=dt.timezone.utc)) - - def test_parsesHeightLevel(self) -> None: - filename: str = "00H24H.grib2" - - out: ArpegeFileInfo | None = _parseArpegeFilename( - name=filename, - baseurl=self.baseurl+"HP2/", - match_hl=True, - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename) - self.assertEqual(out.it(), dt.datetime(2023, 12, 3, 12, tzinfo=dt.timezone.utc)) - - out: ArpegeFileInfo | None = _parseArpegeFilename( - name=filename, - baseurl=self.baseurl, - match_hl=False, - ) - self.assertIsNone(out) - - def test_parsesPressureLevel(self) -> None: - filename: str = "00H24H.grib2" - - out: ArpegeFileInfo | None = _parseArpegeFilename( - name=filename, - baseurl=self.baseurl+"IP4/", - match_pl=True, - ) - self.assertIsNotNone(out) - self.assertEqual(out.filename(), filename) - self.assertEqual(out.it(), dt.datetime(2023, 12, 3, 12, tzinfo=dt.timezone.utc)) - - out: ArpegeFileInfo | None = _parseArpegeFilename( - name=filename, - baseurl=self.baseurl, - match_pl=False, - ) - self.assertIsNone(out) diff --git a/src/nwp_consumer/internal/inputs/metoffice/README.md b/src/nwp_consumer/internal/inputs/metoffice/README.md deleted file mode 100644 index 2707687c..00000000 --- a/src/nwp_consumer/internal/inputs/metoffice/README.md +++ /dev/null @@ -1,202 +0,0 @@ -# MetOffice API - ---- - - -## Data - -Currently being fetched from our MetOffice orders: - -### `uk-5params-35steps` - -| Name | Long Name | Level | ID | Unit | -|-----------------------------------|----------------------------------------------------|--------------|-----------|--------| -| Low Cloud Cover | low-cloud-cover | `atmosphere` | `lcc` | % | -| Snow Depth | snow-depth-water-equivalent | `ground` | `sd` | kg m-2 | -| Downward Shortwave Radiation Flux | downward-short-wave-radiation-flux | `ground` | `dswrf` | W m-2 | -| Temperature at 1.5m | temperature | `agl` | `t2m` | K | -| Wind Direction at 10m | wind-direction-from-which-blowing-surface-adjusted | `agl` | `unknown` | | - -### `uk-11params-12steps` - -| Name | Long Name | Level | ID | Unit | -|--------------------------------------|------------------------------------|--------------|-----------|------------| -| High Cloud Cover | high-cloud-cover | `atmosphere` | `hcc` | % | -| Medium Cloud Cover | medium-cloud-cover | `atmosphere` | `mcc` | % | -| Low Cloud Cover | low-cloud-cover | `atmosphere` | `lcc` | % | -| Visibility at 1.5m | visibility | `agl` | `vis` | m | -| Relative Humidity at 1.5m | relative-humidity | `agl` | `r2` | % | -| Rain Precipitation Rate | rain-precipitation-rate | `ground` | `rprate` | kg m-2 s-1 | -| Snow Depth - ground | snow-depth-water-equivalent | `ground` | `sd` | kg m-2 | -| Downward Longwave Radiation Flux | downward-long-wave-radiation-flux | `ground` | `dlwrf` | W m-2 | -| Downward Shortwave Radiation Flux | downward-short-wave-radiation-flux | `ground` | `dswrf` | W m-2 | -| Temperature at 1.5m | temperature | `agl` | `t2m` | K | -| Wind Speed at 10m (Surface Adjusted) | wind-speed-surface-adjusted | `agl` | `unknown` | m s-1 | - -> :warning: **NOTE:** The two wind parameters are read in from their grib files as "unknown" - -## Parameter names in datasets - -These orders may provide multiple time steps per "latest" file list. - -Each parameter is loaded as a separate grib file. - -
- Datasets - - --- relative-humidity-1.5 --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 12:00:00 - heightAboveGround float64 1.5 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - r2 (step, y, x) float32 ... - - --- temperature 1.5m --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 12:00:00 - heightAboveGround float64 1.5 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - t2m (step, y, x) float32 ... (t2m because it's called "temperature 2m", even though it's at 1.5m) - - --- visibility 1.5 --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 12:00:00 - heightAboveGround float64 1.5 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - vis (step, y, x) float32 ... - - --- wind speed surface adjusted --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 12:00:00 - heightAboveGround float64 10.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - unknown (step, y, x) float32 ... - - --- high cloud cover --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - atmosphere float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - hcc (step, y, x) float32 ... - - --- low cloud cover --- - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - atmosphere float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - lcc (step, y, x) float32 ... - - --- medium cloud cover --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - atmosphere float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - mcc (step, y, x) float32 ... - - --- downward longwave radiation flux --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - surface float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - dlwrf (step, y, x) float32 ... - - --- downward shortwave radiation flux --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - surface float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - dswrf (step, y, x) float32 ... - - --- snow depth --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T10:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - surface float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - sd (step, y, x) float32 ... - - --- rain precipitation rate --- - Dimensions: (step: 10, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T21:00:00 - * step (step) timedelta64[ns] 00:00:00 01:00:00 ... 08:00:00 12:00:00 - surface float64 0.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - rprate (step, y, x) float32 ... - - --- wind direction from which blowing surface adjusted --- - Dimensions: (step: 36, y: 639, x: 455) - Coordinates: - time datetime64[ns] 2023-03-08T21:00:00 - * step (step) timedelta64[ns] 00:00:00 ... 1 days 11:00:00 - heightAboveGround float64 10.0 - latitude (y, x) float64 ... - longitude (y, x) float64 ... - valid_time (step) datetime64[ns] ... - Dimensions without coordinates: y, x - Data variables: - unknown (step, y, x) float32 ... - -
diff --git a/src/nwp_consumer/internal/inputs/metoffice/__init__.py b/src/nwp_consumer/internal/inputs/metoffice/__init__.py deleted file mode 100644 index 74f4c648..00000000 --- a/src/nwp_consumer/internal/inputs/metoffice/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ['Client'] - -from .client import Client diff --git a/src/nwp_consumer/internal/inputs/metoffice/_models.py b/src/nwp_consumer/internal/inputs/metoffice/_models.py deleted file mode 100644 index b6313151..00000000 --- a/src/nwp_consumer/internal/inputs/metoffice/_models.py +++ /dev/null @@ -1,58 +0,0 @@ -import datetime as dt -from typing import ClassVar - -from marshmallow import EXCLUDE, Schema, fields -from marshmallow_dataclass import dataclass - -from nwp_consumer import internal - - -@dataclass -class MetOfficeFileInfo(internal.FileInfoModel): - - class Meta: - unknown = EXCLUDE - - fileId: str - runDateTime: dt.datetime - - Schema: ClassVar[type[Schema]] = Schema # To prevent confusing type checkers - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class.""" - return self.runDateTime.replace(tzinfo=None) - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self.fileId + ".grib" - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"{self.fileId}/data" - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() - - -@dataclass -class MetOfficeOrderDetails: - - class Meta: - unknown = EXCLUDE - - files: list[MetOfficeFileInfo] = fields.List(fields.Nested(MetOfficeFileInfo.Schema())) - - Schema: ClassVar[type[Schema]] = Schema # To prevent confusing type checkers - - -@dataclass -class MetOfficeResponse: - - orderDetails: MetOfficeOrderDetails - - Schema: ClassVar[type[Schema]] = Schema # To prevent confusing type checkers diff --git a/src/nwp_consumer/internal/inputs/metoffice/client.py b/src/nwp_consumer/internal/inputs/metoffice/client.py deleted file mode 100644 index b003ecef..00000000 --- a/src/nwp_consumer/internal/inputs/metoffice/client.py +++ /dev/null @@ -1,337 +0,0 @@ -"""Implements a client to fetch the data from the MetOffice API.""" - -import datetime as dt -import pathlib -import urllib.request - -import pyproj -import requests -import structlog.stdlib -import xarray as xr - -from nwp_consumer import internal - -from ._models import MetOfficeFileInfo, MetOfficeResponse - -log = structlog.getLogger() - -class Client(internal.FetcherInterface): - """Implements a client to fetch the data from the MetOffice API.""" - - # Base https URL for MetOffice's data endpoint - baseurl: str - - # Query string headers to pass to the MetOffice API - __headers: dict[str, str] - - def __init__(self, *, orderID: str, apiKey: str) -> None: - """Create a new MetOfficeClient. - - Exposes a client for the MetOffice API which conforms to the FetcherInterface. - MetOffice API credentials must be provided, as well as an orderID for the - desired dataset. - - Args: - orderID: The orderID to fetch from the MetOffice API. - apiKey: The apiKey to use to authenticate with the MetOffice API. - """ - if any(value in [None, "", "unset"] for value in [apiKey, orderID]): - raise KeyError("must provide apiKey and orderID for MetOffice API") - self.baseurl: str = ( - f"https://data.hub.api.metoffice.gov.uk/atmospheric-models/1.0.0/orders/{orderID}/latest" - ) - self.querystring: dict[str, str] = {"detail": "MINIMAL"} - self.__headers: dict[str, str] = { - "accept": "application/json, application/json", - "apikey": apiKey, - } - - def datasetName(self) -> str: - """Overrides the corresponding method in FetcherInterface.""" - return "UKV" - - def getInitHours(self) -> list[int]: # noqa: D102 - # NOTE: This will depend on the order you have with the MetOffice. - # Technically they can provide data for every hour of the day, - # but OpenClimateFix choose to match what is available from CEDA. - return [0, 3, 6, 9, 12, 15, 18, 21] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - if ( - self.__headers.get("apikey") is None - ): - log.error("all metoffice API credentials not provided") - return [] - - if it.date() != dt.datetime.now(tz=dt.UTC).date(): - log.warn("metoffice API only supports fetching data for the current day") - return [] - - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - # Fetch info for all files available on the input date - response: requests.Response = requests.request( - method="GET", - url=self.baseurl, - headers=self.__headers, - params=self.querystring, - ) - try: - rj: dict = response.json() - except Exception as e: - log.warn( - event="error parsing response from filelist endpoint", - error=e, - response=response.content, - ) - return [] - if not response.ok or ("httpCode" in rj and int(rj["httpCode"]) > 399): - log.warn( - event="error response from filelist endpoint", - url=response.url, - response=rj, - ) - return [] - - # Map the response to a MetOfficeResponse object - try: - responseObj: MetOfficeResponse = MetOfficeResponse.Schema().load(response.json()) - except Exception as e: - log.warn( - event="response from metoffice does not match expected schema", - error=e, - response=response.json(), - ) - return [] - - # Filter the file infos for the desired init time - wantedFileInfos: list[MetOfficeFileInfo] = [ - fo for fo in responseObj.orderDetails.files if _isWantedFile(fi=fo, dit=it) - ] - - return wantedFileInfos - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - if ( - self.__headers.get("apikey") is None - ): - log.error("all metoffice API credentials not provided") - return pathlib.Path() - - log.debug( - event="requesting download of file", - file=fi.filename(), - ) - url: str = f"{self.baseurl}/{fi.filepath()}" - try: - opener = urllib.request.build_opener() - opener.addheaders = list( - dict( - self.__headers, - **{"accept": "application/x-grib"}, - ).items(), - ) - urllib.request.install_opener(opener) - response = urllib.request.urlopen(url=url) - if response.status != 200: - log.warn( - event="error response received for download file request", - response=response.json(), - url=url, - ) - return pathlib.Path() - except Exception as e: - log.warn( - event="error calling url for file", - url=url, - filename=fi.filename(), - error=e, - ) - return pathlib.Path() - - # Stream the filedata into cache - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with cfp.open("wb") as f: - for chunk in iter(lambda: response.read(16 * 1024), b""): - f.write(chunk) - f.flush() - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=url, - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: # noqa: D102 - if p.suffix != ".grib": - log.warn( - event="cannot map non-grib file to dataset", - filepath=p.as_posix(), - ) - return xr.Dataset() - - log.debug( - event="mapping raw file to xarray dataset", - filepath=p.as_posix(), - ) - - # Cfgrib is built upon eccodes which needs an in-memory file to read from - # Load the GRIB file as a cube - try: - # Read the file as a dataset, also reading the values of the keys in 'read_keys' - parameterDataset: xr.Dataset = xr.open_dataset( - p.as_posix(), - engine="cfgrib", - backend_kwargs={"read_keys": ["name", "parameterNumber"], "indexpath": ""}, - chunks={ - "time": 1, - "step": -1, - "x": "auto", - "y": "auto", - }, - ) - except Exception as e: - log.warn( - event="error loading raw file as dataset", - error=e, - filepath=p.as_posix(), - ) - return xr.Dataset() - - # Make the DataArray OCF-compliant - # 1. Rename the parameter to the OCF short name - currentName = next(iter(parameterDataset.data_vars)) - parameterNumber = parameterDataset[currentName].attrs["GRIB_parameterNumber"] - - # The two wind dirs are the only parameters read in as "unknown" - # * Tell them apart via the parameterNumber attribute - # which lines up with the last number in the GRIB2 code specified below - # https://gridded-data-ui.cda.api.metoffice.gov.uk/glossary?groups=Wind&sortOrder=GRIB2_CODE - match currentName, parameterNumber: - case "unknown", 194: - parameterDataset = parameterDataset.rename( - { - currentName: internal.OCFParameter.WindDirectionFromWhichBlowingSurfaceAdjustedAGL.value, - }, - ) - case "unknown", 195: - parameterDataset = parameterDataset.rename( - {currentName: internal.OCFParameter.WindSpeedSurfaceAdjustedAGL.value}, - ) - - # There is some weird behaviour with the radiation parameters, and different setups - # this is a catch all situation (hopefully) - case "sdswrf", 7: - parameterDataset = parameterDataset.rename( - {currentName: 'dswrf'}, - ) - case "sdlwrf", 3: - parameterDataset = parameterDataset.rename( - {currentName: 'dlwrf'}, - ) - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - # * Reverse `latitude` so it's top-to-bottom via reindexing. - parameterDataset = ( - parameterDataset.drop_vars( - names=[ - "height", - "pressure", - "valid_time", - "surface", - "heightAboveGround", - "atmosphere", - "cloudBase", - "meanSea", - "heightAboveGroundLayer", - "level", - ], - errors="ignore", - ) - .rename({"time": "init_time"}) - .expand_dims(["init_time"]) - .sortby("y", ascending=False) - .transpose("init_time", "step", "y", "x") - .sortby("step") - .chunk( - { - "init_time": 1, - "step": -1, - "y": len(parameterDataset.y) // 2, - "x": len(parameterDataset.x) // 2, - }, - ) - ) - - # TODO: Remove this by moving this logic into ocf-datapipes and update PVNet1+2 to use that - # TODO: See issue #26 https://github.com/openclimatefix/nwp-consumer/issues/26 - # 5. Create osgb x and y coordinates from the lat/lon coordinates - # * The lat/lon coordinates are WGS84, i.e. EPSG:4326 - # * The OSGB coordinates are EPSG:27700 - # * Approximate the osgb values by taking the first row and column of the - # transformed x/y grids - latlonOsgbTransformer = pyproj.Transformer.from_crs( - crs_from=4326, - crs_to=27700, - always_xy=True, - ) - osgbX, osgbY = latlonOsgbTransformer.transform( - parameterDataset.longitude.values, - parameterDataset.latitude.values, - ) - osgbX = osgbX.astype(int) - osgbY = osgbY.astype(int) - parameterDataset = parameterDataset.assign_coords( - { - "x": osgbX[0], - "y": [osgbY[i][0] for i in range(len(osgbY))], - }, - ) - - return parameterDataset - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides the corresponding method in the parent class.""" - return { - "t2m": internal.OCFParameter.TemperatureAGL, - "si10": internal.OCFParameter.WindSpeedSurfaceAdjustedAGL, - "wdir10": internal.OCFParameter.WindDirectionFromWhichBlowingSurfaceAdjustedAGL, - "hcc": internal.OCFParameter.HighCloudCover, - "mcc": internal.OCFParameter.MediumCloudCover, - "lcc": internal.OCFParameter.LowCloudCover, - "vis": internal.OCFParameter.VisibilityAGL, - "r2": internal.OCFParameter.RelativeHumidityAGL, - "rprate": internal.OCFParameter.RainPrecipitationRate, - "tprate": internal.OCFParameter.RainPrecipitationRate, - "sd": internal.OCFParameter.SnowDepthWaterEquivalent, - "dswrf": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "dlwrf": internal.OCFParameter.DownwardLongWaveRadiationFlux, - } - - -def _isWantedFile(*, fi: MetOfficeFileInfo, dit: dt.datetime) -> bool: - """Check if the input FileInfo corresponds to a wanted GRIB file. - - :param fi: FileInfo describing the file to check - :param dit: Desired init time - """ - # False if item has an init_time not equal to desired init time - if fi.it().replace(tzinfo=None) != dit.replace(tzinfo=None): - return False - # False if item is one of the ones ending in +HH - if "+" in fi.filename(): - return False - - return True diff --git a/src/nwp_consumer/internal/inputs/metoffice/test_client.py b/src/nwp_consumer/internal/inputs/metoffice/test_client.py deleted file mode 100644 index 1af02b34..00000000 --- a/src/nwp_consumer/internal/inputs/metoffice/test_client.py +++ /dev/null @@ -1,113 +0,0 @@ -"""Tests for the metoffice module.""" - -import datetime as dt -import pathlib -import unittest.mock - -from ._models import MetOfficeFileInfo -from .client import Client, _isWantedFile - -# --------- Test setup --------- # - -testClient = Client( - orderID="tmp", - apiKey="tmp", -) - -# --------- Client methods --------- # - - -class TestClient_Init(unittest.TestCase): - """Tests for the MetOfficeClient.__init__ method.""" - - def test_errorsWhenVariablesAreNotSet(self) -> None: - with self.assertRaises(KeyError): - _ = Client(orderID="tmp", apiKey="") - - -class TestClient(unittest.TestCase): - """Tests for the MetOfficeClient.""" - - def test_mapCachedRaw(self) -> None: - - tests = [ - { - "filename": "test_knownparam.grib", - "expected_dims": ["init_time", "step", "y", "x"], - "expected_var": "dswrf", - }, - { - "filename": "test_unknownparam1.grib", - "expected_dims": ["init_time", "step", "y", "x"], - "expected_var": "wdir10", - }, - { - "filename": "test_unknownparam2.grib", - "expected_dims": ["init_time", "step", "y", "x"], - "expected_var": "si10", - }, - ] - - for tst in tests: - with self.subTest(f"test file {tst['filename']}"): - out = testClient.mapCachedRaw(p=pathlib.Path(__file__).parent / tst["filename"]) - - # Ensure the dimensions of the variables are correct - for data_var in out.data_vars: - self.assertEqual(list(out[data_var].dims), tst["expected_dims"], - msg=f'Dims "{list(out[data_var].dims)}" not as expected in {tst}') - # Ensure the correct variable is in the data_vars - self.assertTrue(tst["expected_var"] in list(out.data_vars.keys()), - msg=f'Variable "{list(out.data_vars.keys())}" not as expected in {tst}') - # Ensure no unknowns - self.assertNotIn("unknown", list(out.data_vars.keys())) - - -# --------- Static methods --------- # - - -class Test_IsWantedFile(unittest.TestCase): - """Tests for the _isWantedFile method.""" - - def test_correctlyFiltersMetOfficeFileInfos(self) -> None: - initTime: dt.datetime = dt.datetime( - year=2023, - month=3, - day=24, - hour=0, - minute=0, - tzinfo=dt.timezone.utc, - ) - - wantedFileInfos: list[MetOfficeFileInfo] = [ - MetOfficeFileInfo( - fileId="agl_temperature_1.5_2023032400", - runDateTime=dt.datetime( - year=2023, month=3, day=24, hour=0, minute=0, tzinfo=dt.timezone.utc, - ), - ), - MetOfficeFileInfo( - fileId="ground_downward-short-wave-radiation-flux_2023032400", - runDateTime=dt.datetime( - year=2023, month=3, day=24, hour=0, minute=0, tzinfo=dt.timezone.utc, - ), - ), - ] - - unwantedFileInfos: list[MetOfficeFileInfo] = [ - MetOfficeFileInfo( - fileId="agl_temperature_1.5+00", - runDateTime=dt.datetime( - year=2023, month=3, day=24, hour=0, minute=0, tzinfo=dt.timezone.utc, - ), - ), - MetOfficeFileInfo( - fileId="agl_temperature_1.5_2023032403", - runDateTime=dt.datetime( - year=2023, month=3, day=24, hour=3, minute=0, tzinfo=dt.timezone.utc, - ), - ), - ] - - self.assertTrue(all(_isWantedFile(fi=fo, dit=initTime) for fo in wantedFileInfos)) - self.assertFalse(all(_isWantedFile(fi=fo, dit=initTime) for fo in unwantedFileInfos)) diff --git a/src/nwp_consumer/internal/inputs/metoffice/test_knownparam.grib b/src/nwp_consumer/internal/inputs/metoffice/test_knownparam.grib deleted file mode 100644 index bdae72b1..00000000 Binary files a/src/nwp_consumer/internal/inputs/metoffice/test_knownparam.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/metoffice/test_unknownparam1.grib b/src/nwp_consumer/internal/inputs/metoffice/test_unknownparam1.grib deleted file mode 100644 index e5f86cf9..00000000 Binary files a/src/nwp_consumer/internal/inputs/metoffice/test_unknownparam1.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/metoffice/test_unknownparam2.grib b/src/nwp_consumer/internal/inputs/metoffice/test_unknownparam2.grib deleted file mode 100644 index df619082..00000000 Binary files a/src/nwp_consumer/internal/inputs/metoffice/test_unknownparam2.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/metoffice/test_wrongnameparam.grib b/src/nwp_consumer/internal/inputs/metoffice/test_wrongnameparam.grib deleted file mode 100644 index d7c94424..00000000 Binary files a/src/nwp_consumer/internal/inputs/metoffice/test_wrongnameparam.grib and /dev/null differ diff --git a/src/nwp_consumer/internal/inputs/noaa/__init__.py b/src/nwp_consumer/internal/inputs/noaa/__init__.py deleted file mode 100644 index c0ab0b44..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -__all__ = ["AWSClient", "NCARClient"] - -from .aws import Client as AWSClient -from .ncar import Client as NCARClient \ No newline at end of file diff --git a/src/nwp_consumer/internal/inputs/noaa/_consts.py b/src/nwp_consumer/internal/inputs/noaa/_consts.py deleted file mode 100644 index e6f4413f..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/_consts.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Defines all parameters available from NOAA.""" - -GFS_VARIABLES = ['siconc_surface_instant', 'slt_surface_instant', 'cape_surface_instant', 't_surface_instant', - 'sp_surface_instant', 'lsm_surface_instant', 'sr_surface_instant', 'vis_surface_instant', - 'prate_surface_instant', 'acpcp_surface_accum', 'sde_surface_instant', 'cin_surface_instant', - 'orog_surface_instant', 'tp_surface_accum', 'lhtfl_surface_avg', 'shtfl_surface_avg', - 'crain_surface_instant', 'cfrzr_surface_instant', 'cicep_surface_instant', 'csnow_surface_instant', - 'cprat_surface_instant', 'cpofp_surface_instant', 'pevpr_surface_instant', 'sdwe_surface_instant', - 'uflx_surface_avg', 'vflx_surface_avg', 'gust_surface_instant', 'fricv_surface_instant', - 'u-gwd_surface_avg', 'v-gwd_surface_avg', 'hpbl_surface_instant', 'dswrf_surface_avg', - 'uswrf_surface_avg', 'dlwrf_surface_avg', 'ulwrf_surface_avg', 'lftx_surface_instant', - '4lftx_surface_instant', 'veg_surface_instant', 'watr_surface_accum', 'gflux_surface_avg', - 'fco2rec_surface_instant', 'hindex_surface_instant', 'wilt_surface_instant', 'fldcp_surface_instant', - 'al_surface_avg', 'SUNSD_surface_instant', 'prate_surface_avg', 'crain_surface_avg', - 'cfrzr_surface_avg', 'cicep_surface_avg', 'csnow_surface_avg', 'cprat_surface_avg', 'pres_instant', - 'q_instant', 't_instant', 'u_instant', 'v_instant', 'u10_instant', 'v10_instant', 't2m_instant', - 'd2m_instant', 'tmax_max', 'tmin_min', 'sh2_instant', 'r2_instant', 'aptmp_instant', 'u100_instant', - 'v100_instant', 'refd_instant', 't', 'u', 'v', 'q', 'w', 'gh', 'r', 'absv', 'o3mr', 'wz', 'tcc', - 'clwmr', 'icmr', 'rwmr', 'snmr', 'grle', ] - -MISSING_STEP_0_VARIABLES = ['slt_surface_instant', 'sr_surface_instant', 'acpcp_surface_accum', 'tp_surface_accum', - 'lhtfl_surface_avg', 'shtfl_surface_avg', 'cprat_surface_instant', 'pevpr_surface_instant', - 'uflx_surface_avg', 'vflx_surface_avg', 'fricv_surface_instant', 'u-gwd_surface_avg', - 'v-gwd_surface_avg', 'dswrf_surface_avg', 'uswrf_surface_avg', 'dlwrf_surface_avg', - 'ulwrf_surface_avg', 'veg_surface_instant', 'watr_surface_accum', 'gflux_surface_avg', - 'fco2rec_surface_instant', 'al_surface_avg', 'prate_surface_avg', 'crain_surface_avg', - 'cfrzr_surface_avg', 'cicep_surface_avg', 'csnow_surface_avg', 'cprat_surface_avg', - 'tmax_max', 'tmin_min', 'refd_instant', 'q', ] - -EXTRA_STEP_0_VARIABLES = ["landn_surface_instant", "5wavh"] diff --git a/src/nwp_consumer/internal/inputs/noaa/_models.py b/src/nwp_consumer/internal/inputs/noaa/_models.py deleted file mode 100644 index 15388605..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/_models.py +++ /dev/null @@ -1,37 +0,0 @@ -import datetime as dt - -from nwp_consumer import internal - - -class NOAAFileInfo(internal.FileInfoModel): - def __init__( - self, - it: dt.datetime, - filename: str, - currentURL: str, - step: int, - ) -> None: - self._it = it - self._filename = filename - self._url = currentURL - self.step = step - - def filename(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._filename - - def filepath(self) -> str: - """Overrides the corresponding method in the parent class.""" - return self._url + "/" + self._filename - - def it(self) -> dt.datetime: - """Overrides the corresponding method in the parent class.""" - return self._it - - def steps(self) -> list[int]: - """Overrides the corresponding method in the parent class.""" - return [self.step] - - def variables(self) -> list[str]: - """Overrides the corresponding method in the parent class.""" - raise NotImplementedError() diff --git a/src/nwp_consumer/internal/inputs/noaa/aws.py b/src/nwp_consumer/internal/inputs/noaa/aws.py deleted file mode 100644 index 522c2fe4..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/aws.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Implements a client to fetch NOAA data from AWS.""" -import datetime as dt -import pathlib -import typing -import urllib.request - -import cfgrib -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._consts import GFS_VARIABLES -from ._models import NOAAFileInfo - -log = structlog.getLogger() - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("init_time", "step", "latitude", "longitude") - - -class Client(internal.FetcherInterface): - """Implements a client to fetch NOAA data from AWS.""" - - baseurl: str # The base URL for the NOAA model - model: str # The model to fetch data for - parameters: list[str] # The parameters to fetch - - def __init__(self, model: str, hours: int = 48, param_group: str = "default") -> None: - """Create a new NOAA Client. - - Exposes a client for NOAA data from AWS that conforms to the FetcherInterface. - - Args: - model: The model to fetch data for. Valid models is "global". - param_group: The set of parameters to fetch. - Valid groups are "default", "full", and "basic". - """ - self.baseurl = "https://noaa-gfs-bdp-pds.s3.amazonaws.com" - - match (param_group, model): - case ("default", _): - self.parameters = [ - "t2m", - "tcc", - "mcc", - "hcc", - "lcc", - "dswrf", - "dlwrf", - "prate", - "sdwe", - "r", - "vis", - "u10", - "v10", - "u100", - "v100", - ] - case ("basic", "global"): - self.parameters = ["t2m", "dswrf"] - case ("full", "global"): - raise ValueError("full parameter group is not yet implemented for GFS") - case (_, _): - raise ValueError( - f"unknown parameter group {param_group}." - "Valid groups are 'default', 'full', 'basic'", - ) - - self.model = model - self.hours = hours - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"NOAA_{self.model}".upper() - - def getInitHours(self) -> list[int]: # noqa: D102 - return [0, 6, 12, 18] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - files: list[internal.FileInfoModel] = [] - - # Files are split per timestep - # And the url includes the time and init time - # https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.20201206/00/atmos/gfs.t00z.pgrb2.1p00.f000 - for step in range(0, self.hours + 1, 3): - files.append( - NOAAFileInfo( - it=it, - filename=f"gfs.t{it.hour:02}z.pgrb2.1p00.f{step:03}", - currentURL=f"{self.baseurl}/gfs.{it.strftime('%Y%m%d')}/{it.hour:02}/atmos", - step=step, - ), - ) - - log.debug( - event="listed files for init time", - inittime=it.strftime("%Y-%m-%d %H:%M"), - numfiles=len(files), - ) - - return files - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: # noqa: D102 - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Load the raw file as a dataset - try: - ds = cfgrib.open_datasets( - p.as_posix(), - backend_kwargs={ - "indexpath": "", - "errors": "ignore", - }, - ) - except Exception as e: - log.warn( - event="error converting raw file as dataset", - error=e, - filepath=p.as_posix(), - ) - return xr.Dataset() - - log.debug(event=f"Loaded the file {p.as_posix()}, and now processing it") - # Process all the parameters into a single file - ds = [ - d - for d in ds - if any(x in d.coords for x in ["surface", "heightAboveGround", "isobaricInhPa"]) - ] - - # Split into surface, heightAboveGround, and isobaricInhPa lists - surface = [d for d in ds if "surface" in d.coords] - heightAboveGround = [d for d in ds if "heightAboveGround" in d.coords] - isobaricInhPa = [d for d in ds if "isobaricInhPa" in d.coords] - - # * Drop any variables we are not intrested in keeping - for i, d in enumerate(surface): - unwanted_variables = [v for v in d.data_vars if v not in self.parameters] - surface[i] = d.drop_vars(unwanted_variables) - for i, d in enumerate(heightAboveGround): - unwanted_variables = [v for v in d.data_vars if v not in self.parameters] - heightAboveGround[i] = d.drop_vars(unwanted_variables) - for i, d in enumerate(isobaricInhPa): - unwanted_variables = [v for v in d.data_vars if v not in self.parameters] - isobaricInhPa[i] = d.drop_vars(unwanted_variables) - - surface_merged = xr.merge(surface, compat="override").drop_vars( - ["unknown_surface_instant", "valid_time"], - errors="ignore", - ) - del surface - # Drop unknown data variable - hag_merged = xr.merge(heightAboveGround).drop_vars("valid_time", errors="ignore") - del heightAboveGround - iso_merged = xr.merge(isobaricInhPa).drop_vars("valid_time", errors="ignore") - del isobaricInhPa - - log.debug(event='Merging surface, hag and iso backtogether') - - total_ds = ( - xr.merge([surface_merged, hag_merged, iso_merged]) - .rename({"time": "init_time"}) - .expand_dims("init_time") - .expand_dims("step") - .transpose("init_time", "step", ...) - .sortby("step") - .chunk({"init_time": 1, "step": 1}) - ) - del surface_merged, hag_merged, iso_merged - - ds = total_ds.drop_dims([c for c in list(total_ds.sizes.keys()) if c not in COORDINATE_ALLOW_LIST]) - - log.debug(event='Finished mapping raw file to xarray', filename=p.as_posix()) - - return ds - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - log.debug(event="requesting download of file", file=fi.filename(), path=fi.filepath()) - try: - response = urllib.request.urlopen(fi.filepath()) - except Exception as e: - log.warn( - event="error calling url for file", - url=fi.filepath(), - filename=fi.filename(), - error=e, - ) - return pathlib.Path() - - if response.status != 200: - log.warn( - event="error downloading file", - status=response.status, - url=fi.filepath(), - filename=fi.filename(), - ) - return pathlib.Path() - - # Extract the bz2 file when downloading - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with open(cfp, "wb") as f: - f.write(response.read()) - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides the corresponding method in the parent class.""" - # See https://www.nco.ncep.noaa.gov/pmb/products/gfs/gfs.t00z.pgrb2.0p25.f003.shtml for a list of NOAA GFS - return { - "t2m_instant": internal.OCFParameter.TemperatureAGL, - "tcc": internal.OCFParameter.HighCloudCover, - "dswrf_surface_avg": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "dlwrf_surface_avg": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "sdwe_surface_instant": internal.OCFParameter.SnowDepthWaterEquivalent, - "r": internal.OCFParameter.RelativeHumidityAGL, - "u10_instant": internal.OCFParameter.WindUComponentAGL, - "v10_instant": internal.OCFParameter.WindVComponentAGL, - "u100_instant": internal.OCFParameter.WindUComponent100m, - "v100_instant": internal.OCFParameter.WindVComponent100m, - } - diff --git a/src/nwp_consumer/internal/inputs/noaa/ncar.py b/src/nwp_consumer/internal/inputs/noaa/ncar.py deleted file mode 100644 index f3655379..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/ncar.py +++ /dev/null @@ -1,222 +0,0 @@ -"""Implements a client to fetch NOAA data from NCAR.""" -import datetime as dt -import pathlib -import typing -import urllib.request - -import cfgrib -import structlog -import xarray as xr - -from nwp_consumer import internal - -from ._consts import GFS_VARIABLES -from ._models import NOAAFileInfo - -log = structlog.getLogger() - -COORDINATE_ALLOW_LIST: typing.Sequence[str] = ("time", "step", "latitude", "longitude") - - -class Client(internal.FetcherInterface): - """Implements a client to fetch NOAA data from NCAR.""" - - baseurl: str # The base URL for the NOAA model - model: str # The model to fetch data for - parameters: list[str] # The parameters to fetch - - def __init__(self, model: str, hours: int = 48, param_group: str = "default") -> None: - """Create a new NOAA Client. - - Exposes a client for NOAA data from NCAR that conforms to the FetcherInterface. - - Args: - model: The model to fetch data for. Valid models are "global". - param_group: The set of parameters to fetch. - Valid groups are "default", "full", and "basic". - """ - self.baseurl = "https://data.rda.ucar.edu/ds084.1" - - match (param_group, model): - case ("default", _): - self.parameters = ["t2m_instant", "tcc", "dswrf_surface_avg", "dlwrf_surface_avg", - "sdwe_surface_instant", "r", "u10_instant", "v10_instant"] - case ("basic", "global"): - self.parameters = ["t2m_instant", "dswrf_surface_avg"] - case ("full", "global"): - self.parameters = GFS_VARIABLES - case (_, _): - raise ValueError( - f"unknown parameter group {param_group}." - "Valid groups are 'default', 'full', 'basic'", - ) - - self.model = model - self.hours = hours - - def datasetName(self) -> str: - """Overrides the corresponding method in the parent class.""" - return f"NOAA_{self.model}".upper() - - def getInitHours(self) -> list[int]: # noqa: D102 - return [0, 6, 12, 18] - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: # noqa: D102 - - # Ignore inittimes that don't correspond to valid hours - if it.hour not in self.getInitHours(): - return [] - - # The GFS dataset goes from 2015-01-15 to present - # * https://rda.ucar.edu/datasets/ds084.1/ - if it < dt.datetime(2015, 1, 15, tzinfo=dt.UTC): - return [] - - files: list[internal.FileInfoModel] = [] - - # The GFS dataset has data in hour jumps of 3 up to 240 - for step in range(0, self.hours + 1, 3): - filename = f"gfs.0p25.{it.strftime('%Y%m%d%H')}.f{step:03}.grib2" - files.append( - NOAAFileInfo( - it=it, - filename=filename, - currentURL=f"{self.baseurl}/{it.strftime('%Y')}/{it.strftime('%Y%m%d')}", - step=step, - ), - ) - - return files - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: # noqa: D102 - if p.suffix != ".grib2": - log.warn( - event="cannot map non-grib file to dataset", - filepath=p.as_posix(), - ) - return xr.Dataset() - - log.debug(event="mapping raw file to xarray dataset", filepath=p.as_posix()) - - # Load the raw file as a list of datasets - try: - ds: list[xr.Dataset] = cfgrib.open_datasets( - p.as_posix(), - ) - except Exception as e: - log.error( - event="error converting raw file as dataset", - error=e, - filepath=p.as_posix(), - ) - return xr.Dataset() - - # Process all the parameters into a single file - ds = [ - d for d in ds - if any(x in d.coords for x in ["surface", "heightAboveGround", "isobaricInhPa"]) - ] - - # Split into surface, heightAboveGround, and isobaricInhPa lists - surface: list[xr.Dataset] = [d for d in ds if "surface" in d.coords] - heightAboveGround: list[xr.Dataset] = [d for d in ds if "heightAboveGround" in d.coords] - isobaricInhPa: list[xr.Dataset] = [d for d in ds if "isobaricInhPa" in d.coords] - del ds - - # Update name of each data variable based off the attribute GRIB_stepType - for i, d in enumerate(surface): - for variable in d.data_vars: - d = d.rename({variable: f"{variable}_surface_{d[f'{variable}'].attrs['GRIB_stepType']}"}) - surface[i] = d - for i, d in enumerate(heightAboveGround): - for variable in d.data_vars: - d = d.rename({variable: f"{variable}_{d[f'{variable}'].attrs['GRIB_stepType']}"}) - heightAboveGround[i] = d - - surface_merged: xr.Dataset = xr.merge(surface).drop_vars( - ["unknown_surface_instant", "valid_time"], errors="ignore", - ) - del surface - heightAboveGround_merged: xr.Dataset = xr.merge(heightAboveGround).drop_vars( - ["valid_time"], errors="ignore", - ) - del heightAboveGround - isobaricInhPa_merged: xr.Dataset = xr.merge(isobaricInhPa).drop_vars( - ["valid_time"], errors="ignore", - ) - del isobaricInhPa - - total_ds = xr.merge([surface_merged, heightAboveGround_merged, isobaricInhPa_merged]) - del surface_merged, heightAboveGround_merged, isobaricInhPa_merged - - # Map the data to the internal dataset representation - # * Transpose the Dataset so that the dimensions are correctly ordered - # * Rechunk the data to a more optimal size - total_ds = ( - total_ds.rename({"time": "init_time"}) - .expand_dims("init_time") - .expand_dims("step") - .transpose("init_time", "step", ...) - .sortby("step") - .chunk({"init_time": 1, "step": 1}) - ) - - return total_ds - - def downloadToCache( # noqa: D102 - self, - *, - fi: internal.FileInfoModel, - ) -> pathlib.Path: - log.debug(event="requesting download of file", file=fi.filename(), path=fi.filepath()) - try: - response = urllib.request.urlopen(fi.filepath()) - except Exception as e: - log.warn( - event="error calling url for file", - url=fi.filepath(), - filename=fi.filename(), - error=e, - ) - return pathlib.Path() - - if response.status != 200: - log.warn( - event="error downloading file", - status=response.status, - url=fi.filepath(), - filename=fi.filename(), - ) - return pathlib.Path() - - # Extract the bz2 file when downloading - cfp: pathlib.Path = internal.rawCachePath(it=fi.it(), filename=fi.filename()) - with open(cfp, "wb") as f: - f.write(response.read()) - - log.debug( - event="fetched all data from file", - filename=fi.filename(), - url=fi.filepath(), - filepath=cfp.as_posix(), - nbytes=cfp.stat().st_size, - ) - - return cfp - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - """Overrides the corresponding method in the parent class.""" - # See https://www.nco.ncep.noaa.gov/pmb/products/gfs/gfs.t00z.pgrb2.0p25.f003.shtml - # for a list of NOAA parameters - return { - "t2m_instant": internal.OCFParameter.TemperatureAGL, - "tcc": internal.OCFParameter.HighCloudCover, - "dswrf_surface_avg": internal.OCFParameter.DownwardShortWaveRadiationFlux, - "dlwrf_surface_avg": internal.OCFParameter.DownwardLongWaveRadiationFlux, - "sdwe_surface_instant": internal.OCFParameter.SnowDepthWaterEquivalent, - "r": internal.OCFParameter.RelativeHumidityAGL, - "u10_instant": internal.OCFParameter.WindUComponentAGL, - "v10_instant": internal.OCFParameter.WindVComponentAGL, - "u100_instant": internal.OCFParameter.WindUComponent100m, - "v100_instant": internal.OCFParameter.WindVComponent100m, - } diff --git a/src/nwp_consumer/internal/inputs/noaa/test_aws.py b/src/nwp_consumer/internal/inputs/noaa/test_aws.py deleted file mode 100644 index 9ea7112b..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/test_aws.py +++ /dev/null @@ -1,35 +0,0 @@ -import datetime as dt -import pathlib -import unittest -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from ._models import NOAAFileInfo - -from .aws import Client - -testClient = Client(model="global", param_group="basic") - - -class TestClient(unittest.TestCase): - def test_mapCachedRaw(self) -> None: - # Test with global file - testFilePath: pathlib.Path = ( - pathlib.Path(__file__).parent / "test_surface_000.grib2" - ) - out = testClient.mapCachedRaw(p=testFilePath) - # Check latitude and longitude are injected - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - print(out) - # Check that the dimensions are correctly ordered and renamed - self.assertEqual( - out[next(iter(out.data_vars.keys()))].dims, - ("init_time", "step", "latitude", "longitude"), - ) - self.assertEqual(len(out["latitude"].values), 721) - self.assertEqual(len(out["longitude"].values), 1440) - self.assertEqual(len(out["init_time"].values), 1) - self.assertEqual(len(out["step"].values), 1) - self.assertListEqual(list(out.data_vars.keys()), ["t2m"]) - diff --git a/src/nwp_consumer/internal/inputs/noaa/test_ncar.py b/src/nwp_consumer/internal/inputs/noaa/test_ncar.py deleted file mode 100644 index 5d0038a9..00000000 --- a/src/nwp_consumer/internal/inputs/noaa/test_ncar.py +++ /dev/null @@ -1,27 +0,0 @@ -import pathlib -import unittest - -from .ncar import Client - -testClient = Client(model="global", param_group="full") - - -class TestClient(unittest.TestCase): - def test_mapCachedRaw(self) -> None: - # Test with global file - testFilePath: pathlib.Path = ( - pathlib.Path(__file__).parent / "test_surface_000.grib2" - ) - out = testClient.mapCachedRaw(p=testFilePath) - # Check latitude and longitude are injected - self.assertTrue("latitude" in out.coords) - self.assertTrue("longitude" in out.coords) - # Check that the dimensions are correctly ordered and renamed - self.assertEqual( - out[next(iter(out.data_vars.keys()))].dims, - ("init_time", "step", "latitude", "longitude"), - ) - self.assertEqual(len(out["latitude"].values), 721) - self.assertEqual(len(out["longitude"].values), 1440) - self.assertEqual(len(out["init_time"].values), 1) - self.assertEqual(len(out["step"].values), 1) diff --git a/src/nwp_consumer/internal/inputs/noaa/test_surface_000.grib2 b/src/nwp_consumer/internal/inputs/noaa/test_surface_000.grib2 deleted file mode 100644 index a24e14a2..00000000 Binary files a/src/nwp_consumer/internal/inputs/noaa/test_surface_000.grib2 and /dev/null differ diff --git a/src/nwp_consumer/internal/models.py b/src/nwp_consumer/internal/models.py deleted file mode 100644 index 0e1d24ee..00000000 --- a/src/nwp_consumer/internal/models.py +++ /dev/null @@ -1,203 +0,0 @@ -"""Contains both ports and domain models for the nwp_consumer package.""" - -import abc -import datetime as dt -import pathlib -from enum import Enum - -import xarray as xr - - -# ------- Domain models ------- # - - -class OCFParameter(str, Enum): - """Short names for the OCF parameters.""" - - LowCloudCover = "lcc" - MediumCloudCover = "mcc" - HighCloudCover = "hcc" - TotalCloudCover = "clt" - VisibilityAGL = "vis" - RelativeHumidityAGL = "r" - RainPrecipitationRate = "prate" - SnowDepthWaterEquivalent = "sde" - DownwardShortWaveRadiationFlux = "dswrf" - DownwardLongWaveRadiationFlux = "dlwrf" - TemperatureAGL = "t" - WindSpeedSurfaceAdjustedAGL = "si10" - WindDirectionFromWhichBlowingSurfaceAdjustedAGL = "wdir10" - WindUComponentAGL = "u10" - WindVComponentAGL = "v10" - WindUComponent100m = "u100" - WindVComponent100m = "v100" - WindUComponent200m = "u200" - WindVComponent200m = "v200" - DirectSolarRadiation = "sr" - DownwardUVRadiationAtSurface = "duvrs" - - -class FileInfoModel(abc.ABC): - """Information about a raw file. - - FileInfoModel assumes the following properties exist for all - raw NWP files that may be encountered in a provider's archive: - - 1. The file has a name - 2. The file has a path - 3. The file corresponds to a single forecast init time - 4. The file corresponds to one or more time steps - 5. The file corresponds to one or more variables - - These assumptions are reflected in the abstract methods of this class. - """ - - @abc.abstractmethod - def filename(self) -> str: - """Return the file name including extension.""" - pass - - @abc.abstractmethod - def filepath(self) -> str: - """Return the remote file path, not including protocols and TLDs.""" - pass - - @abc.abstractmethod - def it(self) -> dt.datetime: - """Return the init time of the file.""" - pass - - @abc.abstractmethod - def steps(self) -> list[int]: - """Return the time steps of the file.""" - pass - - @abc.abstractmethod - def variables(self) -> list[str]: - """Return the variables of the file.""" - pass - - -# ------- Interfaces ------- # -# Represent ports in the hexagonal architecture pattern - -class FetcherInterface(abc.ABC): - """Generic interface for fetching and converting NWP data from an API. - - Used for dependency injection. NWP data from any source shares common properties: - - It is presented in one or many files for a given init_time - - These files can be read as raw bytes - - There is an expected number of files per init_time which correspond to an equivalent - number of variables and steps in the dataset - - The following functions define generic transforms based around these principals. - """ - - @abc.abstractmethod - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[FileInfoModel]: - """List the relative path of all files available from source for the given init_time. - - :param it: Init Time to list files for - """ - pass - - @abc.abstractmethod - def downloadToCache(self, *, fi: FileInfoModel) -> pathlib.Path: - """Fetch the bytes of a single raw file from source and save to a cache file. - - :param fi: File Info object describing the file to fetch - :return: Path to the local cache file, or pathlib.Path() if the file was not fetched - """ - pass - - @abc.abstractmethod - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: - """Create an xarray dataset from the given RAW data in a cache file. - - :param p: Path to cached file holding raw data - :return: Dataset created from the raw data - """ - pass - - @abc.abstractmethod - def getInitHours(self) -> list[int]: - """Get the forecast init hours available from the source. - - :return: List of forecast init hours - """ - pass - - @abc.abstractmethod - def parameterConformMap(self) -> dict[str, OCFParameter]: - """The mapping from the source's parameter names to the OCF short names. - - :return: Dictionary of parameter mappings - """ - pass - - @abc.abstractmethod - def datasetName(self) -> str: - """Return the name of the dataset. - - :return: Name of the dataset - """ - pass - - -class StorageInterface(abc.ABC): - """Generic interface for storing data, used for dependency injection.""" - - @abc.abstractmethod - def exists(self, *, dst: pathlib.Path) -> bool: - """Check if the given path exists. - - :param dst: Path to check - :return: True if the path exists, False otherwise - """ - pass - - @abc.abstractmethod - def store(self, *, src: pathlib.Path, dst: pathlib.Path) -> pathlib.Path: - """Move a file to the store. - - :param src: Path to file to store - :param dst: Desired path in store - :return: Location in raw store - """ - pass - - @abc.abstractmethod - def listInitTimes(self, *, prefix: pathlib.Path) -> list[dt.datetime]: - """List all initTime folders in the given prefix. - - :param prefix: Path to prefix to list initTimes for - :return: List of initTimes - """ - pass - - @abc.abstractmethod - def copyITFolderToCache(self, *, prefix: pathlib.Path, it: dt.datetime) \ - -> list[pathlib.Path]: - """Copy all files in given folder to cache. - - :param prefix: Path of folder in which to find initTimes - :param it: InitTime to copy files for - :return: List of paths to cached files - """ - pass - - @abc.abstractmethod - def delete(self, *, p: pathlib.Path) -> None: - """Delete the given path. - - :param p: Path to delete - """ - pass - - @abc.abstractmethod - def name(self) -> str: - """Return the name of the storage provider. - - :return: Name of the storage provider - """ - pass diff --git a/src/nwp_consumer/internal/outputs/__init__.py b/src/nwp_consumer/internal/outputs/__init__.py deleted file mode 100644 index dd8ce0fc..00000000 --- a/src/nwp_consumer/internal/outputs/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Output modules the consumer can write to.""" - -from . import ( - huggingface, - localfs, - s3, -) - -__all__ = [ - "localfs", - "s3", - "huggingface", -] diff --git a/src/nwp_consumer/internal/outputs/huggingface/__init__.py b/src/nwp_consumer/internal/outputs/huggingface/__init__.py deleted file mode 100644 index f274eb57..00000000 --- a/src/nwp_consumer/internal/outputs/huggingface/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -__all__ = ['Client'] - -from .client import Client - diff --git a/src/nwp_consumer/internal/outputs/huggingface/client.py b/src/nwp_consumer/internal/outputs/huggingface/client.py deleted file mode 100644 index c2f2f725..00000000 --- a/src/nwp_consumer/internal/outputs/huggingface/client.py +++ /dev/null @@ -1,313 +0,0 @@ -"""Client for HuggingFace.""" - -import datetime as dt -import pathlib - -import huggingface_hub as hfh -import structlog -from huggingface_hub.hf_api import ( - RepoFile, - RepoFolder, - RevisionNotFoundError, -) - -from nwp_consumer import internal - -log = structlog.getLogger() - - -class Client(internal.StorageInterface): - """Client for HuggingFace.""" - - # HuggingFace API - __api: hfh.HfApi - - # DatasetURL - dsURL: str - - def __init__(self, repoID: str, token: str | None = None, endpoint: str | None = None) -> None: - """Create a new client for HuggingFace. - - Exposes a client for the HuggingFace filesystem API that conforms to the StorageInterface. - - Args: - repoID: The ID of the repo to use for the dataset. - token: The HuggingFace authentication token. - endpoint: The HuggingFace endpoint to use. - """ - self.__api = hfh.HfApi(token=token, endpoint=endpoint) - # Get the URL to the dataset, e.g. https://huggingface.co/datasets/username/dataset - self.dsURL = hfh.hf_hub_url( - endpoint=endpoint, - repo_id=repoID, - repo_type="dataset", - filename="", - ) - # Repo ID - self.repoID = repoID - - try: - self.__api.dataset_info( - repo_id=repoID, - ) - except Exception as e: - log.warn( - event="failed to authenticate with huggingface for given repo", - repo_id=repoID, - error=e, - ) - - def name(self) -> str: - """Overrides the corresponding method of the parent class.""" - return "huggingface" - - def exists(self, *, dst: pathlib.Path) -> bool: - """Overrides the corresponding method of the parent class.""" - try: - path_infos: list[RepoFile | RepoFolder] = self.__api.get_paths_info( - repo_id=self.repoID, - repo_type="dataset", - paths=[dst.as_posix()], - ) - if len(path_infos) == 0: - return False - except RevisionNotFoundError: - return False - return True - - def store(self, *, src: pathlib.Path, dst: pathlib.Path) -> pathlib.Path: - """Overrides the corresponding method of the parent class.""" - # Remove any leading slashes as they are not allowed in huggingface - dst = dst.relative_to("/") if dst.is_absolute() else dst - - # Get the hash of the latest commit - sha: str = self.__api.dataset_info(repo_id=self.repoID).sha - # Handle the case where we are trying to upload a folder - if src.is_dir(): - # Upload the folder using the huggingface API - future = self.__api.upload_folder( - repo_id=self.repoID, - repo_type="dataset", - folder_path=src.as_posix(), - path_in_repo=dst.as_posix(), - parent_commit=sha, - run_as_future=True, - ) - # Handle the case where we are trying to upload a file - else: - # Upload the file using the huggingface API - future = self.__api.upload_file( - repo_id=self.repoID, - repo_type="dataset", - path_or_fileobj=src.as_posix(), - path_in_repo=dst.as_posix(), - parent_commit=sha, - run_as_future=True, - ) - - # Block until the upload is complete to prevent overlapping commits - url = future.result(timeout=120) - log.info("Uploaded to huggingface", commiturl=url) - - # Perform a check on the size of the file - size = self._get_size(p=dst) - if size != src.stat().st_size and future.done(): - log.warn( - event="stored file size does not match source file size", - src=src.as_posix(), - dst=dst.as_posix(), - srcsize=src.stat().st_size, - dstsize=size, - ) - else: - log.debug( - event=f"stored file {dst.name}", - filepath=dst.as_posix(), - nbytes=size, - ) - return dst - - def listInitTimes(self, *, prefix: pathlib.Path) -> list[dt.datetime]: - """Overrides the corresponding method of the parent class.""" - # Remove any leading slashes as they are not allowed in huggingface - prefix = prefix.relative_to("/") if prefix.is_absolute() else prefix - # Get the path relative to the prefix of every folder in the repo - allDirs: list[pathlib.Path] = [ - pathlib.Path(f.path).relative_to(prefix) - for f in self.__api.list_repo_tree( - repo_id=self.repoID, - repo_type="dataset", - path_in_repo=prefix.as_posix(), - recursive=True, - ) - if isinstance(f, RepoFolder) - ] - - # Get the initTime from the folder pattern - initTimes = set() - for d in allDirs: - if d.match(internal.IT_FOLDER_GLOBSTR_RAW): - try: - # Try to parse the folder name as a datetime - ddt = dt.datetime.strptime( - d.as_posix(), - internal.IT_FOLDER_STRUCTURE_RAW, - ).replace(tzinfo=dt.UTC) - initTimes.add(ddt) - except ValueError: - log.debug( - event="ignoring invalid folder name", - name=d.as_posix(), - within=prefix.as_posix(), - ) - - sortedInitTimes = sorted(initTimes) - log.debug( - event=f"found {len(initTimes)} init times in raw directory", - earliest=sortedInitTimes[0], - latest=sortedInitTimes[-1], - ) - return sortedInitTimes - - def copyITFolderToCache(self, *, prefix: pathlib.Path, it: dt.datetime) -> list[pathlib.Path]: - """Overrides the corresponding method of the parent class.""" - # Remove any leading slashes as they are not allowed in huggingface - prefix = prefix.relative_to("/") if prefix.is_absolute() else prefix - - # Get the paths of all files in the folder - paths: list[pathlib.Path] = [ - pathlib.Path(p.path) - for p in self.__api.list_repo_tree( - repo_id=self.repoID, - repo_type="dataset", - path_in_repo=(prefix / it.strftime(internal.IT_FOLDER_STRUCTURE_RAW)).as_posix(), - recursive=True, - ) - if isinstance(p, RepoFile) - ] - - log.debug( - event="copying it folder to cache", - inittime=it.strftime(internal.IT_FOLDER_STRUCTURE_RAW), - numfiles=len(paths), - ) - - # Read all files into cache - cachedPaths: list[pathlib.Path] = [] - for path in paths: - # Huggingface replicates the full path from repo root on download - # to local directory. - cfp: pathlib.Path = internal.CACHE_DIR / path.as_posix() - - # Use existing cached file if it already exists in the cache - if cfp.exists() and cfp.stat().st_size > 0: - log.debug( - event="file already exists in cache, skipping", - filepath=path.as_posix(), - cachepath=cfp.as_posix(), - ) - cachedPaths.append(cfp) - continue - - # Don't copy file from the store if it is empty - if self.exists(dst=path) is False: - log.warn( - event="file does not exist in store, skipping", - filepath=path.as_posix(), - ) - continue - - # Copy the file from the store to cache - self.__api.hf_hub_download( - repo_id=self.repoID, - repo_type="dataset", - filename=path.as_posix(), - local_dir=internal.CACHE_DIR.as_posix(), - local_dir_use_symlinks=False, - ) - - # Check that the file was copied correctly - if cfp.stat().st_size != self._get_size(p=path) or cfp.stat().st_size == 0: - log.warn( - event="copied file size does not match source file size", - src=path.as_posix(), - dst=cfp.as_posix(), - srcsize=self._get_size(p=path), - dstsize=cfp.stat().st_size, - ) - else: - cachedPaths.append(cfp) - - log.debug( - event="copied it folder to cache", - nbytes=[p.stat().st_size for p in cachedPaths], - inittime=it.strftime("%Y-%m-%d %H:%M"), - ) - - return cachedPaths - - def delete(self, *, p: pathlib.Path) -> None: - """Overrides the corresponding method of the parent class.""" - # Remove any leading slashes as they are not allowed in huggingface - p = p.relative_to("/") if p.is_absolute() else p - - # Determine if the path corresponds to a file or a folder - info: RepoFile | RepoFolder = self.__api.get_paths_info( - repo_id=self.repoID, - repo_type="dataset", - paths=[p.as_posix()], - recursive=False, - )[0] - # Call the relevant delete function using the huggingface API - if isinstance(info, RepoFolder): - self.__api.delete_folder( - repo_id=self.repoID, - repo_type="dataset", - path_in_repo=p.as_posix(), - ) - else: - self.__api.delete_file( - repo_id=self.repoID, - repo_type="dataset", - path_in_repo=p.as_posix(), - ) - - def _get_size(self, *, p: pathlib.Path) -> int: - """Gets the size of a file or folder in the huggingface dataset.""" - # Remove any leading slashes as they are not allowed in huggingface - p = p.relative_to("/") if p.is_absolute() else p - - size: int = 0 - # Get the info of the path - path_info: RepoFile | RepoFolder = self.__api.get_paths_info( - repo_id=self.repoID, - repo_type="dataset", - paths=[p.as_posix()], - ) - - if len(path_info) == 0: - # The path in question doesn't exist - log.warn( - event="path does not exist in huggingface dataset", - path=p.as_posix(), - ) - return size - - # Calculate the size of the file or folder - if isinstance(path_info[0], RepoFolder): - size = sum( - [ - f.size - for f in self.__api.list_repo_tree( - repo_id=self.repoID, - repo_type="dataset", - path_in_repo=p.as_posix(), - recursive=True, - ) - if isinstance(f, RepoFile) - ], - ) - elif isinstance(path_info[0], RepoFile): - size = path_info[0].size - - return size diff --git a/src/nwp_consumer/internal/outputs/huggingface/test_client.py b/src/nwp_consumer/internal/outputs/huggingface/test_client.py deleted file mode 100644 index f1698faa..00000000 --- a/src/nwp_consumer/internal/outputs/huggingface/test_client.py +++ /dev/null @@ -1,43 +0,0 @@ -import datetime as dt -import pathlib -import unittest - -from nwp_consumer import internal - -from .client import Client - -USER = "openclimatefix" -RAW = pathlib.Path("raw") - - -class TestHuggingFaceClient(unittest.TestCase): - repoID: str - client: Client - - @classmethod - def setUpClass(cls) -> None: - cls.repoID = "PolyAI/minds14" - cls.client = Client(repoID=cls.repoID) - - def test_get_size(self) -> None: - """Test that the size of a file is returned correctly.""" - name_size_map: dict[str, int] = { - "README.md": 5276, - "data": 471355396, - } - for name, exp in name_size_map.items(): - with self.subTest(msg=name): - self.assertEqual(self.client._get_size(p=pathlib.Path(name)), exp) - - def test_exists(self) -> None: - """Test that the existence of a file is returned correctly.""" - name_exists_map: dict[str, bool] = { - "README.md": True, - "data": True, - "nonexistent1": False, - "nonexistent/nonexistent2": False, - } - for name, exp in name_exists_map.items(): - with self.subTest(msg=name): - self.assertEqual(self.client.exists(dst=pathlib.Path(name)), exp) - diff --git a/src/nwp_consumer/internal/outputs/localfs/__init__.py b/src/nwp_consumer/internal/outputs/localfs/__init__.py deleted file mode 100644 index 74f4c648..00000000 --- a/src/nwp_consumer/internal/outputs/localfs/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ['Client'] - -from .client import Client diff --git a/src/nwp_consumer/internal/outputs/localfs/client.py b/src/nwp_consumer/internal/outputs/localfs/client.py deleted file mode 100644 index c60095f6..00000000 --- a/src/nwp_consumer/internal/outputs/localfs/client.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Client for local filesystem.""" - -import datetime as dt -import os -import pathlib -import shutil - -import structlog - -from nwp_consumer import internal - -log = structlog.getLogger() - - -class Client(internal.StorageInterface): - """Client for local filesystem. - - This class implements the StorageInterface for the local filesystem. - """ - - def name(self) -> str: - """Overrides the corresponding method in the parent class.""" - return "localfilesystem" - - def exists(self, *, dst: pathlib.Path) -> bool: - """Overrides the corresponding method in the parent class.""" - return dst.exists() - - def store(self, *, src: pathlib.Path, dst: pathlib.Path) -> pathlib.Path: - """Overrides the corresponding method in the parent class.""" - if src == dst: - return dst - - dst.parent.mkdir(parents=True, exist_ok=True) - if src.is_dir(): - shutil.copytree(src=src, dst=dst) - else: - shutil.copy(src=src, dst=dst) - - if src.stat().st_size != dst.stat().st_size: - log.warn( - event="file size mismatch", - src=src.as_posix(), - dst=dst.as_posix(), - srcbytes=src.stat().st_size, - dstbytes=dst.stat().st_size, - ) - else: - log.debug( - event="stored file locally", - src=src.as_posix(), - dst=dst.as_posix(), - nbytes=dst.stat().st_size, - ) - - # Delete the cache to avoid double storage - try: - src.unlink() - except: - log.warn( - event="could not delete source file. Will be cleaned up at end of run", - src=src.as_posix(), - ) - - return dst - - def listInitTimes(self, *, prefix: pathlib.Path) -> list[dt.datetime]: - """Overrides the corresponding method in the parent class.""" - # List all the inittime folders in the given directory - dirs = [ - f.relative_to(prefix) - for f in prefix.glob(internal.IT_FOLDER_GLOBSTR_RAW) - if f.suffix == "" - ] - - initTimes = set() - for dir in dirs: - try: - # Try to parse the dir as a datetime - ddt: dt.datetime = dt.datetime.strptime( - dir.as_posix(), - internal.IT_FOLDER_STRUCTURE_RAW, - ).replace(tzinfo=dt.UTC) - # Add the initTime to the set - initTimes.add(ddt) - except ValueError: - log.debug( - event="ignoring invalid folder name", - name=dir.as_posix(), - within=prefix.as_posix(), - ) - - if len(initTimes) == 0: - log.debug( - event="no init times found in raw directory", - within=prefix.as_posix(), - ) - return [] - - sortedInitTimes = sorted(initTimes) - log.debug( - event=f"found {len(initTimes)} init times in raw directory", - earliest=sortedInitTimes[0], - latest=sortedInitTimes[-1], - ) - - return sortedInitTimes - - def copyITFolderToCache(self, *, prefix: pathlib.Path, it: dt.datetime) -> list[pathlib.Path]: - """Overrides the corresponding method in the parent class.""" - # Check if the folder exists - if not (prefix / it.strftime(internal.IT_FOLDER_STRUCTURE_RAW)).exists(): - log.debug( - event="Init time folder not present", - path=(prefix / it.strftime(internal.IT_FOLDER_STRUCTURE_RAW)).as_posix(), - ) - return [] - filesInFolder = list((prefix / it.strftime(internal.IT_FOLDER_STRUCTURE_RAW)).iterdir()) - - cfps: list[pathlib.Path] = [] - for file in filesInFolder: - # Copy the file to the cache if it isn't already there - dst: pathlib.Path = internal.rawCachePath(it=it, filename=file.name) - if not dst.exists(): - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src=file, dst=dst) - cfps.append(dst) - - return cfps - - def delete(self, *, p: pathlib.Path) -> None: - """Overrides the corresponding method in the parent class.""" - if not p.exists(): - raise FileNotFoundError(f"file does not exist: {p}") - if p.is_file(): - p.unlink() - elif p.is_dir(): - shutil.rmtree(p.as_posix()) - else: - raise ValueError(f"path is not a file or directory: {p}") - return diff --git a/src/nwp_consumer/internal/outputs/localfs/test_client.py b/src/nwp_consumer/internal/outputs/localfs/test_client.py deleted file mode 100644 index 6c9384a4..00000000 --- a/src/nwp_consumer/internal/outputs/localfs/test_client.py +++ /dev/null @@ -1,187 +0,0 @@ -import datetime as dt -import shutil -import unittest -import uuid -from pathlib import Path - -import numpy as np -import xarray as xr - -from nwp_consumer import internal - -from .client import Client - -RAW = Path("test_raw_dir") -ZARR = Path("test_zarr_dir") - - -class TestLocalFSClient(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - # Make test directories - RAW.mkdir(parents=True, exist_ok=True) - ZARR.mkdir(parents=True, exist_ok=True) - - cls.testClient = Client() - - @classmethod - def tearDownClass(cls) -> None: - # Clean up the temporary directory - shutil.rmtree(RAW.as_posix()) - shutil.rmtree(ZARR.as_posix()) - - def test_exists(self) -> None: - initTime = dt.datetime(2021, 1, 1, 0, 0, 0, tzinfo=dt.UTC) - - # Create a file in the raw directory - path = RAW / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" / "test_file.grib" - path.parent.mkdir(parents=True, exist_ok=True) - path.touch() - - # Check if the file exists using the function - exists = self.testClient.exists(dst=path) - - # Assert that the file exists - self.assertTrue(exists) - - # Remove the init time folder - shutil.rmtree(RAW / "2021") - - # Check that the function returns false when the file does not exist - exists = self.testClient.exists( - dst=RAW / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" / "not_exists.grib", - ) - - # Assert that the file does not exist - self.assertFalse(exists) - - # Create a zarr file in the zarr directory - testDS = xr.Dataset( - data_vars={ - "UKV": ( - ("init_time", "variable", "step", "x", "y"), - np.random.rand(1, 2, 12, 100, 100), - ), - }, - coords={ - "init_time": [np.datetime64(initTime)], - "variable": ["t", "r"], - "step": range(12), - "x": range(100), - "y": range(100), - }, - ) - - testDS.to_zarr(store=ZARR / "test_file.zarr", compute=True) - - # Check if the file exists using the function - exists = self.testClient.exists(dst=ZARR / "test_file.zarr") - - # Assert that the file exists - self.assertTrue(exists) - - def test_store(self) -> None: - initTime = dt.datetime(2021, 1, 2, 0, 0, 0, tzinfo=dt.UTC) - dst = RAW / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" / "test_store.grib" - src = internal.CACHE_DIR / f"nwpc-{uuid.uuid4()}" - # Create a temporary file to simulate a file to be stored - src.parent.mkdir(parents=True, exist_ok=True) - src.write_bytes(bytes("test_file_contents", "utf-8")) - - # Store the file using the function - out = self.testClient.store(src=src, dst=dst) - - # Assert that the file exists - self.assertTrue(dst.exists()) - # Assert that the file has the correct size - self.assertEqual(out, dst) - # Assert that the temporary file has been deleted - self.assertFalse(src.exists()) - - def test_listInitTimes(self) -> None: - expectedTimes = [ - dt.datetime(2023, 1, 1, 3, tzinfo=dt.UTC), - dt.datetime(2023, 1, 2, 6, tzinfo=dt.UTC), - dt.datetime(2023, 1, 3, 9, tzinfo=dt.UTC), - ] - - # Create some files in the raw directory - dirs = [RAW / t.strftime(internal.IT_FOLDER_STRUCTURE_RAW) for t in expectedTimes] - - for d in dirs: - d.mkdir(parents=True, exist_ok=True) - - # Get the list of init times - initTimes = self.testClient.listInitTimes(prefix=Path(RAW)) - - # Assert that the list of init times is correct - self.assertEqual(initTimes, expectedTimes) - - # Remove the files - for d in dirs: - shutil.rmtree(d) - - def test_copyITFolderToCache(self) -> None: - # Make some files in the raw directory - initTime = dt.datetime(2023, 1, 1, 3, tzinfo=dt.UTC) - files = [ - RAW / f"{initTime:%Y/%m/%d/%H%M}" / "test_copyITFolderToTemp1.grib", - RAW / f"{initTime:%Y/%m/%d/%H%M}" / "test_copyITFolderToTemp2.grib", - RAW / f"{initTime:%Y/%m/%d/%H%M}" / "test_copyITFolderToTemp3.grib", - ] - for f in files: - f.parent.mkdir(parents=True, exist_ok=True) - f.write_bytes(bytes("test_file_contents", "utf-8")) - - # Test the function - paths = self.testClient.copyITFolderToCache(prefix=RAW, it=initTime) - - # Assert the contents of the temp files is correct - for _i, path in enumerate(paths): - self.assertEqual(path.read_bytes(), bytes("test_file_contents", "utf-8")) - - # Remove the files - shutil.rmtree(files[0].parent) - - def test_delete(self) -> None: - # Create a file in the raw directory - initTime = dt.datetime(2023, 1, 1, 3, tzinfo=dt.UTC) - path = RAW / f"{initTime:%Y/%m/%d/%H%M}" / "test_delete.grib" - path.parent.mkdir(parents=True, exist_ok=True) - path.touch() - - # Delete the file using the function - self.testClient.delete(p=path) - - # Assert that the file no longer exists - self.assertFalse(path.exists()) - - # Create a zarr folder in the zarr directory - path = ZARR / "test_delete.zarr" - testDS = xr.Dataset( - data_vars={ - "UKV": ( - ("init_time", "variable", "step", "x", "y"), - np.random.rand(1, 2, 12, 100, 100), - ), - }, - coords={ - "init_time": [np.datetime64(initTime)], - "variable": ["t", "r"], - "step": range(12), - "x": range(100), - "y": range(100), - }, - ) - - testDS.to_zarr(store=path, compute=True) - - # Delete the folder using the function - self.testClient.delete(p=path) - - # Assert that the folder no longer exists - self.assertFalse(path.exists()) - - -if __name__ == "__main__": - unittest.main() diff --git a/src/nwp_consumer/internal/outputs/s3/__init__.py b/src/nwp_consumer/internal/outputs/s3/__init__.py deleted file mode 100644 index 74f4c648..00000000 --- a/src/nwp_consumer/internal/outputs/s3/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ['Client'] - -from .client import Client diff --git a/src/nwp_consumer/internal/outputs/s3/client.py b/src/nwp_consumer/internal/outputs/s3/client.py deleted file mode 100644 index 2d5e3664..00000000 --- a/src/nwp_consumer/internal/outputs/s3/client.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Client for AWS S3.""" - -import datetime as dt -import pathlib - -import s3fs -import structlog - -from nwp_consumer import internal - -log = structlog.getLogger() - - -class Client(internal.StorageInterface): - """Storage Interface client for AWS S3.""" - - # S3 Bucket - __bucket: pathlib.Path - - # S3 Filesystem - __fs: s3fs.S3FileSystem - - def __init__( - self, - *, - bucket: str, - region: str, - key: str | None = "", - secret: str| None = "", - endpointURL: str = "", - ) -> None: - """Create a new S3Client. - - Exposes a client that conforms to the StorageInterface. - Provide credentials either explicitly via key and secret - or fallback to default credentials if not provided or empty. - - Args: - bucket: S3 bucket name to use for storage. - region: S3 region the bucket is in. - key: Use this access key, if specified. - secret: Use this secret, if specified. - endpointURL: Use this endpoint URL, if specified. - """ - if (key, secret) == ("", ""): - log.info( - event="attempting AWS connection using default credentials", - ) - key, secret = None, None - - self.__fs: s3fs.S3FileSystem = s3fs.S3FileSystem( - key=key, - secret=secret, - client_kwargs={ - "region_name": region, - "endpoint_url": None if endpointURL == "" else endpointURL, - }, - ) - - self.__bucket = pathlib.Path(bucket) - - def name(self) -> str: - """Overrides the corresponding method in the parent class.""" - return "s3" - - def exists(self, *, dst: pathlib.Path) -> bool: - """Overrides the corresponding method in the parent class.""" - return self.__fs.exists((self.__bucket / dst).as_posix()) - - def store(self, *, src: pathlib.Path, dst: pathlib.Path) -> pathlib.Path: - """Overrides the corresponding method in the parent class.""" - log.debug( - event="storing file in s3", - src=src.as_posix(), - dst=(self.__bucket / dst).as_posix(), - ) - - # If file already exists in store and is of the same size, skip the upload - if self.exists(dst=dst) and self.__fs.du((self.__bucket / dst).as_posix()) == src.stat().st_size: - log.debug( - event="file of same size already exists in s3, skipping", - src=src.as_posix(), - dst=(self.__bucket / dst).as_posix(), - ) - return dst - - # Upload the file to the store - self.__fs.put(lpath=src.as_posix(), rpath=(self.__bucket / dst).as_posix(), recursive=True) - # Don't delete cached file as user may want to do further processing locally. - remote_size_bytes: int = self.__fs.du((self.__bucket / dst).as_posix()) - local_size_bytes: int = src.stat().st_size - if src.is_dir(): - local_size_bytes: int = sum( - f.stat().st_size - for f in src.rglob("*") - if f.is_file() - ) - if remote_size_bytes != local_size_bytes: - log.warn( - event="file size mismatch", - src=src.as_posix(), - dst=(self.__bucket / dst).as_posix(), - srcsize=src.stat().st_size, - dstsize=remote_size_bytes, - ) - else: - log.debug( - event="stored file in s3", - src=src.as_posix(), - dst=(self.__bucket / dst).as_posix(), - remote_size_bytes=remote_size_bytes, - ) - return dst - - def listInitTimes(self, *, prefix: pathlib.Path) -> list[dt.datetime]: - """Overrides the corresponding method in the parent class.""" - allDirs = [ - pathlib.Path(d).relative_to(self.__bucket / prefix) - for d in self.__fs.glob(f"{self.__bucket}/{prefix}/{internal.IT_FOLDER_GLOBSTR_RAW}") - if self.__fs.isdir(d) - ] - - # Get the initTime from the folder pattern - initTimes = set() - for dir in allDirs: - if dir.match(internal.IT_FOLDER_GLOBSTR_RAW): - try: - # Try to parse the folder name as a datetime - ddt = dt.datetime.strptime(dir.as_posix(), internal.IT_FOLDER_STRUCTURE_RAW).replace( - tzinfo=dt.UTC, - ) - initTimes.add(ddt) - except ValueError: - log.debug( - event="ignoring invalid folder name", - name=dir.as_posix(), - within=prefix.as_posix(), - ) - - sortedInitTimes = sorted(initTimes) - log.debug( - event=f"found {len(initTimes)} init times in raw directory", - earliest=sortedInitTimes[0], - latest=sortedInitTimes[-1], - ) - return sortedInitTimes - - def copyITFolderToCache(self, *, prefix: pathlib.Path, it: dt.datetime) -> list[pathlib.Path]: - """Overrides the corresponding method in the parent class.""" - initTimeDirPath = self.__bucket / prefix / it.strftime(internal.IT_FOLDER_STRUCTURE_RAW) - - if not self.__fs.exists(initTimeDirPath.as_posix()) or not self.__fs.isdir(initTimeDirPath.as_posix()): - log.warn( - event="init time folder does not exist in store", - path=it.strftime(internal.IT_FOLDER_STRUCTURE_RAW), - ) - return [] - - paths = [ - pathlib.Path(p).relative_to(self.__bucket) - for p in self.__fs.ls(initTimeDirPath.as_posix()) - ] - - log.debug( - event="copying it folder to cache", - inittime=it.strftime(internal.IT_FOLDER_STRUCTURE_RAW), - numfiles=len(paths), - ) - - # Read all files into cache - cachedPaths: list[pathlib.Path] = [] - for path in paths: - cfp: pathlib.Path = internal.rawCachePath(it=it, filename=path.name) - - # Use existing cached file if it exists and is not empty - if cfp.exists() and cfp.stat().st_size > 0: - log.debug( - event="file already exists in cache, skipping", - filepath=path.as_posix(), - cachepath=cfp.as_posix(), - ) - cachedPaths.append(cfp) - continue - - # Don't copy file from the store if it is empty - if ( - self.exists(dst=path) is False - or self.__fs.du(path=(self.__bucket / path).as_posix()) == 0 - ): - log.warn( - event="file in store is empty", - filepath=path.as_posix(), - ) - continue - - # Copy the file from the store to cache - with self.__fs.open(path=(self.__bucket / path).as_posix(), mode="rb") as infile: - with cfp.open("wb") as tmpfile: - for chunk in iter(lambda: infile.read(16 * 1024), b""): - tmpfile.write(chunk) - tmpfile.flush() - cachedPaths.append(cfp) - - log.debug( - event="copied it folder to cache", - nbytes=[p.stat().st_size for p in cachedPaths], - inittime=it.strftime("%Y-%m-%d %H:%M"), - ) - - return cachedPaths - - def delete(self, *, p: pathlib.Path) -> None: - """Overrides the corresponding method in the parent class.""" - if self.__fs.isdir((self.__bucket / p).as_posix()): - self.__fs.rm((self.__bucket / p).as_posix(), recursive=True) - else: - self.__fs.rm((self.__bucket / p).as_posix()) diff --git a/src/nwp_consumer/internal/outputs/s3/test_client.py b/src/nwp_consumer/internal/outputs/s3/test_client.py deleted file mode 100644 index 893542ea..00000000 --- a/src/nwp_consumer/internal/outputs/s3/test_client.py +++ /dev/null @@ -1,262 +0,0 @@ -import datetime as dt -import inspect -import unittest -import uuid -from pathlib import Path - -from botocore.client import BaseClient as BotocoreClient -from botocore.session import Session -from moto.server import ThreadedMotoServer - -from nwp_consumer import internal - -from .client import Client - -ENDPOINT_URL = "http://localhost:5000" -BUCKET = "test-bucket" -KEY = "test-key" -SECRET = "test-secret" # noqa: S105 -REGION = "us-east-1" - -RAW = Path("raw") -ZARR = Path("zarr") - - -class TestS3Client(unittest.TestCase): - testS3: BotocoreClient - client: Client - server: ThreadedMotoServer - - @classmethod - def setUpClass(cls) -> None: - # Start a local S3 server - cls.server = ThreadedMotoServer() - cls.server.start() - - session = Session() - cls.testS3 = session.create_client( - service_name="s3", - region_name=REGION, - endpoint_url=ENDPOINT_URL, - aws_access_key_id=KEY, - aws_secret_access_key=SECRET, - ) - - # Create a mock S3 bucket - cls.testS3.create_bucket( - Bucket=BUCKET, - ) - - # Create an instance of the S3Client class - cls.client = Client( - key=KEY, - secret=SECRET, - region=REGION, - bucket=BUCKET, - endpointURL=ENDPOINT_URL, - ) - - @classmethod - def tearDownClass(cls) -> None: - # Delete all objects in bucket - response = cls.testS3.list_objects_v2( - Bucket=BUCKET, - ) - if "Contents" in response: - for obj in response["Contents"]: - cls.testS3.delete_object( - Bucket=BUCKET, - Key=obj["Key"], - ) - cls.server.stop() - - def test_exists(self) -> None: - # Create a mock file in the raw directory - initTime = dt.datetime(2023, 1, 1, tzinfo=dt.UTC) - fileName = inspect.stack()[0][3] + ".grib" - filePath = RAW / f"{initTime:%Y/%m/%d/%H%M}" / fileName - self.testS3.put_object( - Bucket=BUCKET, - Key=filePath.as_posix(), - Body=bytes(fileName, "utf-8"), - ) - - # Call the existsInRawDir method - exists = self.client.exists(dst=filePath) - - # Verify the existence of the file - self.assertTrue(exists) - - # Call the existsInRawDir method on a non-existent file - exists = self.client.exists(dst=Path("non_existent_file.grib")) - - # Verify the non-existence of the file - self.assertFalse(exists) - - # Delete the created files - self.testS3.delete_object( - Bucket=BUCKET, - Key=filePath.as_posix(), - ) - - def test_store(self) -> None: - initTime = dt.datetime(2023, 1, 2, tzinfo=dt.UTC) - fileName = inspect.stack()[0][3] + ".grib" - dst = RAW / f"{initTime:%Y/%m/%d/%H%M}" / fileName - src = internal.CACHE_DIR / f"nwpc-{uuid.uuid4()}" - src.parent.mkdir(parents=True, exist_ok=True) - - # Write the data to the temporary file - src.write_bytes(bytes(fileName, "utf-8")) - - name = self.client.store(src=src, dst=dst) - - # Verify the written file in the raw directory - response = self.testS3.get_object(Bucket=BUCKET, Key=dst.as_posix()) - self.assertEqual(response["Body"].read(), bytes(fileName, "utf-8")) - - # Verify the correct number of bytes was written - self.assertEqual(name, dst) - - # Delete the created file and the temp file - self.testS3.delete_object(Bucket=BUCKET, Key=dst.as_posix()) - src.unlink(missing_ok=True) - - ## Test the store doesn't overwrite an existing file of equivalent size - - # Create a mock file in the store - self.testS3.put_object( - Bucket=BUCKET, - Key=dst.as_posix(), - Body=bytes(fileName, "utf-8"), - ) - - # Create a temporary file with the same data - src.write_bytes(bytes(fileName, "utf-8")) - - # Get the modified date of the file in the store - response = self.testS3.head_object(Bucket=BUCKET, Key=dst.as_posix()) - lastModified = response["LastModified"] - - # Call the store method on the file - name = self.client.store(src=src, dst=dst) - - # Verify the file in the store was not overwritten - response = self.testS3.get_object(Bucket=BUCKET, Key=dst.as_posix()) - self.assertEqual(response["Body"].read(), bytes(fileName, "utf-8")) - self.assertEqual(lastModified, response["LastModified"]) - - - def test_listInitTimes(self) -> None: - # Create mock folders/files in the raw directory - self.testS3.put_object( - Bucket=BUCKET, - Key=f"{RAW}/2023/01/03/0000/test_raw_file1.grib", - Body=b"test_data", - ) - self.testS3.put_object( - Bucket=BUCKET, - Key=f"{RAW}/2023/01/04/0300/test_raw_file2.grib", - Body=b"test_data", - ) - - # Call the listInitTimesInRawDir method - init_times = self.client.listInitTimes(prefix=RAW) - - # Verify the returned list of init times - expected_init_times = [ - dt.datetime(2023, 1, 3, 0, 0, tzinfo=dt.UTC), - dt.datetime(2023, 1, 4, 3, 0, tzinfo=dt.UTC), - ] - self.assertEqual(init_times, expected_init_times) - - # Delete the created files - self.testS3.delete_object( - Bucket=BUCKET, - Key=f"{RAW}/2023/01/03/0000/test_raw_file1.grib", - ) - self.testS3.delete_object( - Bucket=BUCKET, - Key=f"{RAW}/2023/01/04/0300/test_raw_file2.grib", - ) - - def test_copyITFolderToCache(self) -> None: - # Make some files in the raw directory - initTime = dt.datetime(2023, 1, 1, 3, tzinfo=dt.UTC) - files = [ - RAW - / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" - / "test_copyITFolderToTemp1.grib", - RAW - / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" - / "test_copyITFolderToTemp2.grib", - RAW - / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" - / "test_copyITFolderToTemp3.grib", - ] - for f in files: - self.testS3.put_object( - Bucket=BUCKET, - Key=f.as_posix(), - Body=bytes("test_file_contents", "utf-8"), - ) - - # Call the copyItFolderToCache method - paths = self.client.copyITFolderToCache(prefix=RAW, it=initTime) - - # Assert the contents of the cached files is correct - for _i, path in enumerate(paths): - self.assertEqual(path.read_bytes(), bytes("test_file_contents", "utf-8")) - - # Delete the cached files - path.unlink() - - # Delete the files in S3 - for f in files: - self.testS3.delete_object(Bucket=BUCKET, Key=f.as_posix()) - - # Make some more RAW files in the raw directory AND in the cache directory - initTime2 = dt.datetime(2023, 1, 1, 6, tzinfo=dt.UTC) - files2 = [ - RAW / f"{initTime2:%Y/%m/%d/%H%M}" / "test_copyITFolderToTemp1.grib", - RAW / f"{initTime2:%Y/%m/%d/%H%M}" / "test_copyITFolderToTemp2.grib", - RAW / f"{initTime2:%Y/%m/%d/%H%M}" / "test_copyITFolderToTemp3.grib", - ] - for f in files2: - self.testS3.put_object( - Bucket=BUCKET, - Key=f.as_posix(), - Body=bytes("test_file_contents", "utf-8"), - ) - with open(internal.CACHE_DIR / f.name, "w") as f: - f.write("test_file_contents") - - # Call the copyITFolderToCache method again - paths = self.client.copyITFolderToCache(prefix=RAW, it=initTime2) - self.assertEqual(len(paths), 3) - - # Delete the files in S3 - for f in files2: - self.testS3.delete_object(Bucket=BUCKET, Key=f.as_posix()) - - @unittest.skip("Broken on github ci") - def test_delete(self) -> None: - # Create a file in the raw directory - initTime = dt.datetime(2023, 1, 1, 3, tzinfo=dt.UTC) - path = RAW / f"{initTime:{internal.IT_FOLDER_STRUCTURE_RAW}}" / "test_delete.grib" - self.testS3.put_object( - Bucket=BUCKET, - Key=path.as_posix(), - Body=bytes("test_delete", "utf-8"), - ) - - # Delete the file using the function - self.client.delete(p=path) - - # Assert that the file no longer exists - with self.assertRaises(Exception): - self.testS3.get_object(Bucket=BUCKET, Key=path.as_posix()) - - -if __name__ == "__main__": - unittest.main() diff --git a/src/nwp_consumer/internal/ports/__init__.py b/src/nwp_consumer/internal/ports/__init__.py new file mode 100644 index 00000000..a51e2b33 --- /dev/null +++ b/src/nwp_consumer/internal/ports/__init__.py @@ -0,0 +1,19 @@ +"""Interfaces for actor-core communication. + +The ports module defines abstract interfaces that specify the signatures +any actors (driving and driven) must obey in order to interact with the core. + +*Driving* actors are found in the `services` module, and *driven* actors are found +in the `repositories` module. +""" + +from .services import ConsumeUseCase, ArchiveUseCase +from .repositories import ModelRepository, ZarrRepository, NotificationRepository + +__all__ = [ + "ConsumeUseCase", + "ArchiveUseCase", + "ModelRepository", + "ZarrRepository", + "NotificationRepository", +] diff --git a/src/nwp_consumer/internal/ports/repositories.py b/src/nwp_consumer/internal/ports/repositories.py new file mode 100644 index 00000000..4ac0094b --- /dev/null +++ b/src/nwp_consumer/internal/ports/repositories.py @@ -0,0 +1,150 @@ +"""Repository interfaces for NWP data sources and stores. + +These interfaces define the signatures that *driven* actors must conform to +in order to interact with the core. +Also sometimes referred to as *secondary ports*. + +All NWP providers use some kind of model to generate their data. This repository +can be physics-based, such as ERA5, or a machine learning model_repositories, such as +Google's GraphCast. The `ModelRepository` interface is used to abstract the +differences between these models, allowing the core to interact with them +in a uniform way. +""" + +import abc +import datetime as dt +import logging +import pathlib +from collections.abc import Callable, Iterator + +import xarray as xr +from returns.result import ResultE + +from nwp_consumer.internal import entities + +log = logging.getLogger("nwp-consumer") + + +class ModelRepository(abc.ABC): + """Interface for a repository that produces raw NWP data. + + Since different producers of NWP data have different data storage + implementations, a ModelRepository needs to define its own download + and processing methods. + + A source may provide one or more files for a given init time. + To keep memory usage at a minimum, when converting raw data to zarr, + converted data is persisted to disk in a store. + In this manner, writes can be done in parallel, but a schema needs to be known + in advance. + + As such, an important distinction is made between: + - the *fileset*: Raw store data for an init time + - the *store*: The Zarr store containing the processed data + """ + + @classmethod + @abc.abstractmethod + def authenticate(cls) -> ResultE["ModelRepository"]: + """Create a new authenticated instance of the class.""" + pass + + + @abc.abstractmethod + def fetch_init_data(self, it: dt.datetime) \ + -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + """Fetch raw data files for an init time as xarray datasets. + + As per the typing, the return value is a generator of functions that + may produce one or more xarray datasets. + The generator-of-functions approach (typed here as ``Iterator[Callable...]``) + is important, as it allows for lazy evaluation: + by returning a generator of delayed objects, joblib can parallelize + the download and the results can be accumulated in a low-memory fashion (see + `the JobLib documentation on parallel generators + `_). + + An example psuedocode implementation is shown below: + + >>> from joblib import delayed + >>> from returns.result import Result, ResultE + >>> from typing import override + >>> from collections.abc import Callable, Iterator + >>> import xarray as xr + >>> import datetime as dt + >>> + >>> # Pseudocode for a model_repositories repository + >>> class MyModelRepository(ModelRepository): + ... @override + ... def fetch_init_data(self, it: dt.datetime) \ + ... -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + ... for file in ["raw_file1.grib", "raw_file2.grib"]: + ... yield delayed(self._download_and_convert)(file) + ... + ... def _download_and_convert(self, file: str) -> ResultE[list[xr.DataArray]]: + ... '''Download and convert a raw file to an xarray dataset.''' + ... return Success([xr.open_dataset(file).to_dataarray()]) + + .. warning:: No downloading or processing should be done in this method*. All of that + should be handled in the function that is yielded by the generator - + ``_download_and_convert`` in the example above. + This is to allow for parallelization of the download and processing. + + .. note:: It is however, worth considering the most efficient way to download and process + the data. The above assumes that the data comes in many files, but there is a possibility + of the case where the source provides one large file with many underlying datasets within. + In this case, it may be more efficient to download the large file in the + `fetch_init_data` method and then process the datasets within via the yielded functions. + + .. note:: For the moment, this returns a list of ``xarray.DataArray`` objects. It may be + more efficient to return a generator here to avoid reading all the datasets into + memory at once, however, often the source of these datasets is ``cfgrib.open_datasets`` + which has no option for returning a generator, hence the current choice of ``list``. + This may be revisited in the future, for instance by recreating the ``open_datasets`` + function in a manner which returns a generator of datasets. + + Args: + it: The initialization time for which to fetch data. + + Returns: + A generator of delayed xarray dataarrays for the init time. + """ + pass + + + @staticmethod + @abc.abstractmethod + def repository() -> entities.ModelRepositoryMetadata: + """Metadata about the model repository.""" + pass + + @staticmethod + @abc.abstractmethod + def model() -> entities.ModelMetadata: + """Metadata about the model.""" + pass + + +class ZarrRepository(abc.ABC): + """Interface for a repository that stores Zarr NWP data.""" + + @abc.abstractmethod + def save(self, src: pathlib.Path, dst: pathlib.Path) -> ResultE[str]: + """Save NWP store data in the repository.""" + pass + + +class NotificationRepository(abc.ABC): + """Interface for a repository that sends notifications. + + Adaptors for this port enable sending notifications to + a desired notification_repositories channel. + """ + + @abc.abstractmethod + def notify( + self, + message: entities.StoreAppendedNotification | entities.StoreCreatedNotification, + ) -> ResultE[str]: + """Send a notification_repositories.""" + pass diff --git a/src/nwp_consumer/internal/ports/services.py b/src/nwp_consumer/internal/ports/services.py new file mode 100644 index 00000000..f19a1888 --- /dev/null +++ b/src/nwp_consumer/internal/ports/services.py @@ -0,0 +1,96 @@ +"""Service interfaces for consumer services. + +These interfaces define the signatures that *driving* actors must conform to +in order to interact with the core. + +Also sometimes referred to as *primary ports*. +""" + +import abc +import datetime as dt + +from returns.result import ResultE + +from nwp_consumer.internal import entities + + +class ConsumeUseCase(abc.ABC): + """Interface for the consumer use case. + + Defines the business-critical methods for the following use cases: + + - 'A user should be able to consume NWP data for a given initialization time.' + """ + + + @abc.abstractmethod + def consume(self, it: dt.datetime | None = None) -> ResultE[str]: + """Consume NWP data to Zarr format for desired init time. + + Where possible the implementation should be as memory-efficient as possible. + The designs of the repository methods also enable parallel processing within + the implementation. + + Args: + it: The initialization time for which to consume data. + If None, the latest available forecast should be consumed. + + Returns: + The path to the produced Zarr store. + + See Also: + - `repositories.ModelRepository.fetch_init_data` + - `tensorstore.TensorStore.write_to_region` + - https://joblib.readthedocs.io/en/stable/auto_examples/parallel_generator.html + """ + pass + + @abc.abstractmethod + def postprocess(self, options: entities.PostProcessOptions) -> ResultE[str]: + """Postprocess the produced Zarr according to given options.""" + pass + + +class ArchiveUseCase(abc.ABC): + """Interface for the archive use case. + + Defines the business-critical methods for the following use cases: + + - 'A user should be able to archive NWP data for a given time period.' + """ + + @abc.abstractmethod + def archive(self, year: int, month: int) -> ResultE[str]: + """Archive NWP data to Zarr format for the given month. + + Args: + year: The year for which to archive data. + month: The month for which to archive data. + + Returns: + The path to the produced Zarr store. + """ + pass + +class InfoUseCase(abc.ABC): + """Interface for the notification use case. + + Defines the business-critical methods for the following use cases: + + - 'A user should be able to retrieve information about the service.' + """ + + @abc.abstractmethod + def available_models(self) -> list[str]: + """Get a list of available models.""" + pass + + @abc.abstractmethod + def model_repository_info(self) -> str: + """Get information about the model repository.""" + pass + + @abc.abstractmethod + def model_info(self) -> str: + """Get information about the model.""" + pass diff --git a/src/nwp_consumer/internal/repositories/__init__.py b/src/nwp_consumer/internal/repositories/__init__.py new file mode 100644 index 00000000..7b5f9e02 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/__init__.py @@ -0,0 +1,38 @@ +"""Implementation of adaptors for driven actors. + +Driven actors +-------------- + +A driven actor is an external component that is acted upon by the core logic. +Also referred to as *secondary* actors, a driven actor represents an external +system that the core logic interacts with. They extend the core driven ports +(see `nwp_consumer.internal.ports`) in their implementation. + +Examples of driven or secondary actors include: + +- a database +- a message queue +- a filesystem + +Since they are stores of data, they are referred to in this package +(and often in hexagonal architecture documentation) as *repositories*. + +This module +----------- + +This module contains implementations for the following driven actors: + +- Notification Repository - Somewhere to send notifications to +- Model Repository - A source of NWP data + +Both inherit from the repository ports specified in the core via `nwp_consumer.internal.ports`. +""" +from . import ( + model_repositories, + notification_repositories, +) + +__all__ = [ + "model_repositories", + "notification_repositories", +] diff --git a/src/nwp_consumer/internal/repositories/model_repositories/__init__.py b/src/nwp_consumer/internal/repositories/model_repositories/__init__.py new file mode 100644 index 00000000..15950c80 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/__init__.py @@ -0,0 +1,17 @@ +"""Model Repositories + +TODO: Add description +""" + +from .ceda_ftp import CEDAFTPModelRepository +from .ecmwf_realtime import ECMWFRealTimeS3ModelRepository +from .noaa_s3 import NOAAS3ModelRepository +from .mo_datahub import MetOfficeDatahubModelRepository + +__all__ = [ + "CEDAFTPModelRepository", + "ECMWFRealTimeS3ModelRepository", + "NOAAS3ModelRepository", + "MetOfficeDatahubModelRepository", +] + diff --git a/src/nwp_consumer/internal/repositories/model_repositories/ceda_ftp.py b/src/nwp_consumer/internal/repositories/model_repositories/ceda_ftp.py new file mode 100644 index 00000000..d03e6a4f --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/ceda_ftp.py @@ -0,0 +1,306 @@ +"""Model repository implementation for MetOffice Global data from CEDA. + +Repository information: +======================= + +The original model is from the UK Met Office, who don't provide their own archive. +CEDA (Centre for Environmental Data Analysis) host the data on their FTP server [2]. +The CEDA catalogue for the Met Office Global can be found +`here `_, +and the spec sheet from the Met Office is detailed in +`this PDF `_. + +For further details on the repository, see the +`CEDAFTPModelRepository.repository` implementation. + +Data discrepancies and corrections +================================== + +MetOffice global model data is stored on CEDA in segments: + +- 4 areas for the northern hemisphere (A,B,C,D) +- 4 areas for the southern hemisphere (E,F,G,H) + +Each area contains a subset of the data for a given time step. + +Documented structure +-------------------- + +According to the MetOffice documentation [2], the files have the following structure:: + + Northern hemisphere: + - AreaA: Lat: 89.9 -> 0.3, Lon: -45 -> 45 + - AreaB: Lat: 89.9 -> 0.3, Lon: 45 -> 135 + - AreaC: Lat: 89.9 -> 0.3, Lon: 135 -> -135 (wraps around 180) + - AreaD: Lat: 89.9 -> 0.3, Lon: -135 -> -45 + + Southern hemisphere: + - AreaE: Lat: -0.3 -> -89.9, Lon: -45 -> 45 + - AreaF: Lat: -0.3 -> -89.9, Lon: 45 -> 135 + - AreaG: Lat: -0.3 -> -89.9, Lon: 135 -> -135 (wraps around 180) + - AreaH: Lat: -0.3 -> -89.9, Lon: -135 -> -45 + +With steps of 0.153 degrees in latitude and 0.234 degrees in longitude. + +Actual structure +---------------- + +In my experience however, the data is not quite as described in the documentation. +Using the eccodes grib tool as shown:: + + $ grib_ls -n geography -wcount=13 file.grib + +I found that the grids are in fact as follows:: + + - AreaA: Lat: 0 -> 89.856, Lon: 315 -> 45.09 + - AreaB: Lat: 0 -> 89.856, Lon: 45 -> 135.09 + - AreaC: Lat: 0 -> 89.856, Lon: 135 -> 225.09 (wraps around 180) + - AreaD: Lat: 0 -> 89.856, Lon: 225 -> 315.09 + - AreaE: Lat: -89.856 -> 0, Lon: 315 -> 45.09 + - AreaF: Lat: -89.856 -> 0, Lon: 45 -> 135.09 + - AreaG: Lat: -89.856 -> 0, Lon: 135 -> 225.09 (wraps around 180) + - AreaH: Lat: -89.856 -> 0, Lon: 225 -> 315.09 + +With steps of 0.156 degrees in latitude and 0.234 degrees in longitude. + +.. important:: Key takeaways from this are: + + - The latitude values are in reverse order as described in the documentation + - The longitude values overlap each other and combine to form a non-uniform step size + - The step size is slightly different + - Smaller lat/lon chunks are needed to allow for the partial area files to be written + in parallel + +As a result, the incoming data is modified to alleviate these issues. + +""" + +import datetime as dt +import logging +import os +import pathlib +import urllib.parse +import urllib.request +from collections.abc import Callable, Iterator +from typing import override + +import numpy as np +import xarray as xr +from joblib import delayed +from returns.result import Failure, ResultE, Success + +from nwp_consumer.internal import entities, ports + +log = logging.getLogger("nwp-consumer") + + +class CEDAFTPModelRepository(ports.ModelRepository): + """Repository implementation for the MetOffice global model data.""" + + url_base: str = "ftp.ceda.ac.uk/badc/ukmo-nwp/data/global-grib" + """The base URL for the CEDA FTP server.""" + _url_auth: str + """The URL prefix containing authentication information.""" + + def __init__(self, url_auth: str) -> None: + """Create a new instance.""" + self._url_auth = url_auth + + + @staticmethod + @override + def repository() -> entities.ModelRepositoryMetadata: + return entities.ModelRepositoryMetadata( + name="CEDA", + is_archive=True, + is_order_based=False, + running_hours=[0, 12], # 6 and 18 exist, but are lacking variables + delay_minutes=(60 * 24 * 7) + (60 * 12), # 7.5 days + max_connections=20, + required_env=["CEDA_FTP_USER", "CEDA_FTP_PASS"], + optional_env={}, + postprocess_options=entities.PostProcessOptions(), + ) + + @staticmethod + @override + def model() -> entities.ModelMetadata: + return entities.ModelMetadata( + name="UM-Global", + resolution="17km", + expected_coordinates = entities.NWPDimensionCoordinateMap( + init_time=[], + step=list(range(0, 48, 1)), + variable=[ + entities.Parameter.DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL, + entities.Parameter.CLOUD_COVER_TOTAL, + entities.Parameter.CLOUD_COVER_HIGH, + entities.Parameter.CLOUD_COVER_LOW, + entities.Parameter.CLOUD_COVER_MEDIUM, + entities.Parameter.RELATIVE_HUMIDITY_SL, + entities.Parameter.SNOW_DEPTH_GL, + entities.Parameter.TEMPERATURE_SL, + entities.Parameter.WIND_U_COMPONENT_10m, + entities.Parameter.WIND_V_COMPONENT_10m, + entities.Parameter.VISIBILITY_SL, + ], + latitude=[ + float(f"{lat:.4f}") for lat in np.arange(89.856, -89.856 - 0.156, -0.156) + ], + longitude=[ + float(f"{lon:.4f}") for lon in np.concatenate([ + np.arange(-45, 45, 0.234), + np.arange(45, 135, 0.234), + np.arange(135, 225, 0.234), + np.arange(225, 315, 0.234), + ]) + ], + ), + ) + + @override + def fetch_init_data(self, it: dt.datetime) \ + -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + + parameter_stubs: list[str] = [ + "Total_Downward_Surface_SW_Flux", + "high_cloud_amount", + "low_cloud_amount", + "medium_cloud_amount", + "relative_humidity_1_5m", + "snow_depth", + "temperature_1_5m", + # "total_cloud", + # "total_precipitation_rate", Exists, but only has 3 hourly steps + "visibility_1_5m", + "wind_u_10m", + "wind_v_10m", + ] + + for parameter in parameter_stubs: + for area in [f"Area{c}" for c in "ABCDEFGH"]: + url = ( + f"{self.url_base}/{it:%Y/%m/%d}/" + + f"{it:%Y%m%d%H}_WSGlobal17km_{parameter}_{area}_000144.grib" + ) + yield delayed(self._download_and_convert)(url=url) + + pass + + def _download_and_convert(self, url: str) -> ResultE[list[xr.DataArray]]: + """Download and convert a file to xarray DataArrays. + + Args: + url: The URL of the file to download. + """ + return self._download(url).bind(self._convert) + + @classmethod + @override + def authenticate(cls) -> ResultE["CEDAFTPModelRepository"]: + """Authenticate with the CEDA FTP server. + + Returns: + A Result containing the instantiated class if successful, or an error if not. + """ + missing_envs = cls.repository().missing_required_envs() + if len(missing_envs) > 0: + return Failure(OSError( + f"Cannot authenticate with CEDA FTP service due to " + f"missing required environment variables: {', '.join(missing_envs)}", + )) + username: str = urllib.parse.quote(os.environ["CEDA_FTP_USER"]) + password: str = urllib.parse.quote(os.environ["CEDA_FTP_PASS"]) + + return Success(cls(url_auth=f"ftp://{username}:{password}@")) + + def _download(self, url: str) -> ResultE[pathlib.Path]: + """Download a file from the CEDA FTP server. + + Args: + url: The URL of the file to download. + """ + local_path: pathlib.Path = ( + pathlib.Path( + os.getenv( + "RAWDIR", + f"~/.local/cache/nwp/{self.repository().name}/{self.model().name}/raw", + ), + ) / url.split("/")[-1] + ).expanduser() + + # Don't download the file if it already exists + if not local_path.exists(): + local_path.parent.mkdir(parents=True, exist_ok=True) + log.debug("Sending request to CEDA FTP server for: '%s'", url) + try: + response = urllib.request.urlopen( # noqa: S310 + self._url_auth + url, + timeout=30, + ) + except Exception as e: + return Failure(OSError(f"Error fetching {url}: {e}")) + + local_path.parent.mkdir(parents=True, exist_ok=True) + log.debug("Downloading %s to %s", url, local_path) + try: + with local_path.open("wb") as f: + for chunk in iter(lambda: response.read(16 * 1024), b""): + f.write(chunk) + f.flush() + log.debug( + f"Downloaded '{url}' to '{local_path}' (%s bytes)", + local_path.stat().st_size, + ) + except Exception as e: + return Failure( + OSError( + f"Error saving '{url}' to '{local_path}': {e}", + ), + ) + + return Success(local_path) + + @staticmethod + def _convert(path: pathlib.Path) -> ResultE[list[xr.DataArray]]: + """Convert a local grib file to xarray DataArrays. + + Args: + path: The path to the file to convert. + """ + try: + ds: xr.Dataset = xr.open_dataset(path, engine="cfgrib") + except Exception as e: + return Failure( + OSError( + f"Error opening '{path}' as xarray Dataset: {e}", + ), + ) + try: + da: xr.DataArray = ( + entities.Parameter.rename_else_drop_ds_vars( + ds=ds, + allowed_parameters=CEDAFTPModelRepository.model().expected_coordinates.variable, + ) + .sel(step=[np.timedelta64(i, "h") for i in range(0, 48, 1)]) + .expand_dims(dim={"init_time": [ds["time"].values]}) + .drop_vars( + names=[ + v + for v in ds.coords.variables + if v not in ["init_time", "step", "latitude", "longitude"] + ], + ) + .to_dataarray(name=CEDAFTPModelRepository.model().name) + .transpose("init_time", "step", "variable", "latitude", "longitude") + # Remove the last value of the longitude dimension as it overlaps with the next file + # Reverse the latitude dimension to be in descending order + .isel(longitude=slice(None, -1), latitude=slice(None, None, -1)) + ) + except Exception as e: + return Failure( + ValueError( + f"Error processing {path} to DataArray: {e}", + ), + ) + return Success([da]) diff --git a/src/nwp_consumer/internal/repositories/model_repositories/ecmwf_realtime.py b/src/nwp_consumer/internal/repositories/model_repositories/ecmwf_realtime.py new file mode 100644 index 00000000..b4f10fe5 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/ecmwf_realtime.py @@ -0,0 +1,321 @@ +"""Model repository implementation for ECMWF live data from S3. + +Repository Information +====================== + +Documented Structure +-------------------- + +When getting live or realtime data from ECMWF, grib files are sent by +a data provider to a location of choice, in this case an S3 bucket. +The `ECMWF Dissemination Schedule `_ +describes the naming convention and time ordering for these files: + +- A 2-character prefix +- A 1-character dissemination stream indicator +- 8 digits representing the initialization time in the format mmddHHMM +- 8 digits representing the target time in the format mmddHHMM +- 1 digit representing the file number(?) + +So a file named ``A2D10250000D10260100`` would be for an initialization +time of 2024-10-25 00:00 and a target time of 2024-10-26 01:00 (step of 25 hours). + +The file contents is specific to the order agreed with the data provider. +For the order that OCF has created, there are four distinct datasets. +This is because OCF has ordered two separate regions and 17 variables, +which are split across two datasets. + +Also, some of the data contains larger steps than we are interested in due +to necessities in the order creation process. + +""" + +import datetime as dt +import logging +import os +import pathlib +import re +from collections.abc import Callable, Iterator +from typing import override + +import cfgrib +import s3fs +import xarray as xr +from joblib import delayed +from returns.result import Failure, ResultE, Success + +from nwp_consumer.internal import entities, ports + +log = logging.getLogger("nwp-consumer") + + +class ECMWFRealTimeS3ModelRepository(ports.ModelRepository): + """Model repository implementation for ECMWF live data from S3.""" + + bucket: str + _fs: s3fs.S3FileSystem + + def __init__(self, bucket: str, fs: s3fs.S3FileSystem) -> None: + """Create a new instance of the class.""" + self.bucket = bucket + self._fs = fs + + + @staticmethod + @override + def repository() -> entities.ModelRepositoryMetadata: + return entities.ModelRepositoryMetadata( + name="ECMWF-Realtime-S3", + is_archive=False, + is_order_based=True, + running_hours=[0, 6, 12, 18], + delay_minutes=(60 * 6), # 6 hours + max_connections=100, + required_env=[ + "ECMWF_REALTIME_S3_ACCESS_KEY", + "ECMWF_REALTIME_S3_ACCESS_SECRET", + "ECMWF_REALTIME_S3_BUCKET", + "ECMWF_REALTIME_S3_REGION", + ], + optional_env={ + "ECMWF_REALTIME_DISSEMINATION_FILE_PREFIX": "A2", + "ECMWF_REALTIME_S3_BUCKET_PREFIX": "ecmwf", + }, + postprocess_options=entities.PostProcessOptions(), + ) + + @staticmethod + @override + def model() -> entities.ModelMetadata: + return entities.ModelMetadata( + name="HRES-IFS", + resolution="0.1 degrees", + expected_coordinates=entities.NWPDimensionCoordinateMap( + init_time=[], + step=list(range(0, 85, 1)), + variable=sorted([ + entities.Parameter.WIND_U_COMPONENT_10m, + entities.Parameter.WIND_V_COMPONENT_10m, + entities.Parameter.WIND_U_COMPONENT_100m, + entities.Parameter.WIND_V_COMPONENT_100m, + entities.Parameter.WIND_U_COMPONENT_200m, + entities.Parameter.WIND_V_COMPONENT_200m, + entities.Parameter.TEMPERATURE_SL, + entities.Parameter.TOTAL_PRECIPITATION_RATE_GL, + entities.Parameter.DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL, + entities.Parameter.DOWNWARD_LONGWAVE_RADIATION_FLUX_GL, + entities.Parameter.CLOUD_COVER_HIGH, + entities.Parameter.CLOUD_COVER_MEDIUM, + entities.Parameter.CLOUD_COVER_LOW, + entities.Parameter.CLOUD_COVER_TOTAL, + entities.Parameter.SNOW_DEPTH_GL, + entities.Parameter.VISIBILITY_SL, + entities.Parameter.DIRECT_SHORTWAVE_RADIATION_FLUX_GL, + entities.Parameter.DOWNWARD_ULTRAVIOLET_RADIATION_FLUX_GL, + ]), + latitude=[float(f"{lat / 10:.2f}") for lat in range(900, -900 - 1, -1)], + longitude=[float(f"{lon / 10:.2f}") for lon in range(-1800, 1800 + 1, 1)], + ), + ) + + @override + def fetch_init_data(self, it: dt.datetime) \ + -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + # List relevant files in the S3 bucket + try: + urls: list[str] = [ + f"s3://{f}" + for f in self._fs.ls(f"{self.bucket}/ecmwf") + if self._wanted_file( + filename=f.split("/")[-1], + it=it, + max_step=max(self.model().expected_coordinates.step), + ) + ] + except Exception as e: + yield delayed(Failure)(ValueError( + f"Failed to list files in bucket path '{self.bucket}/ecmwf'. " + "Ensure the path exists and the caller has relevant access permissions. " + f"Encountered error: {e}", + )) + return + + if len(urls) == 0: + yield delayed(Failure)(ValueError( + f"No raw files found for init time '{it.strftime('%Y-%m-%d %H:%M')}' " + f"in bucket path '{self.bucket}/ecmwf'. Ensure files exist at the given path " + "named with the expected pattern, e.g. 'A2S10250000102603001.", + )) + return + + log.debug( + f"Found {len(urls)} file(s) for init time '{it.strftime('%Y-%m-%d %H:%M')}' " + f"in bucket path '{self.bucket}/ecmwf'.", + ) + for url in urls: + yield delayed(self._download_and_convert)(url=url) + + @classmethod + @override + def authenticate(cls) -> ResultE["ECMWFRealTimeS3ModelRepository"]: + missing_envs = cls.repository().missing_required_envs() + if len(missing_envs) > 0: + return Failure(OSError( + f"Cannot authenticate with ECMWF Realtime S3 service due to " + f"missing required environment variables: {', '.join(missing_envs)}", + )) + try: + bucket: str = os.environ["ECMWF_REALTIME_S3_BUCKET"] + _fs: s3fs.S3FileSystem = s3fs.S3FileSystem( + key=os.environ["ECMWF_REALTIME_S3_ACCESS_KEY"], + secret=os.environ["ECMWF_REALTIME_S3_ACCESS_SECRET"], + client_kwargs={ + "endpoint_url": os.environ.get("AWS_ENDPOINT_URL", None), + "region_name": os.environ["ECMWF_REALTIME_S3_REGION"], + }, + ) + except Exception as e: + return Failure(ConnectionError( + "Failed to connect to S3 for ECMWF data. " + f"Credentials may be wrong or undefined. Encountered error: {e}", + )) + + log.debug(f"Successfully authenticated with S3 instance '{bucket}'") + return Success(cls(bucket=bucket, fs=_fs)) + + + def _download_and_convert(self, url: str) -> ResultE[list[xr.DataArray]]: + """Download and convert a file to xarray DataArrays. + + Args: + url: The URL of the file to download. + """ + return self._download(url=url).bind(self._convert) + + def _download(self, url: str) -> ResultE[pathlib.Path]: + """Download an ECMWF realtime file from S3. + + Args: + url: The URL to the S3 object. + """ + local_path: pathlib.Path = ( + pathlib.Path( + os.getenv( + "RAWDIR", + f"~/.local/cache/nwp/{self.repository().name}/{self.model().name}/raw", + ), + ) / url.split("/")[-1] + ).with_suffix(".grib").expanduser() + + # Only download the file if not already present + if not local_path.exists() or local_path.stat().st_size == 0: + local_path.parent.mkdir(parents=True, exist_ok=True) + log.debug("Requesting file from S3 at: '%s'", url) + + try: + if not self._fs.exists(url): + raise FileNotFoundError(f"File not found at '{url}'") + + with local_path.open("wb") as lf, self._fs.open(url, "rb") as rf: + for chunk in iter(lambda: rf.read(12 * 1024), b""): + lf.write(chunk) + lf.flush() + + except Exception as e: + return Failure(OSError( + f"Failed to download file from S3 at '{url}'. Encountered error: {e}", + )) + + if local_path.stat().st_size != self._fs.info(url)["size"]: + return Failure(ValueError( + f"Failed to download file from S3 at '{url}'. " + "File size mismatch. File may be corrupted.", + )) + + return Success(local_path) + + @staticmethod + def _convert(path: pathlib.Path) -> ResultE[list[xr.DataArray]]: + """Convert a grib file to an xarray DataArray. + + Args: + path: The path to the grib file. + """ + try: + dss: list[xr.Dataset] = cfgrib.open_datasets(path.as_posix()) + except Exception as e: + return Failure(OSError( + f"Error opening '{path}' as list of xarray Datasets: {e}", + )) + if len(dss) == 0: + return Failure(ValueError( + f"No datasets found in '{path}'. File may be corrupted. " + "A redownload of the file may be required.", + )) + + processed_das: list[xr.DataArray] = [] + for i, ds in enumerate(dss): + try: + da: xr.DataArray = ( + entities.Parameter.rename_else_drop_ds_vars( + ds=ds, + allowed_parameters=ECMWFRealTimeS3ModelRepository.model().expected_coordinates.variable, + ) + .rename(name_dict={"time": "init_time"}) + .expand_dims(dim="init_time") + .expand_dims(dim="step") + .to_dataarray(name=ECMWFRealTimeS3ModelRepository.model().name) + ) + da = ( + da.drop_vars( + names=[ + c for c in ds.coords + if c not in ["init_time", "step", "variable", "latitude", "longitude"] + ], + errors="ignore", + ) + .transpose("init_time", "step", "variable", "latitude", "longitude") + .sortby(variables=["step", "variable", "longitude"]) + .sortby(variables="latitude", ascending=False) + ) + except Exception as e: + return Failure(ValueError( + f"Error processing dataset {i} from '{path}' to DataArray: {e}", + )) + # Put each variable into its own DataArray: + # * Each raw file does not contain a full set of parameters + # * and so may not produce a contiguous subset of the expected coordinates. + processed_das.extend( + [ + da.where(cond=da["variable"] == v, drop=True) + for v in da["variable"].values + ], + ) + + return Success(processed_das) + + @staticmethod + def _wanted_file(filename: str, it: dt.datetime, max_step: int) -> bool: + """Determine if the file is wanted based on the init time. + + See module docstring for the file naming convention. + Returns True if the filename describes data corresponding to the input + initialization time and model metadata. + + Args: + filename: The name of the file. + it: The init time of the model run. + max_step: The maximum step in hours to consider. + """ + prefix: str = os.getenv("ECMWF_DISSEMINATION_REALTIME_FILE_PREFIX", "A2") + pattern: str = r"^" + prefix + r"[DS](\d{8})(\d{8})\d$" + match: re.Match[str] | None = re.search(pattern=pattern, string=filename) + if match is None: + return False + if it.strftime("%m%d%H%M") != match.group(1): + return False + tt: dt.datetime = dt.datetime.strptime( + str(it.year) + match.group(2) + "+0000", + "%Y%m%d%H%M%z", + ) + return tt < it + dt.timedelta(hours=max_step) diff --git a/src/nwp_consumer/internal/repositories/model_repositories/mo_datahub.py b/src/nwp_consumer/internal/repositories/model_repositories/mo_datahub.py new file mode 100644 index 00000000..d2ae3213 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/mo_datahub.py @@ -0,0 +1,332 @@ +"""Repository implementation for data from MetOffice's DataHub service. + +Repository Information +====================== + +The API documentation for the MetOffice Weather Datahub can be found at: +https://datahub.metoffice.gov.uk/docs/f/category/atmospheric/type/atmospheric/api-documentation + +Documented Structure +-------------------- + +MetOffice provide a number of models, a few of which OCF consume. Their flagship deterministic +model us called the "Unified Model" (UM) and is run in two configurations: "Global" and "UK". +The "Global" model has a resolution of 10km and the "UK" model has a resolution of 2km. + +See https://datahub.metoffice.gov.uk/docs/f/category/atmospheric/overview for more information. + +Data is provided on a per-order basis, so the filestructure depends on the order ID. +For OCF's purposes, on file per parameter per step is requested. + +Actual Structure +---------------- + +The latitude and longitude increments are ascertained from the GRIB2 file's metadata: +.. code-block:: none + + iDirectionIncrementInDegrees: 0.140625 + jDirectionIncrementInDegrees: 0.09375 + +""" + +import datetime as dt +import json +import logging +import os +import pathlib +import urllib.error +import urllib.request +from collections.abc import Callable, Iterator +from typing import TYPE_CHECKING, ClassVar, override + +import numpy as np +import xarray as xr +from joblib import delayed +from returns.result import Failure, ResultE, Success + +from nwp_consumer.internal import entities, ports + +if TYPE_CHECKING: + import http.client + +log = logging.getLogger("nwp-consumer") + + +class MetOfficeDatahubModelRepository(ports.ModelRepository): + """Repository implementation for data from MetOffice's DataHub service.""" + + base_url: ClassVar[str] = "https://data.hub.api.metoffice.gov.uk/atmospheric-models/1.0.0/orders" + + request_url: str + order_id: str + _headers: dict[str, str] + + def __init__(self, order_id: str, api_key: str) -> None: + """Create a new instance.""" + self._headers = { + "Accept": "application/json", + "apikey": api_key, + } + self.order_id = order_id + self.request_url = f"{self.base_url}/{self.order_id}/latest" + + + @staticmethod + @override + def repository() -> entities.ModelRepositoryMetadata: + return entities.ModelRepositoryMetadata( + name="MetOffice-Weather-Datahub", + is_archive=False, + is_order_based=True, + running_hours=[0, 12], + delay_minutes=60, + max_connections=10, + required_env=["METOFFICE_API_KEY", "METOFFICE_ORDER_ID"], + optional_env={}, + postprocess_options=entities.PostProcessOptions(), + ) + + @staticmethod + @override + def model() -> entities.ModelMetadata: + return entities.ModelMetadata( + name="UM-Global", + resolution="10km", + expected_coordinates=entities.NWPDimensionCoordinateMap( + init_time=[], + step=list(range(0, 55)), + variable=sorted( + [ + entities.Parameter.CLOUD_COVER_TOTAL, + entities.Parameter.CLOUD_COVER_HIGH, + entities.Parameter.CLOUD_COVER_MEDIUM, + entities.Parameter.CLOUD_COVER_LOW, + entities.Parameter.VISIBILITY_SL, + entities.Parameter.RELATIVE_HUMIDITY_SL, + entities.Parameter.SNOW_DEPTH_GL, + entities.Parameter.DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL, + entities.Parameter.TEMPERATURE_SL, + entities.Parameter.WIND_U_COMPONENT_10m, + entities.Parameter.WIND_V_COMPONENT_10m, + ], + ), + latitude=[ + float(f"{lat:.4f}") + for lat in np.arange(89.953125, -89.953125 - 0.09375, -0.09375) + ], + longitude=[ + float(f"{lon:.4f}") + for lon in np.arange(-179.929687, 179.929688 + 0.140625, 0.140625) + ], + ), + ) + + @classmethod + @override + def authenticate(cls) -> ResultE["MetOfficeDatahubModelRepository"]: + """Authenticate with the MetOffice DataHub service.""" + missing_envs = cls.repository().missing_required_envs() + if len(missing_envs) > 0: + return Failure(OSError( + f"Cannot authenticate with MetOffice DataHub service due to " + f"missing required environment variables: {', '.join(missing_envs)}", + )) + api_key: str = os.environ["METOFFICE_API_KEY"] + order_id: str = os.environ["METOFFICE_ORDER_ID"] + return Success(cls(order_id=order_id, api_key=api_key)) + + @override + def fetch_init_data( + self, it: dt.datetime, + ) -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + """Fetch raw data files for an init time as xarray datasets.""" + req: urllib.request.Request = urllib.request.Request( # noqa: S310 + url=self.request_url + f"?detail=MINIMAL&runfilter={it:%Y%m%d%H}", + headers=self._headers, + method="GET", + ) + log.debug( + f"Calling MetOffice Datahub at '{req.get_full_url()}'", + ) + + # Request the list of files + try: + response: http.client.HTTPResponse = urllib.request.urlopen(req, timeout=30) # noqa: S310 + except Exception as e: + yield delayed(Failure)(OSError( + "Unable to list files from MetOffice DataHub for order " + f"{self.order_id} at '{self.request_url}'. " + f"Ensure API key and Order ID are correct. Error context: {e}", + )) + return + try: + data = json.loads( + response.read().decode(response.info().get_param("charset") or "utf-8"), # type: ignore + ) + except Exception as e: + yield delayed(Failure)(ValueError( + "Unable to decode JSON response from MetOffice DataHub. " + "Check the response from the '/latest' endpoint looks as expected. " + f"Error context: {e}", + )) + return + urls: list[str] = [] + if "orderDetails" in data and "files" in data["orderDetails"]: + for filedata in data["orderDetails"]["files"]: + if "fileId" in filedata and "+" not in filedata["fileId"]: + urls.append(f"{self.request_url}/{filedata["fileId"]}/data") + + log.debug( + f"Found {len(urls)} file(s) for init time '{it.strftime('%Y-%m-%d %H:%M')}' " + f"in order '{self.order_id}'.", + ) + + for url in urls: + yield delayed(self._download_and_convert)(url) + + def _download_and_convert(self, url: str) -> ResultE[list[xr.DataArray]]: + """Download and convert a grib file from MetOffice Weather Datahub API. + + Args: + url: The URL of the file of interest. + """ + return self._download(url).bind(self._convert) + + def _download(self, url: str) -> ResultE[pathlib.Path]: + """Download a grib file from MetOffice Weather Datahub API. + + Args: + url: The URL of the file of interest. + """ + local_path: pathlib.Path = ( + pathlib.Path( + os.getenv( + "RAWDIR", + f"~/.local/cache/nwp/{self.repository().name}/{self.model().name}/raw", + ), + ) / f"{url.split("/")[-2]}.grib" + ).expanduser() + + # Only download the file if not already present + if not local_path.exists() or local_path.stat().st_size == 0: + local_path.parent.mkdir(parents=True, exist_ok=True) + log.debug("Requesting file from MetOffice Weather Datahub API at: '%s'", url) + + req: urllib.request.Request = urllib.request.Request( # noqa: S310 + url=url, + headers=self._headers | {"Accept": "application/x-grib"}, + method="GET", + ) + + # Request the file + try: + response: http.client.HTTPResponse = urllib.request.urlopen( # noqa: S310 + req, + timeout=60, + ) + except Exception as e: + return Failure(OSError( + "Unable to request file data from MetOffice DataHub at " + f"'{url}': {e}", + )) + + # Download the file + log.debug("Downloading %s to %s", url, local_path) + try: + with local_path.open("wb") as f: + for chunk in iter(lambda: response.read(16 * 1024), b""): + f.write(chunk) + f.flush() + log.debug( + f"Downloaded '{url}' to '{local_path}' (%s bytes)", + local_path.stat().st_size, + ) + except Exception as e: + return Failure( + OSError( + f"Error saving '{url}' to '{local_path}': {e}", + ), + ) + + return Success(local_path) + + @staticmethod + def _convert(path: pathlib.Path) -> ResultE[list[xr.DataArray]]: + """Convert a local grib file to xarray DataArrays. + + Args: + path: The path to the file to convert. + """ + try: + # Read the file as a dataset, also reading the values of the keys in 'read_keys' + ds: xr.Dataset = xr.open_dataset( + path, + engine="cfgrib", + backend_kwargs={"read_keys": ["name", "parameterNumber"], "indexpath": ""}, + chunks={ + "time": 1, + "step": -1, + }, + ) + except Exception as e: + return Failure( + OSError( + f"Error opening '{path}' as xarray Dataset: {e}", + ), + ) + + # Some parameters are surfaced in the dataset as 'unknown' + # and have to be differentiated via the parameterNumber attribute + # which lines up with the last number in the GRIB2 code specified below + # https://datahub.metoffice.gov.uk/docs/glossary?sortOrder=GRIB2_CODE + name = next(iter(ds.data_vars)) + parameter_number = ds[name].attrs["GRIB_parameterNumber"] + match name, parameter_number: + case "unknown", 192: + ds = ds.rename({name: "u10"}) + case "unknown", 193: + ds = ds.rename({name: "v10"}) + case "unknown", 194: + ds = ds.rename({name: "wdir"}) + case "unknown", 195: + ds = ds.rename({name: "wdir10"}) + case "unknown", 1: + ds = ds.rename({name: "tcc"}) + case "unknown", _: + log.warning( + f"Encountered unknown parameter with parameterNumber {parameter_number} " + f"in file '{path}'.", + ) + + try: + da: xr.DataArray = ( + ds.pipe( + entities.Parameter.rename_else_drop_ds_vars, + allowed_parameters=MetOfficeDatahubModelRepository.model().expected_coordinates.variable, + ) + .rename(name_dict={"time": "init_time"}) + .expand_dims(dim="init_time") + .expand_dims(dim="step") + .to_dataarray(name=MetOfficeDatahubModelRepository.model().name) + ) + da = ( + da.drop_vars( + names=[ + c for c in ds.coords + if c not in ["init_time", "step", "variable", "latitude", "longitude"] + ], + errors="ignore", + ) + .transpose("init_time", "step", "variable", "latitude", "longitude") + .sortby(variables=["step", "variable", "longitude"]) + .sortby(variables="latitude", ascending=False) + ) + except Exception as e: + return Failure( + ValueError( + f"Error processing DataArray for path '{path}'. Error context: {e}", + ), + ) + + + return Success([da]) diff --git a/src/nwp_consumer/internal/repositories/model_repositories/noaa_s3.py b/src/nwp_consumer/internal/repositories/model_repositories/noaa_s3.py new file mode 100644 index 00000000..1766ad59 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/noaa_s3.py @@ -0,0 +1,294 @@ +"""Repository implementation for NOAA GFS data stored in S3. + +This module contains the implementation of the model repository for the +NOAA GFS data stored in an S3 bucket. + +Repository Information +====================== + +TODO: provide links etc + +Documented Structure +-------------------- + +TODO: document filestructure +""" + +import datetime as dt +import logging +import os +import pathlib +import re +from collections.abc import Callable, Iterator +from typing import override + +import cfgrib +import s3fs +import xarray as xr +from joblib import delayed +from returns.result import Failure, ResultE, Success + +from nwp_consumer.internal import entities, ports + +log = logging.getLogger("nwp-consumer") + + +class NOAAS3ModelRepository(ports.ModelRepository): + """Model repository implementation for GFS data stored in S3.""" + + @staticmethod + @override + def repository() -> entities.ModelRepositoryMetadata: + return entities.ModelRepositoryMetadata( + name="NOAA-GFS-S3", + is_archive=False, + is_order_based=False, + running_hours=[0, 6, 12, 18], + delay_minutes=(60 * 24 * 7), # 1 week + max_connections=100, + required_env=[], + optional_env={}, + postprocess_options=entities.PostProcessOptions(), + ) + + @staticmethod + @override + def model() -> entities.ModelMetadata: + return entities.ModelMetadata( + name="NCEP-GFS", + resolution="1 degree", + expected_coordinates=entities.NWPDimensionCoordinateMap( + init_time=[], + step=list(range(0, 49, 3)), + variable=sorted( + [ + entities.Parameter.TEMPERATURE_SL, + entities.Parameter.CLOUD_COVER_TOTAL, + entities.Parameter.CLOUD_COVER_HIGH, + entities.Parameter.CLOUD_COVER_MEDIUM, + entities.Parameter.CLOUD_COVER_LOW, + entities.Parameter.DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL, + entities.Parameter.DOWNWARD_LONGWAVE_RADIATION_FLUX_GL, + entities.Parameter.TOTAL_PRECIPITATION_RATE_GL, + entities.Parameter.SNOW_DEPTH_GL, + entities.Parameter.RELATIVE_HUMIDITY_SL, + entities.Parameter.VISIBILITY_SL, + entities.Parameter.WIND_U_COMPONENT_10m, + entities.Parameter.WIND_V_COMPONENT_10m, + entities.Parameter.WIND_U_COMPONENT_100m, + entities.Parameter.WIND_V_COMPONENT_100m, + ], + ), + latitude=[float(lat) for lat in range(90, -90 - 1, -1)], + longitude=[float(lon) for lon in range(-180, 180 + 1, 1)], + ), + ) + + @override + def fetch_init_data( + self, it: dt.datetime, + ) -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + # List relevant files in the s3 bucket + bucket_path: str = f"noaa-gfs-bdp-pds/gfs.{it:%Y%m%d}/{it:%H}/atmos" + try: + fs = s3fs.S3FileSystem(anon=True) + urls: list[str] = [ + f"s3://{f}" + for f in fs.ls(bucket_path) + if self._wanted_file( + filename=f.split("/")[-1], + it=it, + max_step=max(self.model().expected_coordinates.step), + ) + ] + except Exception as e: + yield delayed(Failure)( + ValueError( + f"Failed to list file in bucket path '{bucket_path}'. " + "Ensure the path exists and the bucket does not require auth. " + f"Encountered error: '{e}'", + ), + ) + return + + if len(urls) == 0: + yield delayed(Failure)( + ValueError( + f"No files found for init time '{it:%Y-%m-%d %H:%M}'. " + "in bucket path '{bucket_path}'. Ensure files exists at the given path " + "with the expected filename pattern. ", + ), + ) + + for url in urls: + yield delayed(self._download_and_convert)(url=url) + + @classmethod + @override + def authenticate(cls) -> ResultE["NOAAS3ModelRepository"]: + return Success(cls()) + + def _download_and_convert(self, url: str) -> ResultE[list[xr.DataArray]]: + """Download and convert a file from S3. + + Args: + url: The URL to the S3 object. + """ + return self._download(url).bind(self._convert) + + def _download(self, url: str) -> ResultE[pathlib.Path]: + """Download an ECMWF realtime file from S3. + + Args: + url: The URL to the S3 object. + """ + local_path: pathlib.Path = ( + pathlib.Path( + os.getenv( + "RAWDIR", + f"~/.local/cache/nwp/{self.repository().name}/{self.model().name}/raw", + ), + ) / url.split("/")[-1] + ).with_suffix(".grib").expanduser() + + # Only download the file if not already present + if not local_path.exists(): + local_path.parent.mkdir(parents=True, exist_ok=True) + log.debug("Requesting file from S3 at: '%s'", url) + + fs = s3fs.S3FileSystem(anon=True) + try: + if not fs.exists(url): + raise FileNotFoundError(f"File not found at '{url}'") + + with local_path.open("wb") as lf, fs.open(url, "rb") as rf: + for chunk in iter(lambda: rf.read(12 * 1024), b""): + lf.write(chunk) + lf.flush() + + except Exception as e: + return Failure(OSError( + f"Failed to download file from S3 at '{url}'. Encountered error: {e}", + )) + + if local_path.stat().st_size != fs.info(url)["size"]: + return Failure(ValueError( + f"Failed to download file from S3 at '{url}'. " + "File size mismatch. File may be corrupted.", + )) + + # Also download the associated index file + # * This isn't critical, but speeds up reading the file in when converting + # TODO: Re-incorporate this when https://github.com/ecmwf/cfgrib/issues/350 + # TODO: is resolved. Currently downloaded index files are ignored due to + # TODO: path differences once downloaded. + index_url: str = url + ".idx" + index_path: pathlib.Path = local_path.with_suffix(".grib.idx") + try: + with index_path.open("wb") as lf, fs.open(index_url, "rb") as rf: + for chunk in iter(lambda: rf.read(12 * 1024), b""): + lf.write(chunk) + lf.flush() + except Exception as e: + log.warning( + f"Failed to download index file from S3 at '{url}'. " + "This will require a manual indexing when converting the file. " + f"Encountered error: {e}", + ) + + return Success(local_path) + + @staticmethod + def _convert(path: pathlib.Path) -> ResultE[list[xr.DataArray]]: + """Convert a GFS file to an xarray DataArray collection. + + Args: + path: The path to the local grib file. + """ + try: + # Use some options when opening the datasets: + # * 'squeeze' reduces length-1- dimensions to scalar coordinates, + # thus single-level variables should not have any extra dimensions + # * 'filter_by_keys' reduces the number of variables loaded to only those + # in the expected list + dss: list[xr.Dataset] = cfgrib.open_datasets( + path.as_posix(), + backend_kwargs={ + "squeeze": True, + "filter_by_keys": { + "shortName": [ + x for v in NOAAS3ModelRepository.model().expected_coordinates.variable + for x in v.metadata().alternate_shortnames + ], + }, + }, + ) + except Exception as e: + return Failure(ValueError( + f"Error opening '{path}' as list of xarray Datasets: {e}", + )) + + if len(dss) == 0: + return Failure(ValueError( + f"No datasets found in '{path}'. File may be corrupted. " + "A redownload of the file may be required.", + )) + + processed_das: list[xr.DataArray] = [] + for i, ds in enumerate(dss): + try: + ds = entities.Parameter.rename_else_drop_ds_vars( + ds=ds, + allowed_parameters=NOAAS3ModelRepository.model().expected_coordinates.variable, + ) + # Ignore datasets with no variables of interest + if len(ds.data_vars) == 0: + continue + # Ignore datasets with multi-level variables + # * This would not work without the "squeeze" option in the open_datasets call, + # which reduces single-length dimensions to scalar coordinates + if any(x not in ["latitude", "longitude" ,"time"] for x in ds.dims): + continue + da: xr.DataArray = ( + ds + .rename(name_dict={"time": "init_time"}) + .expand_dims(dim="init_time") + .expand_dims(dim="step") + .to_dataarray(name=NOAAS3ModelRepository.model().name) + ) + da = ( + da.drop_vars( + names=[ + c for c in da.coords + if c not in ["init_time", "step", "variable", "latitude", "longitude"] + ], + errors="raise", + ) + .transpose("init_time", "step", "variable", "latitude", "longitude") + .assign_coords(coords={"longitude": (da.coords["longitude"] + 180) % 360 - 180}) + .sortby(variables=["step", "variable", "longitude"]) + .sortby(variables="latitude", ascending=False) + ) + except Exception as e: + return Failure(ValueError( + f"Error processing dataset {i} from '{path}' to DataArray: {e}", + )) + processed_das.append(da) + + return Success(processed_das) + + @staticmethod + def _wanted_file(filename: str, it: dt.datetime, max_step: int) -> bool: + """Determine if a file is wanted based on the init time and max step. + + See module docstring for file naming convention. + """ + pattern: str = r"^gfs\.t(\d{2})z\.pgrb2\.1p00\.f(\d{3})$" + match: re.Match[str] | None = re.search(pattern=pattern, string=filename) + if match is None: + return False + if int(match.group(1)) != it.hour: + return False + return not int(match.group(2)) > max_step + + diff --git a/src/nwp_consumer/internal/repositories/model_repositories/test_ceda_ftp.py b/src/nwp_consumer/internal/repositories/model_repositories/test_ceda_ftp.py new file mode 100644 index 00000000..3d372c24 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/test_ceda_ftp.py @@ -0,0 +1,76 @@ +import dataclasses +import datetime as dt +import os +import unittest + +from returns.pipeline import flow, is_successful +from returns.pointfree import bind + +from nwp_consumer.internal import entities + +from .ceda_ftp import CEDAFTPModelRepository + + +class TestCEDAFTPModelRepository(unittest.TestCase): + """Test the business methods of the CEDAFTPModelRepository class.""" + + @unittest.skipIf( + condition="CI" in os.environ, + reason="Skipping integration test that requires FTP access.", + ) + def test__download_and_convert(self) -> None: + """Test the _download_and_convert method.""" + + auth_result = CEDAFTPModelRepository.authenticate() + self.assertTrue(is_successful(auth_result), msg=f"Error: {auth_result}") + c = auth_result.unwrap() + + test_it: dt.datetime = dt.datetime(2021, 1, 1, 0, tzinfo=dt.UTC) + test_coordinates: entities.NWPDimensionCoordinateMap = dataclasses.replace( + c.model().expected_coordinates, + init_time=[test_it], + ) + + @dataclasses.dataclass + class TestCase: + area: str + crop: str | None = None + + @property + def url(self) -> str: + return "".join( + ( + c.url_base, + f"/{test_it:%Y/%m/%d}", + f"/{test_it:%Y%m%d%H}_WSGlobal17km_Total_Downward_Surface_SW_Flux_{self.area}_000144.grib", + ), + ) + + tests = [ + TestCase(area="AreaC", crop="east"), + TestCase(area="AreaG", crop="west"), + TestCase(area="AreaE"), + ] + + for test in tests: + with (self.subTest(area=test.area)): + result = c._download_and_convert(test.url) + + self.assertTrue(is_successful(result), msg=f"Error: {result}") + + for da in result.unwrap(): + # Check resultant arrays are a subset of the expected coordinates + subset_result = flow( + da, + entities.NWPDimensionCoordinateMap.from_xarray, + bind(test_coordinates.determine_region), + ) + + self.assertTrue( + is_successful(subset_result), + msg=f"Error: {subset_result}", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/nwp_consumer/internal/repositories/model_repositories/test_ecmwf_realtime.py b/src/nwp_consumer/internal/repositories/model_repositories/test_ecmwf_realtime.py new file mode 100644 index 00000000..50de560a --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/test_ecmwf_realtime.py @@ -0,0 +1,108 @@ +import dataclasses +import datetime as dt +import os +import unittest +from typing import TYPE_CHECKING + +from returns.pipeline import is_successful + +from ...entities import NWPDimensionCoordinateMap +from .ecmwf_realtime import ECMWFRealTimeS3ModelRepository + +if TYPE_CHECKING: + import xarray as xr + + from nwp_consumer.internal import entities + + +class TestECMWFRealTimeS3ModelRepository(unittest.TestCase): + """Test the business methods of the ECMWFRealTimeS3ModelRepository class.""" + + @unittest.skipIf( + condition="CI" in os.environ, + reason="Skipping integration test that requires S3 access.", + ) # TODO: Move into integration tests, or remove + def test__download_and_convert(self) -> None: + """Test the _download_and_convert method.""" + + auth_result = ECMWFRealTimeS3ModelRepository.authenticate() + self.assertTrue(is_successful(auth_result), msg=f"Error: {auth_result.failure}") + c: ECMWFRealTimeS3ModelRepository = auth_result.unwrap() + + test_it: dt.datetime = dt.datetime(2024, 10, 25, 0, tzinfo=dt.UTC) + test_coordinates: entities.NWPDimensionCoordinateMap = dataclasses.replace( + c.model().expected_coordinates, + init_time=[test_it], + ) + + urls: list[str] = [ + f"s3://{f}" + for f in c._fs.ls(f"{c.bucket}/ecmwf") + if c._wanted_file( + filename=f.split("/")[-1], + it=test_it, + max_step=max(c.model().expected_coordinates.step), + ) + ] + + for url in urls: + with (self.subTest(url=url)): + result = c._download_and_convert(url) + + self.assertTrue(is_successful(result), msg=f"Error: {result}") + + da: xr.DataArray = result.unwrap()[0] + determine_region_result = NWPDimensionCoordinateMap.from_xarray(da).bind( + test_coordinates.determine_region, + ) + self.assertTrue( + is_successful(determine_region_result), + msg=f"Error: {determine_region_result}", + ) + + def test__wanted_file(self) -> None: + """Test the _wanted_file method.""" + + @dataclasses.dataclass + class TestCase: + name: str + filename: str + expected: bool + + test_it: dt.datetime = dt.datetime(2024, 10, 25, 0, tzinfo=dt.UTC) + + tests: list[TestCase] = [ + TestCase( + name="valid_filename", + filename=f"A2D{test_it:%m%d%H%M}102516001", + expected=True, + ), + TestCase( + name="invalid_init_time", + filename="A2D09250600102516002", + expected=False, + ), + TestCase( + name="invalid_prefix", + filename=f"GGC{test_it:%m%d%H%M}102516002", + expected=False, + ), + TestCase( + name="unexpected_extension", + filename="A2D10251200102516001.nc", + expected=False, + ), + TestCase( + name="step_too_large", + filename="A2D10251200102916001", + expected=False, + ), + ] + + for t in tests: + with self.subTest(name=t.name): + result = ECMWFRealTimeS3ModelRepository._wanted_file( + filename=t.filename, + it=test_it, + max_step=max(ECMWFRealTimeS3ModelRepository.model().expected_coordinates.step)) + self.assertEqual(result, t.expected) diff --git a/src/nwp_consumer/internal/repositories/model_repositories/test_mo_datahub.py b/src/nwp_consumer/internal/repositories/model_repositories/test_mo_datahub.py new file mode 100644 index 00000000..c3d992fb --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/test_mo_datahub.py @@ -0,0 +1,32 @@ +import datetime as dt +import os +import unittest + +from returns.pipeline import is_successful + +from .mo_datahub import MetOfficeDatahubModelRepository + + +class TestMetOfficeDatahubModelRepository(unittest.TestCase): + """Test the business methods of the MetOfficeDatahubModelRepository class.""" + + @unittest.skipIf( + condition="CI" in os.environ, + reason="Skipping integration test that requires MetOffice DataHub access.", + ) + def test__download(self) -> None: + """Test the _download method.""" + + auth_result = MetOfficeDatahubModelRepository.authenticate() + self.assertTrue(is_successful(auth_result), msg=f"Error: {auth_result}") + c = auth_result.unwrap() + + test_it = c.repository().determine_latest_it_from(dt.datetime.now(tz=dt.UTC)) + + dl_result = c._download( + f"{c.request_url}/agl_u-component-of-wind-surface-adjusted_10.0_{test_it:%Y%m%d%H}_1/data", + ) + + self.assertTrue(is_successful(dl_result), msg=f"Error: {dl_result}") + + diff --git a/src/nwp_consumer/internal/repositories/model_repositories/test_noaa_s3.py b/src/nwp_consumer/internal/repositories/model_repositories/test_noaa_s3.py new file mode 100644 index 00000000..2e75452e --- /dev/null +++ b/src/nwp_consumer/internal/repositories/model_repositories/test_noaa_s3.py @@ -0,0 +1,111 @@ +import dataclasses +import datetime as dt +import os +import unittest +from typing import TYPE_CHECKING + +import s3fs +from returns.pipeline import is_successful + +from ...entities import NWPDimensionCoordinateMap +from .noaa_s3 import NOAAS3ModelRepository + +if TYPE_CHECKING: + import xarray as xr + + from nwp_consumer.internal import entities + + +class TestECMWFRealTimeS3ModelRepository(unittest.TestCase): + """Test the business methods of the ECMWFRealTimeS3ModelRepository class.""" + + @unittest.skipIf( + condition="CI" in os.environ, + reason="Skipping integration test that requires S3 access.", + ) # TODO: Move into integration tests, or remove + def test__download_and_convert(self) -> None: + """Test the _download_and_convert method.""" + + c: NOAAS3ModelRepository = NOAAS3ModelRepository.authenticate().unwrap() + + test_it: dt.datetime = dt.datetime(2024, 10, 24, 12, tzinfo=dt.UTC) + test_coordinates: entities.NWPDimensionCoordinateMap = dataclasses.replace( + c.model().expected_coordinates, + init_time=[test_it], + ) + + fs = s3fs.S3FileSystem(anon=True) + bucket_path: str = f"noaa-gfs-bdp-pds/gfs.{test_it:%Y%m%d}/{test_it:%H}/atmos" + urls: list[str] = [ + f"s3://{f}" + for f in fs.ls(bucket_path) + if c._wanted_file( + filename=f.split("/")[-1], + it=test_it, + max_step=max(c.model().expected_coordinates.step), + ) + ] + + for url in urls: + with self.subTest(url=url): + result = c._download_and_convert(url) + + self.assertTrue(is_successful(result), msg=f"Error: {result}") + + da: xr.DataArray = result.unwrap()[0] + determine_region_result = NWPDimensionCoordinateMap.from_xarray(da).bind( + test_coordinates.determine_region, + ) + self.assertTrue( + is_successful(determine_region_result), + msg=f"Error: {determine_region_result}", + ) + + def test__wanted_file(self) -> None: + """Test the _wanted_file method.""" + + @dataclasses.dataclass + class TestCase: + name: str + filename: str + expected: bool + + test_it: dt.datetime = dt.datetime(2024, 10, 25, 0, tzinfo=dt.UTC) + + tests: list[TestCase] = [ + TestCase( + name="valid_filename", + filename=f"gfs.t{test_it:%H}z.pgrb2.1p00.f000", + expected=True, + ), + TestCase( + name="invalid_init_time", + filename="gfs.t02z.pgrb2.1p00.f000", + expected=False, + ), + TestCase( + name="invalid_prefix", + filename=f"gfs.t{test_it:%H}z.pgrb2.0p20.f006", + expected=False, + ), + TestCase( + name="unexpected_extension", + filename=f"gfs.t{test_it:%H}z.pgrb2.1p00.f030.nc", + expected=False, + ), + TestCase( + name="step_too_large", + filename=f"gfs.t{test_it:%H}z.pgrb2.1p00.f049", + expected=False, + ), + ] + + for t in tests: + with self.subTest(name=t.name): + result = NOAAS3ModelRepository._wanted_file( + filename=t.filename, + it=test_it, + max_step=max(NOAAS3ModelRepository.model().expected_coordinates.step), + ) + self.assertEqual(result, t.expected) + diff --git a/src/nwp_consumer/internal/repositories/notification_repositories/__init__.py b/src/nwp_consumer/internal/repositories/notification_repositories/__init__.py new file mode 100644 index 00000000..f420e549 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/notification_repositories/__init__.py @@ -0,0 +1,7 @@ +from .stdout import StdoutNotificationRepository +from .dagster import DagsterPipesNotificationRepository + +__all__ = [ + "StdoutNotificationRepository", + "DagsterPipesNotificationRepository", +] diff --git a/src/nwp_consumer/internal/repositories/notification_repositories/dagster.py b/src/nwp_consumer/internal/repositories/notification_repositories/dagster.py new file mode 100644 index 00000000..0998f1f8 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/notification_repositories/dagster.py @@ -0,0 +1,45 @@ +"""Dagster pipes notification repository implementation. + +`Dagster Pipes `_ +enables integration with Dagster for reporting asset materialization +and logging. This module enables dagster instances running this code to recieve +notifications. + +See Also: + - https://docs.dagster.io/concepts/dagster-pipes/subprocess/create-subprocess-asset +""" + +import logging +from typing import override + +from dagster_pipes import PipesContext, open_dagster_pipes +from returns.result import ResultE, Success + +from nwp_consumer.internal import entities, ports + +log = logging.getLogger("nwp-consumer") + + +class DagsterPipesNotificationRepository(ports.NotificationRepository): + """Dagster pipes notification repository.""" + + @override + def notify( + self, + message: entities.StoreCreatedNotification | entities.StoreAppendedNotification, + ) -> ResultE[str]: + with open_dagster_pipes(): + context = PipesContext.get() + context.report_asset_materialization( + metadata={ + "filename": {"raw_value": message.filename, "type": "text"}, + "size_mb": {"raw_value": message.size_mb, "type": "float"}, + "memory_mb": {"raw_value": message.performance.memory_mb, "type": "float"}, + "duration_minutes": { + "raw_value": int(message.performance.duration_seconds / 60), + "type": "int", + }, + }, + ) + return Success("Notification sent to dagster successfully.") + diff --git a/src/nwp_consumer/internal/repositories/notification_repositories/stdout.py b/src/nwp_consumer/internal/repositories/notification_repositories/stdout.py new file mode 100644 index 00000000..4437ba00 --- /dev/null +++ b/src/nwp_consumer/internal/repositories/notification_repositories/stdout.py @@ -0,0 +1,23 @@ +"""Stdout notification repository implementation.""" + +import logging +from typing import override + +from returns.result import ResultE, Success + +from nwp_consumer.internal import entities, ports + +log = logging.getLogger("nwp-consumer") + + +class StdoutNotificationRepository(ports.NotificationRepository): + """Stdout notification repository.""" + + @override + def notify( + self, + message: entities.StoreCreatedNotification | entities.StoreAppendedNotification, + ) -> ResultE[str]: + log.info(f"{message}") + return Success("Notification sent to stdout successfully.") + diff --git a/src/nwp_consumer/internal/service/__init__.py b/src/nwp_consumer/internal/service/__init__.py deleted file mode 100644 index 9cbeddb8..00000000 --- a/src/nwp_consumer/internal/service/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["NWPConsumerService"] - -from .consumer import NWPConsumerService diff --git a/src/nwp_consumer/internal/service/consumer.py b/src/nwp_consumer/internal/service/consumer.py deleted file mode 100644 index 50b67037..00000000 --- a/src/nwp_consumer/internal/service/consumer.py +++ /dev/null @@ -1,452 +0,0 @@ -"""The service class for the NWP Consumer.""" - -import datetime as dt -import pathlib -import shutil -from collections.abc import Callable -from typing import TYPE_CHECKING - -import dask.bag -import pandas as pd -import psutil -import structlog -import xarray as xr -import zarr -from ocf_blosc2 import Blosc2 - -if TYPE_CHECKING: - import numpy as np - - -from nwp_consumer import internal - -log = structlog.getLogger() - - -class NWPConsumerService: - """The service class for the NWP Consumer. - - Each method on the class is a business use case for the consumer - """ - - # Dependency-injected attributes - fetcher: internal.FetcherInterface - storer: internal.StorageInterface - rawstorer: internal.StorageInterface - # Configuration options - rawdir: pathlib.Path - zarrdir: pathlib.Path - rename_vars: bool - variable_dim: bool - - - def __init__( - self, - *, - fetcher: internal.FetcherInterface, - storer: internal.StorageInterface, - rawdir: str, - zarrdir: str, - rawstorer: internal.StorageInterface | None = None, - rename_vars: bool = True, - variable_dim: bool = True, - ) -> None: - """Create a consumer service with the given dependencies. - - Args: - fetcher: The fetcher to use for downloading data - storer: The storer to use for saving data - rawdir: The directory to store raw data - zarrdir: The directory to store zarr data - rawstorer: The storer to use for saving raw data. Defaults to the storer. - """ - self.fetcher = fetcher - self.storer = storer - self.rawstorer = rawstorer if rawstorer is not None else storer - self.rawdir = pathlib.Path(rawdir) - self.zarrdir = pathlib.Path(zarrdir) - self.rename_vars = rename_vars - self.variable_dim = variable_dim - - def DownloadRawDataset(self, *, start: dt.datetime, end: dt.datetime) -> list[pathlib.Path]: - """Download and convert raw data for a given time range. - - Args: - start: The start of the time range - end: The end of the time range - Returns: - A list of the paths to the downloaded files - """ - return self._performFuncForMultipleInitTimes( - func=self._downloadSingleInitTime, - start=start, - end=end, - ) - - def ConvertRawDatasetToZarr( - self, *, start: dt.datetime, end: dt.datetime, - ) -> list[pathlib.Path]: - """Convert raw data for a given time range. - - Args: - start: The start of the time range - end: The end of the time range - Returns: - A list of the paths to the converted files - """ - return self._performFuncForMultipleInitTimes( - func=self._convertSingleInitTime, - start=start, - end=end, - ) - - def CreateLatestZarr(self) -> list[pathlib.Path]: - """Create a Zarr file for the latest init time.""" - # Get the latest init time - allInitTimes: list[dt.datetime] = self.rawstorer.listInitTimes(prefix=self.rawdir) - if not allInitTimes: - log.info(event="no init times found", within=self.rawdir) - return [] - latestInitTime = allInitTimes[-1] - - # Load the latest init time as a dataset - cachedPaths = self.rawstorer.copyITFolderToCache(it=latestInitTime, prefix=self.rawdir) - log.info( - event="creating latest zarr for initTime", - inittime=latestInitTime.strftime("%Y/%m/%d %H:%M"), - path=(self.zarrdir / "latest.zarr.zip").as_posix(), - ) - - # Create a pipeline to convert the raw files and merge them as a dataset - # * Then cache the dataset as a zarr file and store it in the store - bag: dask.bag.Bag = dask.bag.from_sequence(cachedPaths) - latestDataset = ( - bag.map(lambda tfp: self.fetcher.mapCachedRaw(p=tfp)) - .fold(lambda ds1, ds2: _mergeDatasets([ds1, ds2])) - .compute() - ) - if not _dataQualityFilter(ds=latestDataset): - return [] - if self.rename_vars: - for var in latestDataset.data_vars: - if var in self.fetcher.parameterConformMap(): - latestDataset = latestDataset.rename( - {var: self.fetcher.parameterConformMap()[var].value} - ) - if self.variable_dim: - latestDataset = ( - latestDataset.to_array(dim="variable", name=self.fetcher.datasetName()) - .to_dataset() - .transpose("variable", ...) - ) - datasets = dask.bag.from_sequence([latestDataset]) - # Save as zipped zarr - if self.storer.exists(dst=self.zarrdir / "latest.zarr.zip"): - self.storer.delete(p=self.zarrdir / "latest.zarr.zip") - storedFiles = ( - datasets.map(lambda ds: _cacheAsZipZarr(ds=ds)) - .map(lambda path: self.storer.store(src=path, dst=self.zarrdir / "latest.zarr.zip")) - .compute() - ) - - # Save as regular zarr - if self.storer.exists(dst=self.zarrdir / "latest.zarr"): - self.storer.delete(p=self.zarrdir / "latest.zarr") - storedFiles += ( - datasets.map(lambda ds: _cacheAsZarr(ds=ds)) - .map(lambda path: self.storer.store(src=path, dst=self.zarrdir / "latest.zarr")) - .compute() - ) - - # Delete the cached files - for f in cachedPaths: - f.unlink(missing_ok=True) - - return storedFiles - - def Check(self) -> int: - """Perform a healthcheck on the service.""" - unhealthy = False - - # Check eccodes is installed - try: - from cfgrib.messages import eccodes_version - - log.info(event="HEALTH: eccodes is installed", version=eccodes_version) - except Exception as e: - log.error(event="HEALTH: eccodes binary is not installed", error=str(e)) - unhealthy = True - - # Check the raw directory exists - if not self.storer.exists(dst=self.rawdir): - log.error(event="HEALTH: raw directory does not exist", path=self.rawdir.as_posix()) - unhealthy = True - else: - log.info(event="HEALTH: found raw directory", path=self.rawdir.as_posix()) - - # Check the zarr directory exists - if not self.storer.exists(dst=self.zarrdir): - log.error(event="HEALTH: zarr directory does not exist", path=self.zarrdir.as_posix()) - unhealthy = True - else: - log.info(event="HEALTH: found zarr directory", path=self.zarrdir.as_posix()) - - # Check that the cache directory is not approaching capacity - internal.CACHE_DIR.mkdir(parents=True, exist_ok=True) - cache_usage = shutil.disk_usage(internal.CACHE_DIR.as_posix()) - if cache_usage.free < 1e9: - log.error( - event="HEALTH: cache directory is full", - free=cache_usage.free, - total=cache_usage.total, - used=cache_usage.used, - ) - unhealthy = True - else: - log.info( - event="HEALTH: found cache directory", - free=cache_usage.free, - total=cache_usage.total, - used=cache_usage.used, - path=internal.CACHE_DIR.as_posix(), - ) - - # Check the ram usage - ram_usage = psutil.virtual_memory() - if ram_usage.percent > 95: - log.error( - event="HEALTH: ram usage is high", - available=ram_usage.available, - total=ram_usage.total, - used=ram_usage.used, - percent=ram_usage.percent, - ) - unhealthy = True - else: - log.info( - event="HEALTH: found ram usage", - free=ram_usage.free, - total=ram_usage.total, - used=ram_usage.used, - percent=ram_usage.percent, - ) - - # Check the CPU usage - cpu_usage = psutil.cpu_percent() - if cpu_usage > 95: - log.error(event="HEALTH: cpu usage is high", percent=cpu_usage) - unhealthy = True - else: - log.info(event="HEALTH: found cpu usage", percent=cpu_usage) - - if unhealthy: - return 1 - - return 0 - - def _downloadSingleInitTime(self, it: dt.datetime) -> list[pathlib.Path]: - """Download and convert raw data for a given init time. - - Args: - it: The init time to download - Returns: - A list of the paths to the downloaded files - """ - # Check the init time is valid for the fetcher - if it.hour not in self.fetcher.getInitHours(): - log.error( - event="init time not valid for chosen source", - inittime=it.strftime("%Y-%m-%d %H:%M"), - validHours=self.fetcher.getInitHours(), - ) - return [] - - # Get the list of files available from the source - allSourceFiles: list[internal.FileInfoModel] = self.fetcher.listRawFilesForInitTime( - it=it, - ) - # Cache any existing files from the raw storer - cachedFiles: list[pathlib.Path] = self.rawstorer.copyITFolderToCache( - prefix=self.rawdir, it=it - ) - - # Create a dask pipeline from the available files - rb = dask.bag.from_sequence(allSourceFiles) - # Download the files to the cache, filtering any already cached or failed downloads - rb = rb.map( - lambda fi: self.fetcher.downloadToCache(fi=fi) - if fi.filename() not in [cf.name for cf in cachedFiles] - else cachedFiles.pop(cachedFiles.index(internal.rawCachePath(it=it, filename=fi.filename()))) - ).filter( - lambda p: p != pathlib.Path() - ) - # Store the files using the raw storer - rb = rb.map( - lambda p: self.rawstorer.store( - src=p, - dst=self.rawdir / p.relative_to(internal.CACHE_DIR_RAW), - ) - ) - storedFiles: list[pathlib.Path] = rb.compute() - return storedFiles - - def _convertSingleInitTime(self, it: dt.datetime) -> list[pathlib.Path]: - """Convert raw data for a single init time to zarr. - - Args: - it: The init time to convert - Returns: - List of paths to converted files - """ - # Get the raw files for the init time - zbag = dask.bag.from_sequence(self.rawstorer.copyITFolderToCache(prefix=self.rawdir, it=it)) - # Load the raw files as xarray datasets - zbag = zbag.map(lambda p: self.fetcher.mapCachedRaw(p=p)) - # Merge the datasets into a single dataset for the init time - # * Bag.fold is a parallelized version of the reduce function, so - # * in this case, first the partitions are merged, followed by the results - zbag = zbag.fold(lambda a, b: _mergeDatasets([a, b])) - ds = zbag.compute() - - # Filter out datasets that are not of sufficient quality - if not _dataQualityFilter(ds=ds): - return [] - - if self.rename_vars: - for var in ds.data_vars: - if var in self.fetcher.parameterConformMap(): - ds = ds.rename({var: self.fetcher.parameterConformMap()[var].value}) - - if self.variable_dim: - ds = ( - ds.to_array(dim="variable", name=self.fetcher.datasetName()) - .to_dataset() - .transpose("variable", ...) - ) - # Cache the dataset as a zarr file - zpath = _cacheAsZipZarr(ds=ds) - # Store the zarr file using the storer - return [self.storer.store(src=zpath, dst=self.zarrdir / zpath.name)] - - - def _performFuncForMultipleInitTimes( - self, - *, - func=Callable[[dt.datetime], list[pathlib.Path]], - start: dt.datetime, - end: dt.datetime, - ): - """Perform a function for each init time in the fetcher's range.""" - allInitTimes: list[dt.datetime] = [ - pdt.to_pydatetime() - for pdt in pd.date_range( - start=start, - end=end, - inclusive="left", - freq="h", - tz=dt.UTC, - ).tolist() - if pdt.to_pydatetime().hour in self.fetcher.getInitHours() - ] - - log.info( - event="Carrying out function for multiple init times", - func=func.__name__, - start=start.strftime("%Y-%m-%d %H:%M"), - end=end.strftime("%Y-%m-%d %H:%M"), - num=len(allInitTimes), - ) - - paths: list[pathlib.Path] = [] - for it in allInitTimes: - paths.extend(func(it)) - - return paths - - -def _cacheAsZipZarr(ds: xr.Dataset) -> pathlib.Path: - """Save the dataset to the cache as a zipped zarr file.""" - # Get the name of the zarr file from the inittime and the zarr format string - dt64: np.datetime64 = ds.coords["init_time"].values[0] - initTime: dt.datetime = dt.datetime.fromtimestamp(dt64.astype(int) / 1e9, tz=dt.UTC) - cachePath: pathlib.Path = internal.zarrCachePath(it=initTime).with_suffix(".zarr.zip") - # Delete the cached zarr if it already exists - if cachePath.exists(): - cachePath.unlink() - cachePath.parent.mkdir(parents=True, exist_ok=True) - # Save the dataset to a zarr file - with zarr.ZipStore(path=cachePath.as_posix(), mode="w") as store: - ds.to_zarr( - store=store, - encoding=_generate_encoding(ds=ds), - ) - - log.debug("Saved as zipped zarr", path=cachePath.as_posix()) - return cachePath - - -def _cacheAsZarr(ds: xr.Dataset) -> pathlib.Path: - """Save the dataset to the cache as a zarr file.""" - # Get the name of the zarr file from the inittime and the zarr format string - dt64: np.datetime64 = ds.coords["init_time"].values[0] - initTime: dt.datetime = dt.datetime.fromtimestamp(dt64.astype(int) / 1e9, tz=dt.UTC) - cachePath: pathlib.Path = internal.zarrCachePath(it=initTime) - if cachePath.exists() and cachePath.is_dir(): - shutil.rmtree(cachePath.as_posix()) - ds.to_zarr( - store=cachePath.as_posix(), - encoding=_generate_encoding(ds=ds), - ) - return cachePath - - -def _generate_encoding(ds: xr.Dataset) -> dict[str, dict[str, str] | dict[str, Blosc2]]: - encoding = {"init_time": {"units": "nanoseconds since 1970-01-01"}} - for var in ds.data_vars: - encoding[var] = {"compressor": Blosc2(cname="zstd", clevel=5)} - return encoding - - -def _dataQualityFilter(ds: xr.Dataset) -> bool: - """Filter out data that is not of sufficient quality.""" - if ds == xr.Dataset(): - return False - - # Carry out a basic data quality check - for data_var in ds.data_vars: - if ds[f"{data_var}"].isnull().any(): - log.warn( - event=f"Dataset has NaNs in variable {data_var}", - initTime=str(ds.coords["init_time"].values[0])[:16], - variable=data_var, - ) - - return True - - -def _mergeDatasets(datasets: list[xr.Dataset]) -> xr.Dataset: - """Merge a list of datasets into a single dataset.""" - try: - ds: xr.Dataset = xr.merge(objects=datasets, combine_attrs="drop_conflicts") - except (xr.MergeError, ValueError, Exception) as e: - log.warn( - event="Merging datasets failed, trying to insert zeros for missing variables", - exception=str(e), - numdatasets=len(datasets), - datasets={ - i: { - "data_vars": list(datasets[i].data_vars.keys()), - "dimensions": datasets[i].sizes, - "indexes": list(datasets[i].indexes.keys()), - } - for i, ds in enumerate(datasets) - }, - ) - ds = xr.merge( - objects=datasets, - combine_attrs="drop_conflicts", - fill_value=0, - compat="override", - ) - del datasets - return ds diff --git a/src/nwp_consumer/internal/service/test_consumer.py b/src/nwp_consumer/internal/service/test_consumer.py deleted file mode 100644 index 43569e4b..00000000 --- a/src/nwp_consumer/internal/service/test_consumer.py +++ /dev/null @@ -1,191 +0,0 @@ -import datetime as dt -import pathlib -import unittest - -import numpy as np -import structlog -import xarray as xr - -from nwp_consumer import internal -from .consumer import NWPConsumerService, _cacheAsZipZarr, _mergeDatasets - -log = structlog.getLogger() - -IT = dt.datetime(2021, 1, 1, tzinfo=dt.UTC) -FILES = ["dswrf.grib", "prate.grib", "t2m.grib"] - - -class DummyStorer(internal.StorageInterface): - def name(self) -> str: - return "dummy" - - def exists(self, *, dst: pathlib.Path) -> bool: - return True - - def store(self, *, src: pathlib.Path, dst: pathlib.Path) -> pathlib.Path: - return dst - - def listInitTimes(self, prefix: pathlib.Path) -> list[dt.datetime]: - return [IT] - - def copyITFolderToCache(self, *, prefix: pathlib.Path, it: dt.datetime) -> list[pathlib.Path]: - return [ - pathlib.Path(internal.rawCachePath(it=it, filename=f)) - for f in FILES - ] - - def delete(self, *, p: pathlib.Path) -> None: - pass - - -class DummyFileInfo(internal.FileInfoModel): - def __init__(self, fileName: str, initTime: dt.datetime): - self.f = fileName - self.t = initTime - - def filename(self) -> str: - return self.f - - def it(self) -> dt.datetime: - return self.t - - def filepath(self) -> str: - return self.f - - def variables(self) -> list[str]: - raise NotImplementedError() - - def steps(self) -> list[int]: - return list(range(100)) - - -class DummyFetcher(internal.FetcherInterface): - def getInitHours(self) -> list[int]: - return [0, 6, 12, 18] - - def datasetName(self) -> str: - return "dummy" - - def listRawFilesForInitTime(self, *, it: dt.datetime) -> list[internal.FileInfoModel]: - return [DummyFileInfo(file, it) for file in FILES] - - def downloadToCache(self, *, fi: internal.FileInfoModel) -> pathlib.Path: - return internal.rawCachePath(it=fi.it(), filename=fi.filename()) - - def mapCachedRaw(self, *, p: pathlib.Path) -> xr.Dataset: - initTime = dt.datetime.strptime( - p.parent.relative_to(internal.CACHE_DIR_RAW).as_posix(), - "%Y/%m/%d/%H%M", - ).replace(tzinfo=dt.UTC) - return xr.Dataset( - data_vars={ - f"{p.stem}": ( - ("init_time", "step", "x", "y"), - np.random.rand(1, 12, 100, 100), - ) - }, - coords={ - "init_time": [np.datetime64(initTime)], - "step": range(12), - "x": range(100), - "y": range(100), - }, - ) - - def parameterConformMap(self) -> dict[str, internal.OCFParameter]: - return { - "t2m": internal.OCFParameter.TemperatureAGL, - } - - -# ------------- Client Methods -------------- # - - -class TestNWPConsumerService(unittest.TestCase): - service: NWPConsumerService - - @classmethod - def setUpClass(cls) -> None: - testStorer = DummyStorer() - testFetcher = DummyFetcher() - - cls.service = NWPConsumerService( - fetcher=testFetcher, - storer=testStorer, - rawdir="raw", - zarrdir="zarr", - ) - - def test_downloadSingleInitTime(self) -> None: - files = self.service._downloadSingleInitTime(it=IT) - self.assertEqual(3, len(files)) - - def test_convertSingleInitTime(self) -> None: - files = self.service._convertSingleInitTime(it=IT) - self.assertEqual(1, len(files)) - - def test_createLatestZarr(self) -> None: - files = self.service.CreateLatestZarr() - # 1 zarr, 1 zipped zarr - self.assertEqual(2, len(files)) - - -# ------------ Static Methods ----------- # - - -class TestCacheAsZipZarr(unittest.TestCase): - def test_createsValidZipZarr(self) -> None: - ds = DummyFetcher().mapCachedRaw( - p=pathlib.Path(f"{internal.CACHE_DIR_RAW}/2021/01/01/0000/dswrf.grib"), - ) - file = _cacheAsZipZarr(ds=ds) - outds = xr.open_zarr(f"zip::{file.as_posix()}") - self.assertEqual(ds.dims, outds.dims) - - -class TestMergeDatasets(unittest.TestCase): - def test_mergeDifferentDataVars(self) -> None: - """Test merging datasets with different data variables. - - This targets a bug seen in merging large ICON datasets, whereby - two datasets with different variables and number of steps would - not merge correctly. - - """ - datasets = [ - xr.Dataset( - data_vars={ - "msnswrf": ( - ("init_time", "step", "latitude", "longitude"), - np.random.rand(1, 2, 657, 1377), - ), - "t2m": ( - ("init_time", "step", "latitude", "longitude"), - np.random.rand(1, 2, 657, 1377), - ) - }, - coords={ - "init_time": [np.datetime64("2021-01-01T00:00:00")], - "step": [np.timedelta64(i, 's') for i in [7200, 10800]], - "latitude": range(657), - "longitude": range(1377), - }, - ), - xr.Dataset( - data_vars={ - "t2m": ( - ("init_time", "latitude", "longitude", "step"), - np.random.rand(1, 657, 1377, 1), - ), - }, - coords={ - "init_time": [np.datetime64("2021-01-01T00:00:00")], - "step": [0], - "latitude": range(657), - "longitude": range(1377), - }, - ), - ] - # Merge the datasets - merged = _mergeDatasets(datasets) - diff --git a/src/nwp_consumer/internal/services/__init__.py b/src/nwp_consumer/internal/services/__init__.py new file mode 100644 index 00000000..90d20dce --- /dev/null +++ b/src/nwp_consumer/internal/services/__init__.py @@ -0,0 +1,13 @@ +"""Interfaces for core services implementations. + +The services module defines abstract interfaces that specify the signatures +any services implementations must obey in order to interact with the core. +""" + +from .consumer_service import ConsumerService +from .archiver_service import ArchiverService + +__all__ = [ + "ConsumerService", + "ArchiverService" +] diff --git a/src/nwp_consumer/internal/services/_dummy_adaptors.py b/src/nwp_consumer/internal/services/_dummy_adaptors.py new file mode 100644 index 00000000..a1a40d53 --- /dev/null +++ b/src/nwp_consumer/internal/services/_dummy_adaptors.py @@ -0,0 +1,88 @@ +import datetime as dt +from collections.abc import Callable, Iterator +from typing import override + +import numpy as np +import xarray as xr +from joblib import delayed +from returns.result import ResultE, Success + +from nwp_consumer.internal import entities, ports + + +class DummyModelRepository(ports.ModelRepository): + + @classmethod + @override + def authenticate(cls) -> ResultE["DummyModelRepository"]: + return Success(cls()) + + @staticmethod + @override + def repository() -> entities.ModelRepositoryMetadata: + return entities.ModelRepositoryMetadata( + name="ACME-Test-Models", + is_archive=False, + is_order_based=False, + running_hours=[0, 6, 12, 18], + delay_minutes=60, + max_connections=4, + required_env=[], + optional_env={}, + postprocess_options=entities.PostProcessOptions(), + ) + + @staticmethod + @override + def model() -> entities.ModelMetadata: + return entities.ModelMetadata( + name="simple-random", + resolution="17km", + expected_coordinates=entities.NWPDimensionCoordinateMap( + init_time=[dt.datetime(2021, 1, 1, 0, 0, tzinfo=dt.UTC)], + step=list(range(0, 48, 1)), + variable=[ + entities.Parameter.TEMPERATURE_SL, + entities.Parameter.DOWNWARD_SHORTWAVE_RADIATION_FLUX_GL, + entities.Parameter.CLOUD_COVER_HIGH, + ], + latitude=np.linspace(90, -90, 721).tolist(), + longitude=np.linspace(-180, 179.8, 1440).tolist(), + ), + ) + + + @override + def fetch_init_data(self, it: dt.datetime) \ + -> Iterator[Callable[..., ResultE[list[xr.DataArray]]]]: + + def gen_dataset(step: int, variable: str) -> ResultE[list[xr.DataArray]]: + """Define a generator that provides one variable at one step.""" + da = xr.DataArray( + name=self.model().name, + dims=["init_time", "step", "variable", "latitude", "longitude"], + data=np.random.rand(1, 1, 1, 721, 1440), + coords=self.model().expected_coordinates.to_pandas() | { + "init_time": [np.datetime64(it.replace(tzinfo=None), "ns")], + "step": [step], + "variable": [variable], + }, + ) + return Success([da]) + + + for s in self.model().expected_coordinates.step: + for v in self.model().expected_coordinates.variable: + yield delayed(gen_dataset)(s, v.value) + + +class DummyNotificationRepository(ports.NotificationRepository): + + @override + def notify( + self, + message: entities.StoreAppendedNotification | entities.StoreCreatedNotification, + ) -> ResultE[str]: + return Success(str(message)) + + diff --git a/src/nwp_consumer/internal/services/archiver_service.py b/src/nwp_consumer/internal/services/archiver_service.py new file mode 100644 index 00000000..94901f1b --- /dev/null +++ b/src/nwp_consumer/internal/services/archiver_service.py @@ -0,0 +1,135 @@ +"""Implementation of the NWP consumer services.""" + +import dataclasses +import logging +import os +import pathlib +from typing import TYPE_CHECKING, override + +from joblib import Parallel, cpu_count +from returns.result import Failure, ResultE, Success + +from nwp_consumer.internal import entities, ports + +if TYPE_CHECKING: + import datetime as dt + +log = logging.getLogger("nwp-consumer") + + +class ArchiverService(ports.ArchiveUseCase): + """Service implementation of the consumer use case. + + This services contains the business logic required to enact + the consumer use case. It is responsible for consuming NWP data + and writing it to a Zarr store. + """ + + mr: type[ports.ModelRepository] + nr: type[ports.NotificationRepository] + + def __init__( + self, + model_repository: type[ports.ModelRepository], + notification_repository: type[ports.NotificationRepository], + ) -> None: + """Create a new instance.""" + self.mr = model_repository + self.nr = notification_repository + + @override + def archive(self, year: int, month: int) -> ResultE[str]: + monitor = entities.PerformanceMonitor() + with monitor: + + init_times = self.mr.repository().month_its(year=year, month=month) + + # Create a store for the archive + init_store_result: ResultE[entities.TensorStore] = \ + entities.TensorStore.initialize_empty_store( + model=self.mr.model().name, + repository=self.mr.repository().name, + coords=dataclasses.replace( + self.mr.model().expected_coordinates, + init_time=init_times, + ), + ) + + if isinstance(init_store_result, Failure): + return Failure(OSError( + f"Failed to initialize store for {year}-{month}: {init_store_result!s}"), + ) + store = init_store_result.unwrap() + + missing_times_result = store.missing_times() + if isinstance(missing_times_result, Failure): + return Failure(missing_times_result.failure()) + log.info(f"{len(missing_times_result.unwrap())} missing init_times in store.") + + failed_times: list[dt.datetime] = [] + for n, it in enumerate(missing_times_result.unwrap()): + log.info( + f"Consuming data from {self.mr.repository().name} for {it:%Y-%m-%d %H:%M} " + f"(time {n + 1}/{len(missing_times_result.unwrap())})", + ) + + # Authenticate with the model repository + amr_result = self.mr.authenticate() + if isinstance(amr_result, Failure): + store.delete_store() + return Failure(OSError( + "Unable to authenticate with model repository " + f"'{self.mr.repository().name}': " + f"{amr_result.failure()}", + )) + amr = amr_result.unwrap() + + # Create a generator to fetch and process raw data + n_jobs: int = max(cpu_count() - 1, self.mr.repository().max_connections) + if os.getenv("CONCURRENCY", "True").capitalize() == "False": + n_jobs = 1 + log.debug(f"Downloading using {n_jobs} concurrent thread(s)") + da_result_generator = Parallel( + n_jobs=n_jobs, + prefer="threads", + return_as="generator_unordered", + )(amr.fetch_init_data(it=it)) + + # Regionally write the results of the generator as they are ready + for da_result in da_result_generator: + write_result = da_result.bind(store.write_to_region) + # Fail soft if a region fails to write + if isinstance(write_result, Failure): + log.error(f"Failed to write time {it:%Y-%m-%d %H:%M}: {write_result}") + failed_times.append(it) + + del da_result_generator + + # Add the failed times to the store's metadata + store.update_attrs({ + "failed_times": ", ".join([t.strftime("Day %d %H:%M") for t in failed_times]), + }) + + # Postprocess the dataset as required + # postprocess_result = store.postprocess(self._mr.metadata().postprocess_options) + # if isinstance(postprocess_result, Failure): + # return Failure(postprocess_result.failure()) + + notify_result = self.nr().notify( + message=entities.StoreCreatedNotification( + filename=pathlib.Path(store.path).name, + size_mb=store.size_kb // 1024, + performance=entities.PerformanceMetadata( + duration_seconds=monitor.get_runtime(), + memory_mb=monitor.max_memory_mb(), + ), + ), + ) + if isinstance(notify_result, Failure): + return Failure(OSError( + "Failed to notify of store creation: " + f"{notify_result.failure()}", + )) + + return Success(store.path) + diff --git a/src/nwp_consumer/internal/services/consumer_service.py b/src/nwp_consumer/internal/services/consumer_service.py new file mode 100644 index 00000000..749c90b9 --- /dev/null +++ b/src/nwp_consumer/internal/services/consumer_service.py @@ -0,0 +1,148 @@ +"""Implementation of the NWP consumer services.""" + +import dataclasses +import datetime as dt +import logging +import os +import pathlib +from typing import override + +from joblib import Parallel, cpu_count +from returns.result import Failure, ResultE, Success + +from nwp_consumer.internal import entities, ports + +log = logging.getLogger("nwp-consumer") + + +class ConsumerService(ports.ConsumeUseCase): + """Service implementation of the consumer use case. + + This services contains the business logic required to enact + the consumer use case. It is responsible for consuming NWP data + and writing it to a Zarr store. + """ + + mr: type[ports.ModelRepository] + nr: type[ports.NotificationRepository] + + def __init__( + self, + model_repository: type[ports.ModelRepository], + notification_repository: type[ports.NotificationRepository], + ) -> None: + """Create a new instance.""" + self.mr = model_repository + self.nr = notification_repository + + @override + def consume(self, it: dt.datetime | None = None) -> ResultE[str]: + # Note that the usage of the returns here is not in the spirit of + # 'railway orientated programming', mostly due to to the number of + # generators involved - it seemed clearer to be explicit. However, + # it would be much neater to refactor this to be more functional. + monitor = entities.PerformanceMonitor() + with monitor: + if it is None: + it = self.mr.repository().determine_latest_it_from(dt.datetime.now(tz=dt.UTC)) + log.info( + f"Consuming data from repository '{self.mr.repository().name}' " + f"for the '{self.mr.model().name}' model " + f"spanning init time '{it:%Y-%m-%d %H:%M}'", + ) + + # Create a store for the init time + init_store_result: ResultE[entities.TensorStore] = \ + entities.TensorStore.initialize_empty_store( + model=self.mr.model().name, + repository=self.mr.repository().name, + coords=dataclasses.replace( + self.mr.model().expected_coordinates, + init_time=[it], + ), + ) + + if isinstance(init_store_result, Failure): + return Failure(OSError( + f"Failed to initialize store for init time: {init_store_result!s}", + )) + store = init_store_result.unwrap() + + amr_result = self.mr.authenticate() + if isinstance(amr_result, Failure): + store.delete_store() + return Failure(OSError( + "Unable to authenticate with model repository " + f"'{self.mr.repository().name}': " + f"{amr_result.failure()}", + )) + amr = amr_result.unwrap() + + # Create a generator to fetch and process raw data + n_jobs: int = max(cpu_count() - 1, self.mr.repository().max_connections) + if os.getenv("CONCURRENCY", "True").capitalize() == "False": + n_jobs = 1 + log.debug(f"Downloading using {n_jobs} concurrent thread(s)") + fetch_result_generator = Parallel( + n_jobs=n_jobs, + prefer="threads", + return_as="generator_unordered", + )(amr.fetch_init_data(it=it)) + + # Regionally write the results of the generator as they are ready + failed_etls: int = 0 + for fetch_result in fetch_result_generator: + if isinstance(fetch_result, Failure): + log.error( + f"Error fetching data for init time '{it:%Y-%m-%d %H:%M}' " + f"and model {self.mr.repository().name}: {fetch_result.failure()!s}", + ) + failed_etls += 1 + continue + for da in fetch_result.unwrap(): + write_result = store.write_to_region(da) + if isinstance(write_result, Failure): + log.error( + f"Error writing data for init time '{it:%Y-%m-%d %H:%M}' " + f"and model {self.mr.repository().name}: " + f"{write_result.failure()!s}", + ) + failed_etls += 1 + + del fetch_result_generator + # Fail hard if any of the writes failed + # * TODO: Consider just how hard we want to fail in this instance + if failed_etls > 0: + store.delete_store() + return Failure(OSError( + f"Failed to write {failed_etls} regions " + f"for init time '{it:%Y-%m-%d %H:%M}'. " + "See error logs for details.", + )) + + # Postprocess the dataset as required + # postprocess_result = store.postprocess(self.mr.repository().postprocess_options) + # if isinstance(postprocess_result, Failure): + # return Failure(postprocess_result.failure()) + + notify_result = self.nr().notify( + message=entities.StoreCreatedNotification( + filename=pathlib.Path(store.path).name, + size_mb=store.size_kb // 1024, # TODO: 2024-11-19 check this is right + performance=entities.PerformanceMetadata( + duration_seconds=monitor.get_runtime(), + memory_mb=monitor.max_memory_mb(), + ), + ), + ) + if isinstance(notify_result, Failure): + return Failure(OSError( + "Failed to notify of store creation: " + f"{notify_result.failure()}", + )) + + return Success(store.path) + + @override + def postprocess(self, options: entities.PostProcessOptions) -> ResultE[str]: + return Failure(NotImplementedError("Postprocessing not yet implemented")) diff --git a/src/nwp_consumer/internal/services/test_archiver.py b/src/nwp_consumer/internal/services/test_archiver.py new file mode 100644 index 00000000..3dd6cd86 --- /dev/null +++ b/src/nwp_consumer/internal/services/test_archiver.py @@ -0,0 +1,40 @@ +import shutil +import unittest + +import xarray as xr +from returns.pipeline import is_successful + +from nwp_consumer.internal.services.archiver_service import ArchiverService + +from ._dummy_adaptors import DummyModelRepository, DummyNotificationRepository + + +class TestParallelConsumer(unittest.TestCase): + + @unittest.skip("Takes an age to run, need to figure out a better way.") + def test_archive(self) -> None: + """Test the consume method of the ParallelConsumer class.""" + + test_consumer = ArchiverService( + model_repository=DummyModelRepository, + notification_repository=DummyNotificationRepository, + ) + + result = test_consumer.archive(year=2021, month=1) + + self.assertTrue(is_successful(result), msg=result) + + da: xr.DataArray = xr.open_dataarray(result.unwrap(), engine="zarr") + + self.assertEqual( + list(da.sizes.keys()), + ["init_time", "step", "variable", "latitude", "longitude"], + ) + + path = result.unwrap() + shutil.rmtree(path) + + +if __name__ == "__main__": + unittest.main() + diff --git a/src/nwp_consumer/internal/services/test_consumer.py b/src/nwp_consumer/internal/services/test_consumer.py new file mode 100644 index 00000000..85baea11 --- /dev/null +++ b/src/nwp_consumer/internal/services/test_consumer.py @@ -0,0 +1,40 @@ +import datetime as dt +import shutil +import unittest + +import xarray as xr +from returns.pipeline import is_successful + +from nwp_consumer.internal.services.consumer_service import ConsumerService + +from ._dummy_adaptors import DummyModelRepository, DummyNotificationRepository + + +class TestParallelConsumer(unittest.TestCase): + + def test_consume(self) -> None: + """Test the consume method of the ParallelConsumer class.""" + + test_consumer = ConsumerService( + model_repository=DummyModelRepository, + notification_repository=DummyNotificationRepository, + ) + + result = test_consumer.consume(it=dt.datetime(2021, 1, 1, tzinfo=dt.UTC)) + + self.assertTrue(is_successful(result), msg=result) + + da: xr.DataArray = xr.open_dataarray(result.unwrap(), engine="zarr") + + self.assertEqual( + list(da.sizes.keys()), + ["init_time", "step", "variable", "latitude", "longitude"], + ) + + path = result.unwrap() + shutil.rmtree(path) + + +if __name__ == "__main__": + unittest.main() + diff --git a/src/test_integration/test_inputs_integration.py b/src/test_integration/test_inputs_integration.py deleted file mode 100644 index f7c2bc7e..00000000 --- a/src/test_integration/test_inputs_integration.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Integration tests for the `inputs` module. - -WARNING: Requires environment variables to be set for the MetOffice and CEDA APIs. -Just tests connections to the APIs. Tests assume that attempts to download the -source files would raise an exception in the first TIMEOUT seconds of running, -and will be considered passed if no exception is raised within that time. -""" - -import datetime as dt -import unittest - -from nwp_consumer.internal import config, inputs, outputs - -storageClient = outputs.localfs.Client() - - -TIMEOUT = 10 - - -class TestListRawFilesForInitTime(unittest.TestCase): - def test_getsFileInfosFromCEDA(self) -> None: - cedaInitTime: dt.datetime = dt.datetime( - year=2022, - month=1, - day=1, - hour=0, - minute=0, - tzinfo=dt.UTC, - ) - c = config.CEDAEnv() - cedaClient = inputs.ceda.Client( - ftpUsername=c.CEDA_FTP_USER, - ftpPassword=c.CEDA_FTP_PASS, - ) - fileInfos = cedaClient.listRawFilesForInitTime(it=cedaInitTime) - self.assertTrue(len(fileInfos) > 0) - - def test_getsFileInfosFromMetOffice(self) -> None: - metOfficeInitTime: dt.datetime = dt.datetime.now(tz=dt.UTC).replace( - hour=0, - minute=0, - second=0, - microsecond=0, - ) - c = config.MetOfficeEnv() - metOfficeClient = inputs.metoffice.Client( - orderID=c.METOFFICE_ORDER_ID, - apiKey=c.METOFFICE_API_KEY, - ) - fileInfos = metOfficeClient.listRawFilesForInitTime(it=metOfficeInitTime) - self.assertTrue(len(fileInfos) > 0) - - def test_getsFileInfosFromECMWFMARS(self) -> None: - ecmwfMarsInitTime: dt.datetime = dt.datetime( - year=2022, - month=1, - day=1, - hour=0, - minute=0, - tzinfo=dt.UTC, - ) - c = config.ECMWFMARSEnv() - ecmwfMarsClient = inputs.ecmwf.MARSClient( - area=c.ECMWF_AREA, - hours=4, - ) - fileInfos = ecmwfMarsClient.listRawFilesForInitTime(it=ecmwfMarsInitTime) - self.assertTrue(len(fileInfos) > 0) - - def test_getsFileInfosFromICON(self) -> None: - iconInitTime: dt.datetime = dt.datetime.now(tz=dt.UTC).replace( - hour=0, - minute=0, - second=0, - microsecond=0, - ) - iconClient = inputs.icon.Client( - model="global", - hours=4, - param_group="basic", - ) - fileInfos = iconClient.listRawFilesForInitTime(it=iconInitTime) - self.assertTrue(len(fileInfos) > 0) - - iconClient = inputs.icon.Client( - model="europe", - hours=4, - param_group="basic", - ) - euFileInfos = iconClient.listRawFilesForInitTime(it=iconInitTime) - self.assertTrue(len(euFileInfos) > 0) - self.assertNotEqual(fileInfos, euFileInfos) - - def test_getsFileInfosFromCMC(self) -> None: - cmcInitTime: dt.datetime = dt.datetime.now(tz=dt.UTC).replace( - hour=0, - minute=0, - second=0, - microsecond=0, - ) - cmcClient = inputs.cmc.Client( - model="gdps", - hours=4, - param_group="basic", - ) - fileInfos = cmcClient.listRawFilesForInitTime(it=cmcInitTime) - self.assertGreater(len(fileInfos), 0) - - cmcClient = inputs.cmc.Client( - model="geps", - hours=4, - param_group="basic", - ) - gepsFileInfos = cmcClient.listRawFilesForInitTime(it=cmcInitTime) - self.assertGreater(len(gepsFileInfos), 0) - self.assertNotEqual(fileInfos, gepsFileInfos) - - def test_getsFileInfosFromMeteoFrance(self) -> None: - arpegeInitTime: dt.datetime = dt.datetime.now(tz=dt.UTC).replace( - hour=0, - minute=0, - second=0, - microsecond=0, - ) - arpegeClient = inputs.meteofrance.Client( - model="global", - hours=4, - param_group="basic", - ) - fileInfos = arpegeClient.listRawFilesForInitTime(it=arpegeInitTime) - self.assertTrue(len(fileInfos) > 0) - - arpegeClient = inputs.meteofrance.Client( - model="europe", - hours=4, - param_group="basic", - ) - europeFileInfos = arpegeClient.listRawFilesForInitTime(it=arpegeInitTime) - self.assertTrue(len(europeFileInfos) > 0) - self.assertNotEqual(fileInfos, europeFileInfos) - - def test_getsFilesFromNOAANCAR(self) -> None: - ncarInitTime: dt.datetime = dt.datetime( - year=2023, - month=12, - day=19, - tzinfo=dt.UTC, - ) - ncarClient = inputs.noaa.NCARClient( - model="global", - param_group="full", - hours=4, - ) - fileInfos = ncarClient.listRawFilesForInitTime(it=ncarInitTime) - self.assertTrue(len(fileInfos) > 0) - - -if __name__ == "__main__": - unittest.main() diff --git a/src/test_integration/test_integration.py b/src/test_integration/test_integration.py new file mode 100644 index 00000000..a60e2ec5 --- /dev/null +++ b/src/test_integration/test_integration.py @@ -0,0 +1,28 @@ +import datetime as dt +import unittest + +import xarray as xr +from returns.pipeline import is_successful + +from nwp_consumer.internal import handlers, repositories, services + + +class TestIntegration(unittest.TestCase): + def test_ceda_metoffice_global_model(self) -> None: + c = handlers.CLIHandler( + consumer_usecase=services.ConsumerService( + model_repository=repositories.model_repositories.CEDAFTPModelRepository, + notification_repository=repositories.notification_repositories.StdoutNotificationRepository, + ), + archiver_usecase=services.ArchiverService( + model_repository=repositories.model_repositories.CEDAFTPModelRepository, + notification_repository=repositories.notification_repositories.StdoutNotificationRepository, + ), + ) + result = c._consumer_usecase.consume(it=dt.datetime(2021, 1, 1, tzinfo=dt.UTC)) + + self.assertTrue(is_successful(result), msg=f"{result}") + + da = xr.open_dataarray(result.unwrap(), engine="zarr") + + self.assertTrue(da.sizes["init_time"] > 0) diff --git a/src/test_integration/test_service_integration.py b/src/test_integration/test_service_integration.py deleted file mode 100644 index 103fae6b..00000000 --- a/src/test_integration/test_service_integration.py +++ /dev/null @@ -1,201 +0,0 @@ -"""Integration tests for the NWPConsumerService class. - -WARNING: Requires environment variables to be set for the MetOffice and CEDA APIs. -Will download up to a GB of data. Costs may apply for usage of the APIs. - -Runs the main function of the consumer as it would appear externally imported -""" - -import datetime as dt -import os -import shutil -import unittest -import unittest.mock - -import numpy as np -import ocf_blosc2 # noqa: F401 -import xarray as xr -from nwp_consumer.cmd.main import run - - -class TestNWPConsumerService_MetOffice(unittest.TestCase): - """Integration tests for the NWPConsumerService class.""" - - def setUp(self) -> None: - self.rawdir = "data/me_raw" - self.zarrdir = "data/me_zarr" - - def test_downloadAndConvertDataset(self) -> None: - initTime: dt.datetime = dt.datetime.now(tz=dt.UTC) - - raw_files, zarr_files = run( - [ - "consume", - "--source=metoffice", - "--rdir=" + self.rawdir, - "--zdir=" + self.zarrdir, - "--from=" + initTime.strftime("%Y-%m-%dT00:00"), - ], - ) - - self.assertGreater(len(raw_files), 0) - self.assertEqual(len(zarr_files), 1) - - for path in zarr_files: - ds = xr.open_zarr(store=f"zip::{path.as_posix()}") - - # The number of variables in the dataset depends on the order from MetOffice - numVars = len(ds.coords["variable"].values) - - # Ensure the dimensions have the right sizes - self.assertDictEqual( - {"variable": numVars, "init_time": 1, "step": 5, "y": 639, "x": 455}, - dict(ds.sizes.items()), - ) - # Ensure the dimensions of the variables are in the correct order - self.assertEqual(("variable", "init_time", "step", "y", "x"), ds["UKV"].dims) - # Ensure the init time is correct - self.assertEqual( - np.datetime64(initTime.strftime("%Y-%m-%dT00:00")), - ds.coords["init_time"].values[0], - ) - - shutil.rmtree(self.rawdir) - shutil.rmtree(self.zarrdir) - - -class TestNWPConsumerService_CEDA(unittest.TestCase): - """Integration tests for the NWPConsumerService class.""" - - def setUp(self) -> None: - self.rawdir = "data/cd_raw" - self.zarrdir = "data/cd_zarr" - - def test_downloadAndConvertDataset(self) -> None: - raw_files, zarr_files = run( - [ - "consume", - "--source=ceda", - "--rdir=" + self.rawdir, - "--zdir=" + self.zarrdir, - "--from=2022-01-01T12:00", - ], - ) - - self.assertGreater(len(raw_files), 0) - self.assertEqual(len(zarr_files), 1) - - for path in zarr_files: - ds = xr.open_zarr(store=f"zip::{path.as_posix()}").compute() - - # Enusre the data variables are correct - self.assertEqual(["UKV"], list(ds.data_vars)) - # Ensure the dimensions have the right sizes - self.assertEqual( - {"variable": 12, "init_time": 1, "step": 51, "y": 704, "x": 548}, - dict(ds.sizes.items()), - ) - # Ensure the init time is correct - self.assertEqual( - np.datetime64("2022-01-01T12:00"), - ds.coords["init_time"].values[0], - ) - - shutil.rmtree(self.rawdir) - shutil.rmtree(self.zarrdir) - - -class TestNWPConverterService_ECMWFMARS(unittest.TestCase): - def setUp(self) -> None: - self.rawdir = "data/ec_raw" - self.zarrdir = "data/ec_zarr" - - @unittest.mock.patch.dict(os.environ, {"ECMWF_PARAMETER_GROUP": "basic", "ECMWF_HOURS": "3"}) - def test_downloadAndConvertDataset(self) -> None: - initTime: dt.datetime = dt.datetime(year=2022, month=1, day=1, tzinfo=dt.UTC) - - raw_files, zarr_files = run( - [ - "consume", - "--source=ecmwf-mars", - "--rdir=" + self.rawdir, - "--zdir=" + self.zarrdir, - "--from=" + initTime.strftime("%Y-%m-%dT00:00"), - ], - ) - - self.assertGreater(len(raw_files), 0) - self.assertEqual(len(zarr_files), 1) - - for path in zarr_files: - ds = xr.open_zarr(store=f"zip::{path.as_posix()}").compute() - - # Ensure the data variables are correct - self.assertEqual(["ECMWF_UK"], list(ds.data_vars)) - # Ensure the dimensions have the right sizes. - # * Should be two variables due to the "basic" parameter group - # * Should be 4 steps due to the "3" hours - self.assertEqual( - { - "variable": 2, - "init_time": 1, - "step": 3, - "latitude": 141, - "longitude": 151, - }, - dict(ds.sizes.items()), - ) - # Ensure the init time is correct - self.assertEqual( - np.datetime64(initTime.strftime("%Y-%m-%dT00:00")), - ds.coords["init_time"].values[0], - ) - - shutil.rmtree(self.rawdir) - shutil.rmtree(self.zarrdir) - - -class TestNWPConsumerService_ICON(unittest.TestCase): - """Integration tests for the NWPConsumerService class.""" - - def setUp(self) -> None: - self.rawdir = "data/ic_raw" - self.zarrdir = "data/ic_zarr" - - @unittest.mock.patch.dict(os.environ, {"ICON_PARAMETER_GROUP": "basic", "ICON_HOURS": "3"}) - def test_downloadAndConvertDataset(self) -> None: - initTime: dt.datetime = dt.datetime.now(tz=dt.UTC) - - raw_files, zarr_files = run( - [ - "consume", - "--source=icon", - "--rdir=" + self.rawdir, - "--zdir=" + self.zarrdir, - "--from=" + initTime.strftime("%Y-%m-%dT00:00"), - ], - ) - - self.assertGreater(len(raw_files), 0) - self.assertEqual(len(zarr_files), 1) - - for path in zarr_files: - ds = xr.open_zarr(store=f"zip::{path.as_posix()}").compute() - - # Ensure the data variables are correct - self.assertEqual(["ICON_EUROPE"], list(ds.data_vars)) - # Ensure the dimensions have the right sizes - # * Should be two variables due to the "basic" parameter group - # * Should be 4 steps due to the "3" hours - self.assertEqual( - {"variable": 2, "init_time": 1, "step": 4, "latitude": 657, "longitude": 1377}, - ds.sizes, - ) - # Ensure the init time is correct - self.assertEqual( - np.datetime64(initTime.strftime("%Y-%m-%dT00:00")), - ds.coords["init_time"].values[0], - ) - - shutil.rmtree(self.rawdir) - shutil.rmtree(self.zarrdir) diff --git a/taskfile.yml b/taskfile.yml deleted file mode 100644 index 8ed78417..00000000 --- a/taskfile.yml +++ /dev/null @@ -1,47 +0,0 @@ -version: '3' - -# If you want to run with python from a specific environment, -# set the PYTHON_PREFIX environment variable to -# /path/to/python/dir/ - -tasks: - - install-dependencies: - aliases: ["install"] - desc: "Install application dependencies as defined in pyproject.toml" - cmds: - - ${PYTHON_PREFIX}python -m pip install -q -e . - - install-dev-dependencies: - aliases: ["install-dev"] - desc: "Installs development dependencies as defined in pyproject.toml" - cmds: - - ${PYTHON_PREFIX}python -m pip install --upgrade -q pip wheel setuptools - - ${PYTHON_PREFIX}python -m pip install -q -e .[dev] - - test-unit: - aliases: ["ut"] - deps: [install-dev-dependencies] - desc: "Run all application unittests" - cmds: - - ${PYTHON_PREFIX}python -m xmlrunner discover -s src/nwp_consumer -p "test_*.py" --output-file ut-report.xml - - test-integration: - aliases: ["it"] - deps: [install-dev-dependencies] - desc: "Run all application integration tests" - cmds: - - ${PYTHON_PREFIX}python -m xmlrunner discover -s src/test_integration -p "test_*.py" --output-file it-report.xml - - build-wheel: - aliases: ["wheel"] - desc: "Build python wheel" - cmds: - - ${PYTHON_PREFIX}python -m pip wheel . --no-deps --wheel-dir dist - - build-container: - aliases: ["cont"] - desc: "Build container" - cmds: - - docker build -f Containerfile . --tag nwp-consumer:local --progress=plain -