diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 8e2560d910d..74b54ad55d6 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -79,7 +79,8 @@ jobs: # # If dependencies emit warnings we can't do anything about, add ignores to # `xarray/tests/__init__.py`. - python -m pytest --doctest-modules xarray --ignore xarray/tests -Werror + # [MHS, 01/25/2024] Skip datatree_ documentation remove after #8572 + python -m pytest --doctest-modules xarray --ignore xarray/tests --ignore xarray/datatree_ -Werror mypy: name: Mypy diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4af262a0a04..64d6bcaebf9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,7 @@ # https://pre-commit.com/ ci: autoupdate_schedule: monthly +exclude: 'xarray/datatree_.*' repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000000..032b620f433 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +prune xarray/datatree_* diff --git a/pyproject.toml b/pyproject.toml index 2a185933b47..81d41a38939 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,10 @@ exclude_lines = ["pragma: no cover", "if TYPE_CHECKING"] [tool.mypy] enable_error_code = "redundant-self" -exclude = 'xarray/util/generate_.*\.py' +exclude = [ + 'xarray/util/generate_.*\.py', + 'xarray/datatree_/.*\.py', +] files = "xarray" show_error_codes = true show_error_context = true diff --git a/xarray/datatree_/.flake8 b/xarray/datatree_/.flake8 new file mode 100644 index 00000000000..f1e3f9271e1 --- /dev/null +++ b/xarray/datatree_/.flake8 @@ -0,0 +1,15 @@ +[flake8] +ignore = + # whitespace before ':' - doesn't work well with black + E203 + # module level import not at top of file + E402 + # line too long - let black worry about that + E501 + # do not assign a lambda expression, use a def + E731 + # line break before binary operator + W503 +exclude= + .eggs + doc diff --git a/xarray/datatree_/.git_archival.txt b/xarray/datatree_/.git_archival.txt new file mode 100644 index 00000000000..3994ec0a83e --- /dev/null +++ b/xarray/datatree_/.git_archival.txt @@ -0,0 +1,4 @@ +node: $Format:%H$ +node-date: $Format:%cI$ +describe-name: $Format:%(describe:tags=true)$ +ref-names: $Format:%D$ diff --git a/xarray/datatree_/.github/dependabot.yml b/xarray/datatree_/.github/dependabot.yml new file mode 100644 index 00000000000..d1d1190be70 --- /dev/null +++ b/xarray/datatree_/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: "/" + schedule: + interval: daily + - package-ecosystem: "github-actions" + directory: "/" + schedule: + # Check for updates to GitHub Actions every weekday + interval: "daily" diff --git a/xarray/datatree_/.github/pull_request_template.md b/xarray/datatree_/.github/pull_request_template.md new file mode 100644 index 00000000000..8270498108a --- /dev/null +++ b/xarray/datatree_/.github/pull_request_template.md @@ -0,0 +1,7 @@ + + +- [ ] Closes #xxxx +- [ ] Tests added +- [ ] Passes `pre-commit run --all-files` +- [ ] New functions/methods are listed in `api.rst` +- [ ] Changes are summarized in `docs/source/whats-new.rst` diff --git a/xarray/datatree_/.github/workflows/main.yaml b/xarray/datatree_/.github/workflows/main.yaml new file mode 100644 index 00000000000..37034fc5900 --- /dev/null +++ b/xarray/datatree_/.github/workflows/main.yaml @@ -0,0 +1,97 @@ +name: CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + schedule: + - cron: "0 0 * * *" + +jobs: + + test: + name: ${{ matrix.python-version }}-build + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Create conda environment + uses: mamba-org/provision-with-micromamba@main + with: + cache-downloads: true + micromamba-version: 'latest' + environment-file: ci/environment.yml + extra-specs: | + python=${{ matrix.python-version }} + + - name: Conda info + run: conda info + + - name: Install datatree + run: | + python -m pip install -e . --no-deps --force-reinstall + + - name: Conda list + run: conda list + + - name: Running Tests + run: | + python -m pytest --cov=./ --cov-report=xml --verbose + + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false + + + test-upstream: + name: ${{ matrix.python-version }}-dev-build + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Create conda environment + uses: mamba-org/provision-with-micromamba@main + with: + cache-downloads: true + micromamba-version: 'latest' + environment-file: ci/environment.yml + extra-specs: | + python=${{ matrix.python-version }} + + - name: Conda info + run: conda info + + - name: Install dev reqs + run: | + python -m pip install --no-deps --upgrade \ + git+https://github.com/pydata/xarray \ + git+https://github.com/Unidata/netcdf4-python + + python -m pip install -e . --no-deps --force-reinstall + + - name: Conda list + run: conda list + + - name: Running Tests + run: | + python -m pytest --verbose diff --git a/xarray/datatree_/.github/workflows/pypipublish.yaml b/xarray/datatree_/.github/workflows/pypipublish.yaml new file mode 100644 index 00000000000..7dc36d87691 --- /dev/null +++ b/xarray/datatree_/.github/workflows/pypipublish.yaml @@ -0,0 +1,84 @@ +name: Build distribution +on: + release: + types: + - published + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-artifacts: + runs-on: ubuntu-latest + if: github.repository == 'xarray-contrib/datatree' + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@v5 + name: Install Python + with: + python-version: 3.9 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build + + - name: Build tarball and wheels + run: | + git clean -xdf + git restore -SW . + python -m build --sdist --wheel . + + + - uses: actions/upload-artifact@v4 + with: + name: releases + path: dist + + test-built-dist: + needs: build-artifacts + runs-on: ubuntu-latest + steps: + - uses: actions/setup-python@v5 + name: Install Python + with: + python-version: '3.10' + - uses: actions/download-artifact@v4 + with: + name: releases + path: dist + - name: List contents of built dist + run: | + ls -ltrh + ls -ltrh dist + + - name: Verify the built dist/wheel is valid + run: | + python -m pip install --upgrade pip + python -m pip install dist/xarray_datatree*.whl + python -c "import datatree; print(datatree.__version__)" + + upload-to-pypi: + needs: test-built-dist + if: github.event_name == 'release' + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v4 + with: + name: releases + path: dist + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@v1.8.11 + with: + user: ${{ secrets.PYPI_USERNAME }} + password: ${{ secrets.PYPI_PASSWORD }} + verbose: true diff --git a/xarray/datatree_/.gitignore b/xarray/datatree_/.gitignore new file mode 100644 index 00000000000..88af9943a90 --- /dev/null +++ b/xarray/datatree_/.gitignore @@ -0,0 +1,136 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/source/generated + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# version +_version.py + +# Ignore vscode specific settings +.vscode/ diff --git a/xarray/datatree_/.pre-commit-config.yaml b/xarray/datatree_/.pre-commit-config.yaml new file mode 100644 index 00000000000..ea73c38d73e --- /dev/null +++ b/xarray/datatree_/.pre-commit-config.yaml @@ -0,0 +1,58 @@ +# https://pre-commit.com/ +ci: + autoupdate_schedule: monthly +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + # isort should run before black as black sometimes tweaks the isort output + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + # https://github.com/python/black#version-control-integration + - repo: https://github.com/psf/black + rev: 23.12.1 + hooks: + - id: black + - repo: https://github.com/keewis/blackdoc + rev: v0.3.9 + hooks: + - id: blackdoc + - repo: https://github.com/PyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + # - repo: https://github.com/Carreau/velin + # rev: 0.0.8 + # hooks: + # - id: velin + # args: ["--write", "--compact"] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + # Copied from setup.cfg + exclude: "properties|asv_bench|docs" + additional_dependencies: [ + # Type stubs + types-python-dateutil, + types-pkg_resources, + types-PyYAML, + types-pytz, + # Dependencies that are typed + numpy, + typing-extensions>=4.1.0, + ] + # run this occasionally, ref discussion https://github.com/pydata/xarray/pull/3194 + # - repo: https://github.com/asottile/pyupgrade + # rev: v1.22.1 + # hooks: + # - id: pyupgrade + # args: + # - "--py3-only" + # # remove on f-strings in Py3.7 + # - "--keep-percent-format" diff --git a/xarray/datatree_/LICENSE b/xarray/datatree_/LICENSE new file mode 100644 index 00000000000..d68e7230919 --- /dev/null +++ b/xarray/datatree_/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright (c) 2022 onwards, datatree developers + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/xarray/datatree_/README.md b/xarray/datatree_/README.md new file mode 100644 index 00000000000..e41a13b4cb6 --- /dev/null +++ b/xarray/datatree_/README.md @@ -0,0 +1,95 @@ +# datatree + +| CI | [![GitHub Workflow Status][github-ci-badge]][github-ci-link] [![Code Coverage Status][codecov-badge]][codecov-link] [![pre-commit.ci status][pre-commit.ci-badge]][pre-commit.ci-link] | +| :---------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| **Docs** | [![Documentation Status][rtd-badge]][rtd-link] | +| **Package** | [![Conda][conda-badge]][conda-link] [![PyPI][pypi-badge]][pypi-link] | +| **License** | [![License][license-badge]][repo-link] | + + +**Datatree is a prototype implementation of a tree-like hierarchical data structure for xarray.** + +Datatree was born after the xarray team recognised a [need for a new hierarchical data structure](https://github.com/pydata/xarray/issues/4118), +that was more flexible than a single `xarray.Dataset` object. +The initial motivation was to represent netCDF files / Zarr stores with multiple nested groups in a single in-memory object, +but `datatree.DataTree` objects have many other uses. + +### DEPRECATION NOTICE + +Datatree is in the process of being merged upstream into xarray (as of [v0.0.14](https://github.com/xarray-contrib/datatree/releases/tag/v0.0.14), see xarray issue [#8572](https://github.com/pydata/xarray/issues/8572)). We are aiming to preserve the record of contributions to this repository during the migration process. However whilst we will hapily accept new PRs to this repository, this repo will be deprecated and any PRs since [v0.0.14](https://github.com/xarray-contrib/datatree/releases/tag/v0.0.14) might be later copied across to xarray without full git attribution. + +Hopefully for users the disruption will be minimal - and just mean that in some future version of xarray you only need to do `from xarray import DataTree` rather than `from datatree import DataTree`. Once the migration is complete this repository will be archived. + +### Installation +You can install datatree via pip: +```shell +pip install xarray-datatree +``` + +or via conda-forge +```shell +conda install -c conda-forge xarray-datatree +``` + +### Why Datatree? + +You might want to use datatree for: + +- Organising many related datasets, e.g. results of the same experiment with different parameters, or simulations of the same system using different models, +- Analysing similar data at multiple resolutions simultaneously, such as when doing a convergence study, +- Comparing heterogenous but related data, such as experimental and theoretical data, +- I/O with nested data formats such as netCDF / Zarr groups. + +[**Talk slides on Datatree from AMS-python 2023**](https://speakerdeck.com/tomnicholas/xarray-datatree-hierarchical-data-structures-for-multi-model-science) + +### Features + +The approach used here is based on benbovy's [`DatasetNode` example](https://gist.github.com/benbovy/92e7c76220af1aaa4b3a0b65374e233a) - the basic idea is that each tree node wraps a up to a single `xarray.Dataset`. The differences are that this effort: +- Uses a node structure inspired by [anytree](https://github.com/xarray-contrib/datatree/issues/7) for the tree, +- Implements path-like getting and setting, +- Has functions for mapping user-supplied functions over every node in the tree, +- Automatically dispatches *some* of `xarray.Dataset`'s API over every node in the tree (such as `.isel`), +- Has a bunch of tests, +- Has a printable representation that currently looks like this: +drawing + +### Get Started + +You can create a `DataTree` object in 3 ways: +1) Load from a netCDF file (or Zarr store) that has groups via `open_datatree()`. +2) Using the init method of `DataTree`, which creates an individual node. + You can then specify the nodes' relationships to one other, either by setting `.parent` and `.children` attributes, + or through `__get/setitem__` access, e.g. `dt['path/to/node'] = DataTree()`. +3) Create a tree from a dictionary of paths to datasets using `DataTree.from_dict()`. + +### Development Roadmap + +Datatree currently lives in a separate repository to the main xarray package. +This allows the datatree developers to make changes to it, experiment, and improve it faster. + +Eventually we plan to fully integrate datatree upstream into xarray's main codebase, at which point the [github.com/xarray-contrib/datatree](https://github.com/xarray-contrib/datatree>) repository will be archived. +This should not cause much disruption to code that depends on datatree - you will likely only have to change the import line (i.e. from ``from datatree import DataTree`` to ``from xarray import DataTree``). + +However, until this full integration occurs, datatree's API should not be considered to have the same [level of stability as xarray's](https://docs.xarray.dev/en/stable/contributing.html#backwards-compatibility). + +### User Feedback + +We really really really want to hear your opinions on datatree! +At this point in development, user feedback is critical to help us create something that will suit everyone's needs. +Please raise any thoughts, issues, suggestions or bugs, no matter how small or large, on the [github issue tracker](https://github.com/xarray-contrib/datatree/issues). + + +[github-ci-badge]: https://img.shields.io/github/actions/workflow/status/xarray-contrib/datatree/main.yaml?branch=main&label=CI&logo=github +[github-ci-link]: https://github.com/xarray-contrib/datatree/actions?query=workflow%3ACI +[codecov-badge]: https://img.shields.io/codecov/c/github/xarray-contrib/datatree.svg?logo=codecov +[codecov-link]: https://codecov.io/gh/xarray-contrib/datatree +[rtd-badge]: https://img.shields.io/readthedocs/xarray-datatree/latest.svg +[rtd-link]: https://xarray-datatree.readthedocs.io/en/latest/?badge=latest +[pypi-badge]: https://img.shields.io/pypi/v/xarray-datatree?logo=pypi +[pypi-link]: https://pypi.org/project/xarray-datatree +[conda-badge]: https://img.shields.io/conda/vn/conda-forge/xarray-datatree?logo=anaconda +[conda-link]: https://anaconda.org/conda-forge/xarray-datatree +[license-badge]: https://img.shields.io/github/license/xarray-contrib/datatree +[repo-link]: https://github.com/xarray-contrib/datatree +[pre-commit.ci-badge]: https://results.pre-commit.ci/badge/github/xarray-contrib/datatree/main.svg +[pre-commit.ci-link]: https://results.pre-commit.ci/latest/github/xarray-contrib/datatree/main diff --git a/xarray/datatree_/ci/doc.yml b/xarray/datatree_/ci/doc.yml new file mode 100644 index 00000000000..f3b95f71bd4 --- /dev/null +++ b/xarray/datatree_/ci/doc.yml @@ -0,0 +1,25 @@ +name: datatree-doc +channels: + - conda-forge +dependencies: + - pip + - python>=3.9 + - netcdf4 + - scipy + - sphinx>=4.2.0 + - sphinx-copybutton + - sphinx-panels + - sphinx-autosummary-accessors + - sphinx-book-theme >= 0.0.38 + - nbsphinx + - sphinxcontrib-srclinks + - pickleshare + - pydata-sphinx-theme>=0.4.3 + - ipython + - h5netcdf + - zarr + - xarray + - pip: + - -e .. + - sphinxext-rediraffe + - sphinxext-opengraph diff --git a/xarray/datatree_/ci/environment.yml b/xarray/datatree_/ci/environment.yml new file mode 100644 index 00000000000..fc0c6d97e9f --- /dev/null +++ b/xarray/datatree_/ci/environment.yml @@ -0,0 +1,16 @@ +name: datatree-test +channels: + - conda-forge + - nodefaults +dependencies: + - python>=3.9 + - netcdf4 + - pytest + - flake8 + - black + - codecov + - pytest-cov + - h5netcdf + - zarr + - pip: + - xarray>=2022.05.0.dev0 diff --git a/xarray/datatree_/codecov.yml b/xarray/datatree_/codecov.yml new file mode 100644 index 00000000000..44fd739d417 --- /dev/null +++ b/xarray/datatree_/codecov.yml @@ -0,0 +1,21 @@ +codecov: + require_ci_to_pass: false + max_report_age: off + +comment: false + +ignore: + - 'datatree/tests/*' + - 'setup.py' + - 'conftest.py' + +coverage: + precision: 2 + round: down + status: + project: + default: + target: 95 + informational: true + patch: off + changes: false diff --git a/xarray/datatree_/conftest.py b/xarray/datatree_/conftest.py new file mode 100644 index 00000000000..7ef19174298 --- /dev/null +++ b/xarray/datatree_/conftest.py @@ -0,0 +1,3 @@ +import pytest + +pytest.register_assert_rewrite("datatree.testing") diff --git a/xarray/datatree_/datatree/__init__.py b/xarray/datatree_/datatree/__init__.py new file mode 100644 index 00000000000..03342d34ff5 --- /dev/null +++ b/xarray/datatree_/datatree/__init__.py @@ -0,0 +1,25 @@ +# import public API +from xarray.datatree_.datatree.datatree import DataTree +from xarray.datatree_.datatree.extensions import register_datatree_accessor +from xarray.datatree_.datatree.io import open_datatree +from xarray.datatree_.datatree.mapping import TreeIsomorphismError, map_over_subtree +from xarray.datatree_.datatree.treenode import InvalidTreeError, NotFoundInTreeError + +try: + # NOTE: the `_version.py` file must not be present in the git repository + # as it is generated by setuptools at install time + from xarray.datatree_.datatree._version import __version__ +except ImportError: # pragma: no cover + # Local copy or not installed with setuptools + __version__ = "999" + +__all__ = ( + "DataTree", + "open_datatree", + "TreeIsomorphismError", + "InvalidTreeError", + "NotFoundInTreeError", + "map_over_subtree", + "register_datatree_accessor", + "__version__", +) diff --git a/xarray/datatree_/datatree/common.py b/xarray/datatree_/datatree/common.py new file mode 100644 index 00000000000..e4d52925ede --- /dev/null +++ b/xarray/datatree_/datatree/common.py @@ -0,0 +1,105 @@ +""" +This file and class only exists because it was easier to copy the code for AttrAccessMixin from xarray.core.common +with some slight modifications than it was to change the behaviour of an inherited xarray internal here. + +The modifications are marked with # TODO comments. +""" + +import warnings +from contextlib import suppress +from typing import Any, Hashable, Iterable, List, Mapping + + +class TreeAttrAccessMixin: + """Mixin class that allows getting keys with attribute access""" + + __slots__ = () + + def __init_subclass__(cls, **kwargs): + """Verify that all subclasses explicitly define ``__slots__``. If they don't, + raise error in the core xarray module and a FutureWarning in third-party + extensions. + """ + if not hasattr(object.__new__(cls), "__dict__"): + pass + # TODO reinstate this once integrated upstream + # elif cls.__module__.startswith("datatree."): + # raise AttributeError(f"{cls.__name__} must explicitly define __slots__") + # else: + # cls.__setattr__ = cls._setattr_dict + # warnings.warn( + # f"xarray subclass {cls.__name__} should explicitly define __slots__", + # FutureWarning, + # stacklevel=2, + # ) + super().__init_subclass__(**kwargs) + + @property + def _attr_sources(self) -> Iterable[Mapping[Hashable, Any]]: + """Places to look-up items for attribute-style access""" + yield from () + + @property + def _item_sources(self) -> Iterable[Mapping[Hashable, Any]]: + """Places to look-up items for key-autocompletion""" + yield from () + + def __getattr__(self, name: str) -> Any: + if name not in {"__dict__", "__setstate__"}: + # this avoids an infinite loop when pickle looks for the + # __setstate__ attribute before the xarray object is initialized + for source in self._attr_sources: + with suppress(KeyError): + return source[name] + raise AttributeError( + f"{type(self).__name__!r} object has no attribute {name!r}" + ) + + # This complicated two-method design boosts overall performance of simple operations + # - particularly DataArray methods that perform a _to_temp_dataset() round-trip - by + # a whopping 8% compared to a single method that checks hasattr(self, "__dict__") at + # runtime before every single assignment. All of this is just temporary until the + # FutureWarning can be changed into a hard crash. + def _setattr_dict(self, name: str, value: Any) -> None: + """Deprecated third party subclass (see ``__init_subclass__`` above)""" + object.__setattr__(self, name, value) + if name in self.__dict__: + # Custom, non-slotted attr, or improperly assigned variable? + warnings.warn( + f"Setting attribute {name!r} on a {type(self).__name__!r} object. Explicitly define __slots__ " + "to suppress this warning for legitimate custom attributes and " + "raise an error when attempting variables assignments.", + FutureWarning, + stacklevel=2, + ) + + def __setattr__(self, name: str, value: Any) -> None: + """Objects with ``__slots__`` raise AttributeError if you try setting an + undeclared attribute. This is desirable, but the error message could use some + improvement. + """ + try: + object.__setattr__(self, name, value) + except AttributeError as e: + # Don't accidentally shadow custom AttributeErrors, e.g. + # DataArray.dims.setter + if str(e) != "{!r} object has no attribute {!r}".format( + type(self).__name__, name + ): + raise + raise AttributeError( + f"cannot set attribute {name!r} on a {type(self).__name__!r} object. Use __setitem__ style" + "assignment (e.g., `ds['name'] = ...`) instead of assigning variables." + ) from e + + def __dir__(self) -> List[str]: + """Provide method name lookup and completion. Only provide 'public' + methods. + """ + extra_attrs = { + item + for source in self._attr_sources + for item in source + if isinstance(item, str) + } + return sorted(set(dir(type(self))) | extra_attrs) diff --git a/xarray/datatree_/datatree/datatree.py b/xarray/datatree_/datatree/datatree.py new file mode 100644 index 00000000000..61636012ba2 --- /dev/null +++ b/xarray/datatree_/datatree/datatree.py @@ -0,0 +1,1545 @@ +from __future__ import annotations + +import copy +import itertools +from collections import OrderedDict +from html import escape +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generic, + Hashable, + Iterable, + Iterator, + List, + Mapping, + MutableMapping, + Optional, + Set, + Tuple, + Union, + overload, +) + +from xarray.core import utils +from xarray.core.coordinates import DatasetCoordinates +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset, DataVariables +from xarray.core.indexes import Index, Indexes +from xarray.core.merge import dataset_update_method +from xarray.core.options import OPTIONS as XR_OPTS +from xarray.core.utils import ( + Default, + Frozen, + HybridMappingProxy, + _default, + either_dict_or_kwargs, + maybe_wrap_array, +) +from xarray.core.variable import Variable + +from xarray.datatree_.datatree import formatting, formatting_html +from xarray.datatree_.datatree.common import TreeAttrAccessMixin +from xarray.datatree_.datatree.mapping import ( + TreeIsomorphismError, + check_isomorphic, + map_over_subtree, +) +from xarray.datatree_.datatree.ops import ( + DataTreeArithmeticMixin, + MappedDatasetMethodsMixin, + MappedDataWithCoords, +) +from xarray.datatree_.datatree.render import RenderTree +from xarray.datatree_.datatree.treenode import NamedNode, NodePath, Tree + +try: + from xarray.core.variable import calculate_dimensions +except ImportError: + # for xarray versions 2022.03.0 and earlier + from xarray.core.dataset import calculate_dimensions + +if TYPE_CHECKING: + import pandas as pd + from xarray.core.merge import CoercibleValue + from xarray.core.types import ErrorOptions + +# """ +# DEVELOPERS' NOTE +# ---------------- +# The idea of this module is to create a `DataTree` class which inherits the tree structure from TreeNode, and also copies +# the entire API of `xarray.Dataset`, but with certain methods decorated to instead map the dataset function over every +# node in the tree. As this API is copied without directly subclassing `xarray.Dataset` we instead create various Mixin +# classes (in ops.py) which each define part of `xarray.Dataset`'s extensive API. +# +# Some of these methods must be wrapped to map over all nodes in the subtree. Others are fine to inherit unaltered +# (normally because they (a) only call dataset properties and (b) don't return a dataset that should be nested into a new +# tree) and some will get overridden by the class definition of DataTree. +# """ + + +T_Path = Union[str, NodePath] + + +def _coerce_to_dataset(data: Dataset | DataArray | None) -> Dataset: + if isinstance(data, DataArray): + ds = data.to_dataset() + elif isinstance(data, Dataset): + ds = data + elif data is None: + ds = Dataset() + else: + raise TypeError( + f"data object is not an xarray Dataset, DataArray, or None, it is of type {type(data)}" + ) + return ds + + +def _check_for_name_collisions( + children: Iterable[str], variables: Iterable[Hashable] +) -> None: + colliding_names = set(children).intersection(set(variables)) + if colliding_names: + raise KeyError( + f"Some names would collide between variables and children: {list(colliding_names)}" + ) + + +class DatasetView(Dataset): + """ + An immutable Dataset-like view onto the data in a single DataTree node. + + In-place operations modifying this object should raise an AttributeError. + This requires overriding all inherited constructors. + + Operations returning a new result will return a new xarray.Dataset object. + This includes all API on Dataset, which will be inherited. + """ + + # TODO what happens if user alters (in-place) a DataArray they extracted from this object? + + __slots__ = ( + "_attrs", + "_cache", + "_coord_names", + "_dims", + "_encoding", + "_close", + "_indexes", + "_variables", + ) + + def __init__( + self, + data_vars: Optional[Mapping[Any, Any]] = None, + coords: Optional[Mapping[Any, Any]] = None, + attrs: Optional[Mapping[Any, Any]] = None, + ): + raise AttributeError("DatasetView objects are not to be initialized directly") + + @classmethod + def _from_node( + cls, + wrapping_node: DataTree, + ) -> DatasetView: + """Constructor, using dataset attributes from wrapping node""" + + obj: DatasetView = object.__new__(cls) + obj._variables = wrapping_node._variables + obj._coord_names = wrapping_node._coord_names + obj._dims = wrapping_node._dims + obj._indexes = wrapping_node._indexes + obj._attrs = wrapping_node._attrs + obj._close = wrapping_node._close + obj._encoding = wrapping_node._encoding + + return obj + + def __setitem__(self, key, val) -> None: + raise AttributeError( + "Mutation of the DatasetView is not allowed, please use `.__setitem__` on the wrapping DataTree node, " + "or use `dt.to_dataset()` if you want a mutable dataset. If calling this from within `map_over_subtree`," + "use `.copy()` first to get a mutable version of the input dataset." + ) + + def update(self, other) -> None: + raise AttributeError( + "Mutation of the DatasetView is not allowed, please use `.update` on the wrapping DataTree node, " + "or use `dt.to_dataset()` if you want a mutable dataset. If calling this from within `map_over_subtree`," + "use `.copy()` first to get a mutable version of the input dataset." + ) + + # FIXME https://github.com/python/mypy/issues/7328 + @overload + def __getitem__(self, key: Mapping) -> Dataset: # type: ignore[misc] + ... + + @overload + def __getitem__(self, key: Hashable) -> DataArray: # type: ignore[misc] + ... + + @overload + def __getitem__(self, key: Any) -> Dataset: + ... + + def __getitem__(self, key) -> DataArray: + # TODO call the `_get_item` method of DataTree to allow path-like access to contents of other nodes + # For now just call Dataset.__getitem__ + return Dataset.__getitem__(self, key) + + @classmethod + def _construct_direct( + cls, + variables: dict[Any, Variable], + coord_names: set[Hashable], + dims: Optional[dict[Any, int]] = None, + attrs: Optional[dict] = None, + indexes: Optional[dict[Any, Index]] = None, + encoding: Optional[dict] = None, + close: Optional[Callable[[], None]] = None, + ) -> Dataset: + """ + Overriding this method (along with ._replace) and modifying it to return a Dataset object + should hopefully ensure that the return type of any method on this object is a Dataset. + """ + if dims is None: + dims = calculate_dimensions(variables) + if indexes is None: + indexes = {} + obj = object.__new__(Dataset) + obj._variables = variables + obj._coord_names = coord_names + obj._dims = dims + obj._indexes = indexes + obj._attrs = attrs + obj._close = close + obj._encoding = encoding + return obj + + def _replace( + self, + variables: Optional[dict[Hashable, Variable]] = None, + coord_names: Optional[set[Hashable]] = None, + dims: Optional[dict[Any, int]] = None, + attrs: dict[Hashable, Any] | None | Default = _default, + indexes: Optional[dict[Hashable, Index]] = None, + encoding: dict | None | Default = _default, + inplace: bool = False, + ) -> Dataset: + """ + Overriding this method (along with ._construct_direct) and modifying it to return a Dataset object + should hopefully ensure that the return type of any method on this object is a Dataset. + """ + + if inplace: + raise AttributeError("In-place mutation of the DatasetView is not allowed") + + return Dataset._replace( + self, + variables=variables, + coord_names=coord_names, + dims=dims, + attrs=attrs, + indexes=indexes, + encoding=encoding, + inplace=inplace, + ) + + def map( + self, + func: Callable, + keep_attrs: bool | None = None, + args: Iterable[Any] = (), + **kwargs: Any, + ) -> Dataset: + """Apply a function to each data variable in this dataset + + Parameters + ---------- + func : callable + Function which can be called in the form `func(x, *args, **kwargs)` + to transform each DataArray `x` in this dataset into another + DataArray. + keep_attrs : bool or None, optional + If True, both the dataset's and variables' attributes (`attrs`) will be + copied from the original objects to the new ones. If False, the new dataset + and variables will be returned without copying the attributes. + args : iterable, optional + Positional arguments passed on to `func`. + **kwargs : Any + Keyword arguments passed on to `func`. + + Returns + ------- + applied : Dataset + Resulting dataset from applying ``func`` to each data variable. + + Examples + -------- + >>> da = xr.DataArray(np.random.randn(2, 3)) + >>> ds = xr.Dataset({"foo": da, "bar": ("x", [-1, 2])}) + >>> ds + + Dimensions: (dim_0: 2, dim_1: 3, x: 2) + Dimensions without coordinates: dim_0, dim_1, x + Data variables: + foo (dim_0, dim_1) float64 1.764 0.4002 0.9787 2.241 1.868 -0.9773 + bar (x) int64 -1 2 + >>> ds.map(np.fabs) + + Dimensions: (dim_0: 2, dim_1: 3, x: 2) + Dimensions without coordinates: dim_0, dim_1, x + Data variables: + foo (dim_0, dim_1) float64 1.764 0.4002 0.9787 2.241 1.868 0.9773 + bar (x) float64 1.0 2.0 + """ + + # Copied from xarray.Dataset so as not to call type(self), which causes problems (see datatree GH188). + # TODO Refactor xarray upstream to avoid needing to overwrite this. + # TODO This copied version will drop all attrs - the keep_attrs stuff should be re-instated + variables = { + k: maybe_wrap_array(v, func(v, *args, **kwargs)) + for k, v in self.data_vars.items() + } + # return type(self)(variables, attrs=attrs) + return Dataset(variables) + + +class DataTree( + NamedNode, + MappedDatasetMethodsMixin, + MappedDataWithCoords, + DataTreeArithmeticMixin, + TreeAttrAccessMixin, + Generic[Tree], + Mapping, +): + """ + A tree-like hierarchical collection of xarray objects. + + Attempts to present an API like that of xarray.Dataset, but methods are wrapped to also update all the tree's child nodes. + """ + + # TODO Some way of sorting children by depth + + # TODO do we need a watch out for if methods intended only for root nodes are called on non-root nodes? + + # TODO dataset methods which should not or cannot act over the whole tree, such as .to_array + + # TODO .loc method + + # TODO a lot of properties like .variables could be defined in a DataMapping class which both Dataset and DataTree inherit from + + # TODO all groupby classes + + # TODO a lot of properties like .variables could be defined in a DataMapping class which both Dataset and DataTree inherit from + + # TODO __slots__ + + # TODO all groupby classes + + _name: Optional[str] + _parent: Optional[DataTree] + _children: OrderedDict[str, DataTree] + _attrs: Optional[Dict[Hashable, Any]] + _cache: Dict[str, Any] + _coord_names: Set[Hashable] + _dims: Dict[Hashable, int] + _encoding: Optional[Dict[Hashable, Any]] + _close: Optional[Callable[[], None]] + _indexes: Dict[Hashable, Index] + _variables: Dict[Hashable, Variable] + + __slots__ = ( + "_name", + "_parent", + "_children", + "_attrs", + "_cache", + "_coord_names", + "_dims", + "_encoding", + "_close", + "_indexes", + "_variables", + ) + + def __init__( + self, + data: Optional[Dataset | DataArray] = None, + parent: Optional[DataTree] = None, + children: Optional[Mapping[str, DataTree]] = None, + name: Optional[str] = None, + ): + """ + Create a single node of a DataTree. + + The node may optionally contain data in the form of data and coordinate variables, stored in the same way as + data is stored in an xarray.Dataset. + + Parameters + ---------- + data : Dataset, DataArray, or None, optional + Data to store under the .ds attribute of this node. DataArrays will be promoted to Datasets. + Default is None. + parent : DataTree, optional + Parent node to this node. Default is None. + children : Mapping[str, DataTree], optional + Any child nodes of this node. Default is None. + name : str, optional + Name for this node of the tree. Default is None. + + Returns + ------- + DataTree + + See Also + -------- + DataTree.from_dict + """ + + # validate input + if children is None: + children = {} + ds = _coerce_to_dataset(data) + _check_for_name_collisions(children, ds.variables) + + super().__init__(name=name) + + # set data attributes + self._replace( + inplace=True, + variables=ds._variables, + coord_names=ds._coord_names, + dims=ds._dims, + indexes=ds._indexes, + attrs=ds._attrs, + encoding=ds._encoding, + ) + self._close = ds._close + + # set tree attributes (must happen after variables set to avoid initialization errors) + self.children = children + self.parent = parent + + @property + def parent(self: DataTree) -> DataTree | None: + """Parent of this node.""" + return self._parent + + @parent.setter + def parent(self: DataTree, new_parent: DataTree) -> None: + if new_parent and self.name is None: + raise ValueError("Cannot set an unnamed node as a child of another node") + self._set_parent(new_parent, self.name) + + @property + def ds(self) -> DatasetView: + """ + An immutable Dataset-like view onto the data in this node. + + For a mutable Dataset containing the same data as in this node, use `.to_dataset()` instead. + + See Also + -------- + DataTree.to_dataset + """ + return DatasetView._from_node(self) + + @ds.setter + def ds(self, data: Optional[Union[Dataset, DataArray]] = None) -> None: + ds = _coerce_to_dataset(data) + + _check_for_name_collisions(self.children, ds.variables) + + self._replace( + inplace=True, + variables=ds._variables, + coord_names=ds._coord_names, + dims=ds._dims, + indexes=ds._indexes, + attrs=ds._attrs, + encoding=ds._encoding, + ) + self._close = ds._close + + def _pre_attach(self: DataTree, parent: DataTree) -> None: + """ + Method which superclass calls before setting parent, here used to prevent having two + children with duplicate names (or a data variable with the same name as a child). + """ + super()._pre_attach(parent) + if self.name in list(parent.ds.variables): + raise KeyError( + f"parent {parent.name} already contains a data variable named {self.name}" + ) + + def to_dataset(self) -> Dataset: + """ + Return the data in this node as a new xarray.Dataset object. + + See Also + -------- + DataTree.ds + """ + return Dataset._construct_direct( + self._variables, + self._coord_names, + self._dims, + self._attrs, + self._indexes, + self._encoding, + self._close, + ) + + @property + def has_data(self): + """Whether or not there are any data variables in this node.""" + return len(self._variables) > 0 + + @property + def has_attrs(self) -> bool: + """Whether or not there are any metadata attributes in this node.""" + return len(self.attrs.keys()) > 0 + + @property + def is_empty(self) -> bool: + """False if node contains any data or attrs. Does not look at children.""" + return not (self.has_data or self.has_attrs) + + @property + def is_hollow(self) -> bool: + """True if only leaf nodes contain data.""" + return not any(node.has_data for node in self.subtree if not node.is_leaf) + + @property + def variables(self) -> Mapping[Hashable, Variable]: + """Low level interface to node contents as dict of Variable objects. + + This ordered dictionary is frozen to prevent mutation that could + violate Dataset invariants. It contains all variable objects + constituting this DataTree node, including both data variables and + coordinates. + """ + return Frozen(self._variables) + + @property + def attrs(self) -> Dict[Hashable, Any]: + """Dictionary of global attributes on this node object.""" + if self._attrs is None: + self._attrs = {} + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping[Any, Any]) -> None: + self._attrs = dict(value) + + @property + def encoding(self) -> Dict: + """Dictionary of global encoding attributes on this node object.""" + if self._encoding is None: + self._encoding = {} + return self._encoding + + @encoding.setter + def encoding(self, value: Mapping) -> None: + self._encoding = dict(value) + + @property + def dims(self) -> Mapping[Hashable, int]: + """Mapping from dimension names to lengths. + + Cannot be modified directly, but is updated when adding new variables. + + Note that type of this object differs from `DataArray.dims`. + See `DataTree.sizes`, `Dataset.sizes`, and `DataArray.sizes` for consistently named + properties. + """ + return Frozen(self._dims) + + @property + def sizes(self) -> Mapping[Hashable, int]: + """Mapping from dimension names to lengths. + + Cannot be modified directly, but is updated when adding new variables. + + This is an alias for `DataTree.dims` provided for the benefit of + consistency with `DataArray.sizes`. + + See Also + -------- + DataArray.sizes + """ + return self.dims + + @property + def _attr_sources(self) -> Iterable[Mapping[Hashable, Any]]: + """Places to look-up items for attribute-style access""" + yield from self._item_sources + yield self.attrs + + @property + def _item_sources(self) -> Iterable[Mapping[Any, Any]]: + """Places to look-up items for key-completion""" + yield self.data_vars + yield HybridMappingProxy(keys=self._coord_names, mapping=self.coords) + + # virtual coordinates + yield HybridMappingProxy(keys=self.dims, mapping=self) + + # immediate child nodes + yield self.children + + def _ipython_key_completions_(self) -> List[str]: + """Provide method for the key-autocompletions in IPython. + See http://ipython.readthedocs.io/en/stable/config/integrating.html#tab-completion + For the details. + """ + + # TODO allow auto-completing relative string paths, e.g. `dt['path/to/../ node'` + # Would require changes to ipython's autocompleter, see https://github.com/ipython/ipython/issues/12420 + # Instead for now we only list direct paths to all node in subtree explicitly + + items_on_this_node = self._item_sources + full_file_like_paths_to_all_nodes_in_subtree = { + node.path[1:]: node for node in self.subtree + } + + all_item_sources = itertools.chain( + items_on_this_node, [full_file_like_paths_to_all_nodes_in_subtree] + ) + + items = { + item + for source in all_item_sources + for item in source + if isinstance(item, str) + } + return list(items) + + def __contains__(self, key: object) -> bool: + """The 'in' operator will return true or false depending on whether + 'key' is either an array stored in the datatree or a child node, or neither. + """ + return key in self.variables or key in self.children + + def __bool__(self) -> bool: + return bool(self.ds.data_vars) or bool(self.children) + + def __iter__(self) -> Iterator[Hashable]: + return itertools.chain(self.ds.data_vars, self.children) + + def __array__(self, dtype=None): + raise TypeError( + "cannot directly convert a DataTree into a " + "numpy array. Instead, create an xarray.DataArray " + "first, either with indexing on the DataTree or by " + "invoking the `to_array()` method." + ) + + def __repr__(self) -> str: + return formatting.datatree_repr(self) + + def __str__(self) -> str: + return formatting.datatree_repr(self) + + def _repr_html_(self): + """Make html representation of datatree object""" + if XR_OPTS["display_style"] == "text": + return f"
{escape(repr(self))}
" + return formatting_html.datatree_repr(self) + + @classmethod + def _construct_direct( + cls, + variables: dict[Any, Variable], + coord_names: set[Hashable], + dims: Optional[dict[Any, int]] = None, + attrs: Optional[dict] = None, + indexes: Optional[dict[Any, Index]] = None, + encoding: Optional[dict] = None, + name: str | None = None, + parent: DataTree | None = None, + children: Optional[OrderedDict[str, DataTree]] = None, + close: Optional[Callable[[], None]] = None, + ) -> DataTree: + """Shortcut around __init__ for internal use when we want to skip costly validation.""" + + # data attributes + if dims is None: + dims = calculate_dimensions(variables) + if indexes is None: + indexes = {} + if children is None: + children = OrderedDict() + + obj: DataTree = object.__new__(cls) + obj._variables = variables + obj._coord_names = coord_names + obj._dims = dims + obj._indexes = indexes + obj._attrs = attrs + obj._close = close + obj._encoding = encoding + + # tree attributes + obj._name = name + obj._children = children + obj._parent = parent + + return obj + + def _replace( + self: DataTree, + variables: Optional[dict[Hashable, Variable]] = None, + coord_names: Optional[set[Hashable]] = None, + dims: Optional[dict[Any, int]] = None, + attrs: dict[Hashable, Any] | None | Default = _default, + indexes: Optional[dict[Hashable, Index]] = None, + encoding: dict | None | Default = _default, + name: str | None | Default = _default, + parent: DataTree | None = _default, + children: Optional[OrderedDict[str, DataTree]] = None, + inplace: bool = False, + ) -> DataTree: + """ + Fastpath constructor for internal use. + + Returns an object with optionally replaced attributes. + + Explicitly passed arguments are *not* copied when placed on the new + datatree. It is up to the caller to ensure that they have the right type + and are not used elsewhere. + """ + # TODO Adding new children inplace using this method will cause bugs. + # You will end up with an inconsistency between the name of the child node and the key the child is stored under. + # Use ._set() instead for now + if inplace: + if variables is not None: + self._variables = variables + if coord_names is not None: + self._coord_names = coord_names + if dims is not None: + self._dims = dims + if attrs is not _default: + self._attrs = attrs + if indexes is not None: + self._indexes = indexes + if encoding is not _default: + self._encoding = encoding + if name is not _default: + self._name = name + if parent is not _default: + self._parent = parent + if children is not None: + self._children = children + obj = self + else: + if variables is None: + variables = self._variables.copy() + if coord_names is None: + coord_names = self._coord_names.copy() + if dims is None: + dims = self._dims.copy() + if attrs is _default: + attrs = copy.copy(self._attrs) + if indexes is None: + indexes = self._indexes.copy() + if encoding is _default: + encoding = copy.copy(self._encoding) + if name is _default: + name = self._name # no need to copy str objects or None + if parent is _default: + parent = copy.copy(self._parent) + if children is _default: + children = copy.copy(self._children) + obj = self._construct_direct( + variables, + coord_names, + dims, + attrs, + indexes, + encoding, + name, + parent, + children, + ) + return obj + + def copy( + self: DataTree, + deep: bool = False, + ) -> DataTree: + """ + Returns a copy of this subtree. + + Copies this node and all child nodes. + + If `deep=True`, a deep copy is made of each of the component variables. + Otherwise, a shallow copy of each of the component variable is made, so + that the underlying memory region of the new datatree is the same as in + the original datatree. + + Parameters + ---------- + deep : bool, default: False + Whether each component variable is loaded into memory and copied onto + the new object. Default is False. + + Returns + ------- + object : DataTree + New object with dimensions, attributes, coordinates, name, encoding, + and data of this node and all child nodes copied from original. + + See Also + -------- + xarray.Dataset.copy + pandas.DataFrame.copy + """ + return self._copy_subtree(deep=deep) + + def _copy_subtree( + self: DataTree, + deep: bool = False, + memo: dict[int, Any] | None = None, + ) -> DataTree: + """Copy entire subtree""" + new_tree = self._copy_node(deep=deep) + for node in self.descendants: + path = node.relative_to(self) + new_tree[path] = node._copy_node(deep=deep) + return new_tree + + def _copy_node( + self: DataTree, + deep: bool = False, + ) -> DataTree: + """Copy just one node of a tree""" + new_node: DataTree = DataTree() + new_node.name = self.name + new_node.ds = self.to_dataset().copy(deep=deep) + return new_node + + def __copy__(self: DataTree) -> DataTree: + return self._copy_subtree(deep=False) + + def __deepcopy__(self: DataTree, memo: dict[int, Any] | None = None) -> DataTree: + return self._copy_subtree(deep=True, memo=memo) + + def get( + self: DataTree, key: str, default: Optional[DataTree | DataArray] = None + ) -> Optional[DataTree | DataArray]: + """ + Access child nodes, variables, or coordinates stored in this node. + + Returned object will be either a DataTree or DataArray object depending on whether the key given points to a + child or variable. + + Parameters + ---------- + key : str + Name of variable / child within this node. Must lie in this immediate node (not elsewhere in the tree). + default : DataTree | DataArray, optional + A value to return if the specified key does not exist. Default return value is None. + """ + if key in self.children: + return self.children[key] + elif key in self.ds: + return self.ds[key] + else: + return default + + def __getitem__(self: DataTree, key: str) -> DataTree | DataArray: + """ + Access child nodes, variables, or coordinates stored anywhere in this tree. + + Returned object will be either a DataTree or DataArray object depending on whether the key given points to a + child or variable. + + Parameters + ---------- + key : str + Name of variable / child within this node, or unix-like path to variable / child within another node. + + Returns + ------- + Union[DataTree, DataArray] + """ + + # Either: + if utils.is_dict_like(key): + # dict-like indexing + raise NotImplementedError("Should this index over whole tree?") + elif isinstance(key, str): + # TODO should possibly deal with hashables in general? + # path-like: a name of a node/variable, or path to a node/variable + path = NodePath(key) + return self._get_item(path) + elif utils.is_list_like(key): + # iterable of variable names + raise NotImplementedError( + "Selecting via tags is deprecated, and selecting multiple items should be " + "implemented via .subset" + ) + else: + raise ValueError(f"Invalid format for key: {key}") + + def _set(self, key: str, val: DataTree | CoercibleValue) -> None: + """ + Set the child node or variable with the specified key to value. + + Counterpart to the public .get method, and also only works on the immediate node, not other nodes in the tree. + """ + if isinstance(val, DataTree): + # create and assign a shallow copy here so as not to alter original name of node in grafted tree + new_node = val.copy(deep=False) + new_node.name = key + new_node.parent = self + else: + if not isinstance(val, (DataArray, Variable)): + # accommodate other types that can be coerced into Variables + val = DataArray(val) + + self.update({key: val}) + + def __setitem__( + self, + key: str, + value: Any, + ) -> None: + """ + Add either a child node or an array to the tree, at any position. + + Data can be added anywhere, and new nodes will be created to cross the path to the new location if necessary. + + If there is already a node at the given location, then if value is a Node class or Dataset it will overwrite the + data already present at that node, and if value is a single array, it will be merged with it. + """ + # TODO xarray.Dataset accepts other possibilities, how do we exactly replicate all the behaviour? + if utils.is_dict_like(key): + raise NotImplementedError + elif isinstance(key, str): + # TODO should possibly deal with hashables in general? + # path-like: a name of a node/variable, or path to a node/variable + path = NodePath(key) + return self._set_item(path, value, new_nodes_along_path=True) + else: + raise ValueError("Invalid format for key") + + def update(self, other: Dataset | Mapping[str, DataTree | DataArray]) -> None: + """ + Update this node's children and / or variables. + + Just like `dict.update` this is an in-place operation. + """ + # TODO separate by type + new_children = {} + new_variables = {} + for k, v in other.items(): + if isinstance(v, DataTree): + # avoid named node being stored under inconsistent key + new_child = v.copy() + new_child.name = k + new_children[k] = new_child + elif isinstance(v, (DataArray, Variable)): + # TODO this should also accommodate other types that can be coerced into Variables + new_variables[k] = v + else: + raise TypeError(f"Type {type(v)} cannot be assigned to a DataTree") + + vars_merge_result = dataset_update_method(self.to_dataset(), new_variables) + # TODO are there any subtleties with preserving order of children like this? + merged_children = OrderedDict({**self.children, **new_children}) + self._replace( + inplace=True, children=merged_children, **vars_merge_result._asdict() + ) + + def assign( + self, items: Mapping[Any, Any] | None = None, **items_kwargs: Any + ) -> DataTree: + """ + Assign new data variables or child nodes to a DataTree, returning a new object + with all the original items in addition to the new ones. + + Parameters + ---------- + items : mapping of hashable to Any + Mapping from variable or child node names to the new values. If the new values + are callable, they are computed on the Dataset and assigned to new + data variables. If the values are not callable, (e.g. a DataTree, DataArray, + scalar, or array), they are simply assigned. + **items_kwargs + The keyword arguments form of ``variables``. + One of variables or variables_kwargs must be provided. + + Returns + ------- + dt : DataTree + A new DataTree with the new variables or children in addition to all the + existing items. + + Notes + ----- + Since ``kwargs`` is a dictionary, the order of your arguments may not + be preserved, and so the order of the new variables is not well-defined. + Assigning multiple items within the same ``assign`` is + possible, but you cannot reference other variables created within the + same ``assign`` call. + + See Also + -------- + xarray.Dataset.assign + pandas.DataFrame.assign + """ + items = either_dict_or_kwargs(items, items_kwargs, "assign") + dt = self.copy() + dt.update(items) + return dt + + def drop_nodes( + self: DataTree, names: str | Iterable[str], *, errors: ErrorOptions = "raise" + ) -> DataTree: + """ + Drop child nodes from this node. + + Parameters + ---------- + names : str or iterable of str + Name(s) of nodes to drop. + errors : {"raise", "ignore"}, default: "raise" + If 'raise', raises a KeyError if any of the node names + passed are not present as children of this node. If 'ignore', + any given names that are present are dropped and no error is raised. + + Returns + ------- + dropped : DataTree + A copy of the node with the specified children dropped. + """ + # the Iterable check is required for mypy + if isinstance(names, str) or not isinstance(names, Iterable): + names = {names} + else: + names = set(names) + + if errors == "raise": + extra = names - set(self.children) + if extra: + raise KeyError(f"Cannot drop all nodes - nodes {extra} not present") + + children_to_keep = OrderedDict( + {name: child for name, child in self.children.items() if name not in names} + ) + return self._replace(children=children_to_keep) + + @classmethod + def from_dict( + cls, + d: MutableMapping[str, Dataset | DataArray | DataTree | None], + name: Optional[str] = None, + ) -> DataTree: + """ + Create a datatree from a dictionary of data objects, organised by paths into the tree. + + Parameters + ---------- + d : dict-like + A mapping from path names to xarray.Dataset, xarray.DataArray, or DataTree objects. + + Path names are to be given as unix-like path. If path names containing more than one part are given, new + tree nodes will be constructed as necessary. + + To assign data to the root node of the tree use "/" as the path. + name : Hashable, optional + Name for the root node of the tree. Default is None. + + Returns + ------- + DataTree + + Notes + ----- + If your dictionary is nested you will need to flatten it before using this method. + """ + + # First create the root node + root_data = d.pop("/", None) + obj = cls(name=name, data=root_data, parent=None, children=None) + + if d: + # Populate tree with children determined from data_objects mapping + for path, data in d.items(): + # Create and set new node + node_name = NodePath(path).name + if isinstance(data, cls): + new_node = data.copy() + new_node.orphan() + else: + new_node = cls(name=node_name, data=data) + obj._set_item( + path, + new_node, + allow_overwrite=False, + new_nodes_along_path=True, + ) + + return obj + + def to_dict(self) -> Dict[str, Dataset]: + """ + Create a dictionary mapping of absolute node paths to the data contained in those nodes. + + Returns + ------- + Dict[str, Dataset] + """ + return {node.path: node.to_dataset() for node in self.subtree} + + @property + def nbytes(self) -> int: + return sum(node.to_dataset().nbytes for node in self.subtree) + + def __len__(self) -> int: + return len(self.children) + len(self.data_vars) + + @property + def indexes(self) -> Indexes[pd.Index]: + """Mapping of pandas.Index objects used for label based indexing. + + Raises an error if this DataTree node has indexes that cannot be coerced + to pandas.Index objects. + + See Also + -------- + DataTree.xindexes + """ + return self.xindexes.to_pandas_indexes() + + @property + def xindexes(self) -> Indexes[Index]: + """Mapping of xarray Index objects used for label based indexing.""" + return Indexes(self._indexes, {k: self._variables[k] for k in self._indexes}) + + @property + def coords(self) -> DatasetCoordinates: + """Dictionary of xarray.DataArray objects corresponding to coordinate + variables + """ + return DatasetCoordinates(self.to_dataset()) + + @property + def data_vars(self) -> DataVariables: + """Dictionary of DataArray objects corresponding to data variables""" + return DataVariables(self.to_dataset()) + + def isomorphic( + self, + other: DataTree, + from_root: bool = False, + strict_names: bool = False, + ) -> bool: + """ + Two DataTrees are considered isomorphic if every node has the same number of children. + + Nothing about the data in each node is checked. + + Isomorphism is a necessary condition for two trees to be used in a nodewise binary operation, + such as ``tree1 + tree2``. + + By default this method does not check any part of the tree above the given node. + Therefore this method can be used as default to check that two subtrees are isomorphic. + + Parameters + ---------- + other : DataTree + The other tree object to compare to. + from_root : bool, optional, default is False + Whether or not to first traverse to the root of the two trees before checking for isomorphism. + If neither tree has a parent then this has no effect. + strict_names : bool, optional, default is False + Whether or not to also check that every node in the tree has the same name as its counterpart in the other + tree. + + See Also + -------- + DataTree.equals + DataTree.identical + """ + try: + check_isomorphic( + self, + other, + require_names_equal=strict_names, + check_from_root=from_root, + ) + return True + except (TypeError, TreeIsomorphismError): + return False + + def equals(self, other: DataTree, from_root: bool = True) -> bool: + """ + Two DataTrees are equal if they have isomorphic node structures, with matching node names, + and if they have matching variables and coordinates, all of which are equal. + + By default this method will check the whole tree above the given node. + + Parameters + ---------- + other : DataTree + The other tree object to compare to. + from_root : bool, optional, default is True + Whether or not to first traverse to the root of the two trees before checking for isomorphism. + If neither tree has a parent then this has no effect. + + See Also + -------- + Dataset.equals + DataTree.isomorphic + DataTree.identical + """ + if not self.isomorphic(other, from_root=from_root, strict_names=True): + return False + + return all( + [ + node.ds.equals(other_node.ds) + for node, other_node in zip(self.subtree, other.subtree) + ] + ) + + def identical(self, other: DataTree, from_root=True) -> bool: + """ + Like equals, but will also check all dataset attributes and the attributes on + all variables and coordinates. + + By default this method will check the whole tree above the given node. + + Parameters + ---------- + other : DataTree + The other tree object to compare to. + from_root : bool, optional, default is True + Whether or not to first traverse to the root of the two trees before checking for isomorphism. + If neither tree has a parent then this has no effect. + + See Also + -------- + Dataset.identical + DataTree.isomorphic + DataTree.equals + """ + if not self.isomorphic(other, from_root=from_root, strict_names=True): + return False + + return all( + node.ds.identical(other_node.ds) + for node, other_node in zip(self.subtree, other.subtree) + ) + + def filter(self: DataTree, filterfunc: Callable[[DataTree], bool]) -> DataTree: + """ + Filter nodes according to a specified condition. + + Returns a new tree containing only the nodes in the original tree for which `fitlerfunc(node)` is True. + Will also contain empty nodes at intermediate positions if required to support leaves. + + Parameters + ---------- + filterfunc: function + A function which accepts only one DataTree - the node on which filterfunc will be called. + + Returns + ------- + DataTree + + See Also + -------- + match + pipe + map_over_subtree + """ + filtered_nodes = { + node.path: node.ds for node in self.subtree if filterfunc(node) + } + return DataTree.from_dict(filtered_nodes, name=self.root.name) + + def match(self, pattern: str) -> DataTree: + """ + Return nodes with paths matching pattern. + + Uses unix glob-like syntax for pattern-matching. + + Parameters + ---------- + pattern: str + A pattern to match each node path against. + + Returns + ------- + DataTree + + See Also + -------- + filter + pipe + map_over_subtree + + Examples + -------- + >>> dt = DataTree.from_dict( + ... { + ... "/a/A": None, + ... "/a/B": None, + ... "/b/A": None, + ... "/b/B": None, + ... } + ... ) + >>> dt.match("*/B") + DataTree('None', parent=None) + ├── DataTree('a') + │ └── DataTree('B') + └── DataTree('b') + └── DataTree('B') + """ + matching_nodes = { + node.path: node.ds + for node in self.subtree + if NodePath(node.path).match(pattern) + } + return DataTree.from_dict(matching_nodes, name=self.root.name) + + def map_over_subtree( + self, + func: Callable, + *args: Iterable[Any], + **kwargs: Any, + ) -> DataTree | Tuple[DataTree]: + """ + Apply a function to every dataset in this subtree, returning a new tree which stores the results. + + The function will be applied to any dataset stored in this node, as well as any dataset stored in any of the + descendant nodes. The returned tree will have the same structure as the original subtree. + + func needs to return a Dataset in order to rebuild the subtree. + + Parameters + ---------- + func : callable + Function to apply to datasets with signature: + `func(node.ds, *args, **kwargs) -> Dataset`. + + Function will not be applied to any nodes without datasets. + *args : tuple, optional + Positional arguments passed on to `func`. + **kwargs : Any + Keyword arguments passed on to `func`. + + Returns + ------- + subtrees : DataTree, Tuple of DataTrees + One or more subtrees containing results from applying ``func`` to the data at each node. + """ + # TODO this signature means that func has no way to know which node it is being called upon - change? + + # TODO fix this typing error + return map_over_subtree(func)(self, *args, **kwargs) # type: ignore[operator] + + def map_over_subtree_inplace( + self, + func: Callable, + *args: Iterable[Any], + **kwargs: Any, + ) -> None: + """ + Apply a function to every dataset in this subtree, updating data in place. + + Parameters + ---------- + func : callable + Function to apply to datasets with signature: + `func(node.ds, *args, **kwargs) -> Dataset`. + + Function will not be applied to any nodes without datasets, + *args : tuple, optional + Positional arguments passed on to `func`. + **kwargs : Any + Keyword arguments passed on to `func`. + """ + + # TODO if func fails on some node then the previous nodes will still have been updated... + + for node in self.subtree: + if node.has_data: + node.ds = func(node.ds, *args, **kwargs) + + def pipe( + self, func: Callable | tuple[Callable, str], *args: Any, **kwargs: Any + ) -> Any: + """Apply ``func(self, *args, **kwargs)`` + + This method replicates the pandas method of the same name. + + Parameters + ---------- + func : callable + function to apply to this xarray object (Dataset/DataArray). + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects the xarray object. + *args + positional arguments passed into ``func``. + **kwargs + a dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : Any + the return type of ``func``. + + Notes + ----- + Use ``.pipe`` when chaining together functions that expect + xarray or pandas objects, e.g., instead of writing + + .. code:: python + + f(g(h(dt), arg1=a), arg2=b, arg3=c) + + You can write + + .. code:: python + + (dt.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)) + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``f`` takes its data as ``arg2``: + + .. code:: python + + (dt.pipe(h).pipe(g, arg1=a).pipe((f, "arg2"), arg1=a, arg3=c)) + + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + raise ValueError( + f"{target} is both the pipe target and a keyword argument" + ) + kwargs[target] = self + else: + args = (self,) + args + return func(*args, **kwargs) + + def render(self): + """Print tree structure, including any data stored at each node.""" + for pre, fill, node in RenderTree(self): + print(f"{pre}DataTree('{self.name}')") + for ds_line in repr(node.ds)[1:]: + print(f"{fill}{ds_line}") + + def merge(self, datatree: DataTree) -> DataTree: + """Merge all the leaves of a second DataTree into this one.""" + raise NotImplementedError + + def merge_child_nodes(self, *paths, new_path: T_Path) -> DataTree: + """Merge a set of child nodes into a single new node.""" + raise NotImplementedError + + # TODO some kind of .collapse() or .flatten() method to merge a subtree + + def as_array(self) -> DataArray: + return self.ds.as_dataarray() + + @property + def groups(self): + """Return all netCDF4 groups in the tree, given as a tuple of path-like strings.""" + return tuple(node.path for node in self.subtree) + + def to_netcdf( + self, filepath, mode: str = "w", encoding=None, unlimited_dims=None, **kwargs + ): + """ + Write datatree contents to a netCDF file. + + Parameters + ---------- + filepath : str or Path + Path to which to save this datatree. + mode : {"w", "a"}, default: "w" + Write ('w') or append ('a') mode. If mode='w', any existing file at + this location will be overwritten. If mode='a', existing variables + will be overwritten. Only appies to the root group. + encoding : dict, optional + Nested dictionary with variable names as keys and dictionaries of + variable specific encodings as values, e.g., + ``{"root/set1": {"my_variable": {"dtype": "int16", "scale_factor": 0.1, + "zlib": True}, ...}, ...}``. See ``xarray.Dataset.to_netcdf`` for available + options. + unlimited_dims : dict, optional + Mapping of unlimited dimensions per group that that should be serialized as unlimited dimensions. + By default, no dimensions are treated as unlimited dimensions. + Note that unlimited_dims may also be set via + ``dataset.encoding["unlimited_dims"]``. + kwargs : + Addional keyword arguments to be passed to ``xarray.Dataset.to_netcdf`` + """ + from xarray.datatree_.datatree.io import _datatree_to_netcdf + + _datatree_to_netcdf( + self, + filepath, + mode=mode, + encoding=encoding, + unlimited_dims=unlimited_dims, + **kwargs, + ) + + def to_zarr( + self, + store, + mode: str = "w-", + encoding=None, + consolidated: bool = True, + **kwargs, + ): + """ + Write datatree contents to a Zarr store. + + Parameters + ---------- + store : MutableMapping, str or Path, optional + Store or path to directory in file system + mode : {{"w", "w-", "a", "r+", None}, default: "w-" + Persistence mode: “w” means create (overwrite if exists); “w-” means create (fail if exists); + “a” means override existing variables (create if does not exist); “r+” means modify existing + array values only (raise an error if any metadata or shapes would change). The default mode + is “a” if append_dim is set. Otherwise, it is “r+” if region is set and w- otherwise. + encoding : dict, optional + Nested dictionary with variable names as keys and dictionaries of + variable specific encodings as values, e.g., + ``{"root/set1": {"my_variable": {"dtype": "int16", "scale_factor": 0.1}, ...}, ...}``. + See ``xarray.Dataset.to_zarr`` for available options. + consolidated : bool + If True, apply zarr's `consolidate_metadata` function to the store + after writing metadata for all groups. + kwargs : + Additional keyword arguments to be passed to ``xarray.Dataset.to_zarr`` + """ + from xarray.datatree_.datatree.io import _datatree_to_zarr + + _datatree_to_zarr( + self, + store, + mode=mode, + encoding=encoding, + consolidated=consolidated, + **kwargs, + ) + + def plot(self): + raise NotImplementedError diff --git a/xarray/datatree_/datatree/extensions.py b/xarray/datatree_/datatree/extensions.py new file mode 100644 index 00000000000..51a8ee577ab --- /dev/null +++ b/xarray/datatree_/datatree/extensions.py @@ -0,0 +1,20 @@ +from xarray.core.extensions import _register_accessor + +from xarray.datatree_.datatree.datatree import DataTree + + +def register_datatree_accessor(name): + """Register a custom accessor on DataTree objects. + + Parameters + ---------- + name : str + Name under which the accessor should be registered. A warning is issued + if this name conflicts with a preexisting attribute. + + See Also + -------- + xarray.register_dataarray_accessor + xarray.register_dataset_accessor + """ + return _register_accessor(name, DataTree) diff --git a/xarray/datatree_/datatree/formatting.py b/xarray/datatree_/datatree/formatting.py new file mode 100644 index 00000000000..070dd09ca5b --- /dev/null +++ b/xarray/datatree_/datatree/formatting.py @@ -0,0 +1,91 @@ +from typing import TYPE_CHECKING + +from xarray.core.formatting import _compat_to_str, diff_dataset_repr + +from xarray.datatree_.datatree.mapping import diff_treestructure +from xarray.datatree_.datatree.render import RenderTree + +if TYPE_CHECKING: + from xarray.datatree_.datatree.datatree import DataTree + + +def diff_nodewise_summary(a, b, compat): + """Iterates over all corresponding nodes, recording differences between data at each location.""" + + compat_str = _compat_to_str(compat) + + summary = [] + for node_a, node_b in zip(a.subtree, b.subtree): + a_ds, b_ds = node_a.ds, node_b.ds + + if not a_ds._all_compat(b_ds, compat): + dataset_diff = diff_dataset_repr(a_ds, b_ds, compat_str) + data_diff = "\n".join(dataset_diff.split("\n", 1)[1:]) + + nodediff = ( + f"\nData in nodes at position '{node_a.path}' do not match:" + f"{data_diff}" + ) + summary.append(nodediff) + + return "\n".join(summary) + + +def diff_tree_repr(a, b, compat): + summary = [ + f"Left and right {type(a).__name__} objects are not {_compat_to_str(compat)}" + ] + + # TODO check root parents? + + strict_names = True if compat in ["equals", "identical"] else False + treestructure_diff = diff_treestructure(a, b, strict_names) + + # If the trees structures are different there is no point comparing each node + # TODO we could show any differences in nodes up to the first place that structure differs? + if treestructure_diff or compat == "isomorphic": + summary.append("\n" + treestructure_diff) + else: + nodewise_diff = diff_nodewise_summary(a, b, compat) + summary.append("\n" + nodewise_diff) + + return "\n".join(summary) + + +def datatree_repr(dt): + """A printable representation of the structure of this entire tree.""" + renderer = RenderTree(dt) + + lines = [] + for pre, fill, node in renderer: + node_repr = _single_node_repr(node) + + node_line = f"{pre}{node_repr.splitlines()[0]}" + lines.append(node_line) + + if node.has_data or node.has_attrs: + ds_repr = node_repr.splitlines()[2:] + for line in ds_repr: + if len(node.children) > 0: + lines.append(f"{fill}{renderer.style.vertical}{line}") + else: + lines.append(f"{fill}{' ' * len(renderer.style.vertical)}{line}") + + # Tack on info about whether or not root node has a parent at the start + first_line = lines[0] + parent = f'"{dt.parent.name}"' if dt.parent is not None else "None" + first_line_with_parent = first_line[:-1] + f", parent={parent})" + lines[0] = first_line_with_parent + + return "\n".join(lines) + + +def _single_node_repr(node: "DataTree") -> str: + """Information about this node, not including its relationships to other nodes.""" + node_info = f"DataTree('{node.name}')" + + if node.has_data or node.has_attrs: + ds_info = "\n" + repr(node.ds) + else: + ds_info = "" + return node_info + ds_info diff --git a/xarray/datatree_/datatree/formatting_html.py b/xarray/datatree_/datatree/formatting_html.py new file mode 100644 index 00000000000..4531f5aec18 --- /dev/null +++ b/xarray/datatree_/datatree/formatting_html.py @@ -0,0 +1,138 @@ +from functools import partial +from html import escape +from typing import Any, Mapping + +from xarray.core.formatting_html import ( + _mapping_section, + _obj_repr, + attr_section, + coord_section, + datavar_section, + dim_section, +) +from xarray.core.options import OPTIONS + +OPTIONS["display_expand_groups"] = "default" + + +def summarize_children(children: Mapping[str, Any]) -> str: + N_CHILDREN = len(children) - 1 + + # Get result from node_repr and wrap it + lines_callback = lambda n, c, end: _wrap_repr(node_repr(n, c), end=end) + + children_html = "".join( + lines_callback(n, c, end=False) # Long lines + if i < N_CHILDREN + else lines_callback(n, c, end=True) # Short lines + for i, (n, c) in enumerate(children.items()) + ) + + return "".join( + [ + "
", + children_html, + "
", + ] + ) + + +children_section = partial( + _mapping_section, + name="Groups", + details_func=summarize_children, + max_items_collapse=1, + expand_option_name="display_expand_groups", +) + + +def node_repr(group_title: str, dt: Any) -> str: + header_components = [f"
{escape(group_title)}
"] + + ds = dt.ds + + sections = [ + children_section(dt.children), + dim_section(ds), + coord_section(ds.coords), + datavar_section(ds.data_vars), + attr_section(ds.attrs), + ] + + return _obj_repr(ds, header_components, sections) + + +def _wrap_repr(r: str, end: bool = False) -> str: + """ + Wrap HTML representation with a tee to the left of it. + + Enclosing HTML tag is a
with :code:`display: inline-grid` style. + + Turns: + [ title ] + | details | + |_____________| + + into (A): + |─ [ title ] + | | details | + | |_____________| + + or (B): + └─ [ title ] + | details | + |_____________| + + Parameters + ---------- + r: str + HTML representation to wrap. + end: bool + Specify if the line on the left should continue or end. + + Default is True. + + Returns + ------- + str + Wrapped HTML representation. + + Tee color is set to the variable :code:`--xr-border-color`. + """ + # height of line + end = bool(end) + height = "100%" if end is False else "1.2em" + return "".join( + [ + "
", + "
", + "
", + "
", + "
", + "
", + "
    ", + r, + "
" "
", + "
", + ] + ) + + +def datatree_repr(dt: Any) -> str: + obj_type = f"datatree.{type(dt).__name__}" + return node_repr(obj_type, dt) diff --git a/xarray/datatree_/datatree/io.py b/xarray/datatree_/datatree/io.py new file mode 100644 index 00000000000..cd6d5d0ec17 --- /dev/null +++ b/xarray/datatree_/datatree/io.py @@ -0,0 +1,223 @@ +from xarray import Dataset, open_dataset + +from xarray.datatree_.datatree.datatree import DataTree, NodePath + + +def _iter_zarr_groups(root, parent="/"): + parent = NodePath(parent) + for path, group in root.groups(): + gpath = parent / path + yield str(gpath) + yield from _iter_zarr_groups(group, parent=gpath) + + +def _iter_nc_groups(root, parent="/"): + parent = NodePath(parent) + for path, group in root.groups.items(): + gpath = parent / path + yield str(gpath) + yield from _iter_nc_groups(group, parent=gpath) + + +def _get_nc_dataset_class(engine): + if engine == "netcdf4": + from netCDF4 import Dataset # type: ignore + elif engine == "h5netcdf": + from h5netcdf.legacyapi import Dataset # type: ignore + elif engine is None: + try: + from netCDF4 import Dataset + except ImportError: + from h5netcdf.legacyapi import Dataset # type: ignore + else: + raise ValueError(f"unsupported engine: {engine}") + return Dataset + + +def open_datatree(filename_or_obj, engine=None, **kwargs) -> DataTree: + """ + Open and decode a dataset from a file or file-like object, creating one Tree node for each group in the file. + + Parameters + ---------- + filename_or_obj : str, Path, file-like, or DataStore + Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. + engine : str, optional + Xarray backend engine to us. Valid options include `{"netcdf4", "h5netcdf", "zarr"}`. + kwargs : + Additional keyword arguments passed to ``xarray.open_dataset`` for each group. + + Returns + ------- + DataTree + """ + + if engine == "zarr": + return _open_datatree_zarr(filename_or_obj, **kwargs) + elif engine in [None, "netcdf4", "h5netcdf"]: + return _open_datatree_netcdf(filename_or_obj, engine=engine, **kwargs) + else: + raise ValueError("Unsupported engine") + + +def _open_datatree_netcdf(filename: str, **kwargs) -> DataTree: + ncDataset = _get_nc_dataset_class(kwargs.get("engine", None)) + + ds = open_dataset(filename, **kwargs) + tree_root = DataTree.from_dict({"/": ds}) + with ncDataset(filename, mode="r") as ncds: + for path in _iter_nc_groups(ncds): + subgroup_ds = open_dataset(filename, group=path, **kwargs) + + # TODO refactor to use __setitem__ once creation of new nodes by assigning Dataset works again + node_name = NodePath(path).name + new_node: DataTree = DataTree(name=node_name, data=subgroup_ds) + tree_root._set_item( + path, + new_node, + allow_overwrite=False, + new_nodes_along_path=True, + ) + return tree_root + + +def _open_datatree_zarr(store, **kwargs) -> DataTree: + import zarr # type: ignore + + zds = zarr.open_group(store, mode="r") + ds = open_dataset(store, engine="zarr", **kwargs) + tree_root = DataTree.from_dict({"/": ds}) + for path in _iter_zarr_groups(zds): + try: + subgroup_ds = open_dataset(store, engine="zarr", group=path, **kwargs) + except zarr.errors.PathNotFoundError: + subgroup_ds = Dataset() + + # TODO refactor to use __setitem__ once creation of new nodes by assigning Dataset works again + node_name = NodePath(path).name + new_node: DataTree = DataTree(name=node_name, data=subgroup_ds) + tree_root._set_item( + path, + new_node, + allow_overwrite=False, + new_nodes_along_path=True, + ) + return tree_root + + +def _create_empty_netcdf_group(filename, group, mode, engine): + ncDataset = _get_nc_dataset_class(engine) + + with ncDataset(filename, mode=mode) as rootgrp: + rootgrp.createGroup(group) + + +def _datatree_to_netcdf( + dt: DataTree, + filepath, + mode: str = "w", + encoding=None, + unlimited_dims=None, + **kwargs, +): + if kwargs.get("format", None) not in [None, "NETCDF4"]: + raise ValueError("to_netcdf only supports the NETCDF4 format") + + engine = kwargs.get("engine", None) + if engine not in [None, "netcdf4", "h5netcdf"]: + raise ValueError("to_netcdf only supports the netcdf4 and h5netcdf engines") + + if kwargs.get("group", None) is not None: + raise NotImplementedError( + "specifying a root group for the tree has not been implemented" + ) + + if not kwargs.get("compute", True): + raise NotImplementedError("compute=False has not been implemented yet") + + if encoding is None: + encoding = {} + + # In the future, we may want to expand this check to insure all the provided encoding + # options are valid. For now, this simply checks that all provided encoding keys are + # groups in the datatree. + if set(encoding) - set(dt.groups): + raise ValueError( + f"unexpected encoding group name(s) provided: {set(encoding) - set(dt.groups)}" + ) + + if unlimited_dims is None: + unlimited_dims = {} + + for node in dt.subtree: + ds = node.ds + group_path = node.path + if ds is None: + _create_empty_netcdf_group(filepath, group_path, mode, engine) + else: + ds.to_netcdf( + filepath, + group=group_path, + mode=mode, + encoding=encoding.get(node.path), + unlimited_dims=unlimited_dims.get(node.path), + **kwargs, + ) + mode = "r+" + + +def _create_empty_zarr_group(store, group, mode): + import zarr # type: ignore + + root = zarr.open_group(store, mode=mode) + root.create_group(group, overwrite=True) + + +def _datatree_to_zarr( + dt: DataTree, + store, + mode: str = "w-", + encoding=None, + consolidated: bool = True, + **kwargs, +): + from zarr.convenience import consolidate_metadata # type: ignore + + if kwargs.get("group", None) is not None: + raise NotImplementedError( + "specifying a root group for the tree has not been implemented" + ) + + if not kwargs.get("compute", True): + raise NotImplementedError("compute=False has not been implemented yet") + + if encoding is None: + encoding = {} + + # In the future, we may want to expand this check to insure all the provided encoding + # options are valid. For now, this simply checks that all provided encoding keys are + # groups in the datatree. + if set(encoding) - set(dt.groups): + raise ValueError( + f"unexpected encoding group name(s) provided: {set(encoding) - set(dt.groups)}" + ) + + for node in dt.subtree: + ds = node.ds + group_path = node.path + if ds is None: + _create_empty_zarr_group(store, group_path, mode) + else: + ds.to_zarr( + store, + group=group_path, + mode=mode, + encoding=encoding.get(node.path), + consolidated=False, + **kwargs, + ) + if "w" in mode: + mode = "a" + + if consolidated: + consolidate_metadata(store) diff --git a/xarray/datatree_/datatree/iterators.py b/xarray/datatree_/datatree/iterators.py new file mode 100644 index 00000000000..a3c092b004c --- /dev/null +++ b/xarray/datatree_/datatree/iterators.py @@ -0,0 +1,116 @@ +from abc import abstractmethod +from collections import abc +from typing import Callable, Iterator, List, Optional + +from xarray.datatree_.datatree.treenode import Tree + +"""These iterators are copied from anytree.iterators, with minor modifications.""" + + +class AbstractIter(abc.Iterator): + def __init__( + self, + node: Tree, + filter_: Optional[Callable] = None, + stop: Optional[Callable] = None, + maxlevel: Optional[int] = None, + ): + """ + Iterate over tree starting at `node`. + Base class for all iterators. + Keyword Args: + filter_: function called with every `node` as argument, `node` is returned if `True`. + stop: stop iteration at `node` if `stop` function returns `True` for `node`. + maxlevel (int): maximum descending in the node hierarchy. + """ + self.node = node + self.filter_ = filter_ + self.stop = stop + self.maxlevel = maxlevel + self.__iter = None + + def __init(self): + node = self.node + maxlevel = self.maxlevel + filter_ = self.filter_ or AbstractIter.__default_filter + stop = self.stop or AbstractIter.__default_stop + children = ( + [] + if AbstractIter._abort_at_level(1, maxlevel) + else AbstractIter._get_children([node], stop) + ) + return self._iter(children, filter_, stop, maxlevel) + + @staticmethod + def __default_filter(node): + return True + + @staticmethod + def __default_stop(node): + return False + + def __iter__(self) -> Iterator[Tree]: + return self + + def __next__(self) -> Iterator[Tree]: + if self.__iter is None: + self.__iter = self.__init() + item = next(self.__iter) # type: ignore[call-overload] + return item + + @staticmethod + @abstractmethod + def _iter(children: List[Tree], filter_, stop, maxlevel) -> Iterator[Tree]: + ... + + @staticmethod + def _abort_at_level(level, maxlevel): + return maxlevel is not None and level > maxlevel + + @staticmethod + def _get_children(children: List[Tree], stop) -> List[Tree]: + return [child for child in children if not stop(child)] + + +class PreOrderIter(AbstractIter): + """ + Iterate over tree applying pre-order strategy starting at `node`. + Start at root and go-down until reaching a leaf node. + Step upwards then, and search for the next leafs. + """ + + @staticmethod + def _iter(children, filter_, stop, maxlevel): + for child_ in children: + if stop(child_): + continue + if filter_(child_): + yield child_ + if not AbstractIter._abort_at_level(2, maxlevel): + descendantmaxlevel = maxlevel - 1 if maxlevel else None + for descendant_ in PreOrderIter._iter( + list(child_.children.values()), filter_, stop, descendantmaxlevel + ): + yield descendant_ + + +class LevelOrderIter(AbstractIter): + """ + Iterate over tree applying level-order strategy starting at `node`. + """ + + @staticmethod + def _iter(children, filter_, stop, maxlevel): + level = 1 + while children: + next_children = [] + for child in children: + if filter_(child): + yield child + next_children += AbstractIter._get_children( + list(child.children.values()), stop + ) + children = next_children + level += 1 + if AbstractIter._abort_at_level(level, maxlevel): + break diff --git a/xarray/datatree_/datatree/mapping.py b/xarray/datatree_/datatree/mapping.py new file mode 100644 index 00000000000..9b008c2e840 --- /dev/null +++ b/xarray/datatree_/datatree/mapping.py @@ -0,0 +1,346 @@ +from __future__ import annotations + +import functools +import sys +from itertools import repeat +from textwrap import dedent +from typing import TYPE_CHECKING, Callable, Tuple + +from xarray import DataArray, Dataset + +from xarray.datatree_.datatree.iterators import LevelOrderIter +from xarray.datatree_.datatree.treenode import NodePath, TreeNode + +if TYPE_CHECKING: + from xarray.datatree_.datatree.datatree import DataTree + + +class TreeIsomorphismError(ValueError): + """Error raised if two tree objects do not share the same node structure.""" + + pass + + +def check_isomorphic( + a: DataTree, + b: DataTree, + require_names_equal: bool = False, + check_from_root: bool = True, +): + """ + Check that two trees have the same structure, raising an error if not. + + Does not compare the actual data in the nodes. + + By default this function only checks that subtrees are isomorphic, not the entire tree above (if it exists). + Can instead optionally check the entire trees starting from the root, which will ensure all + + Can optionally check if corresponding nodes should have the same name. + + Parameters + ---------- + a : DataTree + b : DataTree + require_names_equal : Bool + Whether or not to also check that each node has the same name as its counterpart. + check_from_root : Bool + Whether or not to first traverse to the root of the trees before checking for isomorphism. + If a & b have no parents then this has no effect. + + Raises + ------ + TypeError + If either a or b are not tree objects. + TreeIsomorphismError + If a and b are tree objects, but are not isomorphic to one another. + Also optionally raised if their structure is isomorphic, but the names of any two + respective nodes are not equal. + """ + + if not isinstance(a, TreeNode): + raise TypeError(f"Argument `a` is not a tree, it is of type {type(a)}") + if not isinstance(b, TreeNode): + raise TypeError(f"Argument `b` is not a tree, it is of type {type(b)}") + + if check_from_root: + a = a.root + b = b.root + + diff = diff_treestructure(a, b, require_names_equal=require_names_equal) + + if diff: + raise TreeIsomorphismError("DataTree objects are not isomorphic:\n" + diff) + + +def diff_treestructure(a: DataTree, b: DataTree, require_names_equal: bool) -> str: + """ + Return a summary of why two trees are not isomorphic. + If they are isomorphic return an empty string. + """ + + # Walking nodes in "level-order" fashion means walking down from the root breadth-first. + # Checking for isomorphism by walking in this way implicitly assumes that the tree is an ordered tree + # (which it is so long as children are stored in a tuple or list rather than in a set). + for node_a, node_b in zip(LevelOrderIter(a), LevelOrderIter(b)): + path_a, path_b = node_a.path, node_b.path + + if require_names_equal: + if node_a.name != node_b.name: + diff = dedent( + f"""\ + Node '{path_a}' in the left object has name '{node_a.name}' + Node '{path_b}' in the right object has name '{node_b.name}'""" + ) + return diff + + if len(node_a.children) != len(node_b.children): + diff = dedent( + f"""\ + Number of children on node '{path_a}' of the left object: {len(node_a.children)} + Number of children on node '{path_b}' of the right object: {len(node_b.children)}""" + ) + return diff + + return "" + + +def map_over_subtree(func: Callable) -> Callable: + """ + Decorator which turns a function which acts on (and returns) Datasets into one which acts on and returns DataTrees. + + Applies a function to every dataset in one or more subtrees, returning new trees which store the results. + + The function will be applied to any data-containing dataset stored in any of the nodes in the trees. The returned + trees will have the same structure as the supplied trees. + + `func` needs to return one Datasets, DataArrays, or None in order to be able to rebuild the subtrees after + mapping, as each result will be assigned to its respective node of a new tree via `DataTree.__setitem__`. Any + returned value that is one of these types will be stacked into a separate tree before returning all of them. + + The trees passed to the resulting function must all be isomorphic to one another. Their nodes need not be named + similarly, but all the output trees will have nodes named in the same way as the first tree passed. + + Parameters + ---------- + func : callable + Function to apply to datasets with signature: + + `func(*args, **kwargs) -> Union[Dataset, Iterable[Dataset]]`. + + (i.e. func must accept at least one Dataset and return at least one Dataset.) + Function will not be applied to any nodes without datasets. + *args : tuple, optional + Positional arguments passed on to `func`. If DataTrees any data-containing nodes will be converted to Datasets + via .ds . + **kwargs : Any + Keyword arguments passed on to `func`. If DataTrees any data-containing nodes will be converted to Datasets + via .ds . + + Returns + ------- + mapped : callable + Wrapped function which returns one or more tree(s) created from results of applying ``func`` to the dataset at + each node. + + See also + -------- + DataTree.map_over_subtree + DataTree.map_over_subtree_inplace + DataTree.subtree + """ + + # TODO examples in the docstring + + # TODO inspect function to work out immediately if the wrong number of arguments were passed for it? + + @functools.wraps(func) + def _map_over_subtree(*args, **kwargs) -> DataTree | Tuple[DataTree, ...]: + """Internal function which maps func over every node in tree, returning a tree of the results.""" + from xarray.datatree_.datatree.datatree import DataTree + + all_tree_inputs = [a for a in args if isinstance(a, DataTree)] + [ + a for a in kwargs.values() if isinstance(a, DataTree) + ] + + if len(all_tree_inputs) > 0: + first_tree, *other_trees = all_tree_inputs + else: + raise TypeError("Must pass at least one tree object") + + for other_tree in other_trees: + # isomorphism is transitive so this is enough to guarantee all trees are mutually isomorphic + check_isomorphic( + first_tree, other_tree, require_names_equal=False, check_from_root=False + ) + + # Walk all trees simultaneously, applying func to all nodes that lie in same position in different trees + # We don't know which arguments are DataTrees so we zip all arguments together as iterables + # Store tuples of results in a dict because we don't yet know how many trees we need to rebuild to return + out_data_objects = {} + args_as_tree_length_iterables = [ + a.subtree if isinstance(a, DataTree) else repeat(a) for a in args + ] + n_args = len(args_as_tree_length_iterables) + kwargs_as_tree_length_iterables = { + k: v.subtree if isinstance(v, DataTree) else repeat(v) + for k, v in kwargs.items() + } + for node_of_first_tree, *all_node_args in zip( + first_tree.subtree, + *args_as_tree_length_iterables, + *list(kwargs_as_tree_length_iterables.values()), + ): + node_args_as_datasetviews = [ + a.ds if isinstance(a, DataTree) else a for a in all_node_args[:n_args] + ] + node_kwargs_as_datasetviews = dict( + zip( + [k for k in kwargs_as_tree_length_iterables.keys()], + [ + v.ds if isinstance(v, DataTree) else v + for v in all_node_args[n_args:] + ], + ) + ) + func_with_error_context = _handle_errors_with_path_context( + node_of_first_tree.path + )(func) + + if node_of_first_tree.has_data: + # call func on the data in this particular set of corresponding nodes + results = func_with_error_context( + *node_args_as_datasetviews, **node_kwargs_as_datasetviews + ) + elif node_of_first_tree.has_attrs: + # propagate attrs + results = node_of_first_tree.ds + else: + # nothing to propagate so use fastpath to create empty node in new tree + results = None + + # TODO implement mapping over multiple trees in-place using if conditions from here on? + out_data_objects[node_of_first_tree.path] = results + + # Find out how many return values we received + num_return_values = _check_all_return_values(out_data_objects) + + # Reconstruct 1+ subtrees from the dict of results, by filling in all nodes of all result trees + original_root_path = first_tree.path + result_trees = [] + for i in range(num_return_values): + out_tree_contents = {} + for n in first_tree.subtree: + p = n.path + if p in out_data_objects.keys(): + if isinstance(out_data_objects[p], tuple): + output_node_data = out_data_objects[p][i] + else: + output_node_data = out_data_objects[p] + else: + output_node_data = None + + # Discard parentage so that new trees don't include parents of input nodes + relative_path = str(NodePath(p).relative_to(original_root_path)) + relative_path = "/" if relative_path == "." else relative_path + out_tree_contents[relative_path] = output_node_data + + new_tree = DataTree.from_dict( + out_tree_contents, + name=first_tree.name, + ) + result_trees.append(new_tree) + + # If only one result then don't wrap it in a tuple + if len(result_trees) == 1: + return result_trees[0] + else: + return tuple(result_trees) + + return _map_over_subtree + + +def _handle_errors_with_path_context(path): + """Wraps given function so that if it fails it also raises path to node on which it failed.""" + + def decorator(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + if sys.version_info >= (3, 11): + # Add the context information to the error message + e.add_note( + f"Raised whilst mapping function over node with path {path}" + ) + raise + + return wrapper + + return decorator + + +def add_note(err: BaseException, msg: str) -> None: + # TODO: remove once python 3.10 can be dropped + if sys.version_info < (3, 11): + err.__notes__ = getattr(err, "__notes__", []) + [msg] # type: ignore[attr-defined] + else: + err.add_note(msg) + + +def _check_single_set_return_values(path_to_node, obj): + """Check types returned from single evaluation of func, and return number of return values received from func.""" + if isinstance(obj, (Dataset, DataArray)): + return 1 + elif isinstance(obj, tuple): + for r in obj: + if not isinstance(r, (Dataset, DataArray)): + raise TypeError( + f"One of the results of calling func on datasets on the nodes at position {path_to_node} is " + f"of type {type(r)}, not Dataset or DataArray." + ) + return len(obj) + else: + raise TypeError( + f"The result of calling func on the node at position {path_to_node} is of type {type(obj)}, not " + f"Dataset or DataArray, nor a tuple of such types." + ) + + +def _check_all_return_values(returned_objects): + """Walk through all values returned by mapping func over subtrees, raising on any invalid or inconsistent types.""" + + if all(r is None for r in returned_objects.values()): + raise TypeError( + "Called supplied function on all nodes but found a return value of None for" + "all of them." + ) + + result_data_objects = [ + (path_to_node, r) + for path_to_node, r in returned_objects.items() + if r is not None + ] + + if len(result_data_objects) == 1: + # Only one node in the tree: no need to check consistency of results between nodes + path_to_node, result = result_data_objects[0] + num_return_values = _check_single_set_return_values(path_to_node, result) + else: + prev_path, _ = result_data_objects[0] + prev_num_return_values, num_return_values = None, None + for path_to_node, obj in result_data_objects[1:]: + num_return_values = _check_single_set_return_values(path_to_node, obj) + + if ( + num_return_values != prev_num_return_values + and prev_num_return_values is not None + ): + raise TypeError( + f"Calling func on the nodes at position {path_to_node} returns {num_return_values} separate return " + f"values, whereas calling func on the nodes at position {prev_path} instead returns " + f"{prev_num_return_values} separate return values." + ) + + prev_path, prev_num_return_values = path_to_node, num_return_values + + return num_return_values diff --git a/xarray/datatree_/datatree/ops.py b/xarray/datatree_/datatree/ops.py new file mode 100644 index 00000000000..b4687447be4 --- /dev/null +++ b/xarray/datatree_/datatree/ops.py @@ -0,0 +1,262 @@ +import textwrap + +from xarray import Dataset + +from xarray.datatree_.datatree.mapping import map_over_subtree + +""" +Module which specifies the subset of xarray.Dataset's API which we wish to copy onto DataTree. + +Structured to mirror the way xarray defines Dataset's various operations internally, but does not actually import from +xarray's internals directly, only the public-facing xarray.Dataset class. +""" + + +_MAPPED_DOCSTRING_ADDENDUM = textwrap.fill( + "This method was copied from xarray.Dataset, but has been altered to " + "call the method on the Datasets stored in every node of the subtree. " + "See the `map_over_subtree` function for more details.", + width=117, +) + +# TODO equals, broadcast_equals etc. +# TODO do dask-related private methods need to be exposed? +_DATASET_DASK_METHODS_TO_MAP = [ + "load", + "compute", + "persist", + "unify_chunks", + "chunk", + "map_blocks", +] +_DATASET_METHODS_TO_MAP = [ + "as_numpy", + "set_coords", + "reset_coords", + "info", + "isel", + "sel", + "head", + "tail", + "thin", + "broadcast_like", + "reindex_like", + "reindex", + "interp", + "interp_like", + "rename", + "rename_dims", + "rename_vars", + "swap_dims", + "expand_dims", + "set_index", + "reset_index", + "reorder_levels", + "stack", + "unstack", + "merge", + "drop_vars", + "drop_sel", + "drop_isel", + "drop_dims", + "transpose", + "dropna", + "fillna", + "interpolate_na", + "ffill", + "bfill", + "combine_first", + "reduce", + "map", + "diff", + "shift", + "roll", + "sortby", + "quantile", + "rank", + "differentiate", + "integrate", + "cumulative_integrate", + "filter_by_attrs", + "polyfit", + "pad", + "idxmin", + "idxmax", + "argmin", + "argmax", + "query", + "curvefit", +] +_ALL_DATASET_METHODS_TO_MAP = _DATASET_DASK_METHODS_TO_MAP + _DATASET_METHODS_TO_MAP + +_DATA_WITH_COORDS_METHODS_TO_MAP = [ + "squeeze", + "clip", + "assign_coords", + "where", + "close", + "isnull", + "notnull", + "isin", + "astype", +] + +REDUCE_METHODS = ["all", "any"] +NAN_REDUCE_METHODS = [ + "max", + "min", + "mean", + "prod", + "sum", + "std", + "var", + "median", +] +NAN_CUM_METHODS = ["cumsum", "cumprod"] +_TYPED_DATASET_OPS_TO_MAP = [ + "__add__", + "__sub__", + "__mul__", + "__pow__", + "__truediv__", + "__floordiv__", + "__mod__", + "__and__", + "__xor__", + "__or__", + "__lt__", + "__le__", + "__gt__", + "__ge__", + "__eq__", + "__ne__", + "__radd__", + "__rsub__", + "__rmul__", + "__rpow__", + "__rtruediv__", + "__rfloordiv__", + "__rmod__", + "__rand__", + "__rxor__", + "__ror__", + "__iadd__", + "__isub__", + "__imul__", + "__ipow__", + "__itruediv__", + "__ifloordiv__", + "__imod__", + "__iand__", + "__ixor__", + "__ior__", + "__neg__", + "__pos__", + "__abs__", + "__invert__", + "round", + "argsort", + "conj", + "conjugate", +] +# TODO NUM_BINARY_OPS apparently aren't defined on DatasetArithmetic, and don't appear to be injected anywhere... +_ARITHMETIC_METHODS_TO_MAP = ( + REDUCE_METHODS + + NAN_REDUCE_METHODS + + NAN_CUM_METHODS + + _TYPED_DATASET_OPS_TO_MAP + + ["__array_ufunc__"] +) + + +def _wrap_then_attach_to_cls( + target_cls_dict, source_cls, methods_to_set, wrap_func=None +): + """ + Attach given methods on a class, and optionally wrap each method first. (i.e. with map_over_subtree) + + Result is like having written this in the classes' definition: + ``` + @wrap_func + def method_name(self, *args, **kwargs): + return self.method(*args, **kwargs) + ``` + + Every method attached here needs to have a return value of Dataset or DataArray in order to construct a new tree. + + Parameters + ---------- + target_cls_dict : MappingProxy + The __dict__ attribute of the class which we want the methods to be added to. (The __dict__ attribute can also + be accessed by calling vars() from within that classes' definition.) This will be updated by this function. + source_cls : class + Class object from which we want to copy methods (and optionally wrap them). Should be the actual class object + (or instance), not just the __dict__. + methods_to_set : Iterable[Tuple[str, callable]] + The method names and definitions supplied as a list of (method_name_string, method) pairs. + This format matches the output of inspect.getmembers(). + wrap_func : callable, optional + Function to decorate each method with. Must have the same return type as the method. + """ + for method_name in methods_to_set: + orig_method = getattr(source_cls, method_name) + wrapped_method = ( + wrap_func(orig_method) if wrap_func is not None else orig_method + ) + target_cls_dict[method_name] = wrapped_method + + if wrap_func is map_over_subtree: + # Add a paragraph to the method's docstring explaining how it's been mapped + orig_method_docstring = orig_method.__doc__ + # if orig_method_docstring is not None: + # if "\n" in orig_method_docstring: + # new_method_docstring = orig_method_docstring.replace( + # "\n", _MAPPED_DOCSTRING_ADDENDUM, 1 + # ) + # else: + # new_method_docstring = ( + # orig_method_docstring + f"\n\n{_MAPPED_DOCSTRING_ADDENDUM}" + # ) + setattr(target_cls_dict[method_name], "__doc__", orig_method_docstring) + + +class MappedDatasetMethodsMixin: + """ + Mixin to add methods defined specifically on the Dataset class such as .query(), but wrapped to map over all nodes + in the subtree. + """ + + _wrap_then_attach_to_cls( + target_cls_dict=vars(), + source_cls=Dataset, + methods_to_set=_ALL_DATASET_METHODS_TO_MAP, + wrap_func=map_over_subtree, + ) + + +class MappedDataWithCoords: + """ + Mixin to add coordinate-aware Dataset methods such as .where(), but wrapped to map over all nodes in the subtree. + """ + + # TODO add mapped versions of groupby, weighted, rolling, rolling_exp, coarsen, resample + _wrap_then_attach_to_cls( + target_cls_dict=vars(), + source_cls=Dataset, + methods_to_set=_DATA_WITH_COORDS_METHODS_TO_MAP, + wrap_func=map_over_subtree, + ) + + +class DataTreeArithmeticMixin: + """ + Mixin to add Dataset arithmetic operations such as __add__, reduction methods such as .mean(), and enable numpy + ufuncs such as np.sin(), but wrapped to map over all nodes in the subtree. + """ + + _wrap_then_attach_to_cls( + target_cls_dict=vars(), + source_cls=Dataset, + methods_to_set=_ARITHMETIC_METHODS_TO_MAP, + wrap_func=map_over_subtree, + ) diff --git a/xarray/datatree_/datatree/py.typed b/xarray/datatree_/datatree/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/xarray/datatree_/datatree/render.py b/xarray/datatree_/datatree/render.py new file mode 100644 index 00000000000..7f407606722 --- /dev/null +++ b/xarray/datatree_/datatree/render.py @@ -0,0 +1,271 @@ +""" +String Tree Rendering. Copied from anytree. +""" + +import collections +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from xarray.datatree_.datatree.datatree import DataTree + +Row = collections.namedtuple("Row", ("pre", "fill", "node")) + + +class AbstractStyle(object): + def __init__(self, vertical, cont, end): + """ + Tree Render Style. + Args: + vertical: Sign for vertical line. + cont: Chars for a continued branch. + end: Chars for the last branch. + """ + super(AbstractStyle, self).__init__() + self.vertical = vertical + self.cont = cont + self.end = end + assert ( + len(cont) == len(vertical) == len(end) + ), f"'{vertical}', '{cont}' and '{end}' need to have equal length" + + @property + def empty(self): + """Empty string as placeholder.""" + return " " * len(self.end) + + def __repr__(self): + return f"{self.__class__.__name__}()" + + +class ContStyle(AbstractStyle): + def __init__(self): + """ + Continued style, without gaps. + + >>> from anytree import Node, RenderTree + >>> root = Node("root") + >>> s0 = Node("sub0", parent=root) + >>> s0b = Node("sub0B", parent=s0) + >>> s0a = Node("sub0A", parent=s0) + >>> s1 = Node("sub1", parent=root) + >>> print(RenderTree(root, style=ContStyle())) + + Node('/root') + ├── Node('/root/sub0') + │ ├── Node('/root/sub0/sub0B') + │ └── Node('/root/sub0/sub0A') + └── Node('/root/sub1') + """ + super(ContStyle, self).__init__( + "\u2502 ", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 " + ) + + +class RenderTree(object): + def __init__( + self, node: "DataTree", style=ContStyle(), childiter=list, maxlevel=None + ): + """ + Render tree starting at `node`. + Keyword Args: + style (AbstractStyle): Render Style. + childiter: Child iterator. + maxlevel: Limit rendering to this depth. + :any:`RenderTree` is an iterator, returning a tuple with 3 items: + `pre` + tree prefix. + `fill` + filling for multiline entries. + `node` + :any:`NodeMixin` object. + It is up to the user to assemble these parts to a whole. + >>> from anytree import Node, RenderTree + >>> root = Node("root", lines=["c0fe", "c0de"]) + >>> s0 = Node("sub0", parent=root, lines=["ha", "ba"]) + >>> s0b = Node("sub0B", parent=s0, lines=["1", "2", "3"]) + >>> s0a = Node("sub0A", parent=s0, lines=["a", "b"]) + >>> s1 = Node("sub1", parent=root, lines=["Z"]) + Simple one line: + >>> for pre, _, node in RenderTree(root): + ... print("%s%s" % (pre, node.name)) + ... + root + ├── sub0 + │ ├── sub0B + │ └── sub0A + └── sub1 + Multiline: + >>> for pre, fill, node in RenderTree(root): + ... print("%s%s" % (pre, node.lines[0])) + ... for line in node.lines[1:]: + ... print("%s%s" % (fill, line)) + ... + c0fe + c0de + ├── ha + │ ba + │ ├── 1 + │ │ 2 + │ │ 3 + │ └── a + │ b + └── Z + `maxlevel` limits the depth of the tree: + >>> print(RenderTree(root, maxlevel=2)) + Node('/root', lines=['c0fe', 'c0de']) + ├── Node('/root/sub0', lines=['ha', 'ba']) + └── Node('/root/sub1', lines=['Z']) + The `childiter` is responsible for iterating over child nodes at the + same level. An reversed order can be achived by using `reversed`. + >>> for row in RenderTree(root, childiter=reversed): + ... print("%s%s" % (row.pre, row.node.name)) + ... + root + ├── sub1 + └── sub0 + ├── sub0A + └── sub0B + Or writing your own sort function: + >>> def mysort(items): + ... return sorted(items, key=lambda item: item.name) + ... + >>> for row in RenderTree(root, childiter=mysort): + ... print("%s%s" % (row.pre, row.node.name)) + ... + root + ├── sub0 + │ ├── sub0A + │ └── sub0B + └── sub1 + :any:`by_attr` simplifies attribute rendering and supports multiline: + >>> print(RenderTree(root).by_attr()) + root + ├── sub0 + │ ├── sub0B + │ └── sub0A + └── sub1 + >>> print(RenderTree(root).by_attr("lines")) + c0fe + c0de + ├── ha + │ ba + │ ├── 1 + │ │ 2 + │ │ 3 + │ └── a + │ b + └── Z + And can be a function: + >>> print(RenderTree(root).by_attr(lambda n: " ".join(n.lines))) + c0fe c0de + ├── ha ba + │ ├── 1 2 3 + │ └── a b + └── Z + """ + if not isinstance(style, AbstractStyle): + style = style() + self.node = node + self.style = style + self.childiter = childiter + self.maxlevel = maxlevel + + def __iter__(self): + return self.__next(self.node, tuple()) + + def __next(self, node, continues, level=0): + yield RenderTree.__item(node, continues, self.style) + children = node.children.values() + level += 1 + if children and (self.maxlevel is None or level < self.maxlevel): + children = self.childiter(children) + for child, is_last in _is_last(children): + for grandchild in self.__next( + child, continues + (not is_last,), level=level + ): + yield grandchild + + @staticmethod + def __item(node, continues, style): + if not continues: + return Row("", "", node) + else: + items = [style.vertical if cont else style.empty for cont in continues] + indent = "".join(items[:-1]) + branch = style.cont if continues[-1] else style.end + pre = indent + branch + fill = "".join(items) + return Row(pre, fill, node) + + def __str__(self): + lines = ["%s%r" % (pre, node) for pre, _, node in self] + return "\n".join(lines) + + def __repr__(self): + classname = self.__class__.__name__ + args = [ + repr(self.node), + "style=%s" % repr(self.style), + "childiter=%s" % repr(self.childiter), + ] + return "%s(%s)" % (classname, ", ".join(args)) + + def by_attr(self, attrname="name"): + """ + Return rendered tree with node attribute `attrname`. + >>> from anytree import AnyNode, RenderTree + >>> root = AnyNode(id="root") + >>> s0 = AnyNode(id="sub0", parent=root) + >>> s0b = AnyNode(id="sub0B", parent=s0, foo=4, bar=109) + >>> s0a = AnyNode(id="sub0A", parent=s0) + >>> s1 = AnyNode(id="sub1", parent=root) + >>> s1a = AnyNode(id="sub1A", parent=s1) + >>> s1b = AnyNode(id="sub1B", parent=s1, bar=8) + >>> s1c = AnyNode(id="sub1C", parent=s1) + >>> s1ca = AnyNode(id="sub1Ca", parent=s1c) + >>> print(RenderTree(root).by_attr("id")) + root + ├── sub0 + │ ├── sub0B + │ └── sub0A + └── sub1 + ├── sub1A + ├── sub1B + └── sub1C + └── sub1Ca + """ + + def get(): + for pre, fill, node in self: + attr = ( + attrname(node) + if callable(attrname) + else getattr(node, attrname, "") + ) + if isinstance(attr, (list, tuple)): + lines = attr + else: + lines = str(attr).split("\n") + yield "%s%s" % (pre, lines[0]) + for line in lines[1:]: + yield "%s%s" % (fill, line) + + return "\n".join(get()) + + +def _is_last(iterable): + iter_ = iter(iterable) + try: + nextitem = next(iter_) + except StopIteration: + pass + else: + item = nextitem + while True: + try: + nextitem = next(iter_) + yield item, False + except StopIteration: + yield nextitem, True + break + item = nextitem diff --git a/xarray/datatree_/datatree/testing.py b/xarray/datatree_/datatree/testing.py new file mode 100644 index 00000000000..658056cf795 --- /dev/null +++ b/xarray/datatree_/datatree/testing.py @@ -0,0 +1,120 @@ +from xarray.testing.assertions import ensure_warnings + +from xarray.datatree_.datatree.datatree import DataTree +from xarray.datatree_.datatree.formatting import diff_tree_repr + + +@ensure_warnings +def assert_isomorphic(a: DataTree, b: DataTree, from_root: bool = False): + """ + Two DataTrees are considered isomorphic if every node has the same number of children. + + Nothing about the data in each node is checked. + + Isomorphism is a necessary condition for two trees to be used in a nodewise binary operation, + such as tree1 + tree2. + + By default this function does not check any part of the tree above the given node. + Therefore this function can be used as default to check that two subtrees are isomorphic. + + Parameters + ---------- + a : DataTree + The first object to compare. + b : DataTree + The second object to compare. + from_root : bool, optional, default is False + Whether or not to first traverse to the root of the trees before checking for isomorphism. + If a & b have no parents then this has no effect. + + See Also + -------- + DataTree.isomorphic + assert_equals + assert_identical + """ + __tracebackhide__ = True + assert isinstance(a, type(b)) + + if isinstance(a, DataTree): + if from_root: + a = a.root + b = b.root + + assert a.isomorphic(b, from_root=from_root), diff_tree_repr(a, b, "isomorphic") + else: + raise TypeError(f"{type(a)} not of type DataTree") + + +@ensure_warnings +def assert_equal(a: DataTree, b: DataTree, from_root: bool = True): + """ + Two DataTrees are equal if they have isomorphic node structures, with matching node names, + and if they have matching variables and coordinates, all of which are equal. + + By default this method will check the whole tree above the given node. + + Parameters + ---------- + a : DataTree + The first object to compare. + b : DataTree + The second object to compare. + from_root : bool, optional, default is True + Whether or not to first traverse to the root of the trees before checking for isomorphism. + If a & b have no parents then this has no effect. + + See Also + -------- + DataTree.equals + assert_isomorphic + assert_identical + """ + __tracebackhide__ = True + assert isinstance(a, type(b)) + + if isinstance(a, DataTree): + if from_root: + a = a.root + b = b.root + + assert a.equals(b, from_root=from_root), diff_tree_repr(a, b, "equals") + else: + raise TypeError(f"{type(a)} not of type DataTree") + + +@ensure_warnings +def assert_identical(a: DataTree, b: DataTree, from_root: bool = True): + """ + Like assert_equals, but will also check all dataset attributes and the attributes on + all variables and coordinates. + + By default this method will check the whole tree above the given node. + + Parameters + ---------- + a : xarray.DataTree + The first object to compare. + b : xarray.DataTree + The second object to compare. + from_root : bool, optional, default is True + Whether or not to first traverse to the root of the trees before checking for isomorphism. + If a & b have no parents then this has no effect. + + See Also + -------- + DataTree.identical + assert_isomorphic + assert_equal + """ + + __tracebackhide__ = True + assert isinstance(a, type(b)) + if isinstance(a, DataTree): + if from_root: + a = a.root + b = b.root + + assert a.identical(b, from_root=from_root), diff_tree_repr(a, b, "identical") + else: + raise TypeError(f"{type(a)} not of type DataTree") diff --git a/xarray/datatree_/datatree/tests/__init__.py b/xarray/datatree_/datatree/tests/__init__.py new file mode 100644 index 00000000000..64961158b13 --- /dev/null +++ b/xarray/datatree_/datatree/tests/__init__.py @@ -0,0 +1,29 @@ +import importlib + +import pytest +from packaging import version + + +def _importorskip(modname, minversion=None): + try: + mod = importlib.import_module(modname) + has = True + if minversion is not None: + if LooseVersion(mod.__version__) < LooseVersion(minversion): + raise ImportError("Minimum version not satisfied") + except ImportError: + has = False + func = pytest.mark.skipif(not has, reason=f"requires {modname}") + return has, func + + +def LooseVersion(vstring): + # Our development version is something like '0.10.9+aac7bfc' + # This function just ignores the git commit id. + vstring = vstring.split("+")[0] + return version.parse(vstring) + + +has_zarr, requires_zarr = _importorskip("zarr") +has_h5netcdf, requires_h5netcdf = _importorskip("h5netcdf") +has_netCDF4, requires_netCDF4 = _importorskip("netCDF4") diff --git a/xarray/datatree_/datatree/tests/conftest.py b/xarray/datatree_/datatree/tests/conftest.py new file mode 100644 index 00000000000..3ed1325ccd5 --- /dev/null +++ b/xarray/datatree_/datatree/tests/conftest.py @@ -0,0 +1,65 @@ +import pytest +import xarray as xr + +from datatree import DataTree + + +@pytest.fixture(scope="module") +def create_test_datatree(): + """ + Create a test datatree with this structure: + + + |-- set1 + | |-- + | | Dimensions: () + | | Data variables: + | | a int64 0 + | | b int64 1 + | |-- set1 + | |-- set2 + |-- set2 + | |-- + | | Dimensions: (x: 2) + | | Data variables: + | | a (x) int64 2, 3 + | | b (x) int64 0.1, 0.2 + | |-- set1 + |-- set3 + |-- + | Dimensions: (x: 2, y: 3) + | Data variables: + | a (y) int64 6, 7, 8 + | set0 (x) int64 9, 10 + + The structure has deliberately repeated names of tags, variables, and + dimensions in order to better check for bugs caused by name conflicts. + """ + + def _create_test_datatree(modify=lambda ds: ds): + set1_data = modify(xr.Dataset({"a": 0, "b": 1})) + set2_data = modify(xr.Dataset({"a": ("x", [2, 3]), "b": ("x", [0.1, 0.2])})) + root_data = modify(xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])})) + + # Avoid using __init__ so we can independently test it + root = DataTree(data=root_data) + set1 = DataTree(name="set1", parent=root, data=set1_data) + DataTree(name="set1", parent=set1) + DataTree(name="set2", parent=set1) + set2 = DataTree(name="set2", parent=root, data=set2_data) + DataTree(name="set1", parent=set2) + DataTree(name="set3", parent=root) + + return root + + return _create_test_datatree + + +@pytest.fixture(scope="module") +def simple_datatree(create_test_datatree): + """ + Invoke create_test_datatree fixture (callback). + + Returns a DataTree. + """ + return create_test_datatree() diff --git a/xarray/datatree_/datatree/tests/test_dataset_api.py b/xarray/datatree_/datatree/tests/test_dataset_api.py new file mode 100644 index 00000000000..6879b869299 --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_dataset_api.py @@ -0,0 +1,98 @@ +import numpy as np +import xarray as xr + +from datatree import DataTree +from datatree.testing import assert_equal + + +class TestDSMethodInheritance: + def test_dataset_method(self): + ds = xr.Dataset({"a": ("x", [1, 2, 3])}) + dt = DataTree(data=ds) + DataTree(name="results", parent=dt, data=ds) + + expected = DataTree(data=ds.isel(x=1)) + DataTree(name="results", parent=expected, data=ds.isel(x=1)) + + result = dt.isel(x=1) + assert_equal(result, expected) + + def test_reduce_method(self): + ds = xr.Dataset({"a": ("x", [False, True, False])}) + dt = DataTree(data=ds) + DataTree(name="results", parent=dt, data=ds) + + expected = DataTree(data=ds.any()) + DataTree(name="results", parent=expected, data=ds.any()) + + result = dt.any() + assert_equal(result, expected) + + def test_nan_reduce_method(self): + ds = xr.Dataset({"a": ("x", [1, 2, 3])}) + dt = DataTree(data=ds) + DataTree(name="results", parent=dt, data=ds) + + expected = DataTree(data=ds.mean()) + DataTree(name="results", parent=expected, data=ds.mean()) + + result = dt.mean() + assert_equal(result, expected) + + def test_cum_method(self): + ds = xr.Dataset({"a": ("x", [1, 2, 3])}) + dt = DataTree(data=ds) + DataTree(name="results", parent=dt, data=ds) + + expected = DataTree(data=ds.cumsum()) + DataTree(name="results", parent=expected, data=ds.cumsum()) + + result = dt.cumsum() + assert_equal(result, expected) + + +class TestOps: + def test_binary_op_on_int(self): + ds1 = xr.Dataset({"a": [5], "b": [3]}) + ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) + dt = DataTree(data=ds1) + DataTree(name="subnode", data=ds2, parent=dt) + + expected = DataTree(data=ds1 * 5) + DataTree(name="subnode", data=ds2 * 5, parent=expected) + + result = dt * 5 + assert_equal(result, expected) + + def test_binary_op_on_dataset(self): + ds1 = xr.Dataset({"a": [5], "b": [3]}) + ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) + dt = DataTree(data=ds1) + DataTree(name="subnode", data=ds2, parent=dt) + other_ds = xr.Dataset({"z": ("z", [0.1, 0.2])}) + + expected = DataTree(data=ds1 * other_ds) + DataTree(name="subnode", data=ds2 * other_ds, parent=expected) + + result = dt * other_ds + assert_equal(result, expected) + + def test_binary_op_on_datatree(self): + ds1 = xr.Dataset({"a": [5], "b": [3]}) + ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) + dt = DataTree(data=ds1) + DataTree(name="subnode", data=ds2, parent=dt) + + expected = DataTree(data=ds1 * ds1) + DataTree(name="subnode", data=ds2 * ds2, parent=expected) + + result = dt * dt + assert_equal(result, expected) + + +class TestUFuncs: + def test_tree(self, create_test_datatree): + dt = create_test_datatree() + expected = create_test_datatree(modify=lambda ds: np.sin(ds)) + result_tree = np.sin(dt) + assert_equal(result_tree, expected) diff --git a/xarray/datatree_/datatree/tests/test_datatree.py b/xarray/datatree_/datatree/tests/test_datatree.py new file mode 100644 index 00000000000..3815f99bcb2 --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_datatree.py @@ -0,0 +1,733 @@ +from copy import copy, deepcopy + +import numpy as np +import pytest +import xarray as xr +import xarray.testing as xrt +from xarray.tests import create_test_data, source_ndarray + +import datatree.testing as dtt +from datatree import DataTree, NotFoundInTreeError + + +class TestTreeCreation: + def test_empty(self): + dt = DataTree(name="root") + assert dt.name == "root" + assert dt.parent is None + assert dt.children == {} + xrt.assert_identical(dt.to_dataset(), xr.Dataset()) + + def test_unnamed(self): + dt = DataTree() + assert dt.name is None + + def test_bad_names(self): + with pytest.raises(TypeError): + DataTree(name=5) + + with pytest.raises(ValueError): + DataTree(name="folder/data") + + +class TestFamilyTree: + def test_setparent_unnamed_child_node_fails(self): + john = DataTree(name="john") + with pytest.raises(ValueError, match="unnamed"): + DataTree(parent=john) + + def test_create_two_children(self): + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": 0, "b": 1}) + + root = DataTree(data=root_data) + set1 = DataTree(name="set1", parent=root, data=set1_data) + DataTree(name="set1", parent=root) + DataTree(name="set2", parent=set1) + + def test_create_full_tree(self, simple_datatree): + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": 0, "b": 1}) + set2_data = xr.Dataset({"a": ("x", [2, 3]), "b": ("x", [0.1, 0.2])}) + + root = DataTree(data=root_data) + set1 = DataTree(name="set1", parent=root, data=set1_data) + DataTree(name="set1", parent=set1) + DataTree(name="set2", parent=set1) + set2 = DataTree(name="set2", parent=root, data=set2_data) + DataTree(name="set1", parent=set2) + DataTree(name="set3", parent=root) + + expected = simple_datatree + assert root.identical(expected) + + +class TestNames: + def test_child_gets_named_on_attach(self): + sue = DataTree() + mary = DataTree(children={"Sue": sue}) # noqa + assert sue.name == "Sue" + + +class TestPaths: + def test_path_property(self): + sue = DataTree() + mary = DataTree(children={"Sue": sue}) + john = DataTree(children={"Mary": mary}) # noqa + assert sue.path == "/Mary/Sue" + assert john.path == "/" + + def test_path_roundtrip(self): + sue = DataTree() + mary = DataTree(children={"Sue": sue}) + john = DataTree(children={"Mary": mary}) # noqa + assert john[sue.path] is sue + + def test_same_tree(self): + mary = DataTree() + kate = DataTree() + john = DataTree(children={"Mary": mary, "Kate": kate}) # noqa + assert mary.same_tree(kate) + + def test_relative_paths(self): + sue = DataTree() + mary = DataTree(children={"Sue": sue}) + annie = DataTree() + john = DataTree(children={"Mary": mary, "Annie": annie}) + + result = sue.relative_to(john) + assert result == "Mary/Sue" + assert john.relative_to(sue) == "../.." + assert annie.relative_to(sue) == "../../Annie" + assert sue.relative_to(annie) == "../Mary/Sue" + assert sue.relative_to(sue) == "." + + evil_kate = DataTree() + with pytest.raises( + NotFoundInTreeError, match="nodes do not lie within the same tree" + ): + sue.relative_to(evil_kate) + + +class TestStoreDatasets: + def test_create_with_data(self): + dat = xr.Dataset({"a": 0}) + john = DataTree(name="john", data=dat) + + xrt.assert_identical(john.to_dataset(), dat) + + with pytest.raises(TypeError): + DataTree(name="mary", parent=john, data="junk") # noqa + + def test_set_data(self): + john = DataTree(name="john") + dat = xr.Dataset({"a": 0}) + john.ds = dat + + xrt.assert_identical(john.to_dataset(), dat) + + with pytest.raises(TypeError): + john.ds = "junk" + + def test_has_data(self): + john = DataTree(name="john", data=xr.Dataset({"a": 0})) + assert john.has_data + + john = DataTree(name="john", data=None) + assert not john.has_data + + def test_is_hollow(self): + john = DataTree(data=xr.Dataset({"a": 0})) + assert john.is_hollow + + eve = DataTree(children={"john": john}) + assert eve.is_hollow + + eve.ds = xr.Dataset({"a": 1}) + assert not eve.is_hollow + + +class TestVariablesChildrenNameCollisions: + def test_parent_already_has_variable_with_childs_name(self): + dt = DataTree(data=xr.Dataset({"a": [0], "b": 1})) + with pytest.raises(KeyError, match="already contains a data variable named a"): + DataTree(name="a", data=None, parent=dt) + + def test_assign_when_already_child_with_variables_name(self): + dt = DataTree(data=None) + DataTree(name="a", data=None, parent=dt) + with pytest.raises(KeyError, match="names would collide"): + dt.ds = xr.Dataset({"a": 0}) + + dt.ds = xr.Dataset() + + new_ds = dt.to_dataset().assign(a=xr.DataArray(0)) + with pytest.raises(KeyError, match="names would collide"): + dt.ds = new_ds + + +class TestGet: + ... + + +class TestGetItem: + def test_getitem_node(self): + folder1 = DataTree(name="folder1") + results = DataTree(name="results", parent=folder1) + highres = DataTree(name="highres", parent=results) + assert folder1["results"] is results + assert folder1["results/highres"] is highres + + def test_getitem_self(self): + dt = DataTree() + assert dt["."] is dt + + def test_getitem_single_data_variable(self): + data = xr.Dataset({"temp": [0, 50]}) + results = DataTree(name="results", data=data) + xrt.assert_identical(results["temp"], data["temp"]) + + def test_getitem_single_data_variable_from_node(self): + data = xr.Dataset({"temp": [0, 50]}) + folder1 = DataTree(name="folder1") + results = DataTree(name="results", parent=folder1) + DataTree(name="highres", parent=results, data=data) + xrt.assert_identical(folder1["results/highres/temp"], data["temp"]) + + def test_getitem_nonexistent_node(self): + folder1 = DataTree(name="folder1") + DataTree(name="results", parent=folder1) + with pytest.raises(KeyError): + folder1["results/highres"] + + def test_getitem_nonexistent_variable(self): + data = xr.Dataset({"temp": [0, 50]}) + results = DataTree(name="results", data=data) + with pytest.raises(KeyError): + results["pressure"] + + @pytest.mark.xfail(reason="Should be deprecated in favour of .subset") + def test_getitem_multiple_data_variables(self): + data = xr.Dataset({"temp": [0, 50], "p": [5, 8, 7]}) + results = DataTree(name="results", data=data) + xrt.assert_identical(results[["temp", "p"]], data[["temp", "p"]]) + + @pytest.mark.xfail( + reason="Indexing needs to return whole tree (GH https://github.com/xarray-contrib/datatree/issues/77)" + ) + def test_getitem_dict_like_selection_access_to_dataset(self): + data = xr.Dataset({"temp": [0, 50]}) + results = DataTree(name="results", data=data) + xrt.assert_identical(results[{"temp": 1}], data[{"temp": 1}]) + + +class TestUpdate: + def test_update(self): + dt = DataTree() + dt.update({"foo": xr.DataArray(0), "a": DataTree()}) + expected = DataTree.from_dict({"/": xr.Dataset({"foo": 0}), "a": None}) + print(dt) + print(dt.children) + print(dt._children) + print(dt["a"]) + print(expected) + dtt.assert_equal(dt, expected) + + def test_update_new_named_dataarray(self): + da = xr.DataArray(name="temp", data=[0, 50]) + folder1 = DataTree(name="folder1") + folder1.update({"results": da}) + expected = da.rename("results") + xrt.assert_equal(folder1["results"], expected) + + def test_update_doesnt_alter_child_name(self): + dt = DataTree() + dt.update({"foo": xr.DataArray(0), "a": DataTree(name="b")}) + assert "a" in dt.children + child = dt["a"] + assert child.name == "a" + + def test_update_overwrite(self): + actual = DataTree.from_dict({"a": DataTree(xr.Dataset({"x": 1}))}) + actual.update({"a": DataTree(xr.Dataset({"x": 2}))}) + + expected = DataTree.from_dict({"a": DataTree(xr.Dataset({"x": 2}))}) + + print(actual) + print(expected) + + dtt.assert_equal(actual, expected) + + +class TestCopy: + def test_copy(self, create_test_datatree): + dt = create_test_datatree() + + for node in dt.root.subtree: + node.attrs["Test"] = [1, 2, 3] + + for copied in [dt.copy(deep=False), copy(dt)]: + dtt.assert_identical(dt, copied) + + for node, copied_node in zip(dt.root.subtree, copied.root.subtree): + assert node.encoding == copied_node.encoding + # Note: IndexVariable objects with string dtype are always + # copied because of xarray.core.util.safe_cast_to_index. + # Limiting the test to data variables. + for k in node.data_vars: + v0 = node.variables[k] + v1 = copied_node.variables[k] + assert source_ndarray(v0.data) is source_ndarray(v1.data) + copied_node["foo"] = xr.DataArray(data=np.arange(5), dims="z") + assert "foo" not in node + + copied_node.attrs["foo"] = "bar" + assert "foo" not in node.attrs + assert node.attrs["Test"] is copied_node.attrs["Test"] + + def test_copy_subtree(self): + dt = DataTree.from_dict({"/level1/level2/level3": xr.Dataset()}) + + actual = dt["/level1/level2"].copy() + expected = DataTree.from_dict({"/level3": xr.Dataset()}, name="level2") + + dtt.assert_identical(actual, expected) + + def test_deepcopy(self, create_test_datatree): + dt = create_test_datatree() + + for node in dt.root.subtree: + node.attrs["Test"] = [1, 2, 3] + + for copied in [dt.copy(deep=True), deepcopy(dt)]: + dtt.assert_identical(dt, copied) + + for node, copied_node in zip(dt.root.subtree, copied.root.subtree): + assert node.encoding == copied_node.encoding + # Note: IndexVariable objects with string dtype are always + # copied because of xarray.core.util.safe_cast_to_index. + # Limiting the test to data variables. + for k in node.data_vars: + v0 = node.variables[k] + v1 = copied_node.variables[k] + assert source_ndarray(v0.data) is not source_ndarray(v1.data) + copied_node["foo"] = xr.DataArray(data=np.arange(5), dims="z") + assert "foo" not in node + + copied_node.attrs["foo"] = "bar" + assert "foo" not in node.attrs + assert node.attrs["Test"] is not copied_node.attrs["Test"] + + @pytest.mark.xfail(reason="data argument not yet implemented") + def test_copy_with_data(self, create_test_datatree): + orig = create_test_datatree() + # TODO use .data_vars once that property is available + data_vars = { + k: v for k, v in orig.variables.items() if k not in orig._coord_names + } + new_data = {k: np.random.randn(*v.shape) for k, v in data_vars.items()} + actual = orig.copy(data=new_data) + + expected = orig.copy() + for k, v in new_data.items(): + expected[k].data = v + dtt.assert_identical(expected, actual) + + # TODO test parents and children? + + +class TestSetItem: + def test_setitem_new_child_node(self): + john = DataTree(name="john") + mary = DataTree(name="mary") + john["mary"] = mary + + grafted_mary = john["mary"] + assert grafted_mary.parent is john + assert grafted_mary.name == "mary" + + def test_setitem_unnamed_child_node_becomes_named(self): + john2 = DataTree(name="john2") + john2["sonny"] = DataTree() + assert john2["sonny"].name == "sonny" + + def test_setitem_new_grandchild_node(self): + john = DataTree(name="john") + mary = DataTree(name="mary", parent=john) + rose = DataTree(name="rose") + john["mary/rose"] = rose + + grafted_rose = john["mary/rose"] + assert grafted_rose.parent is mary + assert grafted_rose.name == "rose" + + def test_grafted_subtree_retains_name(self): + subtree = DataTree(name="original_subtree_name") + root = DataTree(name="root") + root["new_subtree_name"] = subtree # noqa + assert subtree.name == "original_subtree_name" + + def test_setitem_new_empty_node(self): + john = DataTree(name="john") + john["mary"] = DataTree() + mary = john["mary"] + assert isinstance(mary, DataTree) + xrt.assert_identical(mary.to_dataset(), xr.Dataset()) + + def test_setitem_overwrite_data_in_node_with_none(self): + john = DataTree(name="john") + mary = DataTree(name="mary", parent=john, data=xr.Dataset()) + john["mary"] = DataTree() + xrt.assert_identical(mary.to_dataset(), xr.Dataset()) + + john.ds = xr.Dataset() + with pytest.raises(ValueError, match="has no name"): + john["."] = DataTree() + + @pytest.mark.xfail(reason="assigning Datasets doesn't yet create new nodes") + def test_setitem_dataset_on_this_node(self): + data = xr.Dataset({"temp": [0, 50]}) + results = DataTree(name="results") + results["."] = data + xrt.assert_identical(results.to_dataset(), data) + + @pytest.mark.xfail(reason="assigning Datasets doesn't yet create new nodes") + def test_setitem_dataset_as_new_node(self): + data = xr.Dataset({"temp": [0, 50]}) + folder1 = DataTree(name="folder1") + folder1["results"] = data + xrt.assert_identical(folder1["results"].to_dataset(), data) + + @pytest.mark.xfail(reason="assigning Datasets doesn't yet create new nodes") + def test_setitem_dataset_as_new_node_requiring_intermediate_nodes(self): + data = xr.Dataset({"temp": [0, 50]}) + folder1 = DataTree(name="folder1") + folder1["results/highres"] = data + xrt.assert_identical(folder1["results/highres"].to_dataset(), data) + + def test_setitem_named_dataarray(self): + da = xr.DataArray(name="temp", data=[0, 50]) + folder1 = DataTree(name="folder1") + folder1["results"] = da + expected = da.rename("results") + xrt.assert_equal(folder1["results"], expected) + + def test_setitem_unnamed_dataarray(self): + data = xr.DataArray([0, 50]) + folder1 = DataTree(name="folder1") + folder1["results"] = data + xrt.assert_equal(folder1["results"], data) + + def test_setitem_variable(self): + var = xr.Variable(data=[0, 50], dims="x") + folder1 = DataTree(name="folder1") + folder1["results"] = var + xrt.assert_equal(folder1["results"], xr.DataArray(var)) + + def test_setitem_coerce_to_dataarray(self): + folder1 = DataTree(name="folder1") + folder1["results"] = 0 + xrt.assert_equal(folder1["results"], xr.DataArray(0)) + + def test_setitem_add_new_variable_to_empty_node(self): + results = DataTree(name="results") + results["pressure"] = xr.DataArray(data=[2, 3]) + assert "pressure" in results.ds + results["temp"] = xr.Variable(data=[10, 11], dims=["x"]) + assert "temp" in results.ds + + # What if there is a path to traverse first? + results = DataTree(name="results") + results["highres/pressure"] = xr.DataArray(data=[2, 3]) + assert "pressure" in results["highres"].ds + results["highres/temp"] = xr.Variable(data=[10, 11], dims=["x"]) + assert "temp" in results["highres"].ds + + def test_setitem_dataarray_replace_existing_node(self): + t = xr.Dataset({"temp": [0, 50]}) + results = DataTree(name="results", data=t) + p = xr.DataArray(data=[2, 3]) + results["pressure"] = p + expected = t.assign(pressure=p) + xrt.assert_identical(results.to_dataset(), expected) + + +class TestDictionaryInterface: + ... + + +class TestTreeFromDict: + def test_data_in_root(self): + dat = xr.Dataset() + dt = DataTree.from_dict({"/": dat}) + assert dt.name is None + assert dt.parent is None + assert dt.children == {} + xrt.assert_identical(dt.to_dataset(), dat) + + def test_one_layer(self): + dat1, dat2 = xr.Dataset({"a": 1}), xr.Dataset({"b": 2}) + dt = DataTree.from_dict({"run1": dat1, "run2": dat2}) + xrt.assert_identical(dt.to_dataset(), xr.Dataset()) + assert dt.name is None + xrt.assert_identical(dt["run1"].to_dataset(), dat1) + assert dt["run1"].children == {} + xrt.assert_identical(dt["run2"].to_dataset(), dat2) + assert dt["run2"].children == {} + + def test_two_layers(self): + dat1, dat2 = xr.Dataset({"a": 1}), xr.Dataset({"a": [1, 2]}) + dt = DataTree.from_dict({"highres/run": dat1, "lowres/run": dat2}) + assert "highres" in dt.children + assert "lowres" in dt.children + highres_run = dt["highres/run"] + xrt.assert_identical(highres_run.to_dataset(), dat1) + + def test_nones(self): + dt = DataTree.from_dict({"d": None, "d/e": None}) + assert [node.name for node in dt.subtree] == [None, "d", "e"] + assert [node.path for node in dt.subtree] == ["/", "/d", "/d/e"] + xrt.assert_identical(dt["d/e"].to_dataset(), xr.Dataset()) + + def test_full(self, simple_datatree): + dt = simple_datatree + paths = list(node.path for node in dt.subtree) + assert paths == [ + "/", + "/set1", + "/set1/set1", + "/set1/set2", + "/set2", + "/set2/set1", + "/set3", + ] + + def test_datatree_values(self): + dat1 = DataTree(data=xr.Dataset({"a": 1})) + expected = DataTree() + expected["a"] = dat1 + + actual = DataTree.from_dict({"a": dat1}) + + dtt.assert_identical(actual, expected) + + def test_roundtrip(self, simple_datatree): + dt = simple_datatree + roundtrip = DataTree.from_dict(dt.to_dict()) + assert roundtrip.equals(dt) + + @pytest.mark.xfail + def test_roundtrip_unnamed_root(self, simple_datatree): + # See GH81 + + dt = simple_datatree + dt.name = "root" + roundtrip = DataTree.from_dict(dt.to_dict()) + assert roundtrip.equals(dt) + + +class TestDatasetView: + def test_view_contents(self): + ds = create_test_data() + dt = DataTree(data=ds) + assert ds.identical( + dt.ds + ) # this only works because Dataset.identical doesn't check types + assert isinstance(dt.ds, xr.Dataset) + + def test_immutability(self): + # See issue https://github.com/xarray-contrib/datatree/issues/38 + dt = DataTree(name="root", data=None) + DataTree(name="a", data=None, parent=dt) + + with pytest.raises( + AttributeError, match="Mutation of the DatasetView is not allowed" + ): + dt.ds["a"] = xr.DataArray(0) + + with pytest.raises( + AttributeError, match="Mutation of the DatasetView is not allowed" + ): + dt.ds.update({"a": 0}) + + # TODO are there any other ways you can normally modify state (in-place)? + # (not attribute-like assignment because that doesn't work on Dataset anyway) + + def test_methods(self): + ds = create_test_data() + dt = DataTree(data=ds) + assert ds.mean().identical(dt.ds.mean()) + assert type(dt.ds.mean()) == xr.Dataset + + def test_arithmetic(self, create_test_datatree): + dt = create_test_datatree() + expected = create_test_datatree(modify=lambda ds: 10.0 * ds)["set1"] + result = 10.0 * dt["set1"].ds + assert result.identical(expected) + + def test_init_via_type(self): + # from datatree GH issue https://github.com/xarray-contrib/datatree/issues/188 + # xarray's .weighted is unusual because it uses type() to create a Dataset/DataArray + + a = xr.DataArray( + np.random.rand(3, 4, 10), + dims=["x", "y", "time"], + coords={"area": (["x", "y"], np.random.rand(3, 4))}, + ).to_dataset(name="data") + dt = DataTree(data=a) + + def weighted_mean(ds): + return ds.weighted(ds.area).mean(["x", "y"]) + + weighted_mean(dt.ds) + + +class TestAccess: + def test_attribute_access(self, create_test_datatree): + dt = create_test_datatree() + + # vars / coords + for key in ["a", "set0"]: + xrt.assert_equal(dt[key], getattr(dt, key)) + assert key in dir(dt) + + # dims + xrt.assert_equal(dt["a"]["y"], getattr(dt.a, "y")) + assert "y" in dir(dt["a"]) + + # children + for key in ["set1", "set2", "set3"]: + dtt.assert_equal(dt[key], getattr(dt, key)) + assert key in dir(dt) + + # attrs + dt.attrs["meta"] = "NASA" + assert dt.attrs["meta"] == "NASA" + assert "meta" in dir(dt) + + def test_ipython_key_completions(self, create_test_datatree): + dt = create_test_datatree() + key_completions = dt._ipython_key_completions_() + + node_keys = [node.path[1:] for node in dt.subtree] + assert all(node_key in key_completions for node_key in node_keys) + + var_keys = list(dt.variables.keys()) + assert all(var_key in key_completions for var_key in var_keys) + + def test_operation_with_attrs_but_no_data(self): + # tests bug from xarray-datatree GH262 + xs = xr.Dataset({"testvar": xr.DataArray(np.ones((2, 3)))}) + dt = DataTree.from_dict({"node1": xs, "node2": xs}) + dt.attrs["test_key"] = 1 # sel works fine without this line + dt.sel(dim_0=0) + + +class TestRestructuring: + def test_drop_nodes(self): + sue = DataTree.from_dict({"Mary": None, "Kate": None, "Ashley": None}) + + # test drop just one node + dropped_one = sue.drop_nodes(names="Mary") + assert "Mary" not in dropped_one.children + + # test drop multiple nodes + dropped = sue.drop_nodes(names=["Mary", "Kate"]) + assert not set(["Mary", "Kate"]).intersection(set(dropped.children)) + assert "Ashley" in dropped.children + + # test raise + with pytest.raises(KeyError, match="nodes {'Mary'} not present"): + dropped.drop_nodes(names=["Mary", "Ashley"]) + + # test ignore + childless = dropped.drop_nodes(names=["Mary", "Ashley"], errors="ignore") + assert childless.children == {} + + def test_assign(self): + dt = DataTree() + expected = DataTree.from_dict({"/": xr.Dataset({"foo": 0}), "/a": None}) + + # kwargs form + result = dt.assign(foo=xr.DataArray(0), a=DataTree()) + dtt.assert_equal(result, expected) + + # dict form + result = dt.assign({"foo": xr.DataArray(0), "a": DataTree()}) + dtt.assert_equal(result, expected) + + +class TestPipe: + def test_noop(self, create_test_datatree): + dt = create_test_datatree() + + actual = dt.pipe(lambda tree: tree) + assert actual.identical(dt) + + def test_params(self, create_test_datatree): + dt = create_test_datatree() + + def f(tree, **attrs): + return tree.assign(arr_with_attrs=xr.Variable("dim0", [], attrs=attrs)) + + attrs = {"x": 1, "y": 2, "z": 3} + + actual = dt.pipe(f, **attrs) + assert actual["arr_with_attrs"].attrs == attrs + + def test_named_self(self, create_test_datatree): + dt = create_test_datatree() + + def f(x, tree, y): + tree.attrs.update({"x": x, "y": y}) + return tree + + attrs = {"x": 1, "y": 2} + + actual = dt.pipe((f, "tree"), **attrs) + + assert actual is dt and actual.attrs == attrs + + +class TestSubset: + def test_match(self): + # TODO is this example going to cause problems with case sensitivity? + dt = DataTree.from_dict( + { + "/a/A": None, + "/a/B": None, + "/b/A": None, + "/b/B": None, + } + ) + result = dt.match("*/B") + expected = DataTree.from_dict( + { + "/a/B": None, + "/b/B": None, + } + ) + dtt.assert_identical(result, expected) + + def test_filter(self): + simpsons = DataTree.from_dict( + d={ + "/": xr.Dataset({"age": 83}), + "/Herbert": xr.Dataset({"age": 40}), + "/Homer": xr.Dataset({"age": 39}), + "/Homer/Bart": xr.Dataset({"age": 10}), + "/Homer/Lisa": xr.Dataset({"age": 8}), + "/Homer/Maggie": xr.Dataset({"age": 1}), + }, + name="Abe", + ) + expected = DataTree.from_dict( + d={ + "/": xr.Dataset({"age": 83}), + "/Herbert": xr.Dataset({"age": 40}), + "/Homer": xr.Dataset({"age": 39}), + }, + name="Abe", + ) + elders = simpsons.filter(lambda node: node["age"] > 18) + dtt.assert_identical(elders, expected) diff --git a/xarray/datatree_/datatree/tests/test_extensions.py b/xarray/datatree_/datatree/tests/test_extensions.py new file mode 100644 index 00000000000..b288998e2ce --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_extensions.py @@ -0,0 +1,40 @@ +import pytest + +from datatree import DataTree, register_datatree_accessor + + +class TestAccessor: + def test_register(self) -> None: + @register_datatree_accessor("demo") + class DemoAccessor: + """Demo accessor.""" + + def __init__(self, xarray_obj): + self._obj = xarray_obj + + @property + def foo(self): + return "bar" + + dt: DataTree = DataTree() + assert dt.demo.foo == "bar" # type: ignore + + # accessor is cached + assert dt.demo is dt.demo # type: ignore + + # check descriptor + assert dt.demo.__doc__ == "Demo accessor." # type: ignore + # TODO: typing doesn't seem to work with accessors + assert DataTree.demo.__doc__ == "Demo accessor." # type: ignore + assert isinstance(dt.demo, DemoAccessor) # type: ignore + assert DataTree.demo is DemoAccessor # type: ignore + + with pytest.warns(Warning, match="overriding a preexisting attribute"): + + @register_datatree_accessor("demo") + class Foo: + pass + + # ensure we can remove it + del DataTree.demo # type: ignore + assert not hasattr(DataTree, "demo") diff --git a/xarray/datatree_/datatree/tests/test_formatting.py b/xarray/datatree_/datatree/tests/test_formatting.py new file mode 100644 index 00000000000..0f64644c05a --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_formatting.py @@ -0,0 +1,120 @@ +from textwrap import dedent + +from xarray import Dataset + +from datatree import DataTree +from datatree.formatting import diff_tree_repr + + +class TestRepr: + def test_print_empty_node(self): + dt = DataTree(name="root") + printout = dt.__str__() + assert printout == "DataTree('root', parent=None)" + + def test_print_empty_node_with_attrs(self): + dat = Dataset(attrs={"note": "has attrs"}) + dt = DataTree(name="root", data=dat) + printout = dt.__str__() + assert printout == dedent( + """\ + DataTree('root', parent=None) + Dimensions: () + Data variables: + *empty* + Attributes: + note: has attrs""" + ) + + def test_print_node_with_data(self): + dat = Dataset({"a": [0, 2]}) + dt = DataTree(name="root", data=dat) + printout = dt.__str__() + expected = [ + "DataTree('root', parent=None)", + "Dimensions", + "Coordinates", + "a", + "Data variables", + "*empty*", + ] + for expected_line, printed_line in zip(expected, printout.splitlines()): + assert expected_line in printed_line + + def test_nested_node(self): + dat = Dataset({"a": [0, 2]}) + root = DataTree(name="root") + DataTree(name="results", data=dat, parent=root) + printout = root.__str__() + assert printout.splitlines()[2].startswith(" ") + + def test_print_datatree(self, simple_datatree): + dt = simple_datatree + print(dt) + + # TODO work out how to test something complex like this + + def test_repr_of_node_with_data(self): + dat = Dataset({"a": [0, 2]}) + dt = DataTree(name="root", data=dat) + assert "Coordinates" in repr(dt) + + +class TestDiffFormatting: + def test_diff_structure(self): + dt_1 = DataTree.from_dict({"a": None, "a/b": None, "a/c": None}) + dt_2 = DataTree.from_dict({"d": None, "d/e": None}) + + expected = dedent( + """\ + Left and right DataTree objects are not isomorphic + + Number of children on node '/a' of the left object: 2 + Number of children on node '/d' of the right object: 1""" + ) + actual = diff_tree_repr(dt_1, dt_2, "isomorphic") + assert actual == expected + + def test_diff_node_names(self): + dt_1 = DataTree.from_dict({"a": None}) + dt_2 = DataTree.from_dict({"b": None}) + + expected = dedent( + """\ + Left and right DataTree objects are not identical + + Node '/a' in the left object has name 'a' + Node '/b' in the right object has name 'b'""" + ) + actual = diff_tree_repr(dt_1, dt_2, "identical") + assert actual == expected + + def test_diff_node_data(self): + import numpy as np + + # casting to int64 explicitly ensures that int64s are created on all architectures + ds1 = Dataset({"u": np.int64(0), "v": np.int64(1)}) + ds3 = Dataset({"w": np.int64(5)}) + dt_1 = DataTree.from_dict({"a": ds1, "a/b": ds3}) + ds2 = Dataset({"u": np.int64(0)}) + ds4 = Dataset({"w": np.int64(6)}) + dt_2 = DataTree.from_dict({"a": ds2, "a/b": ds4}) + + expected = dedent( + """\ + Left and right DataTree objects are not equal + + + Data in nodes at position '/a' do not match: + + Data variables only on the left object: + v int64 1 + + Data in nodes at position '/a/b' do not match: + + Differing data variables: + L w int64 5 + R w int64 6""" + ) + actual = diff_tree_repr(dt_1, dt_2, "equals") + assert actual == expected diff --git a/xarray/datatree_/datatree/tests/test_formatting_html.py b/xarray/datatree_/datatree/tests/test_formatting_html.py new file mode 100644 index 00000000000..7c6a47ea86e --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_formatting_html.py @@ -0,0 +1,197 @@ +import pytest +import xarray as xr + +from datatree import DataTree, formatting_html + + +@pytest.fixture(scope="module", params=["some html", "some other html"]) +def repr(request): + return request.param + + +class Test_summarize_children: + """ + Unit tests for summarize_children. + """ + + func = staticmethod(formatting_html.summarize_children) + + @pytest.fixture(scope="class") + def childfree_tree_factory(self): + """ + Fixture for a child-free DataTree factory. + """ + from random import randint + + def _childfree_tree_factory(): + return DataTree( + data=xr.Dataset({"z": ("y", [randint(1, 100) for _ in range(3)])}) + ) + + return _childfree_tree_factory + + @pytest.fixture(scope="class") + def childfree_tree(self, childfree_tree_factory): + """ + Fixture for a child-free DataTree. + """ + return childfree_tree_factory() + + @pytest.fixture(scope="function") + def mock_node_repr(self, monkeypatch): + """ + Apply mocking for node_repr. + """ + + def mock(group_title, dt): + """ + Mock with a simple result + """ + return group_title + " " + str(id(dt)) + + monkeypatch.setattr(formatting_html, "node_repr", mock) + + @pytest.fixture(scope="function") + def mock_wrap_repr(self, monkeypatch): + """ + Apply mocking for _wrap_repr. + """ + + def mock(r, *, end, **kwargs): + """ + Mock by appending "end" or "not end". + """ + return r + " " + ("end" if end else "not end") + "//" + + monkeypatch.setattr(formatting_html, "_wrap_repr", mock) + + def test_empty_mapping(self): + """ + Test with an empty mapping of children. + """ + children = {} + assert self.func(children) == ( + "
" "
" + ) + + def test_one_child(self, childfree_tree, mock_wrap_repr, mock_node_repr): + """ + Test with one child. + + Uses a mock of _wrap_repr and node_repr to essentially mock + the inline lambda function "lines_callback". + """ + # Create mapping of children + children = {"a": childfree_tree} + + # Expect first line to be produced from the first child, and + # wrapped as the last child + first_line = f"a {id(children['a'])} end//" + + assert self.func(children) == ( + "
" + f"{first_line}" + "
" + ) + + def test_two_children(self, childfree_tree_factory, mock_wrap_repr, mock_node_repr): + """ + Test with two level deep children. + + Uses a mock of _wrap_repr and node_repr to essentially mock + the inline lambda function "lines_callback". + """ + + # Create mapping of children + children = {"a": childfree_tree_factory(), "b": childfree_tree_factory()} + + # Expect first line to be produced from the first child, and + # wrapped as _not_ the last child + first_line = f"a {id(children['a'])} not end//" + + # Expect second line to be produced from the second child, and + # wrapped as the last child + second_line = f"b {id(children['b'])} end//" + + assert self.func(children) == ( + "
" + f"{first_line}" + f"{second_line}" + "
" + ) + + +class Test__wrap_repr: + """ + Unit tests for _wrap_repr. + """ + + func = staticmethod(formatting_html._wrap_repr) + + def test_end(self, repr): + """ + Test with end=True. + """ + r = self.func(repr, end=True) + assert r == ( + "
" + "
" + "
" + "
" + "
" + "
" + "
    " + f"{repr}" + "
" + "
" + "
" + ) + + def test_not_end(self, repr): + """ + Test with end=False. + """ + r = self.func(repr, end=False) + assert r == ( + "
" + "
" + "
" + "
" + "
" + "
" + "
    " + f"{repr}" + "
" + "
" + "
" + ) diff --git a/xarray/datatree_/datatree/tests/test_io.py b/xarray/datatree_/datatree/tests/test_io.py new file mode 100644 index 00000000000..6fa20479f9a --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_io.py @@ -0,0 +1,120 @@ +import pytest +import zarr.errors + +from datatree.io import open_datatree +from datatree.testing import assert_equal +from datatree.tests import requires_h5netcdf, requires_netCDF4, requires_zarr + + +class TestIO: + @requires_netCDF4 + def test_to_netcdf(self, tmpdir, simple_datatree): + filepath = str( + tmpdir / "test.nc" + ) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + original_dt.to_netcdf(filepath, engine="netcdf4") + + roundtrip_dt = open_datatree(filepath) + assert_equal(original_dt, roundtrip_dt) + + @requires_netCDF4 + def test_netcdf_encoding(self, tmpdir, simple_datatree): + filepath = str( + tmpdir / "test.nc" + ) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + + # add compression + comp = dict(zlib=True, complevel=9) + enc = {"/set2": {var: comp for var in original_dt["/set2"].ds.data_vars}} + + original_dt.to_netcdf(filepath, encoding=enc, engine="netcdf4") + roundtrip_dt = open_datatree(filepath) + + assert roundtrip_dt["/set2/a"].encoding["zlib"] == comp["zlib"] + assert roundtrip_dt["/set2/a"].encoding["complevel"] == comp["complevel"] + + enc["/not/a/group"] = {"foo": "bar"} + with pytest.raises(ValueError, match="unexpected encoding group.*"): + original_dt.to_netcdf(filepath, encoding=enc, engine="netcdf4") + + @requires_h5netcdf + def test_to_h5netcdf(self, tmpdir, simple_datatree): + filepath = str( + tmpdir / "test.nc" + ) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + original_dt.to_netcdf(filepath, engine="h5netcdf") + + roundtrip_dt = open_datatree(filepath) + assert_equal(original_dt, roundtrip_dt) + + @requires_zarr + def test_to_zarr(self, tmpdir, simple_datatree): + filepath = str( + tmpdir / "test.zarr" + ) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + original_dt.to_zarr(filepath) + + roundtrip_dt = open_datatree(filepath, engine="zarr") + assert_equal(original_dt, roundtrip_dt) + + @requires_zarr + def test_zarr_encoding(self, tmpdir, simple_datatree): + import zarr + + filepath = str( + tmpdir / "test.zarr" + ) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + + comp = {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)} + enc = {"/set2": {var: comp for var in original_dt["/set2"].ds.data_vars}} + original_dt.to_zarr(filepath, encoding=enc) + roundtrip_dt = open_datatree(filepath, engine="zarr") + + print(roundtrip_dt["/set2/a"].encoding) + assert roundtrip_dt["/set2/a"].encoding["compressor"] == comp["compressor"] + + enc["/not/a/group"] = {"foo": "bar"} + with pytest.raises(ValueError, match="unexpected encoding group.*"): + original_dt.to_zarr(filepath, encoding=enc, engine="zarr") + + @requires_zarr + def test_to_zarr_zip_store(self, tmpdir, simple_datatree): + from zarr.storage import ZipStore + + filepath = str( + tmpdir / "test.zarr.zip" + ) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + store = ZipStore(filepath) + original_dt.to_zarr(store) + + roundtrip_dt = open_datatree(store, engine="zarr") + assert_equal(original_dt, roundtrip_dt) + + @requires_zarr + def test_to_zarr_not_consolidated(self, tmpdir, simple_datatree): + filepath = tmpdir / "test.zarr" + zmetadata = filepath / ".zmetadata" + s1zmetadata = filepath / "set1" / ".zmetadata" + filepath = str(filepath) # casting to str avoids a pathlib bug in xarray + original_dt = simple_datatree + original_dt.to_zarr(filepath, consolidated=False) + assert not zmetadata.exists() + assert not s1zmetadata.exists() + + with pytest.warns(RuntimeWarning, match="consolidated"): + roundtrip_dt = open_datatree(filepath, engine="zarr") + assert_equal(original_dt, roundtrip_dt) + + @requires_zarr + def test_to_zarr_default_write_mode(self, tmpdir, simple_datatree): + simple_datatree.to_zarr(tmpdir) + + # with default settings, to_zarr should not overwrite an existing dir + with pytest.raises(zarr.errors.ContainsGroupError): + simple_datatree.to_zarr(tmpdir) diff --git a/xarray/datatree_/datatree/tests/test_mapping.py b/xarray/datatree_/datatree/tests/test_mapping.py new file mode 100644 index 00000000000..929ce7644dd --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_mapping.py @@ -0,0 +1,343 @@ +import numpy as np +import pytest +import xarray as xr + +from datatree.datatree import DataTree +from datatree.mapping import TreeIsomorphismError, check_isomorphic, map_over_subtree +from datatree.testing import assert_equal + +empty = xr.Dataset() + + +class TestCheckTreesIsomorphic: + def test_not_a_tree(self): + with pytest.raises(TypeError, match="not a tree"): + check_isomorphic("s", 1) + + def test_different_widths(self): + dt1 = DataTree.from_dict(d={"a": empty}) + dt2 = DataTree.from_dict(d={"b": empty, "c": empty}) + expected_err_str = ( + "Number of children on node '/' of the left object: 1\n" + "Number of children on node '/' of the right object: 2" + ) + with pytest.raises(TreeIsomorphismError, match=expected_err_str): + check_isomorphic(dt1, dt2) + + def test_different_heights(self): + dt1 = DataTree.from_dict({"a": empty}) + dt2 = DataTree.from_dict({"b": empty, "b/c": empty}) + expected_err_str = ( + "Number of children on node '/a' of the left object: 0\n" + "Number of children on node '/b' of the right object: 1" + ) + with pytest.raises(TreeIsomorphismError, match=expected_err_str): + check_isomorphic(dt1, dt2) + + def test_names_different(self): + dt1 = DataTree.from_dict({"a": xr.Dataset()}) + dt2 = DataTree.from_dict({"b": empty}) + expected_err_str = ( + "Node '/a' in the left object has name 'a'\n" + "Node '/b' in the right object has name 'b'" + ) + with pytest.raises(TreeIsomorphismError, match=expected_err_str): + check_isomorphic(dt1, dt2, require_names_equal=True) + + def test_isomorphic_names_equal(self): + dt1 = DataTree.from_dict({"a": empty, "b": empty, "b/c": empty, "b/d": empty}) + dt2 = DataTree.from_dict({"a": empty, "b": empty, "b/c": empty, "b/d": empty}) + check_isomorphic(dt1, dt2, require_names_equal=True) + + def test_isomorphic_ordering(self): + dt1 = DataTree.from_dict({"a": empty, "b": empty, "b/d": empty, "b/c": empty}) + dt2 = DataTree.from_dict({"a": empty, "b": empty, "b/c": empty, "b/d": empty}) + check_isomorphic(dt1, dt2, require_names_equal=False) + + def test_isomorphic_names_not_equal(self): + dt1 = DataTree.from_dict({"a": empty, "b": empty, "b/c": empty, "b/d": empty}) + dt2 = DataTree.from_dict({"A": empty, "B": empty, "B/C": empty, "B/D": empty}) + check_isomorphic(dt1, dt2) + + def test_not_isomorphic_complex_tree(self, create_test_datatree): + dt1 = create_test_datatree() + dt2 = create_test_datatree() + dt2["set1/set2/extra"] = DataTree(name="extra") + with pytest.raises(TreeIsomorphismError, match="/set1/set2"): + check_isomorphic(dt1, dt2) + + def test_checking_from_root(self, create_test_datatree): + dt1 = create_test_datatree() + dt2 = create_test_datatree() + real_root = DataTree(name="real root") + dt2.name = "not_real_root" + dt2.parent = real_root + with pytest.raises(TreeIsomorphismError): + check_isomorphic(dt1, dt2, check_from_root=True) + + +class TestMapOverSubTree: + def test_no_trees_passed(self): + @map_over_subtree + def times_ten(ds): + return 10.0 * ds + + with pytest.raises(TypeError, match="Must pass at least one tree"): + times_ten("dt") + + def test_not_isomorphic(self, create_test_datatree): + dt1 = create_test_datatree() + dt2 = create_test_datatree() + dt2["set1/set2/extra"] = DataTree(name="extra") + + @map_over_subtree + def times_ten(ds1, ds2): + return ds1 * ds2 + + with pytest.raises(TreeIsomorphismError): + times_ten(dt1, dt2) + + def test_no_trees_returned(self, create_test_datatree): + dt1 = create_test_datatree() + dt2 = create_test_datatree() + + @map_over_subtree + def bad_func(ds1, ds2): + return None + + with pytest.raises(TypeError, match="return value of None"): + bad_func(dt1, dt2) + + def test_single_dt_arg(self, create_test_datatree): + dt = create_test_datatree() + + @map_over_subtree + def times_ten(ds): + return 10.0 * ds + + expected = create_test_datatree(modify=lambda ds: 10.0 * ds) + result_tree = times_ten(dt) + assert_equal(result_tree, expected) + + def test_single_dt_arg_plus_args_and_kwargs(self, create_test_datatree): + dt = create_test_datatree() + + @map_over_subtree + def multiply_then_add(ds, times, add=0.0): + return (times * ds) + add + + expected = create_test_datatree(modify=lambda ds: (10.0 * ds) + 2.0) + result_tree = multiply_then_add(dt, 10.0, add=2.0) + assert_equal(result_tree, expected) + + def test_multiple_dt_args(self, create_test_datatree): + dt1 = create_test_datatree() + dt2 = create_test_datatree() + + @map_over_subtree + def add(ds1, ds2): + return ds1 + ds2 + + expected = create_test_datatree(modify=lambda ds: 2.0 * ds) + result = add(dt1, dt2) + assert_equal(result, expected) + + def test_dt_as_kwarg(self, create_test_datatree): + dt1 = create_test_datatree() + dt2 = create_test_datatree() + + @map_over_subtree + def add(ds1, value=0.0): + return ds1 + value + + expected = create_test_datatree(modify=lambda ds: 2.0 * ds) + result = add(dt1, value=dt2) + assert_equal(result, expected) + + def test_return_multiple_dts(self, create_test_datatree): + dt = create_test_datatree() + + @map_over_subtree + def minmax(ds): + return ds.min(), ds.max() + + dt_min, dt_max = minmax(dt) + expected_min = create_test_datatree(modify=lambda ds: ds.min()) + assert_equal(dt_min, expected_min) + expected_max = create_test_datatree(modify=lambda ds: ds.max()) + assert_equal(dt_max, expected_max) + + def test_return_wrong_type(self, simple_datatree): + dt1 = simple_datatree + + @map_over_subtree + def bad_func(ds1): + return "string" + + with pytest.raises(TypeError, match="not Dataset or DataArray"): + bad_func(dt1) + + def test_return_tuple_of_wrong_types(self, simple_datatree): + dt1 = simple_datatree + + @map_over_subtree + def bad_func(ds1): + return xr.Dataset(), "string" + + with pytest.raises(TypeError, match="not Dataset or DataArray"): + bad_func(dt1) + + @pytest.mark.xfail + def test_return_inconsistent_number_of_results(self, simple_datatree): + dt1 = simple_datatree + + @map_over_subtree + def bad_func(ds): + # Datasets in simple_datatree have different numbers of dims + # TODO need to instead return different numbers of Dataset objects for this test to catch the intended error + return tuple(ds.dims) + + with pytest.raises(TypeError, match="instead returns"): + bad_func(dt1) + + def test_wrong_number_of_arguments_for_func(self, simple_datatree): + dt = simple_datatree + + @map_over_subtree + def times_ten(ds): + return 10.0 * ds + + with pytest.raises( + TypeError, match="takes 1 positional argument but 2 were given" + ): + times_ten(dt, dt) + + def test_map_single_dataset_against_whole_tree(self, create_test_datatree): + dt = create_test_datatree() + + @map_over_subtree + def nodewise_merge(node_ds, fixed_ds): + return xr.merge([node_ds, fixed_ds]) + + other_ds = xr.Dataset({"z": ("z", [0])}) + expected = create_test_datatree(modify=lambda ds: xr.merge([ds, other_ds])) + result_tree = nodewise_merge(dt, other_ds) + assert_equal(result_tree, expected) + + @pytest.mark.xfail + def test_trees_with_different_node_names(self): + # TODO test this after I've got good tests for renaming nodes + raise NotImplementedError + + def test_dt_method(self, create_test_datatree): + dt = create_test_datatree() + + def multiply_then_add(ds, times, add=0.0): + return times * ds + add + + expected = create_test_datatree(modify=lambda ds: (10.0 * ds) + 2.0) + result_tree = dt.map_over_subtree(multiply_then_add, 10.0, add=2.0) + assert_equal(result_tree, expected) + + def test_discard_ancestry(self, create_test_datatree): + # Check for datatree GH issue https://github.com/xarray-contrib/datatree/issues/48 + dt = create_test_datatree() + subtree = dt["set1"] + + @map_over_subtree + def times_ten(ds): + return 10.0 * ds + + expected = create_test_datatree(modify=lambda ds: 10.0 * ds)["set1"] + result_tree = times_ten(subtree) + assert_equal(result_tree, expected, from_root=False) + + def test_skip_empty_nodes_with_attrs(self, create_test_datatree): + # inspired by xarray-datatree GH262 + dt = create_test_datatree() + dt["set1/set2"].attrs["foo"] = "bar" + + def check_for_data(ds): + # fails if run on a node that has no data + assert len(ds.variables) != 0 + return ds + + dt.map_over_subtree(check_for_data) + + def test_keep_attrs_on_empty_nodes(self, create_test_datatree): + # GH278 + dt = create_test_datatree() + dt["set1/set2"].attrs["foo"] = "bar" + + def empty_func(ds): + return ds + + result = dt.map_over_subtree(empty_func) + assert result["set1/set2"].attrs == dt["set1/set2"].attrs + + @pytest.mark.xfail( + reason="probably some bug in pytests handling of exception notes" + ) + def test_error_contains_path_of_offending_node(self, create_test_datatree): + dt = create_test_datatree() + dt["set1"]["bad_var"] = 0 + print(dt) + + def fail_on_specific_node(ds): + if "bad_var" in ds: + raise ValueError("Failed because 'bar_var' present in dataset") + + with pytest.raises( + ValueError, match="Raised whilst mapping function over node /set1" + ): + dt.map_over_subtree(fail_on_specific_node) + + +class TestMutableOperations: + def test_construct_using_type(self): + # from datatree GH issue https://github.com/xarray-contrib/datatree/issues/188 + # xarray's .weighted is unusual because it uses type() to create a Dataset/DataArray + + a = xr.DataArray( + np.random.rand(3, 4, 10), + dims=["x", "y", "time"], + coords={"area": (["x", "y"], np.random.rand(3, 4))}, + ).to_dataset(name="data") + b = xr.DataArray( + np.random.rand(2, 6, 14), + dims=["x", "y", "time"], + coords={"area": (["x", "y"], np.random.rand(2, 6))}, + ).to_dataset(name="data") + dt = DataTree.from_dict({"a": a, "b": b}) + + def weighted_mean(ds): + return ds.weighted(ds.area).mean(["x", "y"]) + + dt.map_over_subtree(weighted_mean) + + def test_alter_inplace_forbidden(self): + simpsons = DataTree.from_dict( + d={ + "/": xr.Dataset({"age": 83}), + "/Herbert": xr.Dataset({"age": 40}), + "/Homer": xr.Dataset({"age": 39}), + "/Homer/Bart": xr.Dataset({"age": 10}), + "/Homer/Lisa": xr.Dataset({"age": 8}), + "/Homer/Maggie": xr.Dataset({"age": 1}), + }, + name="Abe", + ) + + def fast_forward(ds: xr.Dataset, years: float) -> xr.Dataset: + """Add some years to the age, but by altering the given dataset""" + ds["age"] = ds["age"] + years + return ds + + with pytest.raises(AttributeError): + simpsons.map_over_subtree(fast_forward, years=10) + + +@pytest.mark.xfail +class TestMapOverSubTreeInplace: + def test_map_over_subtree_inplace(self): + raise NotImplementedError diff --git a/xarray/datatree_/datatree/tests/test_treenode.py b/xarray/datatree_/datatree/tests/test_treenode.py new file mode 100644 index 00000000000..f2d314c50e3 --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_treenode.py @@ -0,0 +1,380 @@ +import pytest + +from datatree.iterators import LevelOrderIter, PreOrderIter +from datatree.treenode import InvalidTreeError, NamedNode, NodePath, TreeNode + + +class TestFamilyTree: + def test_lonely(self): + root = TreeNode() + assert root.parent is None + assert root.children == {} + + def test_parenting(self): + john = TreeNode() + mary = TreeNode() + mary._set_parent(john, "Mary") + + assert mary.parent == john + assert john.children["Mary"] is mary + + def test_no_time_traveller_loops(self): + john = TreeNode() + + with pytest.raises(InvalidTreeError, match="cannot be a parent of itself"): + john._set_parent(john, "John") + + with pytest.raises(InvalidTreeError, match="cannot be a parent of itself"): + john.children = {"John": john} + + mary = TreeNode() + rose = TreeNode() + mary._set_parent(john, "Mary") + rose._set_parent(mary, "Rose") + + with pytest.raises(InvalidTreeError, match="is already a descendant"): + john._set_parent(rose, "John") + + with pytest.raises(InvalidTreeError, match="is already a descendant"): + rose.children = {"John": john} + + def test_parent_swap(self): + john = TreeNode() + mary = TreeNode() + mary._set_parent(john, "Mary") + + steve = TreeNode() + mary._set_parent(steve, "Mary") + + assert mary.parent == steve + assert steve.children["Mary"] is mary + assert "Mary" not in john.children + + def test_multi_child_family(self): + mary = TreeNode() + kate = TreeNode() + john = TreeNode(children={"Mary": mary, "Kate": kate}) + assert john.children["Mary"] is mary + assert john.children["Kate"] is kate + assert mary.parent is john + assert kate.parent is john + + def test_disown_child(self): + mary = TreeNode() + john = TreeNode(children={"Mary": mary}) + mary.orphan() + assert mary.parent is None + assert "Mary" not in john.children + + def test_doppelganger_child(self): + kate = TreeNode() + john = TreeNode() + + with pytest.raises(TypeError): + john.children = {"Kate": 666} + + with pytest.raises(InvalidTreeError, match="Cannot add same node"): + john.children = {"Kate": kate, "Evil_Kate": kate} + + john = TreeNode(children={"Kate": kate}) + evil_kate = TreeNode() + evil_kate._set_parent(john, "Kate") + assert john.children["Kate"] is evil_kate + + def test_sibling_relationships(self): + mary = TreeNode() + kate = TreeNode() + ashley = TreeNode() + TreeNode(children={"Mary": mary, "Kate": kate, "Ashley": ashley}) + assert kate.siblings["Mary"] is mary + assert kate.siblings["Ashley"] is ashley + assert "Kate" not in kate.siblings + + def test_ancestors(self): + tony = TreeNode() + michael = TreeNode(children={"Tony": tony}) + vito = TreeNode(children={"Michael": michael}) + assert tony.root is vito + assert tony.parents == (michael, vito) + assert tony.ancestors == (vito, michael, tony) + + +class TestGetNodes: + def test_get_child(self): + steven = TreeNode() + sue = TreeNode(children={"Steven": steven}) + mary = TreeNode(children={"Sue": sue}) + john = TreeNode(children={"Mary": mary}) + + # get child + assert john._get_item("Mary") is mary + assert mary._get_item("Sue") is sue + + # no child exists + with pytest.raises(KeyError): + john._get_item("Kate") + + # get grandchild + assert john._get_item("Mary/Sue") is sue + + # get great-grandchild + assert john._get_item("Mary/Sue/Steven") is steven + + # get from middle of tree + assert mary._get_item("Sue/Steven") is steven + + def test_get_upwards(self): + sue = TreeNode() + kate = TreeNode() + mary = TreeNode(children={"Sue": sue, "Kate": kate}) + john = TreeNode(children={"Mary": mary}) + + assert sue._get_item("../") is mary + assert sue._get_item("../../") is john + + # relative path + assert sue._get_item("../Kate") is kate + + def test_get_from_root(self): + sue = TreeNode() + mary = TreeNode(children={"Sue": sue}) + john = TreeNode(children={"Mary": mary}) # noqa + + assert sue._get_item("/Mary") is mary + + +class TestSetNodes: + def test_set_child_node(self): + john = TreeNode() + mary = TreeNode() + john._set_item("Mary", mary) + + assert john.children["Mary"] is mary + assert isinstance(mary, TreeNode) + assert mary.children == {} + assert mary.parent is john + + def test_child_already_exists(self): + mary = TreeNode() + john = TreeNode(children={"Mary": mary}) + mary_2 = TreeNode() + with pytest.raises(KeyError): + john._set_item("Mary", mary_2, allow_overwrite=False) + + def test_set_grandchild(self): + rose = TreeNode() + mary = TreeNode() + john = TreeNode() + + john._set_item("Mary", mary) + john._set_item("Mary/Rose", rose) + + assert john.children["Mary"] is mary + assert isinstance(mary, TreeNode) + assert "Rose" in mary.children + assert rose.parent is mary + + def test_create_intermediate_child(self): + john = TreeNode() + rose = TreeNode() + + # test intermediate children not allowed + with pytest.raises(KeyError, match="Could not reach"): + john._set_item(path="Mary/Rose", item=rose, new_nodes_along_path=False) + + # test intermediate children allowed + john._set_item("Mary/Rose", rose, new_nodes_along_path=True) + assert "Mary" in john.children + mary = john.children["Mary"] + assert isinstance(mary, TreeNode) + assert mary.children == {"Rose": rose} + assert rose.parent == mary + assert rose.parent == mary + + def test_overwrite_child(self): + john = TreeNode() + mary = TreeNode() + john._set_item("Mary", mary) + + # test overwriting not allowed + marys_evil_twin = TreeNode() + with pytest.raises(KeyError, match="Already a node object"): + john._set_item("Mary", marys_evil_twin, allow_overwrite=False) + assert john.children["Mary"] is mary + assert marys_evil_twin.parent is None + + # test overwriting allowed + marys_evil_twin = TreeNode() + john._set_item("Mary", marys_evil_twin, allow_overwrite=True) + assert john.children["Mary"] is marys_evil_twin + assert marys_evil_twin.parent is john + + +class TestPruning: + def test_del_child(self): + john = TreeNode() + mary = TreeNode() + john._set_item("Mary", mary) + + del john["Mary"] + assert "Mary" not in john.children + assert mary.parent is None + + with pytest.raises(KeyError): + del john["Mary"] + + +def create_test_tree(): + a = NamedNode(name="a") + b = NamedNode() + c = NamedNode() + d = NamedNode() + e = NamedNode() + f = NamedNode() + g = NamedNode() + h = NamedNode() + i = NamedNode() + + a.children = {"b": b, "c": c} + b.children = {"d": d, "e": e} + e.children = {"f": f, "g": g} + c.children = {"h": h} + h.children = {"i": i} + + return a, f + + +class TestIterators: + def test_preorderiter(self): + root, _ = create_test_tree() + result = [node.name for node in PreOrderIter(root)] + expected = [ + "a", + "b", + "d", + "e", + "f", + "g", + "c", + "h", + "i", + ] + assert result == expected + + def test_levelorderiter(self): + root, _ = create_test_tree() + result = [node.name for node in LevelOrderIter(root)] + expected = [ + "a", # root Node is unnamed + "b", + "c", + "d", + "e", + "h", + "f", + "g", + "i", + ] + assert result == expected + + +class TestAncestry: + def test_parents(self): + _, leaf = create_test_tree() + expected = ["e", "b", "a"] + assert [node.name for node in leaf.parents] == expected + + def test_lineage(self): + _, leaf = create_test_tree() + expected = ["f", "e", "b", "a"] + assert [node.name for node in leaf.lineage] == expected + + def test_ancestors(self): + _, leaf = create_test_tree() + ancestors = leaf.ancestors + expected = ["a", "b", "e", "f"] + for node, expected_name in zip(ancestors, expected): + assert node.name == expected_name + + def test_subtree(self): + root, _ = create_test_tree() + subtree = root.subtree + expected = [ + "a", + "b", + "d", + "e", + "f", + "g", + "c", + "h", + "i", + ] + for node, expected_name in zip(subtree, expected): + assert node.name == expected_name + + def test_descendants(self): + root, _ = create_test_tree() + descendants = root.descendants + expected = [ + "b", + "d", + "e", + "f", + "g", + "c", + "h", + "i", + ] + for node, expected_name in zip(descendants, expected): + assert node.name == expected_name + + def test_leaves(self): + tree, _ = create_test_tree() + leaves = tree.leaves + expected = [ + "d", + "f", + "g", + "i", + ] + for node, expected_name in zip(leaves, expected): + assert node.name == expected_name + + def test_levels(self): + a, f = create_test_tree() + + assert a.level == 0 + assert f.level == 3 + + assert a.depth == 3 + assert f.depth == 3 + + assert a.width == 1 + assert f.width == 3 + + +class TestRenderTree: + def test_render_nodetree(self): + sam = NamedNode() + ben = NamedNode() + mary = NamedNode(children={"Sam": sam, "Ben": ben}) + kate = NamedNode() + john = NamedNode(children={"Mary": mary, "Kate": kate}) + + printout = john.__str__() + expected_nodes = [ + "NamedNode()", + "NamedNode('Mary')", + "NamedNode('Sam')", + "NamedNode('Ben')", + "NamedNode('Kate')", + ] + for expected_node, printed_node in zip(expected_nodes, printout.splitlines()): + assert expected_node in printed_node + + +def test_nodepath(): + path = NodePath("/Mary") + assert path.root == "/" + assert path.stem == "Mary" diff --git a/xarray/datatree_/datatree/tests/test_version.py b/xarray/datatree_/datatree/tests/test_version.py new file mode 100644 index 00000000000..207d5d86d53 --- /dev/null +++ b/xarray/datatree_/datatree/tests/test_version.py @@ -0,0 +1,5 @@ +import datatree + + +def test_version(): + assert datatree.__version__ != "999" diff --git a/xarray/datatree_/datatree/treenode.py b/xarray/datatree_/datatree/treenode.py new file mode 100644 index 00000000000..01529e82bdf --- /dev/null +++ b/xarray/datatree_/datatree/treenode.py @@ -0,0 +1,679 @@ +from __future__ import annotations + +import sys +from collections import OrderedDict +from pathlib import PurePosixPath +from typing import ( + TYPE_CHECKING, + Generic, + Iterator, + Mapping, + Optional, + Tuple, + TypeVar, + Union, +) + +from xarray.core.utils import Frozen, is_dict_like + +if TYPE_CHECKING: + from xarray.core.types import T_DataArray + + +class InvalidTreeError(Exception): + """Raised when user attempts to create an invalid tree in some way.""" + + +class NotFoundInTreeError(ValueError): + """Raised when operation can't be completed because one node is part of the expected tree.""" + + +class NodePath(PurePosixPath): + """Represents a path from one node to another within a tree.""" + + def __init__(self, *pathsegments): + if sys.version_info >= (3, 12): + super().__init__(*pathsegments) + else: + super().__new__(PurePosixPath, *pathsegments) + if self.drive: + raise ValueError("NodePaths cannot have drives") + + if self.root not in ["/", ""]: + raise ValueError( + 'Root of NodePath can only be either "/" or "", with "" meaning the path is relative.' + ) + # TODO should we also forbid suffixes to avoid node names with dots in them? + + +Tree = TypeVar("Tree", bound="TreeNode") + + +class TreeNode(Generic[Tree]): + """ + Base class representing a node of a tree, with methods for traversing and altering the tree. + + This class stores no data, it has only parents and children attributes, and various methods. + + Stores child nodes in an Ordered Dictionary, which is necessary to ensure that equality checks between two trees + also check that the order of child nodes is the same. + + Nodes themselves are intrinsically unnamed (do not possess a ._name attribute), but if the node has a parent you can + find the key it is stored under via the .name property. + + The .parent attribute is read-only: to replace the parent using public API you must set this node as the child of a + new parent using `new_parent.children[name] = child_node`, or to instead detach from the current parent use + `child_node.orphan()`. + + This class is intended to be subclassed by DataTree, which will overwrite some of the inherited behaviour, + in particular to make names an inherent attribute, and allow setting parents directly. The intention is to mirror + the class structure of xarray.Variable & xarray.DataArray, where Variable is unnamed but DataArray is (optionally) + named. + + Also allows access to any other node in the tree via unix-like paths, including upwards referencing via '../'. + + (This class is heavily inspired by the anytree library's NodeMixin class.) + """ + + _parent: Optional[Tree] + _children: OrderedDict[str, Tree] + + def __init__(self, children: Optional[Mapping[str, Tree]] = None): + """Create a parentless node.""" + self._parent = None + self._children = OrderedDict() + if children is not None: + self.children = children + + @property + def parent(self) -> Tree | None: + """Parent of this node.""" + return self._parent + + def _set_parent( + self, new_parent: Tree | None, child_name: Optional[str] = None + ) -> None: + # TODO is it possible to refactor in a way that removes this private method? + + if new_parent is not None and not isinstance(new_parent, TreeNode): + raise TypeError( + "Parent nodes must be of type DataTree or None, " + f"not type {type(new_parent)}" + ) + + old_parent = self._parent + if new_parent is not old_parent: + self._check_loop(new_parent) + self._detach(old_parent) + self._attach(new_parent, child_name) + + def _check_loop(self, new_parent: Tree | None) -> None: + """Checks that assignment of this new parent will not create a cycle.""" + if new_parent is not None: + if new_parent is self: + raise InvalidTreeError( + f"Cannot set parent, as node {self} cannot be a parent of itself." + ) + + if self._is_descendant_of(new_parent): + raise InvalidTreeError( + "Cannot set parent, as intended parent is already a descendant of this node." + ) + + def _is_descendant_of(self, node: Tree) -> bool: + return any(n is self for n in node.parents) + + def _detach(self, parent: Tree | None) -> None: + if parent is not None: + self._pre_detach(parent) + parents_children = parent.children + parent._children = OrderedDict( + { + name: child + for name, child in parents_children.items() + if child is not self + } + ) + self._parent = None + self._post_detach(parent) + + def _attach(self, parent: Tree | None, child_name: Optional[str] = None) -> None: + if parent is not None: + if child_name is None: + raise ValueError( + "To directly set parent, child needs a name, but child is unnamed" + ) + + self._pre_attach(parent) + parentchildren = parent._children + assert not any( + child is self for child in parentchildren + ), "Tree is corrupt." + parentchildren[child_name] = self + self._parent = parent + self._post_attach(parent) + else: + self._parent = None + + def orphan(self) -> None: + """Detach this node from its parent.""" + self._set_parent(new_parent=None) + + @property + def children(self: Tree) -> Mapping[str, Tree]: + """Child nodes of this node, stored under a mapping via their names.""" + return Frozen(self._children) + + @children.setter + def children(self: Tree, children: Mapping[str, Tree]) -> None: + self._check_children(children) + children = OrderedDict(children) + + old_children = self.children + del self.children + try: + self._pre_attach_children(children) + for name, child in children.items(): + child._set_parent(new_parent=self, child_name=name) + self._post_attach_children(children) + assert len(self.children) == len(children) + except Exception: + # if something goes wrong then revert to previous children + self.children = old_children + raise + + @children.deleter + def children(self) -> None: + # TODO this just detaches all the children, it doesn't actually delete them... + children = self.children + self._pre_detach_children(children) + for child in self.children.values(): + child.orphan() + assert len(self.children) == 0 + self._post_detach_children(children) + + @staticmethod + def _check_children(children: Mapping[str, Tree]) -> None: + """Check children for correct types and for any duplicates.""" + if not is_dict_like(children): + raise TypeError( + "children must be a dict-like mapping from names to node objects" + ) + + seen = set() + for name, child in children.items(): + if not isinstance(child, TreeNode): + raise TypeError( + f"Cannot add object {name}. It is of type {type(child)}, " + "but can only add children of type DataTree" + ) + + childid = id(child) + if childid not in seen: + seen.add(childid) + else: + raise InvalidTreeError( + f"Cannot add same node {name} multiple times as different children." + ) + + def __repr__(self) -> str: + return f"TreeNode(children={dict(self._children)})" + + def _pre_detach_children(self: Tree, children: Mapping[str, Tree]) -> None: + """Method call before detaching `children`.""" + pass + + def _post_detach_children(self: Tree, children: Mapping[str, Tree]) -> None: + """Method call after detaching `children`.""" + pass + + def _pre_attach_children(self: Tree, children: Mapping[str, Tree]) -> None: + """Method call before attaching `children`.""" + pass + + def _post_attach_children(self: Tree, children: Mapping[str, Tree]) -> None: + """Method call after attaching `children`.""" + pass + + def _iter_parents(self: Tree) -> Iterator[Tree]: + """Iterate up the tree, starting from the current node.""" + node: Tree | None = self.parent + while node is not None: + yield node + node = node.parent + + def iter_lineage(self: Tree) -> Tuple[Tree, ...]: + """Iterate up the tree, starting from the current node.""" + from warnings import warn + + warn( + "`iter_lineage` has been deprecated, and in the future will raise an error." + "Please use `parents` from now on.", + DeprecationWarning, + ) + return tuple((self, *self.parents)) + + @property + def lineage(self: Tree) -> Tuple[Tree, ...]: + """All parent nodes and their parent nodes, starting with the closest.""" + from warnings import warn + + warn( + "`lineage` has been deprecated, and in the future will raise an error." + "Please use `parents` from now on.", + DeprecationWarning, + ) + return self.iter_lineage() + + @property + def parents(self: Tree) -> Tuple[Tree, ...]: + """All parent nodes and their parent nodes, starting with the closest.""" + return tuple(self._iter_parents()) + + @property + def ancestors(self: Tree) -> Tuple[Tree, ...]: + """All parent nodes and their parent nodes, starting with the most distant.""" + + from warnings import warn + + warn( + "`ancestors` has been deprecated, and in the future will raise an error." + "Please use `parents`. Example: `tuple(reversed(node.parents))`", + DeprecationWarning, + ) + return tuple((*reversed(self.parents), self)) + + @property + def root(self: Tree) -> Tree: + """Root node of the tree""" + node = self + while node.parent is not None: + node = node.parent + return node + + @property + def is_root(self) -> bool: + """Whether this node is the tree root.""" + return self.parent is None + + @property + def is_leaf(self) -> bool: + """ + Whether this node is a leaf node. + + Leaf nodes are defined as nodes which have no children. + """ + return self.children == {} + + @property + def leaves(self: Tree) -> Tuple[Tree, ...]: + """ + All leaf nodes. + + Leaf nodes are defined as nodes which have no children. + """ + return tuple([node for node in self.subtree if node.is_leaf]) + + @property + def siblings(self: Tree) -> OrderedDict[str, Tree]: + """ + Nodes with the same parent as this node. + """ + if self.parent: + return OrderedDict( + { + name: child + for name, child in self.parent.children.items() + if child is not self + } + ) + else: + return OrderedDict() + + @property + def subtree(self: Tree) -> Iterator[Tree]: + """ + An iterator over all nodes in this tree, including both self and all descendants. + + Iterates depth-first. + + See Also + -------- + DataTree.descendants + """ + from xarray.datatree_.datatree import iterators + + return iterators.PreOrderIter(self) + + @property + def descendants(self: Tree) -> Tuple[Tree, ...]: + """ + Child nodes and all their child nodes. + + Returned in depth-first order. + + See Also + -------- + DataTree.subtree + """ + all_nodes = tuple(self.subtree) + this_node, *descendants = all_nodes + return tuple(descendants) + + @property + def level(self: Tree) -> int: + """ + Level of this node. + + Level means number of parent nodes above this node before reaching the root. + The root node is at level 0. + + Returns + ------- + level : int + + See Also + -------- + depth + width + """ + return len(self.parents) + + @property + def depth(self: Tree) -> int: + """ + Maximum level of this tree. + + Measured from the root, which has a depth of 0. + + Returns + ------- + depth : int + + See Also + -------- + level + width + """ + return max(node.level for node in self.root.subtree) + + @property + def width(self: Tree) -> int: + """ + Number of nodes at this level in the tree. + + Includes number of immediate siblings, but also "cousins" in other branches and so-on. + + Returns + ------- + depth : int + + See Also + -------- + level + depth + """ + return len([node for node in self.root.subtree if node.level == self.level]) + + def _pre_detach(self: Tree, parent: Tree) -> None: + """Method call before detaching from `parent`.""" + pass + + def _post_detach(self: Tree, parent: Tree) -> None: + """Method call after detaching from `parent`.""" + pass + + def _pre_attach(self: Tree, parent: Tree) -> None: + """Method call before attaching to `parent`.""" + pass + + def _post_attach(self: Tree, parent: Tree) -> None: + """Method call after attaching to `parent`.""" + pass + + def get(self: Tree, key: str, default: Optional[Tree] = None) -> Optional[Tree]: + """ + Return the child node with the specified key. + + Only looks for the node within the immediate children of this node, + not in other nodes of the tree. + """ + if key in self.children: + return self.children[key] + else: + return default + + # TODO `._walk` method to be called by both `_get_item` and `_set_item` + + def _get_item(self: Tree, path: str | NodePath) -> Union[Tree, T_DataArray]: + """ + Returns the object lying at the given path. + + Raises a KeyError if there is no object at the given path. + """ + if isinstance(path, str): + path = NodePath(path) + + if path.root: + current_node = self.root + root, *parts = list(path.parts) + else: + current_node = self + parts = list(path.parts) + + for part in parts: + if part == "..": + if current_node.parent is None: + raise KeyError(f"Could not find node at {path}") + else: + current_node = current_node.parent + elif part in ("", "."): + pass + else: + if current_node.get(part) is None: + raise KeyError(f"Could not find node at {path}") + else: + current_node = current_node.get(part) + return current_node + + def _set(self: Tree, key: str, val: Tree) -> None: + """ + Set the child node with the specified key to value. + + Counterpart to the public .get method, and also only works on the immediate node, not other nodes in the tree. + """ + new_children = {**self.children, key: val} + self.children = new_children + + def _set_item( + self: Tree, + path: str | NodePath, + item: Union[Tree, T_DataArray], + new_nodes_along_path: bool = False, + allow_overwrite: bool = True, + ) -> None: + """ + Set a new item in the tree, overwriting anything already present at that path. + + The given value either forms a new node of the tree or overwrites an existing item at that location. + + Parameters + ---------- + path + item + new_nodes_along_path : bool + If true, then if necessary new nodes will be created along the given path, until the tree can reach the + specified location. + allow_overwrite : bool + Whether or not to overwrite any existing node at the location given by path. + + Raises + ------ + KeyError + If node cannot be reached, and new_nodes_along_path=False. + Or if a node already exists at the specified path, and allow_overwrite=False. + """ + if isinstance(path, str): + path = NodePath(path) + + if not path.name: + raise ValueError("Can't set an item under a path which has no name") + + if path.root: + # absolute path + current_node = self.root + root, *parts, name = path.parts + else: + # relative path + current_node = self + *parts, name = path.parts + + if parts: + # Walk to location of new node, creating intermediate node objects as we go if necessary + for part in parts: + if part == "..": + if current_node.parent is None: + # We can't create a parent if `new_nodes_along_path=True` as we wouldn't know what to name it + raise KeyError(f"Could not reach node at path {path}") + else: + current_node = current_node.parent + elif part in ("", "."): + pass + else: + if part in current_node.children: + current_node = current_node.children[part] + elif new_nodes_along_path: + # Want child classes (i.e. DataTree) to populate tree with their own types + new_node = type(self)() + current_node._set(part, new_node) + current_node = current_node.children[part] + else: + raise KeyError(f"Could not reach node at path {path}") + + if name in current_node.children: + # Deal with anything already existing at this location + if allow_overwrite: + current_node._set(name, item) + else: + raise KeyError(f"Already a node object at path {path}") + else: + current_node._set(name, item) + + def __delitem__(self: Tree, key: str): + """Remove a child node from this tree object.""" + if key in self.children: + child = self._children[key] + del self._children[key] + child.orphan() + else: + raise KeyError("Cannot delete") + + def same_tree(self, other: Tree) -> bool: + """True if other node is in the same tree as this node.""" + return self.root is other.root + + +class NamedNode(TreeNode, Generic[Tree]): + """ + A TreeNode which knows its own name. + + Implements path-like relationships to other nodes in its tree. + """ + + _name: Optional[str] + _parent: Optional[Tree] + _children: OrderedDict[str, Tree] + + def __init__(self, name=None, children=None): + super().__init__(children=children) + self._name = None + self.name = name + + @property + def name(self) -> str | None: + """The name of this node.""" + return self._name + + @name.setter + def name(self, name: str | None) -> None: + if name is not None: + if not isinstance(name, str): + raise TypeError("node name must be a string or None") + if "/" in name: + raise ValueError("node names cannot contain forward slashes") + self._name = name + + def __str__(self) -> str: + return f"NamedNode({self.name})" if self.name else "NamedNode()" + + def _post_attach(self: NamedNode, parent: NamedNode) -> None: + """Ensures child has name attribute corresponding to key under which it has been stored.""" + key = next(k for k, v in parent.children.items() if v is self) + self.name = key + + @property + def path(self) -> str: + """Return the file-like path from the root to this node.""" + if self.is_root: + return "/" + else: + root, *ancestors = tuple(reversed(self.parents)) + # don't include name of root because (a) root might not have a name & (b) we want path relative to root. + names = [*(node.name for node in ancestors), self.name] + return "/" + "/".join(names) + + def relative_to(self: NamedNode, other: NamedNode) -> str: + """ + Compute the relative path from this node to node `other`. + + If other is not in this tree, or it's otherwise impossible, raise a ValueError. + """ + if not self.same_tree(other): + raise NotFoundInTreeError( + "Cannot find relative path because nodes do not lie within the same tree" + ) + + this_path = NodePath(self.path) + if other.path in list(parent.path for parent in (self, *self.parents)): + return str(this_path.relative_to(other.path)) + else: + common_ancestor = self.find_common_ancestor(other) + path_to_common_ancestor = other._path_to_ancestor(common_ancestor) + return str( + path_to_common_ancestor / this_path.relative_to(common_ancestor.path) + ) + + def find_common_ancestor(self, other: NamedNode) -> NamedNode: + """ + Find the first common ancestor of two nodes in the same tree. + + Raise ValueError if they are not in the same tree. + """ + if self is other: + return self + + other_paths = [op.path for op in other.parents] + for parent in (self, *self.parents): + if parent.path in other_paths: + return parent + + raise NotFoundInTreeError( + "Cannot find common ancestor because nodes do not lie within the same tree" + ) + + def _path_to_ancestor(self, ancestor: NamedNode) -> NodePath: + """Return the relative path from this node to the given ancestor node""" + + if not self.same_tree(ancestor): + raise NotFoundInTreeError( + "Cannot find relative path to ancestor because nodes do not lie within the same tree" + ) + if ancestor.path not in list(a.path for a in (self, *self.parents)): + raise NotFoundInTreeError( + "Cannot find relative path to ancestor because given node is not an ancestor of this node" + ) + + parents_paths = list(parent.path for parent in (self, *self.parents)) + generation_gap = list(parents_paths).index(ancestor.path) + path_upwards = "../" * generation_gap if generation_gap > 0 else "." + return NodePath(path_upwards) diff --git a/xarray/datatree_/docs/Makefile b/xarray/datatree_/docs/Makefile new file mode 100644 index 00000000000..6e9b4058414 --- /dev/null +++ b/xarray/datatree_/docs/Makefile @@ -0,0 +1,183 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html rtdhtml dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " rtdhtml Build html using same settings used on ReadtheDocs" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +rtdhtml: + $(SPHINXBUILD) -T -j auto -E -W --keep-going -b html -d $(BUILDDIR)/doctrees -D language=en . $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/complexity.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/complexity.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/complexity" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/complexity" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/xarray/datatree_/docs/README.md b/xarray/datatree_/docs/README.md new file mode 100644 index 00000000000..ca2bf72952e --- /dev/null +++ b/xarray/datatree_/docs/README.md @@ -0,0 +1,14 @@ +# README - docs + +## Build the documentation locally + +```bash +cd docs # From project's root +make clean +rm -rf source/generated # remove autodoc artefacts, that are not removed by `make clean` +make html +``` + +## Access the documentation locally + +Open `docs/_build/html/index.html` in a web browser diff --git a/xarray/datatree_/docs/make.bat b/xarray/datatree_/docs/make.bat new file mode 100644 index 00000000000..2df9a8cbbb6 --- /dev/null +++ b/xarray/datatree_/docs/make.bat @@ -0,0 +1,242 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\complexity.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\complexity.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/xarray/datatree_/docs/source/api.rst b/xarray/datatree_/docs/source/api.rst new file mode 100644 index 00000000000..d325d24f4a4 --- /dev/null +++ b/xarray/datatree_/docs/source/api.rst @@ -0,0 +1,362 @@ +.. currentmodule:: datatree + +############# +API reference +############# + +DataTree +======== + +Creating a DataTree +------------------- + +Methods of creating a datatree. + +.. autosummary:: + :toctree: generated/ + + DataTree + DataTree.from_dict + +Tree Attributes +--------------- + +Attributes relating to the recursive tree-like structure of a ``DataTree``. + +.. autosummary:: + :toctree: generated/ + + DataTree.parent + DataTree.children + DataTree.name + DataTree.path + DataTree.root + DataTree.is_root + DataTree.is_leaf + DataTree.leaves + DataTree.level + DataTree.depth + DataTree.width + DataTree.subtree + DataTree.descendants + DataTree.siblings + DataTree.lineage + DataTree.parents + DataTree.ancestors + DataTree.groups + +Data Contents +------------- + +Interface to the data objects (optionally) stored inside a single ``DataTree`` node. +This interface echoes that of ``xarray.Dataset``. + +.. autosummary:: + :toctree: generated/ + + DataTree.dims + DataTree.sizes + DataTree.data_vars + DataTree.coords + DataTree.attrs + DataTree.encoding + DataTree.indexes + DataTree.nbytes + DataTree.ds + DataTree.to_dataset + DataTree.has_data + DataTree.has_attrs + DataTree.is_empty + DataTree.is_hollow + +Dictionary Interface +-------------------- + +``DataTree`` objects also have a dict-like interface mapping keys to either ``xarray.DataArray``s or to child ``DataTree`` nodes. + +.. autosummary:: + :toctree: generated/ + + DataTree.__getitem__ + DataTree.__setitem__ + DataTree.__delitem__ + DataTree.update + DataTree.get + DataTree.items + DataTree.keys + DataTree.values + +Tree Manipulation +----------------- + +For manipulating, traversing, navigating, or mapping over the tree structure. + +.. autosummary:: + :toctree: generated/ + + DataTree.orphan + DataTree.same_tree + DataTree.relative_to + DataTree.iter_lineage + DataTree.find_common_ancestor + DataTree.map_over_subtree + map_over_subtree + DataTree.pipe + DataTree.match + DataTree.filter + +Pathlib-like Interface +---------------------- + +``DataTree`` objects deliberately echo some of the API of `pathlib.PurePath`. + +.. autosummary:: + :toctree: generated/ + + DataTree.name + DataTree.parent + DataTree.parents + DataTree.relative_to + +Missing: + +.. + + ``DataTree.glob`` + ``DataTree.joinpath`` + ``DataTree.with_name`` + ``DataTree.walk`` + ``DataTree.rename`` + ``DataTree.replace`` + +DataTree Contents +----------------- + +Manipulate the contents of all nodes in a tree simultaneously. + +.. autosummary:: + :toctree: generated/ + + DataTree.copy + DataTree.assign_coords + DataTree.merge + DataTree.rename + DataTree.rename_vars + DataTree.rename_dims + DataTree.swap_dims + DataTree.expand_dims + DataTree.drop_vars + DataTree.drop_dims + DataTree.set_coords + DataTree.reset_coords + +DataTree Node Contents +---------------------- + +Manipulate the contents of a single DataTree node. + +.. autosummary:: + :toctree: generated/ + + DataTree.assign + DataTree.drop_nodes + +Comparisons +=========== + +Compare one ``DataTree`` object to another. + +.. autosummary:: + :toctree: generated/ + + DataTree.isomorphic + DataTree.equals + DataTree.identical + +Indexing +======== + +Index into all nodes in the subtree simultaneously. + +.. autosummary:: + :toctree: generated/ + + DataTree.isel + DataTree.sel + DataTree.drop_sel + DataTree.drop_isel + DataTree.head + DataTree.tail + DataTree.thin + DataTree.squeeze + DataTree.interp + DataTree.interp_like + DataTree.reindex + DataTree.reindex_like + DataTree.set_index + DataTree.reset_index + DataTree.reorder_levels + DataTree.query + +.. + + Missing: + ``DataTree.loc`` + + +Missing Value Handling +====================== + +.. autosummary:: + :toctree: generated/ + + DataTree.isnull + DataTree.notnull + DataTree.combine_first + DataTree.dropna + DataTree.fillna + DataTree.ffill + DataTree.bfill + DataTree.interpolate_na + DataTree.where + DataTree.isin + +Computation +=========== + +Apply a computation to the data in all nodes in the subtree simultaneously. + +.. autosummary:: + :toctree: generated/ + + DataTree.map + DataTree.reduce + DataTree.diff + DataTree.quantile + DataTree.differentiate + DataTree.integrate + DataTree.map_blocks + DataTree.polyfit + DataTree.curvefit + +Aggregation +=========== + +Aggregate data in all nodes in the subtree simultaneously. + +.. autosummary:: + :toctree: generated/ + + DataTree.all + DataTree.any + DataTree.argmax + DataTree.argmin + DataTree.idxmax + DataTree.idxmin + DataTree.max + DataTree.min + DataTree.mean + DataTree.median + DataTree.prod + DataTree.sum + DataTree.std + DataTree.var + DataTree.cumsum + DataTree.cumprod + +ndarray methods +=============== + +Methods copied from :py:class:`numpy.ndarray` objects, here applying to the data in all nodes in the subtree. + +.. autosummary:: + :toctree: generated/ + + DataTree.argsort + DataTree.astype + DataTree.clip + DataTree.conj + DataTree.conjugate + DataTree.round + DataTree.rank + +Reshaping and reorganising +========================== + +Reshape or reorganise the data in all nodes in the subtree. + +.. autosummary:: + :toctree: generated/ + + DataTree.transpose + DataTree.stack + DataTree.unstack + DataTree.shift + DataTree.roll + DataTree.pad + DataTree.sortby + DataTree.broadcast_like + +Plotting +======== + +I/O +=== + +Open a datatree from an on-disk store or serialize the tree. + +.. autosummary:: + :toctree: generated/ + + open_datatree + DataTree.to_dict + DataTree.to_netcdf + DataTree.to_zarr + +.. + + Missing: + ``open_mfdatatree`` + +Tutorial +======== + +Testing +======= + +Test that two DataTree objects are similar. + +.. autosummary:: + :toctree: generated/ + + testing.assert_isomorphic + testing.assert_equal + testing.assert_identical + +Exceptions +========== + +Exceptions raised when manipulating trees. + +.. autosummary:: + :toctree: generated/ + + TreeIsomorphismError + InvalidTreeError + NotFoundInTreeError + +Advanced API +============ + +Relatively advanced API for users or developers looking to understand the internals, or extend functionality. + +.. autosummary:: + :toctree: generated/ + + DataTree.variables + register_datatree_accessor + +.. + + Missing: + ``DataTree.set_close`` diff --git a/xarray/datatree_/docs/source/conf.py b/xarray/datatree_/docs/source/conf.py new file mode 100644 index 00000000000..8a9224def5b --- /dev/null +++ b/xarray/datatree_/docs/source/conf.py @@ -0,0 +1,412 @@ +# -*- coding: utf-8 -*- +# flake8: noqa +# Ignoring F401: imported but unused + +# complexity documentation build configuration file, created by +# sphinx-quickstart on Tue Jul 9 22:26:36 2013. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import inspect +import os +import sys + +import sphinx_autosummary_accessors + +import datatree + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +cwd = os.getcwd() +parent = os.path.dirname(cwd) +sys.path.insert(0, parent) + + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.linkcode", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "sphinx.ext.extlinks", + "sphinx.ext.napoleon", + "sphinx_copybutton", + "sphinxext.opengraph", + "sphinx_autosummary_accessors", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", + "nbsphinx", + "sphinxcontrib.srclinks", +] + +extlinks = { + "issue": ("https://github.com/xarray-contrib/datatree/issues/%s", "GH#%s"), + "pull": ("https://github.com/xarray-contrib/datatree/pull/%s", "GH#%s"), +} +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates", sphinx_autosummary_accessors.templates_path] + +# Generate the API documentation when building +autosummary_generate = True + + +# Napoleon configurations + +napoleon_google_docstring = False +napoleon_numpy_docstring = True +napoleon_use_param = False +napoleon_use_rtype = False +napoleon_preprocess_types = True +napoleon_type_aliases = { + # general terms + "sequence": ":term:`sequence`", + "iterable": ":term:`iterable`", + "callable": ":py:func:`callable`", + "dict_like": ":term:`dict-like `", + "dict-like": ":term:`dict-like `", + "path-like": ":term:`path-like `", + "mapping": ":term:`mapping`", + "file-like": ":term:`file-like `", + # special terms + # "same type as caller": "*same type as caller*", # does not work, yet + # "same type as values": "*same type as values*", # does not work, yet + # stdlib type aliases + "MutableMapping": "~collections.abc.MutableMapping", + "sys.stdout": ":obj:`sys.stdout`", + "timedelta": "~datetime.timedelta", + "string": ":class:`string `", + # numpy terms + "array_like": ":term:`array_like`", + "array-like": ":term:`array-like `", + "scalar": ":term:`scalar`", + "array": ":term:`array`", + "hashable": ":term:`hashable `", + # matplotlib terms + "color-like": ":py:func:`color-like `", + "matplotlib colormap name": ":doc:`matplotlib colormap name `", + "matplotlib axes object": ":py:class:`matplotlib axes object `", + "colormap": ":py:class:`colormap `", + # objects without namespace: xarray + "DataArray": "~xarray.DataArray", + "Dataset": "~xarray.Dataset", + "Variable": "~xarray.Variable", + "DatasetGroupBy": "~xarray.core.groupby.DatasetGroupBy", + "DataArrayGroupBy": "~xarray.core.groupby.DataArrayGroupBy", + # objects without namespace: numpy + "ndarray": "~numpy.ndarray", + "MaskedArray": "~numpy.ma.MaskedArray", + "dtype": "~numpy.dtype", + "ComplexWarning": "~numpy.ComplexWarning", + # objects without namespace: pandas + "Index": "~pandas.Index", + "MultiIndex": "~pandas.MultiIndex", + "CategoricalIndex": "~pandas.CategoricalIndex", + "TimedeltaIndex": "~pandas.TimedeltaIndex", + "DatetimeIndex": "~pandas.DatetimeIndex", + "Series": "~pandas.Series", + "DataFrame": "~pandas.DataFrame", + "Categorical": "~pandas.Categorical", + "Path": "~~pathlib.Path", + # objects with abbreviated namespace (from pandas) + "pd.Index": "~pandas.Index", + "pd.NaT": "~pandas.NaT", +} + +# The suffix of source filenames. +source_suffix = ".rst" + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "Datatree" +copyright = "2021 onwards, Tom Nicholas and its Contributors" +author = "Tom Nicholas" + +html_show_sourcelink = True +srclink_project = "https://github.com/xarray-contrib/datatree" +srclink_branch = "main" +srclink_src_path = "docs/source" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = datatree.__version__ +# The full version, including alpha/beta/rc tags. +release = datatree.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ["_build"] + +# The reST default role (used for this markup: `text`) to use for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + + +# -- Intersphinx links --------------------------------------------------------- + +intersphinx_mapping = { + "python": ("https://docs.python.org/3.8/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "xarray": ("https://xarray.pydata.org/en/stable/", None), +} + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "sphinx_book_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + "repository_url": "https://github.com/xarray-contrib/datatree", + "repository_branch": "main", + "path_to_docs": "docs/source", + "use_repository_button": True, + "use_issues_button": True, + "use_edit_page_button": True, +} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = "datatree_doc" + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ("index", "datatree.tex", "Datatree Documentation", author, "manual") +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [("index", "datatree", "Datatree Documentation", [author], 1)] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + "index", + "datatree", + "Datatree Documentation", + author, + "datatree", + "Tree-like hierarchical data structure for xarray.", + "Miscellaneous", + ) +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# texinfo_no_detailmenu = False + + +# based on numpy doc/source/conf.py +def linkcode_resolve(domain, info): + """ + Determine the URL corresponding to Python object + """ + if domain != "py": + return None + + modname = info["module"] + fullname = info["fullname"] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split("."): + try: + obj = getattr(obj, part) + except AttributeError: + return None + + try: + fn = inspect.getsourcefile(inspect.unwrap(obj)) + except TypeError: + fn = None + if not fn: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except OSError: + lineno = None + + if lineno: + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" + else: + linespec = "" + + fn = os.path.relpath(fn, start=os.path.dirname(datatree.__file__)) + + if "+" in datatree.__version__: + return f"https://github.com/xarray-contrib/datatree/blob/main/datatree/{fn}{linespec}" + else: + return ( + f"https://github.com/xarray-contrib/datatree/blob/" + f"v{datatree.__version__}/datatree/{fn}{linespec}" + ) diff --git a/xarray/datatree_/docs/source/contributing.rst b/xarray/datatree_/docs/source/contributing.rst new file mode 100644 index 00000000000..b070c07c867 --- /dev/null +++ b/xarray/datatree_/docs/source/contributing.rst @@ -0,0 +1,136 @@ +======================== +Contributing to Datatree +======================== + +Contributions are highly welcomed and appreciated. Every little help counts, +so do not hesitate! + +.. contents:: Contribution links + :depth: 2 + +.. _submitfeedback: + +Feature requests and feedback +----------------------------- + +Do you like Datatree? Share some love on Twitter or in your blog posts! + +We'd also like to hear about your propositions and suggestions. Feel free to +`submit them as issues `_ and: + +* Explain in detail how they should work. +* Keep the scope as narrow as possible. This will make it easier to implement. + +.. _reportbugs: + +Report bugs +----------- + +Report bugs for Datatree in the `issue tracker `_. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting, + specifically the Python interpreter version, installed libraries, and Datatree + version. +* Detailed steps to reproduce the bug. + +If you can write a demonstration test that currently fails but should pass +(xfail), that is a very useful commit to make as well, even if you cannot +fix the bug itself. + +.. _fixbugs: + +Fix bugs +-------- + +Look through the `GitHub issues for bugs `_. + +Talk to developers to find out how you can fix specific bugs. + +Write documentation +------------------- + +Datatree could always use more documentation. What exactly is needed? + +* More complementary documentation. Have you perhaps found something unclear? +* Docstrings. There can never be too many of them. +* Blog posts, articles and such -- they're all very appreciated. + +You can also edit documentation files directly in the GitHub web interface, +without using a local copy. This can be convenient for small fixes. + +To build the documentation locally, you first need to install the following +tools: + +- `Sphinx `__ +- `sphinx_rtd_theme `__ +- `sphinx-autosummary-accessors `__ + +You can then build the documentation with the following commands:: + + $ cd docs + $ make html + +The built documentation should be available in the ``docs/_build/`` folder. + +.. _`pull requests`: +.. _pull-requests: + +Preparing Pull Requests +----------------------- + +#. Fork the + `Datatree GitHub repository `__. It's + fine to use ``Datatree`` as your fork repository name because it will live + under your user. + +#. Clone your fork locally using `git `_ and create a branch:: + + $ git clone git@github.com:{YOUR_GITHUB_USERNAME}/Datatree.git + $ cd Datatree + + # now, to fix a bug or add feature create your own branch off "master": + + $ git checkout -b your-bugfix-feature-branch-name master + +#. Install `pre-commit `_ and its hook on the Datatree repo:: + + $ pip install --user pre-commit + $ pre-commit install + + Afterwards ``pre-commit`` will run whenever you commit. + + https://pre-commit.com/ is a framework for managing and maintaining multi-language pre-commit hooks + to ensure code-style and code formatting is consistent. + +#. Install dependencies into a new conda environment:: + + $ conda env update -f ci/environment.yml + +#. Run all the tests + + Now running tests is as simple as issuing this command:: + + $ conda activate datatree-dev + $ pytest --junitxml=test-reports/junit.xml --cov=./ --verbose + + This command will run tests via the "pytest" tool. + +#. You can now edit your local working copy and run the tests again as necessary. Please follow PEP-8 for naming. + + When committing, ``pre-commit`` will re-format the files if necessary. + +#. Commit and push once your tests pass and you are happy with your change(s):: + + $ git commit -a -m "" + $ git push -u + +#. Finally, submit a pull request through the GitHub website using this data:: + + head-fork: YOUR_GITHUB_USERNAME/Datatree + compare: your-branch-name + + base-fork: TomNicholas/datatree + base: master diff --git a/xarray/datatree_/docs/source/data-structures.rst b/xarray/datatree_/docs/source/data-structures.rst new file mode 100644 index 00000000000..02e4a31f688 --- /dev/null +++ b/xarray/datatree_/docs/source/data-structures.rst @@ -0,0 +1,197 @@ +.. currentmodule:: datatree + +.. _data structures: + +Data Structures +=============== + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + import datatree + + np.random.seed(123456) + np.set_printoptions(threshold=10) + + %xmode minimal + +.. note:: + + This page builds on the information given in xarray's main page on + `data structures `_, so it is suggested that you + are familiar with those first. + +DataTree +-------- + +:py:class:`DataTree` is xarray's highest-level data structure, able to organise heterogeneous data which +could not be stored inside a single :py:class:`Dataset` object. This includes representing the recursive structure of multiple +`groups`_ within a netCDF file or `Zarr Store`_. + +.. _groups: https://www.unidata.ucar.edu/software/netcdf/workshops/2011/groups-types/GroupsIntro.html +.. _Zarr Store: https://zarr.readthedocs.io/en/stable/tutorial.html#groups + +Each ``DataTree`` object (or "node") contains the same data that a single ``xarray.Dataset`` would (i.e. ``DataArray`` objects +stored under hashable keys), and so has the same key properties: + +- ``dims``: a dictionary mapping of dimension names to lengths, for the variables in this node, +- ``data_vars``: a dict-like container of DataArrays corresponding to variables in this node, +- ``coords``: another dict-like container of DataArrays, corresponding to coordinate variables in this node, +- ``attrs``: dict to hold arbitary metadata relevant to data in this node. + +A single ``DataTree`` object acts much like a single ``Dataset`` object, and has a similar set of dict-like methods +defined upon it. However, ``DataTree``'s can also contain other ``DataTree`` objects, so they can be thought of as nested dict-like +containers of both ``xarray.DataArray``'s and ``DataTree``'s. + +A single datatree object is known as a "node", and its position relative to other nodes is defined by two more key +properties: + +- ``children``: An ordered dictionary mapping from names to other ``DataTree`` objects, known as its' "child nodes". +- ``parent``: The single ``DataTree`` object whose children this datatree is a member of, known as its' "parent node". + +Each child automatically knows about its parent node, and a node without a parent is known as a "root" node +(represented by the ``parent`` attribute pointing to ``None``). +Nodes can have multiple children, but as each child node has at most one parent, there can only ever be one root node in a given tree. + +The overall structure is technically a `connected acyclic undirected rooted graph`, otherwise known as a +`"Tree" `_. + +.. note:: + + Technically a ``DataTree`` with more than one child node forms an `"Ordered Tree" `_, + because the children are stored in an Ordered Dictionary. However, this distinction only really matters for a few + edge cases involving operations on multiple trees simultaneously, and can safely be ignored by most users. + + +``DataTree`` objects can also optionally have a ``name`` as well as ``attrs``, just like a ``DataArray``. +Again these are not normally used unless explicitly accessed by the user. + + +.. _creating a datatree: + +Creating a DataTree +~~~~~~~~~~~~~~~~~~~ + +One way to create a ``DataTree`` from scratch is to create each node individually, +specifying the nodes' relationship to one another as you create each one. + +The ``DataTree`` constructor takes: + +- ``data``: The data that will be stored in this node, represented by a single ``xarray.Dataset``, or a named ``xarray.DataArray``. +- ``parent``: The parent node (if there is one), given as a ``DataTree`` object. +- ``children``: The various child nodes (if there are any), given as a mapping from string keys to ``DataTree`` objects. +- ``name``: A string to use as the name of this node. + +Let's make a single datatree node with some example data in it: + +.. ipython:: python + + from datatree import DataTree + + ds1 = xr.Dataset({"foo": "orange"}) + dt = DataTree(name="root", data=ds1) # create root node + + dt + +At this point our node is also the root node, as every tree has a root node. + +We can add a second node to this tree either by referring to the first node in the constructor of the second: + +.. ipython:: python + + ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])}) + # add a child by referring to the parent node + node2 = DataTree(name="a", parent=dt, data=ds2) + +or by dynamically updating the attributes of one node to refer to another: + +.. ipython:: python + + # add a second child by first creating a new node ... + ds3 = xr.Dataset({"zed": np.NaN}) + node3 = DataTree(name="b", data=ds3) + # ... then updating its .parent property + node3.parent = dt + +Our tree now has three nodes within it: + +.. ipython:: python + + dt + +It is at tree construction time that consistency checks are enforced. For instance, if we try to create a `cycle` the constructor will raise an error: + +.. ipython:: python + :okexcept: + + dt.parent = node3 + +Alternatively you can also create a ``DataTree`` object from + +- An ``xarray.Dataset`` using ``Dataset.to_node()`` (not yet implemented), +- A dictionary mapping directory-like paths to either ``DataTree`` nodes or data, using :py:meth:`DataTree.from_dict()`, +- A netCDF or Zarr file on disk with :py:func:`open_datatree()`. See :ref:`reading and writing files `. + + +DataTree Contents +~~~~~~~~~~~~~~~~~ + +Like ``xarray.Dataset``, ``DataTree`` implements the python mapping interface, but with values given by either ``xarray.DataArray`` objects or other ``DataTree`` objects. + +.. ipython:: python + + dt["a"] + dt["foo"] + +Iterating over keys will iterate over both the names of variables and child nodes. + +We can also access all the data in a single node through a dataset-like view + +.. ipython:: python + + dt["a"].ds + +This demonstrates the fact that the data in any one node is equivalent to the contents of a single ``xarray.Dataset`` object. +The ``DataTree.ds`` property returns an immutable view, but we can instead extract the node's data contents as a new (and mutable) +``xarray.Dataset`` object via :py:meth:`DataTree.to_dataset()`: + +.. ipython:: python + + dt["a"].to_dataset() + +Like with ``Dataset``, you can access the data and coordinate variables of a node separately via the ``data_vars`` and ``coords`` attributes: + +.. ipython:: python + + dt["a"].data_vars + dt["a"].coords + + +Dictionary-like methods +~~~~~~~~~~~~~~~~~~~~~~~ + +We can update a datatree in-place using Python's standard dictionary syntax, similar to how we can for Dataset objects. +For example, to create this example datatree from scratch, we could have written: + +# TODO update this example using ``.coords`` and ``.data_vars`` as setters, + +.. ipython:: python + + dt = DataTree(name="root") + dt["foo"] = "orange" + dt["a"] = DataTree(data=xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})) + dt["a/b/zed"] = np.NaN + dt + +To change the variables in a node of a ``DataTree``, you can use all the standard dictionary +methods, including ``values``, ``items``, ``__delitem__``, ``get`` and +:py:meth:`DataTree.update`. +Note that assigning a ``DataArray`` object to a ``DataTree`` variable using ``__setitem__`` or ``update`` will +:ref:`automatically align ` the array(s) to the original node's indexes. + +If you copy a ``DataTree`` using the :py:func:`copy` function or the :py:meth:`DataTree.copy` method it will copy the subtree, +meaning that node and children below it, but no parents above it. +Like for ``Dataset``, this copy is shallow by default, but you can copy all the underlying data arrays by calling ``dt.copy(deep=True)``. diff --git a/xarray/datatree_/docs/source/hierarchical-data.rst b/xarray/datatree_/docs/source/hierarchical-data.rst new file mode 100644 index 00000000000..d4f58847718 --- /dev/null +++ b/xarray/datatree_/docs/source/hierarchical-data.rst @@ -0,0 +1,639 @@ +.. currentmodule:: datatree + +.. _hierarchical-data: + +Working With Hierarchical Data +============================== + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + from datatree import DataTree + + np.random.seed(123456) + np.set_printoptions(threshold=10) + + %xmode minimal + +Why Hierarchical Data? +---------------------- + +Many real-world datasets are composed of multiple differing components, +and it can often be be useful to think of these in terms of a hierarchy of related groups of data. +Examples of data which one might want organise in a grouped or hierarchical manner include: + +- Simulation data at multiple resolutions, +- Observational data about the same system but from multiple different types of sensors, +- Mixed experimental and theoretical data, +- A systematic study recording the same experiment but with different parameters, +- Heterogenous data, such as demographic and metereological data, + +or even any combination of the above. + +Often datasets like this cannot easily fit into a single :py:class:`xarray.Dataset` object, +or are more usefully thought of as groups of related ``xarray.Dataset`` objects. +For this purpose we provide the :py:class:`DataTree` class. + +This page explains in detail how to understand and use the different features of the :py:class:`DataTree` class for your own hierarchical data needs. + +.. _node relationships: + +Node Relationships +------------------ + +.. _creating a family tree: + +Creating a Family Tree +~~~~~~~~~~~~~~~~~~~~~~ + +The three main ways of creating a ``DataTree`` object are described briefly in :ref:`creating a datatree`. +Here we go into more detail about how to create a tree node-by-node, using a famous family tree from the Simpsons cartoon as an example. + +Let's start by defining nodes representing the two siblings, Bart and Lisa Simpson: + +.. ipython:: python + + bart = DataTree(name="Bart") + lisa = DataTree(name="Lisa") + +Each of these node objects knows their own :py:class:`~DataTree.name`, but they currently have no relationship to one another. +We can connect them by creating another node representing a common parent, Homer Simpson: + +.. ipython:: python + + homer = DataTree(name="Homer", children={"Bart": bart, "Lisa": lisa}) + +Here we set the children of Homer in the node's constructor. +We now have a small family tree + +.. ipython:: python + + homer + +where we can see how these individual Simpson family members are related to one another. +The nodes representing Bart and Lisa are now connected - we can confirm their sibling rivalry by examining the :py:class:`~DataTree.siblings` property: + +.. ipython:: python + + list(bart.siblings) + +But oops, we forgot Homer's third daughter, Maggie! Let's add her by updating Homer's :py:class:`~DataTree.children` property to include her: + +.. ipython:: python + + maggie = DataTree(name="Maggie") + homer.children = {"Bart": bart, "Lisa": lisa, "Maggie": maggie} + homer + +Let's check that Maggie knows who her Dad is: + +.. ipython:: python + + maggie.parent.name + +That's good - updating the properties of our nodes does not break the internal consistency of our tree, as changes of parentage are automatically reflected on both nodes. + + These children obviously have another parent, Marge Simpson, but ``DataTree`` nodes can only have a maximum of one parent. + Genealogical `family trees are not even technically trees `_ in the mathematical sense - + the fact that distant relatives can mate makes it a directed acyclic graph. + Trees of ``DataTree`` objects cannot represent this. + +Homer is currently listed as having no parent (the so-called "root node" of this tree), but we can update his :py:class:`~DataTree.parent` property: + +.. ipython:: python + + abe = DataTree(name="Abe") + homer.parent = abe + +Abe is now the "root" of this tree, which we can see by examining the :py:class:`~DataTree.root` property of any node in the tree + +.. ipython:: python + + maggie.root.name + +We can see the whole tree by printing Abe's node or just part of the tree by printing Homer's node: + +.. ipython:: python + + abe + homer + +We can see that Homer is aware of his parentage, and we say that Homer and his children form a "subtree" of the larger Simpson family tree. + +In episode 28, Abe Simpson reveals that he had another son, Herbert "Herb" Simpson. +We can add Herbert to the family tree without displacing Homer by :py:meth:`~DataTree.assign`-ing another child to Abe: + +.. ipython:: python + + herbert = DataTree(name="Herb") + abe.assign({"Herbert": herbert}) + +.. note:: + This example shows a minor subtlety - the returned tree has Homer's brother listed as ``"Herbert"``, + but the original node was named "Herbert". Not only are names overriden when stored as keys like this, + but the new node is a copy, so that the original node that was reference is unchanged (i.e. ``herbert.name == "Herb"`` still). + In other words, nodes are copied into trees, not inserted into them. + This is intentional, and mirrors the behaviour when storing named ``xarray.DataArray`` objects inside datasets. + +Certain manipulations of our tree are forbidden, if they would create an inconsistent result. +In episode 51 of the show Futurama, Philip J. Fry travels back in time and accidentally becomes his own Grandfather. +If we try similar time-travelling hijinks with Homer, we get a :py:class:`InvalidTreeError` raised: + +.. ipython:: python + :okexcept: + + abe.parent = homer + +.. _evolutionary tree: + +Ancestry in an Evolutionary Tree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's use a different example of a tree to discuss more complex relationships between nodes - the phylogenetic tree, or tree of life. + +.. ipython:: python + + vertebrates = DataTree.from_dict( + name="Vertebrae", + d={ + "/Sharks": None, + "/Bony Skeleton/Ray-finned Fish": None, + "/Bony Skeleton/Four Limbs/Amphibians": None, + "/Bony Skeleton/Four Limbs/Amniotic Egg/Hair/Primates": None, + "/Bony Skeleton/Four Limbs/Amniotic Egg/Hair/Rodents & Rabbits": None, + "/Bony Skeleton/Four Limbs/Amniotic Egg/Two Fenestrae/Dinosaurs": None, + "/Bony Skeleton/Four Limbs/Amniotic Egg/Two Fenestrae/Birds": None, + }, + ) + + primates = vertebrates["/Bony Skeleton/Four Limbs/Amniotic Egg/Hair/Primates"] + dinosaurs = vertebrates[ + "/Bony Skeleton/Four Limbs/Amniotic Egg/Two Fenestrae/Dinosaurs" + ] + +We have used the :py:meth:`~DataTree.from_dict` constructor method as an alternate way to quickly create a whole tree, +and :ref:`filesystem paths` (to be explained shortly) to select two nodes of interest. + +.. ipython:: python + + vertebrates + +This tree shows various families of species, grouped by their common features (making it technically a `"Cladogram" `_, +rather than an evolutionary tree). + +Here both the species and the features used to group them are represented by ``DataTree`` node objects - there is no distinction in types of node. +We can however get a list of only the nodes we used to represent species by using the fact that all those nodes have no children - they are "leaf nodes". +We can check if a node is a leaf with :py:meth:`~DataTree.is_leaf`, and get a list of all leaves with the :py:class:`~DataTree.leaves` property: + +.. ipython:: python + + primates.is_leaf + [node.name for node in vertebrates.leaves] + +Pretending that this is a true evolutionary tree for a moment, we can find the features of the evolutionary ancestors (so-called "ancestor" nodes), +the distinguishing feature of the common ancestor of all vertebrate life (the root node), +and even the distinguishing feature of the common ancestor of any two species (the common ancestor of two nodes): + +.. ipython:: python + + [node.name for node in primates.ancestors] + primates.root.name + primates.find_common_ancestor(dinosaurs).name + +We can only find a common ancestor between two nodes that lie in the same tree. +If we try to find the common evolutionary ancestor between primates and an Alien species that has no relationship to Earth's evolutionary tree, +an error will be raised. + +.. ipython:: python + :okexcept: + + alien = DataTree(name="Xenomorph") + primates.find_common_ancestor(alien) + + +.. _navigating trees: + +Navigating Trees +---------------- + +There are various ways to access the different nodes in a tree. + +Properties +~~~~~~~~~~ + +We can navigate trees using the :py:class:`~DataTree.parent` and :py:class:`~DataTree.children` properties of each node, for example: + +.. ipython:: python + + lisa.parent.children["Bart"].name + +but there are also more convenient ways to access nodes. + +Dictionary-like interface +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Children are stored on each node as a key-value mapping from name to child node. +They can be accessed and altered via the :py:class:`~DataTree.__getitem__` and :py:class:`~DataTree.__setitem__` syntax. +In general :py:class:`~DataTree.DataTree` objects support almost the entire set of dict-like methods, +including :py:meth:`~DataTree.keys`, :py:class:`~DataTree.values`, :py:class:`~DataTree.items`, +:py:meth:`~DataTree.__delitem__` and :py:meth:`~DataTree.update`. + +.. ipython:: python + + vertebrates["Bony Skeleton"]["Ray-finned Fish"] + +Note that the dict-like interface combines access to child ``DataTree`` nodes and stored ``DataArrays``, +so if we have a node that contains both children and data, calling :py:meth:`~DataTree.keys` will list both names of child nodes and +names of data variables: + +.. ipython:: python + + dt = DataTree( + data=xr.Dataset({"foo": 0, "bar": 1}), + children={"a": DataTree(), "b": DataTree()}, + ) + print(dt) + list(dt.keys()) + +This also means that the names of variables and of child nodes must be different to one another. + +Attribute-like access +~~~~~~~~~~~~~~~~~~~~~ + +You can also select both variables and child nodes through dot indexing + +.. ipython:: python + + dt.foo + dt.a + +.. _filesystem paths: + +Filesystem-like Paths +~~~~~~~~~~~~~~~~~~~~~ + +Hierarchical trees can be thought of as analogous to file systems. +Each node is like a directory, and each directory can contain both more sub-directories and data. + +.. note:: + + You can even make the filesystem analogy concrete by using :py:func:`~DataTree.open_mfdatatree` or :py:func:`~DataTree.save_mfdatatree` # TODO not yet implemented - see GH issue 51 + +Datatree objects support a syntax inspired by unix-like filesystems, +where the "path" to a node is specified by the keys of each intermediate node in sequence, +separated by forward slashes. +This is an extension of the conventional dictionary ``__getitem__`` syntax to allow navigation across multiple levels of the tree. + +Like with filepaths, paths within the tree can either be relative to the current node, e.g. + +.. ipython:: python + + abe["Homer/Bart"].name + abe["./Homer/Bart"].name # alternative syntax + +or relative to the root node. +A path specified from the root (as opposed to being specified relative to an arbitrary node in the tree) is sometimes also referred to as a +`"fully qualified name" `_, +or as an "absolute path". +The root node is referred to by ``"/"``, so the path from the root node to its grand-child would be ``"/child/grandchild"``, e.g. + +.. ipython:: python + + # absolute path will start from root node + lisa["/Homer/Bart"].name + +Relative paths between nodes also support the ``"../"`` syntax to mean the parent of the current node. +We can use this with ``__setitem__`` to add a missing entry to our evolutionary tree, but add it relative to a more familiar node of interest: + +.. ipython:: python + + primates["../../Two Fenestrae/Crocodiles"] = DataTree() + print(vertebrates) + +Given two nodes in a tree, we can also find their relative path: + +.. ipython:: python + + bart.relative_to(lisa) + +You can use this filepath feature to build a nested tree from a dictionary of filesystem-like paths and corresponding ``xarray.Dataset`` objects in a single step. +If we have a dictionary where each key is a valid path, and each value is either valid data or ``None``, +we can construct a complex tree quickly using the alternative constructor :py:meth:`DataTree.from_dict()`: + +.. ipython:: python + + d = { + "/": xr.Dataset({"foo": "orange"}), + "/a": xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])}), + "/a/b": xr.Dataset({"zed": np.NaN}), + "a/c/d": None, + } + dt = DataTree.from_dict(d) + dt + +.. note:: + + Notice that using the path-like syntax will also create any intermediate empty nodes necessary to reach the end of the specified path + (i.e. the node labelled `"c"` in this case.) + This is to help avoid lots of redundant entries when creating deeply-nested trees using :py:meth:`DataTree.from_dict`. + +.. _iterating over trees: + +Iterating over trees +~~~~~~~~~~~~~~~~~~~~ + +You can iterate over every node in a tree using the subtree :py:class:`~DataTree.subtree` property. +This returns an iterable of nodes, which yields them in depth-first order. + +.. ipython:: python + + for node in vertebrates.subtree: + print(node.path) + +A very useful pattern is to use :py:class:`~DataTree.subtree` conjunction with the :py:class:`~DataTree.path` property to manipulate the nodes however you wish, +then rebuild a new tree using :py:meth:`DataTree.from_dict()`. + +For example, we could keep only the nodes containing data by looping over all nodes, +checking if they contain any data using :py:class:`~DataTree.has_data`, +then rebuilding a new tree using only the paths of those nodes: + +.. ipython:: python + + non_empty_nodes = {node.path: node.ds for node in dt.subtree if node.has_data} + DataTree.from_dict(non_empty_nodes) + +You can see this tree is similar to the ``dt`` object above, except that it is missing the empty nodes ``a/c`` and ``a/c/d``. + +(If you want to keep the name of the root node, you will need to add the ``name`` kwarg to :py:class:`from_dict`, i.e. ``DataTree.from_dict(non_empty_nodes, name=dt.root.name)``.) + +.. _manipulating trees: + +Manipulating Trees +------------------ + +Subsetting Tree Nodes +~~~~~~~~~~~~~~~~~~~~~ + +We can subset our tree to select only nodes of interest in various ways. + +Similarly to on a real filesystem, matching nodes by common patterns in their paths is often useful. +We can use :py:meth:`DataTree.match` for this: + +.. ipython:: python + + dt = DataTree.from_dict( + { + "/a/A": None, + "/a/B": None, + "/b/A": None, + "/b/B": None, + } + ) + result = dt.match("*/B") + result + +We can also subset trees by the contents of the nodes. +:py:meth:`DataTree.filter` retains only the nodes of a tree that meet a certain condition. +For example, we could recreate the Simpson's family tree with the ages of each individual, then filter for only the adults: +First lets recreate the tree but with an `age` data variable in every node: + +.. ipython:: python + + simpsons = DataTree.from_dict( + d={ + "/": xr.Dataset({"age": 83}), + "/Herbert": xr.Dataset({"age": 40}), + "/Homer": xr.Dataset({"age": 39}), + "/Homer/Bart": xr.Dataset({"age": 10}), + "/Homer/Lisa": xr.Dataset({"age": 8}), + "/Homer/Maggie": xr.Dataset({"age": 1}), + }, + name="Abe", + ) + simpsons + +Now let's filter out the minors: + +.. ipython:: python + + simpsons.filter(lambda node: node["age"] > 18) + +The result is a new tree, containing only the nodes matching the condition. + +(Yes, under the hood :py:meth:`~DataTree.filter` is just syntactic sugar for the pattern we showed you in :ref:`iterating over trees` !) + +.. _Tree Contents: + +Tree Contents +------------- + +Hollow Trees +~~~~~~~~~~~~ + +A concept that can sometimes be useful is that of a "Hollow Tree", which means a tree with data stored only at the leaf nodes. +This is useful because certain useful tree manipulation operations only make sense for hollow trees. + +You can check if a tree is a hollow tree by using the :py:class:`~DataTree.is_hollow` property. +We can see that the Simpson's family is not hollow because the data variable ``"age"`` is present at some nodes which +have children (i.e. Abe and Homer). + +.. ipython:: python + + simpsons.is_hollow + +.. _tree computation: + +Computation +----------- + +`DataTree` objects are also useful for performing computations, not just for organizing data. + +Operations and Methods on Trees +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To show how applying operations across a whole tree at once can be useful, +let's first create a example scientific dataset. + +.. ipython:: python + + def time_stamps(n_samples, T): + """Create an array of evenly-spaced time stamps""" + return xr.DataArray( + data=np.linspace(0, 2 * np.pi * T, n_samples), dims=["time"] + ) + + + def signal_generator(t, f, A, phase): + """Generate an example electrical-like waveform""" + return A * np.sin(f * t.data + phase) + + + time_stamps1 = time_stamps(n_samples=15, T=1.5) + time_stamps2 = time_stamps(n_samples=10, T=1.0) + + voltages = DataTree.from_dict( + { + "/oscilloscope1": xr.Dataset( + { + "potential": ( + "time", + signal_generator(time_stamps1, f=2, A=1.2, phase=0.5), + ), + "current": ( + "time", + signal_generator(time_stamps1, f=2, A=1.2, phase=1), + ), + }, + coords={"time": time_stamps1}, + ), + "/oscilloscope2": xr.Dataset( + { + "potential": ( + "time", + signal_generator(time_stamps2, f=1.6, A=1.6, phase=0.2), + ), + "current": ( + "time", + signal_generator(time_stamps2, f=1.6, A=1.6, phase=0.7), + ), + }, + coords={"time": time_stamps2}, + ), + } + ) + voltages + +Most xarray computation methods also exist as methods on datatree objects, +so you can for example take the mean value of these two timeseries at once: + +.. ipython:: python + + voltages.mean(dim="time") + +This works by mapping the standard :py:meth:`xarray.Dataset.mean()` method over the dataset stored in each node of the +tree one-by-one. + +The arguments passed to the method are used for every node, so the values of the arguments you pass might be valid for one node and invalid for another + +.. ipython:: python + :okexcept: + + voltages.isel(time=12) + +Notice that the error raised helpfully indicates which node of the tree the operation failed on. + +Arithmetic Methods on Trees +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Arithmetic methods are also implemented, so you can e.g. add a scalar to every dataset in the tree at once. +For example, we can advance the timeline of the Simpsons by a decade just by + +.. ipython:: python + + simpsons + 10 + +See that the same change (fast-forwarding by adding 10 years to the age of each character) has been applied to every node. + +Mapping Custom Functions Over Trees +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can map custom computation over each node in a tree using :py:meth:`DataTree.map_over_subtree`. +You can map any function, so long as it takes `xarray.Dataset` objects as one (or more) of the input arguments, +and returns one (or more) xarray datasets. + +.. note:: + + Functions passed to :py:func:`map_over_subtree` cannot alter nodes in-place. + Instead they must return new `xarray.Dataset` objects. + +For example, we can define a function to calculate the Root Mean Square of a timeseries + +.. ipython:: python + + def rms(signal): + return np.sqrt(np.mean(signal**2)) + +Then calculate the RMS value of these signals: + +.. ipython:: python + + voltages.map_over_subtree(rms) + +.. _multiple trees: + +We can also use the :py:func:`map_over_subtree` decorator to promote a function which accepts datasets into one which +accepts datatrees. + +Operating on Multiple Trees +--------------------------- + +The examples so far have involved mapping functions or methods over the nodes of a single tree, +but we can generalize this to mapping functions over multiple trees at once. + +Comparing Trees for Isomorphism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For it to make sense to map a single non-unary function over the nodes of multiple trees at once, +each tree needs to have the same structure. Specifically two trees can only be considered similar, or "isomorphic", +if they have the same number of nodes, and each corresponding node has the same number of children. +We can check if any two trees are isomorphic using the :py:meth:`DataTree.isomorphic` method. + +.. ipython:: python + :okexcept: + + dt1 = DataTree.from_dict({"a": None, "a/b": None}) + dt2 = DataTree.from_dict({"a": None}) + dt1.isomorphic(dt2) + + dt3 = DataTree.from_dict({"a": None, "b": None}) + dt1.isomorphic(dt3) + + dt4 = DataTree.from_dict({"A": None, "A/B": xr.Dataset({"foo": 1})}) + dt1.isomorphic(dt4) + +If the trees are not isomorphic a :py:class:`~TreeIsomorphismError` will be raised. +Notice that corresponding tree nodes do not need to have the same name or contain the same data in order to be considered isomorphic. + +Arithmetic Between Multiple Trees +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Arithmetic operations like multiplication are binary operations, so as long as we have two isomorphic trees, +we can do arithmetic between them. + +.. ipython:: python + + currents = DataTree.from_dict( + { + "/oscilloscope1": xr.Dataset( + { + "current": ( + "time", + signal_generator(time_stamps1, f=2, A=1.2, phase=1), + ), + }, + coords={"time": time_stamps1}, + ), + "/oscilloscope2": xr.Dataset( + { + "current": ( + "time", + signal_generator(time_stamps2, f=1.6, A=1.6, phase=0.7), + ), + }, + coords={"time": time_stamps2}, + ), + } + ) + currents + + currents.isomorphic(voltages) + +We could use this feature to quickly calculate the electrical power in our signal, P=IV. + +.. ipython:: python + + power = currents * voltages + power diff --git a/xarray/datatree_/docs/source/index.rst b/xarray/datatree_/docs/source/index.rst new file mode 100644 index 00000000000..a88a5747ada --- /dev/null +++ b/xarray/datatree_/docs/source/index.rst @@ -0,0 +1,61 @@ +.. currentmodule:: datatree + +Datatree +======== + +**Datatree is a prototype implementation of a tree-like hierarchical data structure for xarray.** + +Why Datatree? +~~~~~~~~~~~~~ + +Datatree was born after the xarray team recognised a `need for a new hierarchical data structure `_, +that was more flexible than a single :py:class:`xarray.Dataset` object. +The initial motivation was to represent netCDF files / Zarr stores with multiple nested groups in a single in-memory object, +but :py:class:`~datatree.DataTree` objects have many other uses. + +You might want to use datatree for: + +- Organising many related datasets, e.g. results of the same experiment with different parameters, or simulations of the same system using different models, +- Analysing similar data at multiple resolutions simultaneously, such as when doing a convergence study, +- Comparing heterogenous but related data, such as experimental and theoretical data, +- I/O with nested data formats such as netCDF / Zarr groups. + +Development Roadmap +~~~~~~~~~~~~~~~~~~~ + +Datatree currently lives in a separate repository to the main xarray package. +This allows the datatree developers to make changes to it, experiment, and improve it faster. + +Eventually we plan to fully integrate datatree upstream into xarray's main codebase, at which point the `github.com/xarray-contrib/datatree `_ repository will be archived. +This should not cause much disruption to code that depends on datatree - you will likely only have to change the import line (i.e. from ``from datatree import DataTree`` to ``from xarray import DataTree``). + +However, until this full integration occurs, datatree's API should not be considered to have the same `level of stability as xarray's `_. + +User Feedback +~~~~~~~~~~~~~ + +We really really really want to hear your opinions on datatree! +At this point in development, user feedback is critical to help us create something that will suit everyone's needs. +Please raise any thoughts, issues, suggestions or bugs, no matter how small or large, on the `github issue tracker `_. + +.. toctree:: + :maxdepth: 2 + :caption: Documentation Contents + + Installation + Quick Overview + Tutorial + Data Model + Hierarchical Data + Reading and Writing Files + API Reference + Terminology + Contributing Guide + What's New + GitHub repository + +Feedback +-------- + +If you encounter any errors, problems with **Datatree**, or have any suggestions, please open an issue +on `GitHub `_. diff --git a/xarray/datatree_/docs/source/installation.rst b/xarray/datatree_/docs/source/installation.rst new file mode 100644 index 00000000000..b2682743ade --- /dev/null +++ b/xarray/datatree_/docs/source/installation.rst @@ -0,0 +1,38 @@ +.. currentmodule:: datatree + +============ +Installation +============ + +Datatree can be installed in three ways: + +Using the `conda `__ package manager that comes with the +Anaconda/Miniconda distribution: + +.. code:: bash + + $ conda install xarray-datatree --channel conda-forge + +Using the `pip `__ package manager: + +.. code:: bash + + $ python -m pip install xarray-datatree + +To install a development version from source: + +.. code:: bash + + $ git clone https://github.com/xarray-contrib/datatree + $ cd datatree + $ python -m pip install -e . + + +You will just need xarray as a required dependency, with netcdf4, zarr, and h5netcdf as optional dependencies to allow file I/O. + +.. note:: + + Datatree is very much still in the early stages of development. There may be functions that are present but whose + internals are not yet implemented, or significant changes to the API in future. + That said, if you try it out and find some behaviour that looks like a bug to you, please report it on the + `issue tracker `_! diff --git a/xarray/datatree_/docs/source/io.rst b/xarray/datatree_/docs/source/io.rst new file mode 100644 index 00000000000..2f2dabf9948 --- /dev/null +++ b/xarray/datatree_/docs/source/io.rst @@ -0,0 +1,54 @@ +.. currentmodule:: datatree + +.. _io: + +Reading and Writing Files +========================= + +.. note:: + + This page builds on the information given in xarray's main page on + `reading and writing files `_, + so it is suggested that you are familiar with those first. + + +netCDF +------ + +Groups +~~~~~~ + +Whilst netCDF groups can only be loaded individually as Dataset objects, a whole file of many nested groups can be loaded +as a single :py:class:`DataTree` object. +To open a whole netCDF file as a tree of groups use the :py:func:`open_datatree` function. +To save a DataTree object as a netCDF file containing many groups, use the :py:meth:`DataTree.to_netcdf` method. + + +.. _netcdf.group.warning: + +.. warning:: + ``DataTree`` objects do not follow the exact same data model as netCDF files, which means that perfect round-tripping + is not always possible. + + In particular in the netCDF data model dimensions are entities that can exist regardless of whether any variable possesses them. + This is in contrast to `xarray's data model `_ + (and hence :ref:`datatree's data model `) in which the dimensions of a (Dataset/Tree) + object are simply the set of dimensions present across all variables in that dataset. + + This means that if a netCDF file contains dimensions but no variables which possess those dimensions, + these dimensions will not be present when that file is opened as a DataTree object. + Saving this DataTree object to file will therefore not preserve these "unused" dimensions. + +Zarr +---- + +Groups +~~~~~~ + +Nested groups in zarr stores can be represented by loading the store as a :py:class:`DataTree` object, similarly to netCDF. +To open a whole zarr store as a tree of groups use the :py:func:`open_datatree` function. +To save a DataTree object as a zarr store containing many groups, use the :py:meth:`DataTree.to_zarr()` method. + +.. note:: + Note that perfect round-tripping should always be possible with a zarr store (:ref:`unlike for netCDF files `), + as zarr does not support "unused" dimensions. diff --git a/xarray/datatree_/docs/source/quick-overview.rst b/xarray/datatree_/docs/source/quick-overview.rst new file mode 100644 index 00000000000..4743b0899fa --- /dev/null +++ b/xarray/datatree_/docs/source/quick-overview.rst @@ -0,0 +1,84 @@ +.. currentmodule:: datatree + +############## +Quick overview +############## + +DataTrees +--------- + +:py:class:`DataTree` is a tree-like container of :py:class:`xarray.DataArray` objects, organised into multiple mutually alignable groups. +You can think of it like a (recursive) ``dict`` of :py:class:`xarray.Dataset` objects. + +Let's first make some example xarray datasets (following on from xarray's +`quick overview `_ page): + +.. ipython:: python + + import numpy as np + import xarray as xr + + data = xr.DataArray(np.random.randn(2, 3), dims=("x", "y"), coords={"x": [10, 20]}) + ds = xr.Dataset(dict(foo=data, bar=("x", [1, 2]), baz=np.pi)) + ds + + ds2 = ds.interp(coords={"x": [10, 12, 14, 16, 18, 20]}) + ds2 + + ds3 = xr.Dataset( + dict(people=["alice", "bob"], heights=("people", [1.57, 1.82])), + coords={"species": "human"}, + ) + ds3 + +Now we'll put this data into a multi-group tree: + +.. ipython:: python + + from datatree import DataTree + + dt = DataTree.from_dict({"simulation/coarse": ds, "simulation/fine": ds2, "/": ds3}) + dt + +This creates a datatree with various groups. We have one root group, containing information about individual people. +(This root group can be named, but here is unnamed, so is referred to with ``"/"``, same as the root of a unix-like filesystem.) +The root group then has one subgroup ``simulation``, which contains no data itself but does contain another two subgroups, +named ``fine`` and ``coarse``. + +The (sub-)sub-groups ``fine`` and ``coarse`` contain two very similar datasets. +They both have an ``"x"`` dimension, but the dimension is of different lengths in each group, which makes the data in each group unalignable. +In the root group we placed some completely unrelated information, showing how we can use a tree to store heterogenous data. + +The constraints on each group are therefore the same as the constraint on dataarrays within a single dataset. + +We created the sub-groups using a filesystem-like syntax, and accessing groups works the same way. +We can access individual dataarrays in a similar fashion + +.. ipython:: python + + dt["simulation/coarse/foo"] + +and we can also pull out the data in a particular group as a ``Dataset`` object using ``.ds``: + +.. ipython:: python + + dt["simulation/coarse"].ds + +Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by + +.. ipython:: python + + avg = dt["simulation"].mean(dim="x") + avg + +Here the ``"x"`` dimension used is always the one local to that sub-group. + +You can do almost everything you can do with ``Dataset`` objects with ``DataTree`` objects +(including indexing and arithmetic), as operations will be mapped over every sub-group in the tree. +This allows you to work with multiple groups of non-alignable variables at once. + +.. note:: + + If all of your variables are mutually alignable + (i.e. they live on the same grid, such that every common dimension name maps to the same length), + then you probably don't need :py:class:`DataTree`, and should consider just sticking with ``xarray.Dataset``. diff --git a/xarray/datatree_/docs/source/terminology.rst b/xarray/datatree_/docs/source/terminology.rst new file mode 100644 index 00000000000..e481a01a6b2 --- /dev/null +++ b/xarray/datatree_/docs/source/terminology.rst @@ -0,0 +1,34 @@ +.. currentmodule:: datatree + +.. _terminology: + +This page extends `xarray's page on terminology `_. + +Terminology +=========== + +.. glossary:: + + DataTree + A tree-like collection of ``Dataset`` objects. A *tree* is made up of one or more *nodes*, + each of which can store the same information as a single ``Dataset`` (accessed via `.ds`). + This data is stored in the same way as in a ``Dataset``, i.e. in the form of data variables + (see **Variable** in the `corresponding xarray terminology page `_), + dimensions, coordinates, and attributes. + + The nodes in a tree are linked to one another, and each node is it's own instance of ``DataTree`` object. + Each node can have zero or more *children* (stored in a dictionary-like manner under their corresponding *names*), + and those child nodes can themselves have children. + If a node is a child of another node that other node is said to be its *parent*. Nodes can have a maximum of one parent, + and if a node has no parent it is said to be the *root* node of that *tree*. + + Subtree + A section of a *tree*, consisting of a *node* along with all the child nodes below it + (and the child nodes below them, i.e. all so-called *descendant* nodes). + Excludes the parent node and all nodes above. + + Group + Another word for a subtree, reflecting how the hierarchical structure of a ``DataTree`` allows for grouping related data together. + Analogous to a single + `netCDF group `_ or + `Zarr group `_. diff --git a/xarray/datatree_/docs/source/tutorial.rst b/xarray/datatree_/docs/source/tutorial.rst new file mode 100644 index 00000000000..6e33bd36f91 --- /dev/null +++ b/xarray/datatree_/docs/source/tutorial.rst @@ -0,0 +1,7 @@ +.. currentmodule:: datatree + +======== +Tutorial +======== + +Coming soon! diff --git a/xarray/datatree_/docs/source/whats-new.rst b/xarray/datatree_/docs/source/whats-new.rst new file mode 100644 index 00000000000..2f6e4f88fe5 --- /dev/null +++ b/xarray/datatree_/docs/source/whats-new.rst @@ -0,0 +1,426 @@ +.. currentmodule:: datatree + +What's New +========== + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xray + import xarray + import xarray as xr + import datatree + + np.random.seed(123456) + +.. _whats-new.v0.0.14: + +v0.0.14 (unreleased) +-------------------- + +New Features +~~~~~~~~~~~~ + +Breaking changes +~~~~~~~~~~~~~~~~ + +- Renamed `DataTree.lineage` to `DataTree.parents` to match `pathlib` vocabulary + (:issue:`283`, :pull:`286`) +- Minimum required version of xarray is now 2023.12.0, i.e. the latest version. + This is required to prevent recent changes to xarray's internals from breaking datatree. + (:issue:`293`, :pull:`294`) + By `Tom Nicholas `_. +- Change default write mode of :py:meth:`DataTree.to_zarr` to ``'w-'`` to match ``xarray`` + default and prevent accidental directory overwrites. (:issue:`274`, :pull:`275`) + By `Sam Levang `_. + +Deprecations +~~~~~~~~~~~~ + +- Renamed `DataTree.lineage` to `DataTree.parents` to match `pathlib` vocabulary + (:issue:`283`, :pull:`286`). `lineage` is now deprecated and use of `parents` is encouraged. + By `Etienne Schalk `_. + +Bug fixes +~~~~~~~~~ +- Keep attributes on nodes containing no data in :py:func:`map_over_subtree`. (:issue:`278`, :pull:`279`) + By `Sam Levang `_. + +Documentation +~~~~~~~~~~~~~ +- Use ``napoleon`` instead of ``numpydoc`` to align with xarray documentation + (:issue:`284`, :pull:`298`). + By `Etienne Schalk `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +.. _whats-new.v0.0.13: + +v0.0.13 (27/10/2023) +-------------------- + +New Features +~~~~~~~~~~~~ + +- New :py:meth:`DataTree.match` method for glob-like pattern matching of node paths. (:pull:`267`) + By `Tom Nicholas `_. +- New :py:meth:`DataTree.is_hollow` property for checking if data is only contained at the leaf nodes. (:pull:`272`) + By `Tom Nicholas `_. +- Indicate which node caused the problem if error encountered while applying user function using :py:func:`map_over_subtree` + (:issue:`190`, :pull:`264`). Only works when using python 3.11 or later. + By `Tom Nicholas `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + +- Nodes containing only attributes but no data are now ignored by :py:func:`map_over_subtree` (:issue:`262`, :pull:`263`) + By `Tom Nicholas `_. +- Disallow altering of given dataset inside function called by :py:func:`map_over_subtree` (:pull:`269`, reverts part of :pull:`194`). + By `Tom Nicholas `_. + +Bug fixes +~~~~~~~~~ + +- Fix unittests on i386. (:pull:`249`) + By `Antonio Valentino `_. +- Ensure nodepath class is compatible with python 3.12 (:pull:`260`) + By `Max Grover `_. + +Documentation +~~~~~~~~~~~~~ + +- Added new sections to page on ``Working with Hierarchical Data`` (:pull:`180`) + By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +* No longer use the deprecated `distutils` package. + +.. _whats-new.v0.0.12: + +v0.0.12 (03/07/2023) +-------------------- + +New Features +~~~~~~~~~~~~ + +- Added a :py:func:`DataTree.level`, :py:func:`DataTree.depth`, and :py:func:`DataTree.width` property (:pull:`208`). + By `Tom Nicholas `_. +- Allow dot-style (or "attribute-like") access to child nodes and variables, with ipython autocomplete. (:issue:`189`, :pull:`98`) + By `Tom Nicholas `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +- Dropped support for python 3.8 (:issue:`212`, :pull:`214`) + By `Tom Nicholas `_. + +Bug fixes +~~~~~~~~~ + +- Allow for altering of given dataset inside function called by :py:func:`map_over_subtree` (:issue:`188`, :pull:`194`). + By `Tom Nicholas `_. +- copy subtrees without creating ancestor nodes (:pull:`201`) + By `Justus Magin `_. + +Documentation +~~~~~~~~~~~~~ + +Internal Changes +~~~~~~~~~~~~~~~~ + +.. _whats-new.v0.0.11: + +v0.0.11 (01/09/2023) +-------------------- + +Big update with entirely new pages in the docs, +new methods (``.drop_nodes``, ``.filter``, ``.leaves``, ``.descendants``), and bug fixes! + +New Features +~~~~~~~~~~~~ + +- Added a :py:meth:`DataTree.drop_nodes` method (:issue:`161`, :pull:`175`). + By `Tom Nicholas `_. +- New, more specific exception types for tree-related errors (:pull:`169`). + By `Tom Nicholas `_. +- Added a new :py:meth:`DataTree.descendants` property (:pull:`170`). + By `Tom Nicholas `_. +- Added a :py:meth:`DataTree.leaves` property (:pull:`177`). + By `Tom Nicholas `_. +- Added a :py:meth:`DataTree.filter` method (:pull:`184`). + By `Tom Nicholas `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + +- :py:meth:`DataTree.copy` copy method now only copies the subtree, not the parent nodes (:pull:`171`). + By `Tom Nicholas `_. +- Grafting a subtree onto another tree now leaves name of original subtree object unchanged (:issue:`116`, :pull:`172`, :pull:`178`). + By `Tom Nicholas `_. +- Changed the :py:meth:`DataTree.assign` method to just work on the local node (:pull:`181`). + By `Tom Nicholas `_. + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +- Fix bug with :py:meth:`DataTree.relative_to` method (:issue:`133`, :pull:`160`). + By `Tom Nicholas `_. +- Fix links to API docs in all documentation (:pull:`183`). + By `Tom Nicholas `_. + +Documentation +~~~~~~~~~~~~~ + +- Changed docs theme to match xarray's main documentation. (:pull:`173`) + By `Tom Nicholas `_. +- Added ``Terminology`` page. (:pull:`174`) + By `Tom Nicholas `_. +- Added page on ``Working with Hierarchical Data`` (:pull:`179`) + By `Tom Nicholas `_. +- Added context content to ``Index`` page (:pull:`182`) + By `Tom Nicholas `_. +- Updated the README (:pull:`187`) + By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + + +.. _whats-new.v0.0.10: + +v0.0.10 (12/07/2022) +-------------------- + +Adds accessors and a `.pipe()` method. + +New Features +~~~~~~~~~~~~ + +- Add the ability to register accessors on ``DataTree`` objects, by using ``register_datatree_accessor``. (:pull:`144`) + By `Tom Nicholas `_. +- Allow method chaining with a new :py:meth:`DataTree.pipe` method (:issue:`151`, :pull:`156`). + By `Justus Magin `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +- Allow ``Datatree`` objects as values in :py:meth:`DataTree.from_dict` (:pull:`159`). + By `Justus Magin `_. + +Documentation +~~~~~~~~~~~~~ + +- Added ``Reading and Writing Files`` page. (:pull:`158`) + By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +- Avoid reading from same file twice with fsspec3 (:pull:`130`) + By `William Roberts `_. + + +.. _whats-new.v0.0.9: + +v0.0.9 (07/14/2022) +------------------- + +New Features +~~~~~~~~~~~~ + +Breaking changes +~~~~~~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +Documentation +~~~~~~~~~~~~~ +- Switch docs theme (:pull:`123`). + By `JuliusBusecke `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + + +.. _whats-new.v0.0.7: + +v0.0.7 (07/11/2022) +------------------- + +New Features +~~~~~~~~~~~~ + +- Improve the HTML repr by adding tree-style lines connecting groups and sub-groups (:pull:`109`). + By `Benjamin Woods `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + +- The ``DataTree.ds`` attribute now returns a view onto an immutable Dataset-like object, instead of an actual instance + of ``xarray.Dataset``. This make break existing ``isinstance`` checks or ``assert`` comparisons. (:pull:`99`) + By `Tom Nicholas `_. + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +- Modifying the contents of a ``DataTree`` object via the ``DataTree.ds`` attribute is now forbidden, which prevents + any possibility of the contents of a ``DataTree`` object and its ``.ds`` attribute diverging. (:issue:`38`, :pull:`99`) + By `Tom Nicholas `_. +- Fixed a bug so that names of children now always match keys under which parents store them (:pull:`99`). + By `Tom Nicholas `_. + +Documentation +~~~~~~~~~~~~~ + +- Added ``Data Structures`` page describing the internal structure of a ``DataTree`` object, and its relation to + ``xarray.Dataset`` objects. (:pull:`103`) + By `Tom Nicholas `_. +- API page updated with all the methods that are copied from ``xarray.Dataset``. (:pull:`41`) + By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +- Refactored ``DataTree`` class to store a set of ``xarray.Variable`` objects instead of a single ``xarray.Dataset``. + This approach means that the ``DataTree`` class now effectively copies and extends the internal structure of + ``xarray.Dataset``. (:pull:`41`) + By `Tom Nicholas `_. +- Refactored to use intermediate ``NamedNode`` class, separating implementation of methods requiring a ``name`` + attribute from those not requiring it. + By `Tom Nicholas `_. +- Made ``testing.test_datatree.create_test_datatree`` into a pytest fixture (:pull:`107`). + By `Benjamin Woods `_. + + + +.. _whats-new.v0.0.6: + +v0.0.6 (06/03/2022) +------------------- + +Various small bug fixes, in preparation for more significant changes in the next version. + +Bug fixes +~~~~~~~~~ + +- Fixed bug with checking that assigning parent or new children did not create a loop in the tree (:pull:`105`) + By `Tom Nicholas `_. +- Do not call ``__exit__`` on Zarr store when opening (:pull:`90`) + By `Matt McCormick `_. +- Fix netCDF encoding for compression (:pull:`95`) + By `Joe Hamman `_. +- Added validity checking for node names (:pull:`106`) + By `Tom Nicholas `_. + +.. _whats-new.v0.0.5: + +v0.0.5 (05/05/2022) +------------------- + +- Major refactor of internals, moving from the ``DataTree.children`` attribute being a ``Tuple[DataTree]`` to being a + ``OrderedDict[str, DataTree]``. This was necessary in order to integrate better with xarray's dictionary-like API, + solve several issues, simplify the code internally, remove dependencies, and enable new features. (:pull:`76`) + By `Tom Nicholas `_. + +New Features +~~~~~~~~~~~~ + +- Syntax for accessing nodes now supports file-like paths, including parent nodes via ``"../"``, relative paths, the + root node via ``"/"``, and the current node via ``"."``. (Internally it actually uses ``pathlib`` now.) + By `Tom Nicholas `_. +- New path-like API methods, such as ``.relative_to``, ``.find_common_ancestor``, and ``.same_tree``. +- Some new dictionary-like methods, such as ``DataTree.get`` and ``DataTree.update``. (:pull:`76`) + By `Tom Nicholas `_. +- New HTML repr, which will automatically display in a jupyter notebook. (:pull:`78`) + By `Tom Nicholas `_. +- New delitem method so you can delete nodes. (:pull:`88`) + By `Tom Nicholas `_. +- New ``to_dict`` method. (:pull:`82`) + By `Tom Nicholas `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + +- Node names are now optional, which means that the root of the tree can be unnamed. This has knock-on effects for + a lot of the API. +- The ``__init__`` signature for ``DataTree`` has changed, so that ``name`` is now an optional kwarg. +- Files will now be loaded as a slightly different tree, because the root group no longer needs to be given a default + name. +- Removed tag-like access to nodes. +- Removes the option to delete all data in a node by assigning None to the node (in favour of deleting data by replacing + the node's ``.ds`` attribute with an empty Dataset), or to create a new empty node in the same way (in favour of + assigning an empty DataTree object instead). +- Removes the ability to create a new node by assigning a ``Dataset`` object to ``DataTree.__setitem__``. +- Several other minor API changes such as ``.pathstr`` -> ``.path``, and ``from_dict``'s dictionary argument now being + required. (:pull:`76`) + By `Tom Nicholas `_. + +Deprecations +~~~~~~~~~~~~ + +- No longer depends on the anytree library (:pull:`76`) + By `Tom Nicholas `_. + +Bug fixes +~~~~~~~~~ + +- Fixed indentation issue with the string repr (:pull:`86`) + By `Tom Nicholas `_. + +Documentation +~~~~~~~~~~~~~ + +- Quick-overview page updated to match change in path syntax (:pull:`76`) + By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +- Basically every file was changed in some way to accommodate (:pull:`76`). +- No longer need the utility functions for string manipulation that were defined in ``utils.py``. +- A considerable amount of code copied over from the internals of anytree (e.g. in ``render.py`` and ``iterators.py``). + The Apache license for anytree has now been bundled with datatree. (:pull:`76`). + By `Tom Nicholas `_. + +.. _whats-new.v0.0.4: + +v0.0.4 (31/03/2022) +------------------- + +- Ensure you get the pretty tree-like string representation by default in ipython (:pull:`73`). + By `Tom Nicholas `_. +- Now available on conda-forge (as xarray-datatree)! (:pull:`71`) + By `Anderson Banihirwe `_. +- Allow for python 3.8 (:pull:`70`). + By `Don Setiawan `_. + +.. _whats-new.v0.0.3: + +v0.0.3 (30/03/2022) +------------------- + +- First released version available on both pypi (as xarray-datatree)! diff --git a/xarray/datatree_/pyproject.toml b/xarray/datatree_/pyproject.toml new file mode 100644 index 00000000000..40f7d5a59b3 --- /dev/null +++ b/xarray/datatree_/pyproject.toml @@ -0,0 +1,61 @@ +[project] +name = "xarray-datatree" +description = "Hierarchical tree-like data structures for xarray" +readme = "README.md" +authors = [ + {name = "Thomas Nicholas", email = "thomas.nicholas@columbia.edu"} +] +license = {text = "Apache-2"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +requires-python = ">=3.9" +dependencies = [ + "xarray >=2023.12.0", + "packaging", +] +dynamic = ["version"] + +[project.urls] +Home = "https://github.com/xarray-contrib/datatree" +Documentation = "https://xarray-datatree.readthedocs.io/en/stable/" + +[build-system] +requires = [ + "setuptools>=61.0.0", + "wheel", + "setuptools_scm[toml]>=7.0", + "check-manifest" +] + +[tool.setuptools_scm] +write_to = "datatree/_version.py" +write_to_template = ''' +# Do not change! Do not track in version control! +__version__ = "{version}" +''' + +[tool.setuptools.packages.find] +exclude = ["docs", "tests", "tests.*", "docs.*"] + +[tool.setuptools.package-data] +datatree = ["py.typed"] + +[tool.isort] +profile = "black" +skip_gitignore = true +float_to_top = true +default_section = "THIRDPARTY" +known_first_party = "datatree" + +[mypy] +files = "datatree/**/*.py" +show_error_codes = true diff --git a/xarray/datatree_/readthedocs.yml b/xarray/datatree_/readthedocs.yml new file mode 100644 index 00000000000..9b04939c898 --- /dev/null +++ b/xarray/datatree_/readthedocs.yml @@ -0,0 +1,7 @@ +version: 2 +conda: + environment: ci/doc.yml +build: + os: 'ubuntu-20.04' + tools: + python: 'mambaforge-4.10'