From 598ffdad4e79f69f1d194b81e7920e1417999c58 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Thu, 27 Jul 2023 17:31:54 -0400 Subject: [PATCH 01/11] feat: add a schema for serialization Signed-off-by: Henry Schreiner [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pyproject.toml | 4 + src/uhi/resources/histogram.json | 200 +++++++++++++++++++++++++++++++ src/uhi/schema.py | 27 +++++ tests/resources/reg.json | 31 +++++ 4 files changed, 262 insertions(+) create mode 100644 src/uhi/resources/histogram.json create mode 100644 src/uhi/schema.py create mode 100644 tests/resources/reg.json diff --git a/pyproject.toml b/pyproject.toml index 1ba50a0..07be65e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,10 @@ Documentation = "https://uhi.readthedocs.io/en/latest/" Changelog = "https://github.com/scikit-hep/uhi/releases" [project.optional-dependencies] +schema = [ + "fastjsonschema", + "importlib-resources; python_version<'3.9'", +] docs = [ "sphinx>=4.0", "furo", diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json new file mode 100644 index 0000000..1b8124e --- /dev/null +++ b/src/uhi/resources/histogram.json @@ -0,0 +1,200 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/scikit-hep/uhi/.json", + "type": "object", + "patternProperties": { + ".+": { + "type": "object", + "required": ["axes", "storage"], + "properties": { + "title": { "type": "string" }, + "name": { "type": "string" }, + "metadata": { "type": "object" }, + "axes": { + "type": "array", + "items": { + "oneOf": [ + { "$ref": "#/$defs/regular_axis" }, + { "$ref": "#/$defs/variable_axis" }, + { "$ref": "#/$defs/str_category_axis" }, + { "$ref": "#/$defs/int_category_axis" }, + { "$ref": "#/$defs/boolean_axis" } + ] + } + }, + "storage": { + "oneOf": [ + { "$ref": "#/$defs/int_storage" }, + { "$ref": "#/$defs/double_storage" }, + { "$ref": "#/$defs/weighted_storage" }, + { "$ref": "#/$defs/mean_storage" }, + { "$ref": "#/$defs/weighted_mean_storage" } + ] + } + } + } + }, + "$defs": { + "regular_axis": { + "type": "object", + "required": ["type", "lower", "upper", "bins", "underflow", "overflow"], + "properties": { + "type": { "type": "string", "pattern": "regular" }, + "lower": { "type": "number" }, + "upper": { "type": "number" }, + "bins": { "type": "number" }, + "underflow": { "type": "boolean" }, + "overflow": { "type": "boolean" }, + "circular": { "type": "boolean" }, + "name": { "type": "string" }, + "title": { "type": "string" }, + "metadata": { "type": "object" } + } + }, + "variable_axis": { + "type": "object", + "required": ["type", "edges", "underflow", "overflow"], + "properties": { + "type": { "type": "string", "pattern": "variable" }, + "edges": { + "oneOf": [ + { + "type": "array", + "items": { "type": "number" } + }, + { + "type": "string" + } + ] + }, + "underflow": { "type": "boolean" }, + "overflow": { "type": "boolean" }, + "circular": { "type": "boolean" }, + "name": { "type": "string" }, + "title": { "type": "string" }, + "metadata": { "type": "object" } + } + }, + "str_category_axis": { + "type": "object", + "required": ["type", "categories", "flow"], + "properties": { + "type": { "type": "string", "pattern": "str_category" }, + "categories": { "type": "array", "items": { "type": "string" } }, + "flow": { "type": "boolean" }, + "name": { "type": "string" }, + "title": { "type": "string" }, + "metadata": { "type": "object" } + } + }, + "int_category_axis": { + "type": "object", + "required": ["type", "categories", "flow"], + "properties": { + "type": { "type": "string", "pattern": "int_category" }, + "categories": { "type": "array", "items": { "type": "number" } }, + "flow": { "type": "boolean" }, + "name": { "type": "string" }, + "title": { "type": "string" }, + "metadata": { "type": "object" } + } + }, + "boolean_axis": { + "type": "object", + "required": ["type"], + "properties": { + "type": { "type": "string", "pattern": "boolean" }, + "name": { "type": "string" }, + "title": { "type": "string" }, + "metadata": { "type": "object" } + } + }, + "int_storage": { + "type": "object", + "properties": { + "type": { "type": "string", "pattern": "int" }, + "data": { + "oneOf": [ + { "type": "string" }, + { "type": "array", "items": { "type": "number" } } + ] + } + } + }, + "double_storage": { + "type": "object", + "properties": { + "type": { "type": "string", "pattern": "double" }, + "data": { + "oneOf": [ + { "type": "string" }, + { "type": "array", "items": { "type": "number" } } + ] + } + } + }, + "weighted_storage": { + "type": "object", + "properties": { + "type": { "type": "string", "pattern": "int" }, + "data": { + "oneOf": [ + { "type": "string" }, + { + "type": "object", + "properties": { + "values": { "type": "array", "items": { "type": "number" } }, + "variances": { "type": "array", "items": { "type": "number" } } + } + } + ] + } + } + }, + "mean_storage": { + "type": "object", + "properties": { + "type": { "type": "string", "pattern": "int" }, + "data": { + "oneOf": [ + { "type": "string" }, + { + "type": "object", + "properties": { + "counts": { "type": "array", "items": { "type": "number" } }, + "values": { "type": "array", "items": { "type": "number" } }, + "variances": { "type": "array", "items": { "type": "number" } } + } + } + ] + } + } + }, + "weighted_mean_storage": { + "type": "object", + "properties": { + "type": { "type": "string", "pattern": "int" }, + "data": { + "oneOf": [ + { "type": "string" }, + { + "type": "object", + "properties": { + "sum_of_weights": { + "type": "array", + "items": { "type": "number" } + }, + "sum_of_weights_squared": { + "type": "array", + "items": { "type": "number" } + }, + "values": { "type": "array", "items": { "type": "number" } }, + "variances": { "type": "array", "items": { "type": "number" } } + } + } + ] + } + } + } + } +} diff --git a/src/uhi/schema.py b/src/uhi/schema.py new file mode 100644 index 0000000..0af6d36 --- /dev/null +++ b/src/uhi/schema.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import json +import sys + +import fastjsonschema + +if sys.version_info < (3, 9): + import importlib_resources as resources +else: + from importlib import resources + +histogram_file = resources.files("uhi") / "resources/histogram.json" + +with histogram_file.open(encoding="utf-8") as f: + histogram_schema = fastjsonschema.compile(json.load(f)) + + +def validate(path: str) -> None: + with open(path, encoding="utf-8") as f: + example = json.load(f) + + histogram_schema(example) + + +if __name__ == "__main__": + validate(*sys.argv[1:]) diff --git a/tests/resources/reg.json b/tests/resources/reg.json new file mode 100644 index 0000000..8b928d4 --- /dev/null +++ b/tests/resources/reg.json @@ -0,0 +1,31 @@ +{ + "one": { + "title": "One", + "metadata": {}, + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 5, + "bins": 3, + "underflow": true, + "overflow": true, + "circular": false + } + ], + "storage": { "type": "int", "data": [1, 2, 3, 4, 5] } + }, + "two": { + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 5, + "bins": 5, + "underflow": true, + "overflow": true + } + ], + "storage": { "type": "double", "data": "something" } + } +} From 68c0c5e256bc475d30995376ee17a649efd12196 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Fri, 28 Jul 2023 11:04:06 -0400 Subject: [PATCH 02/11] fix: drop title Signed-off-by: Henry Schreiner --- .pre-commit-config.yaml | 1 - docs/index.rst | 1 + docs/serialization.md | 7 +++++++ pyproject.toml | 4 ++++ src/uhi/resources/histogram.json | 9 +-------- src/uhi/schema.py | 6 ++++-- 6 files changed, 17 insertions(+), 11 deletions(-) create mode 100644 docs/serialization.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8847338..1250540 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,6 @@ repos: - importlib_metadata - importlib_resources - - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: diff --git a/docs/index.rst b/docs/index.rst index 55cc258..4a9dfd6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ to plot a histogram, including error bars. indexing.rst indexing+.rst plotting.rst + serialization.md changelog.md diff --git a/docs/serialization.md b/docs/serialization.md new file mode 100644 index 0000000..3e3507c --- /dev/null +++ b/docs/serialization.md @@ -0,0 +1,7 @@ +# Serialization + +A standard form of serialization is described as follows: + +```{literalinclude} ../src/uhi/resources/histogram.json +:language: json +``` diff --git a/pyproject.toml b/pyproject.toml index 07be65e..8eefe76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,10 @@ show_error_codes = true warn_unreachable = true enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] +[[tool.mypy.overrides]] +module = ["fastjsonschema"] +ignore_missing_imports = true + [tool.ruff] select = [ "E", "F", "W", # flake8 diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json index 1b8124e..1c5d7fa 100644 --- a/src/uhi/resources/histogram.json +++ b/src/uhi/resources/histogram.json @@ -1,14 +1,12 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://github.com/scikit-hep/uhi/.json", + "$id": "https://raw.githubusercontent.com/scikit-hep/uhi/henryiii/feat/schema/src/uhi/resources/histogram.json", "type": "object", "patternProperties": { ".+": { "type": "object", "required": ["axes", "storage"], "properties": { - "title": { "type": "string" }, - "name": { "type": "string" }, "metadata": { "type": "object" }, "axes": { "type": "array", @@ -47,7 +45,6 @@ "overflow": { "type": "boolean" }, "circular": { "type": "boolean" }, "name": { "type": "string" }, - "title": { "type": "string" }, "metadata": { "type": "object" } } }, @@ -71,7 +68,6 @@ "overflow": { "type": "boolean" }, "circular": { "type": "boolean" }, "name": { "type": "string" }, - "title": { "type": "string" }, "metadata": { "type": "object" } } }, @@ -83,7 +79,6 @@ "categories": { "type": "array", "items": { "type": "string" } }, "flow": { "type": "boolean" }, "name": { "type": "string" }, - "title": { "type": "string" }, "metadata": { "type": "object" } } }, @@ -95,7 +90,6 @@ "categories": { "type": "array", "items": { "type": "number" } }, "flow": { "type": "boolean" }, "name": { "type": "string" }, - "title": { "type": "string" }, "metadata": { "type": "object" } } }, @@ -105,7 +99,6 @@ "properties": { "type": { "type": "string", "pattern": "boolean" }, "name": { "type": "string" }, - "title": { "type": "string" }, "metadata": { "type": "object" } } }, diff --git a/src/uhi/schema.py b/src/uhi/schema.py index 0af6d36..3d1662e 100644 --- a/src/uhi/schema.py +++ b/src/uhi/schema.py @@ -2,6 +2,7 @@ import json import sys +from pathlib import Path import fastjsonschema @@ -16,8 +17,9 @@ histogram_schema = fastjsonschema.compile(json.load(f)) -def validate(path: str) -> None: - with open(path, encoding="utf-8") as f: +def validate(path: str | Path) -> None: + path = Path(path) + with path.open(encoding="utf-8") as f: example = json.load(f) histogram_schema(example) From a22613ba280ebe694ebeee3f0f564cab6983d894 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Fri, 28 Jul 2023 12:40:11 -0400 Subject: [PATCH 03/11] docs: add text for schema Signed-off-by: Henry Schreiner --- docs/serialization.md | 69 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/docs/serialization.md b/docs/serialization.md index 3e3507c..135d2f8 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -1,6 +1,73 @@ # Serialization -A standard form of serialization is described as follows: +Histogram serialization has to cover a wide range of formats. As such, we +describe a form for serialization that covers the metadata structure as +JSON-like, with a provided JSON-schema. The data (bins and/or variable edges) +is stored out-of-band in a binary format based on what type of data file you +are in. For very small (primarily 1D) histograms, data is allowed inline as +well. + +The following formats are being targeted: + +``` +┌────────┐ ┌────────┐ ┌───────┐ +│ ROOT │ │ HDF5 │ │ ZIP │ +└────────┘ └────────┘ └───────┘ +``` + +Other formats can be used as well, assuming they support out-of-band data and +text attributes or files for the metadata. + +This structure was based heavily on boost-histogram, but it is intended to be +general, and can be expanded in the future as needed. As such, the following +limitations are required: + +* Serialization followed by deserialisation may cause axis changes. Axis types + may change to an equivalent but less performant axis, growth status will be + lost, etc. +* Metadata must be expressible as JSON. It should also be reasonably sized; some + formats like HDF5 may limit the size of attributes to 64K. +* Floating point variations could be introduced in storages, as the storage + format uses a stable but different representation. +* Axis `name` is only part of the metadata, and is not standardized. This is + due to lack of support from boost-histogram. + +The following axes types are supported: + +* `"regular"`: A regularly spaced set of even bins. Boost-histogram's "integer" + axes maps to this axis as well. Has `upper`, `lower`, `bins`, `underflow`, + `overflow`, and `circular` properties. `circular` defaults to False if not + present. +* `"variable"`: A continuous axis defined by bins+1 edges. Has `edges`, which + is either an in-line list of numbers or a string pointing to an out-of-band data source. + Also has `underflow`, `overflow`, and `circular` properties. `circular` + defaults to False if not present. +* `"int_category"`: A list of integer bins, non-continuous. Has `categories`, + which is an in-line list of integers. Also has `flow`. +* `"str_category"`: A list of string bins. Has `categories`, + which is an in-line list of strings. Also has `flow`. +* `"bool"`: A true/false axis. + +Axes with gaps are currently not supported. + +All axes support `metadata`. + +The following storages are supported: + +* `"int"`: A collection of integers. Boost-histogram's Int64 and AtomicInt64 + map to this, and sometimes Unlimited. +* `"double"`: A collection of 64-bit floating point values. Boost-histogram's + Double storage maps to this, and sometimes Unlimited. +* `"weighted"`: A collection of two arrays of 64-bit floating point values, + `"value"` and `"variance"`. Boost-histogram's Weight storage maps to this. +* `"mean"`: A collection of three arrays of 64-bit floating point values, + "count", "value", and "variance". Boost-histogram's Mean storage maps to + this. +* `"weighted_mean"`: A collection of four arrays of 64-bit floating point + values, `"sum_of_weights"`, `"sum_of_weights_squared"`, `"values"`, and + `"variances"`. Boost-histogram's WeighedMean storage maps to this. + +The full schema is below: ```{literalinclude} ../src/uhi/resources/histogram.json :language: json From ab100c7cf7c7f5aa92c120a4bae1880832180c2b Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Fri, 28 Jul 2023 17:57:02 -0400 Subject: [PATCH 04/11] tests: add an integration test for validation Signed-off-by: Henry Schreiner Update ci.yml --- .github/workflows/ci.yml | 2 +- docs/serialization.md | 28 ++++++++++++++++++++++++++++ noxfile.py | 2 +- tests/test_histogram_schema.py | 11 +++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 tests/test_histogram_schema.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46445d2..f14237e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install package - run: python -m pip install .[test] + run: python -m pip install .[test,schema] - name: Test run: python -m pytest -ra diff --git a/docs/serialization.md b/docs/serialization.md index 135d2f8..a14057b 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -1,5 +1,8 @@ # Serialization + +## Introduction + Histogram serialization has to cover a wide range of formats. As such, we describe a form for serialization that covers the metadata structure as JSON-like, with a provided JSON-schema. The data (bins and/or variable edges) @@ -18,6 +21,8 @@ The following formats are being targeted: Other formats can be used as well, assuming they support out-of-band data and text attributes or files for the metadata. +## Drawbacks + This structure was based heavily on boost-histogram, but it is intended to be general, and can be expanded in the future as needed. As such, the following limitations are required: @@ -32,6 +37,8 @@ limitations are required: * Axis `name` is only part of the metadata, and is not standardized. This is due to lack of support from boost-histogram. +## Design + The following axes types are supported: * `"regular"`: A regularly spaced set of even bins. Boost-histogram's "integer" @@ -67,6 +74,27 @@ The following storages are supported: values, `"sum_of_weights"`, `"sum_of_weights_squared"`, `"values"`, and `"variances"`. Boost-histogram's WeighedMean storage maps to this. +## CLI/API + +You can test a JSON file against the schema by running: + +```console +$ python -m uhi.schema some/file.json +``` + +Or with code: + +```python +import uhi.schema + +uhi.schema.validate("some/file.json") +``` + +Eventually this should also be usable for JSON's inside zip, HDF5 attributes, +and maybe more. + +## Full schema + The full schema is below: ```{literalinclude} ../src/uhi/resources/histogram.json diff --git a/noxfile.py b/noxfile.py index 0f6205f..2a576a9 100644 --- a/noxfile.py +++ b/noxfile.py @@ -28,7 +28,7 @@ def tests(session): """ Run the unit and regular tests. """ - session.install("-e.[test]") + session.install("-e.[test,schema]") session.run("pytest", *session.posargs) diff --git a/tests/test_histogram_schema.py b/tests/test_histogram_schema.py new file mode 100644 index 0000000..4f02c4d --- /dev/null +++ b/tests/test_histogram_schema.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from pathlib import Path + +import uhi.schema + +DIR = Path(__file__).parent.resolve() + + +def test_example_1() -> None: + uhi.schema.validate(DIR / "resources/reg.json") From cb5afce63a33078e99b021e027b25284eebfd191 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Sat, 29 Jul 2023 22:45:30 -0400 Subject: [PATCH 05/11] chore: update and tighten defs Signed-off-by: Henry Schreiner --- src/uhi/resources/histogram.json | 38 +++++++++++++++++++++----------- tests/resources/reg.json | 3 +-- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json index 1c5d7fa..1eb9abd 100644 --- a/src/uhi/resources/histogram.json +++ b/src/uhi/resources/histogram.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema#", "$id": "https://raw.githubusercontent.com/scikit-hep/uhi/henryiii/feat/schema/src/uhi/resources/histogram.json", "type": "object", "patternProperties": { @@ -29,7 +29,8 @@ { "$ref": "#/$defs/weighted_mean_storage" } ] } - } + }, + "additionalProperties": false } }, "$defs": { @@ -46,7 +47,8 @@ "circular": { "type": "boolean" }, "name": { "type": "string" }, "metadata": { "type": "object" } - } + }, + "additionalProperties": false }, "variable_axis": { "type": "object", @@ -69,7 +71,8 @@ "circular": { "type": "boolean" }, "name": { "type": "string" }, "metadata": { "type": "object" } - } + }, + "additionalProperties": false }, "str_category_axis": { "type": "object", @@ -80,7 +83,8 @@ "flow": { "type": "boolean" }, "name": { "type": "string" }, "metadata": { "type": "object" } - } + }, + "additionalProperties": false }, "int_category_axis": { "type": "object", @@ -91,7 +95,8 @@ "flow": { "type": "boolean" }, "name": { "type": "string" }, "metadata": { "type": "object" } - } + }, + "additionalProperties": false }, "boolean_axis": { "type": "object", @@ -100,7 +105,8 @@ "type": { "type": "string", "pattern": "boolean" }, "name": { "type": "string" }, "metadata": { "type": "object" } - } + }, + "additionalProperties": false }, "int_storage": { "type": "object", @@ -112,7 +118,8 @@ { "type": "array", "items": { "type": "number" } } ] } - } + }, + "additionalProperties": false }, "double_storage": { "type": "object", @@ -124,7 +131,8 @@ { "type": "array", "items": { "type": "number" } } ] } - } + }, + "additionalProperties": false }, "weighted_storage": { "type": "object", @@ -138,10 +146,12 @@ "properties": { "values": { "type": "array", "items": { "type": "number" } }, "variances": { "type": "array", "items": { "type": "number" } } - } + }, + "additionalProperties": false } ] - } + }, + "additionalProperties": false } }, "mean_storage": { @@ -183,11 +193,13 @@ }, "values": { "type": "array", "items": { "type": "number" } }, "variances": { "type": "array", "items": { "type": "number" } } - } + }, + "additionalProperties": false } ] } - } + }, + "additionalProperties": false } } } diff --git a/tests/resources/reg.json b/tests/resources/reg.json index 8b928d4..0e7643e 100644 --- a/tests/resources/reg.json +++ b/tests/resources/reg.json @@ -1,6 +1,5 @@ { "one": { - "title": "One", "metadata": {}, "axes": [ { @@ -26,6 +25,6 @@ "overflow": true } ], - "storage": { "type": "double", "data": "something" } + "storage": { "type": "double", "data": "some/path/depends/on/format" } } } From cc6028a348661d8f75df2185eb531e29d72ee886 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Mon, 31 Jul 2023 12:10:14 -0400 Subject: [PATCH 06/11] feat: add rendered version & descriptions Signed-off-by: Henry Schreiner --- docs/conf.py | 1 + docs/serialization.md | 6 ++ pyproject.toml | 1 + src/uhi/resources/histogram.json | 127 ++++++++++++++++++++++++------- 4 files changed, 106 insertions(+), 29 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 9c5a56a..6fd87d6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,6 +32,7 @@ # ones. extensions = [ "myst_parser", + "sphinx-jsonschema", "sphinx.ext.napoleon", "sphinx_copybutton", "sphinx_github_changelog", diff --git a/docs/serialization.md b/docs/serialization.md index a14057b..cad7a32 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -93,6 +93,12 @@ uhi.schema.validate("some/file.json") Eventually this should also be usable for JSON's inside zip, HDF5 attributes, and maybe more. +## Rendered schema + +```{jsonschema} ../src/uhi/resources/histogram.json +``` + + ## Full schema The full schema is below: diff --git a/pyproject.toml b/pyproject.toml index 8eefe76..28173b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ docs = [ "sphinx>=4.0", "furo", "sphinx-copybutton>=0.3.1", + "sphinx-jsonschema", "myst-parser", "sphinx_github_changelog", ] diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json index 1eb9abd..b8a5c54 100644 --- a/src/uhi/resources/histogram.json +++ b/src/uhi/resources/histogram.json @@ -1,15 +1,20 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "$id": "https://raw.githubusercontent.com/scikit-hep/uhi/henryiii/feat/schema/src/uhi/resources/histogram.json", + "title": "Histogram", "type": "object", "patternProperties": { ".+": { "type": "object", "required": ["axes", "storage"], "properties": { - "metadata": { "type": "object" }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + }, "axes": { "type": "array", + "description": "A list of the axes of the histogram.", "items": { "oneOf": [ { "$ref": "#/$defs/regular_axis" }, @@ -21,6 +26,7 @@ } }, "storage": { + "description": "The storage of the bins of the histogram.", "oneOf": [ { "$ref": "#/$defs/int_storage" }, { "$ref": "#/$defs/double_storage" }, @@ -36,22 +42,39 @@ "$defs": { "regular_axis": { "type": "object", + "description": "An evenly spaced set of continuous bins.", "required": ["type", "lower", "upper", "bins", "underflow", "overflow"], "properties": { "type": { "type": "string", "pattern": "regular" }, - "lower": { "type": "number" }, - "upper": { "type": "number" }, - "bins": { "type": "number" }, - "underflow": { "type": "boolean" }, - "overflow": { "type": "boolean" }, - "circular": { "type": "boolean" }, - "name": { "type": "string" }, - "metadata": { "type": "object" } + "lower": { "type": "number", "description": "Lower edge of the axis." }, + "upper": { "type": "number", "description": "Upper edge of the axis." }, + "bins": { + "type": "integer", + "minimum": 0, + "description": "Number of bins in the axis." + }, + "underflow": { + "type": "boolean", + "description": "True if there is a bin for underflow." + }, + "overflow": { + "type": "boolean", + "description": "True if there is a bin for overflow." + }, + "circular": { + "type": "boolean", + "description": "True if the axis wraps around." + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } }, "additionalProperties": false }, "variable_axis": { "type": "object", + "description": "A variably spaced set of continuous bins.", "required": ["type", "edges", "underflow", "overflow"], "properties": { "type": { "type": "string", "pattern": "variable" }, @@ -59,63 +82,93 @@ "oneOf": [ { "type": "array", - "items": { "type": "number" } + "items": { "type": "number", "minItems": 2, "uniqueItems": true } }, { - "type": "string" + "type": "string", + "description": "A path (URI?) to the edges data." } ] }, "underflow": { "type": "boolean" }, "overflow": { "type": "boolean" }, "circular": { "type": "boolean" }, - "name": { "type": "string" }, - "metadata": { "type": "object" } + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } }, "additionalProperties": false }, "str_category_axis": { "type": "object", + "description": "A set of string categorical bins.", "required": ["type", "categories", "flow"], "properties": { "type": { "type": "string", "pattern": "str_category" }, - "categories": { "type": "array", "items": { "type": "string" } }, - "flow": { "type": "boolean" }, - "name": { "type": "string" }, - "metadata": { "type": "object" } + "categories": { + "type": "array", + "items": { "type": "string" }, + "uniqueItems": true + }, + "flow": { + "type": "boolean", + "description": "True if flow bin (at the overflow position) present." + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } }, "additionalProperties": false }, "int_category_axis": { "type": "object", + "description": "A set of integer categorical bins in any order.", "required": ["type", "categories", "flow"], "properties": { "type": { "type": "string", "pattern": "int_category" }, - "categories": { "type": "array", "items": { "type": "number" } }, - "flow": { "type": "boolean" }, - "name": { "type": "string" }, - "metadata": { "type": "object" } + "categories": { + "type": "array", + "items": { "type": "integer" }, + "uniqueItems": true + }, + "flow": { + "type": "boolean", + "description": "True if flow bin (at the overflow position) present." + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } }, "additionalProperties": false }, "boolean_axis": { "type": "object", + "description": "A simple true/false axis with no flow.", "required": ["type"], "properties": { "type": { "type": "string", "pattern": "boolean" }, - "name": { "type": "string" }, - "metadata": { "type": "object" } + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } }, "additionalProperties": false }, "int_storage": { "type": "object", + "description": "A storage holding integer counts.", "properties": { "type": { "type": "string", "pattern": "int" }, "data": { "oneOf": [ - { "type": "string" }, - { "type": "array", "items": { "type": "number" } } + { + "type": "string", + "description": "A path (URI?) to the integer bin data." + }, + { "type": "array", "items": { "type": "integer" } } ] } }, @@ -123,11 +176,15 @@ }, "double_storage": { "type": "object", + "description": "A storage holding floating point counts.", "properties": { "type": { "type": "string", "pattern": "double" }, "data": { "oneOf": [ - { "type": "string" }, + { + "type": "string", + "description": "A path (URI?) to the floating point bin data." + }, { "type": "array", "items": { "type": "number" } } ] } @@ -136,11 +193,15 @@ }, "weighted_storage": { "type": "object", + "description": "A storage holding floating point counts and variances.", "properties": { "type": { "type": "string", "pattern": "int" }, "data": { "oneOf": [ - { "type": "string" }, + { + "type": "string", + "description": "A path (URI?) to the floating point bin data; outer dimension is [value, variance]" + }, { "type": "object", "properties": { @@ -156,11 +217,15 @@ }, "mean_storage": { "type": "object", + "description": "A storage holding 'profile'-style floating point counts, values, and variances.", "properties": { "type": { "type": "string", "pattern": "int" }, "data": { "oneOf": [ - { "type": "string" }, + { + "type": "string", + "description": "A path (URI?) to the floating point bin data; outer dimension is [counts, value, variance]" + }, { "type": "object", "properties": { @@ -175,11 +240,15 @@ }, "weighted_mean_storage": { "type": "object", + "description": "A storage holding 'profile'-style floating point ∑weights, ∑weights², values, and variances.", "properties": { "type": { "type": "string", "pattern": "int" }, "data": { "oneOf": [ - { "type": "string" }, + { + "type": "string", + "description": "A path (URI?) to the floating point bin data; outer dimension is [∑weights, ∑weights², value, variance]" + }, { "type": "object", "properties": { From da4d16a312f0d7fe71f11e64cf7b9851f5c013a2 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Mon, 31 Jul 2023 12:22:16 -0400 Subject: [PATCH 07/11] docs: mention warning about JSON Signed-off-by: Henry Schreiner --- docs/serialization.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/serialization.md b/docs/serialization.md index cad7a32..7878541 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -76,7 +76,7 @@ The following storages are supported: ## CLI/API -You can test a JSON file against the schema by running: +You can currently test a JSON file against the schema by running: ```console $ python -m uhi.schema some/file.json @@ -93,6 +93,16 @@ uhi.schema.validate("some/file.json") Eventually this should also be usable for JSON's inside zip, HDF5 attributes, and maybe more. +```{warning} + +Currently, this spec describes **how to prepare the metadata** for one of the +targetted backends. It does not yet cover backend specific details, like how to +define and use the binary resource locator strings or how to store the data. +JSON is not a target spec, but just part of the ZIP spec, meaning the files +that currently "pass" the tool above would be valid inside a `.zip` file +eventually, but are not valid by themselves. +``` + ## Rendered schema ```{jsonschema} ../src/uhi/resources/histogram.json From 0fcb3c85229784e7e4c31b952b6f025e9c192c7e Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Mon, 31 Jul 2023 16:41:47 -0400 Subject: [PATCH 08/11] fix: better required/additional props Signed-off-by: Henry Schreiner --- docs/serialization.md | 2 +- src/uhi/resources/histogram.json | 64 ++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/docs/serialization.md b/docs/serialization.md index 7878541..40511de 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -96,7 +96,7 @@ and maybe more. ```{warning} Currently, this spec describes **how to prepare the metadata** for one of the -targetted backends. It does not yet cover backend specific details, like how to +targeted backends. It does not yet cover backend specific details, like how to define and use the binary resource locator strings or how to store the data. JSON is not a target spec, but just part of the ZIP spec, meaning the files that currently "pass" the tool above would be valid inside a `.zip` file diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json index b8a5c54..b1d6df4 100644 --- a/src/uhi/resources/histogram.json +++ b/src/uhi/resources/histogram.json @@ -3,10 +3,12 @@ "$id": "https://raw.githubusercontent.com/scikit-hep/uhi/henryiii/feat/schema/src/uhi/resources/histogram.json", "title": "Histogram", "type": "object", + "additionalProperties": false, "patternProperties": { ".+": { "type": "object", "required": ["axes", "storage"], + "additionalProperties": false, "properties": { "metadata": { "type": "object", @@ -35,8 +37,7 @@ { "$ref": "#/$defs/weighted_mean_storage" } ] } - }, - "additionalProperties": false + } } }, "$defs": { @@ -44,6 +45,7 @@ "type": "object", "description": "An evenly spaced set of continuous bins.", "required": ["type", "lower", "upper", "bins", "underflow", "overflow"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "regular" }, "lower": { "type": "number", "description": "Lower edge of the axis." }, @@ -69,13 +71,13 @@ "type": "object", "description": "Arbitrary metadata dictionary." } - }, - "additionalProperties": false + } }, "variable_axis": { "type": "object", "description": "A variably spaced set of continuous bins.", "required": ["type", "edges", "underflow", "overflow"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "variable" }, "edges": { @@ -97,13 +99,13 @@ "type": "object", "description": "Arbitrary metadata dictionary." } - }, - "additionalProperties": false + } }, "str_category_axis": { "type": "object", "description": "A set of string categorical bins.", "required": ["type", "categories", "flow"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "str_category" }, "categories": { @@ -119,13 +121,13 @@ "type": "object", "description": "Arbitrary metadata dictionary." } - }, - "additionalProperties": false + } }, "int_category_axis": { "type": "object", "description": "A set of integer categorical bins in any order.", "required": ["type", "categories", "flow"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "int_category" }, "categories": { @@ -141,25 +143,26 @@ "type": "object", "description": "Arbitrary metadata dictionary." } - }, - "additionalProperties": false + } }, "boolean_axis": { "type": "object", "description": "A simple true/false axis with no flow.", "required": ["type"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "boolean" }, "metadata": { "type": "object", "description": "Arbitrary metadata dictionary." } - }, - "additionalProperties": false + } }, "int_storage": { "type": "object", "description": "A storage holding integer counts.", + "required": ["type", "data"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "int" }, "data": { @@ -171,12 +174,13 @@ { "type": "array", "items": { "type": "integer" } } ] } - }, - "additionalProperties": false + } }, "double_storage": { "type": "object", "description": "A storage holding floating point counts.", + "required": ["type", "data"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "double" }, "data": { @@ -188,12 +192,13 @@ { "type": "array", "items": { "type": "number" } } ] } - }, - "additionalProperties": false + } }, "weighted_storage": { "type": "object", "description": "A storage holding floating point counts and variances.", + "required": ["type", "data"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "int" }, "data": { @@ -204,20 +209,22 @@ }, { "type": "object", + "required": ["values", "variances"], + "additionalProperties": false, "properties": { "values": { "type": "array", "items": { "type": "number" } }, "variances": { "type": "array", "items": { "type": "number" } } - }, - "additionalProperties": false + } } ] - }, - "additionalProperties": false + } } }, "mean_storage": { "type": "object", "description": "A storage holding 'profile'-style floating point counts, values, and variances.", + "required": ["type", "data"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "int" }, "data": { @@ -228,6 +235,8 @@ }, { "type": "object", + "required": ["counts", "values", "variances"], + "additionalProperties": false, "properties": { "counts": { "type": "array", "items": { "type": "number" } }, "values": { "type": "array", "items": { "type": "number" } }, @@ -241,6 +250,8 @@ "weighted_mean_storage": { "type": "object", "description": "A storage holding 'profile'-style floating point ∑weights, ∑weights², values, and variances.", + "required": ["type", "data"], + "additionalProperties": false, "properties": { "type": { "type": "string", "pattern": "int" }, "data": { @@ -251,6 +262,13 @@ }, { "type": "object", + "required": [ + "sum_of_weights", + "sum_of_weights_squared", + "values", + "variances" + ], + "additionalProperties": false, "properties": { "sum_of_weights": { "type": "array", @@ -262,13 +280,11 @@ }, "values": { "type": "array", "items": { "type": "number" } }, "variances": { "type": "array", "items": { "type": "number" } } - }, - "additionalProperties": false + } } ] } - }, - "additionalProperties": false + } } } } From 35309652ee4365c9eeb8e1c0f711e672213e2e4e Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Mon, 31 Jul 2023 17:18:27 -0400 Subject: [PATCH 09/11] fix: use const instead of pattern Signed-off-by: Henry Schreiner --- src/uhi/resources/histogram.json | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json index b1d6df4..6dab3ba 100644 --- a/src/uhi/resources/histogram.json +++ b/src/uhi/resources/histogram.json @@ -47,7 +47,7 @@ "required": ["type", "lower", "upper", "bins", "underflow", "overflow"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "regular" }, + "type": { "type": "string", "const": "regular" }, "lower": { "type": "number", "description": "Lower edge of the axis." }, "upper": { "type": "number", "description": "Upper edge of the axis." }, "bins": { @@ -79,7 +79,7 @@ "required": ["type", "edges", "underflow", "overflow"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "variable" }, + "type": { "type": "string", "const": "variable" }, "edges": { "oneOf": [ { @@ -107,7 +107,7 @@ "required": ["type", "categories", "flow"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "str_category" }, + "type": { "type": "string", "const": "str_category" }, "categories": { "type": "array", "items": { "type": "string" }, @@ -129,7 +129,7 @@ "required": ["type", "categories", "flow"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "int_category" }, + "type": { "type": "string", "const": "int_category" }, "categories": { "type": "array", "items": { "type": "integer" }, @@ -151,7 +151,7 @@ "required": ["type"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "boolean" }, + "type": { "type": "string", "const": "boolean" }, "metadata": { "type": "object", "description": "Arbitrary metadata dictionary." @@ -164,7 +164,7 @@ "required": ["type", "data"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "int" }, + "type": { "type": "string", "const": "int" }, "data": { "oneOf": [ { @@ -182,7 +182,7 @@ "required": ["type", "data"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "double" }, + "type": { "type": "string", "const": "double" }, "data": { "oneOf": [ { @@ -200,7 +200,7 @@ "required": ["type", "data"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "int" }, + "type": { "type": "string", "const": "int" }, "data": { "oneOf": [ { @@ -226,7 +226,7 @@ "required": ["type", "data"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "int" }, + "type": { "type": "string", "const": "int" }, "data": { "oneOf": [ { @@ -253,7 +253,7 @@ "required": ["type", "data"], "additionalProperties": false, "properties": { - "type": { "type": "string", "pattern": "int" }, + "type": { "type": "string", "const": "int" }, "data": { "oneOf": [ { From 020c8f532fc541e2b3108d93f338cfde4886b49e Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 1 Aug 2023 10:37:18 -0400 Subject: [PATCH 10/11] fix: address review comments Signed-off-by: Henry Schreiner --- docs/serialization.md | 14 +++++++------- src/uhi/resources/histogram.json | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/serialization.md b/docs/serialization.md index 40511de..cd2bddf 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -21,7 +21,7 @@ The following formats are being targeted: Other formats can be used as well, assuming they support out-of-band data and text attributes or files for the metadata. -## Drawbacks +## Caveats This structure was based heavily on boost-histogram, but it is intended to be general, and can be expanded in the future as needed. As such, the following @@ -32,8 +32,8 @@ limitations are required: lost, etc. * Metadata must be expressible as JSON. It should also be reasonably sized; some formats like HDF5 may limit the size of attributes to 64K. -* Floating point variations could be introduced in storages, as the storage - format uses a stable but different representation. +* Floating point errors could be incurred on conversion, as the storage format + uses a stable but different representation. * Axis `name` is only part of the metadata, and is not standardized. This is due to lack of support from boost-histogram. @@ -49,15 +49,15 @@ The following axes types are supported: is either an in-line list of numbers or a string pointing to an out-of-band data source. Also has `underflow`, `overflow`, and `circular` properties. `circular` defaults to False if not present. -* `"int_category"`: A list of integer bins, non-continuous. Has `categories`, +* `"category_int"`: A list of integer bins, non-continuous. Has `categories`, which is an in-line list of integers. Also has `flow`. -* `"str_category"`: A list of string bins. Has `categories`, +* `"category_str"`: A list of string bins. Has `categories`, which is an in-line list of strings. Also has `flow`. -* `"bool"`: A true/false axis. +* `"boolean"`: A true/false axis. Axes with gaps are currently not supported. -All axes support `metadata`. +All axes support `metadata`, a string-valued dictionary of arbitrary, JSON-like data. The following storages are supported: diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json index 6dab3ba..e62ecbb 100644 --- a/src/uhi/resources/histogram.json +++ b/src/uhi/resources/histogram.json @@ -21,8 +21,8 @@ "oneOf": [ { "$ref": "#/$defs/regular_axis" }, { "$ref": "#/$defs/variable_axis" }, - { "$ref": "#/$defs/str_category_axis" }, - { "$ref": "#/$defs/int_category_axis" }, + { "$ref": "#/$defs/category_str_axis" }, + { "$ref": "#/$defs/category_int_axis" }, { "$ref": "#/$defs/boolean_axis" } ] } @@ -101,13 +101,13 @@ } } }, - "str_category_axis": { + "category_str_axis": { "type": "object", "description": "A set of string categorical bins.", "required": ["type", "categories", "flow"], "additionalProperties": false, "properties": { - "type": { "type": "string", "const": "str_category" }, + "type": { "type": "string", "const": "category_str" }, "categories": { "type": "array", "items": { "type": "string" }, @@ -123,13 +123,13 @@ } } }, - "int_category_axis": { + "category_int_axis": { "type": "object", "description": "A set of integer categorical bins in any order.", "required": ["type", "categories", "flow"], "additionalProperties": false, "properties": { - "type": { "type": "string", "const": "int_category" }, + "type": { "type": "string", "const": "category_int" }, "categories": { "type": "array", "items": { "type": "integer" }, From 84272b6af0e6d99e6d3a215e049d97eb718824ab Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 17 Oct 2023 11:29:35 -0400 Subject: [PATCH 11/11] chore: add checks for schema Signed-off-by: Henry Schreiner --- .pre-commit-config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1250540..8c6796d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -71,3 +71,9 @@ repos: hooks: - id: check-readthedocs - id: check-github-workflows + - id: check-metaschema + files: ^src/uhi/resources/histogram.json$ + - id: check-jsonschema + name: Validate Histogram examples + args: [--schemafile, src/uhi/resources/histogram.json] + files: ^tests/resources/.*\.json