From e77af4608e82324fdd0cf3e85043cb75178e5b46 Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Thu, 11 Jul 2024 16:54:04 +0200 Subject: [PATCH] implement histograms with variable binning --- src/lgdo/lh5/_serializers/read/composite.py | 16 ++- src/lgdo/types/histogram.py | 135 ++++++++++++++------ tests/lh5/test_lh5_write.py | 46 +++++++ tests/types/test_histogram.py | 103 +++++++++++++-- 4 files changed, 242 insertions(+), 58 deletions(-) diff --git a/src/lgdo/lh5/_serializers/read/composite.py b/src/lgdo/lh5/_serializers/read/composite.py index 16697714..7d3efd6e 100644 --- a/src/lgdo/lh5/_serializers/read/composite.py +++ b/src/lgdo/lh5/_serializers/read/composite.py @@ -418,11 +418,17 @@ def _h5_read_histogram( decompress, ) isdensity = struct.isdensity.value - binning = [ - (a.binedges.first, a.binedges.last, a.binedges.step, a.closedleft) - for _, a in struct.binning.items() - ] - binning = [tuple(v.value for v in b) for b in binning] + binning = [] + for _, a in struct.binning.items(): + be = a.binedges + if isinstance(be, Struct): + b = (None, be.first.value, be.last.value, be.step.value, a.closedleft.value) + elif isinstance(be, Array): + b = (be, None, None, None, a.closedleft.value) + else: + msg = "unexpected binning of histogram" + raise LH5DecodeError(msg, h5g) + binning.append(Histogram.Axis(*b)) weights = struct.weights.view_as("np") histogram = Histogram(weights, binning, isdensity) diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index 14203ec6..e2017df6 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -15,9 +15,10 @@ class Histogram(Struct): class Axis(Struct): def __init__( self, - first: float, - last: float, - step: float, + edges: np.ndarray | Array | None, + first: float | None, + last: float | None, + step: float | None, closedleft: bool = True, binedge_attrs: dict[str, Any] | None = None, ) -> None: @@ -39,38 +40,67 @@ def __init__( attributes that will be added to the ``binedges`` LGDO that is part of the axis struct. """ - super().__init__( - { - "binedges": Struct( - { - "first": Scalar(first), - "last": Scalar(last), - "step": Scalar(step), - }, - binedge_attrs, - ), - "closedleft": Scalar(closedleft), - }, - ) + if edges is not None and ( + first is not None or last is not None or step is not None + ): + msg = "can only construct Axis either from edges or from range" + raise ValueError(msg) + if edges is None and (first is None or last is None or step is None): + msg = "did not pass all range parameters" + raise ValueError(msg) + + if edges is None: + edges = Struct( + { + "first": Scalar(first), + "last": Scalar(last), + "step": Scalar(step), + }, + binedge_attrs, + ) + else: + if not isinstance(edges, Array): + edges = Array(edges, attrs=binedge_attrs) + elif binedge_attrs is not None: + msg = "passed both binedge types.Array instance and binedge_attrs" + raise ValueError(msg) + + if len(edges.nda.shape) != 1: + msg = "must pass an array<1>{real} as edges vector" + raise ValueError(msg) + + super().__init__({"binedges": edges, "closedleft": Scalar(closedleft)}) @classmethod def from_edges(cls, edges: np.ndarray) -> Histogram.Axis: edge_diff = np.diff(edges) if np.any(~np.isclose(edge_diff, edge_diff[0])): - msg = "only evenly distributed edges can be converted" - raise ValueError(msg) - return cls(edges[0], edges[-1], edge_diff[0], True) + return cls(edges, None, None, None, True) + return cls(None, edges[0], edges[-1], edge_diff[0], True) + + @property + def is_range(self) -> bool: + return isinstance(self["binedges"], Struct) @property def first(self) -> float: + if not self.is_range: + msg = "Axis is not a range" + raise TypeError(msg) return self["binedges"]["first"].value @property def last(self) -> float: + if not self.is_range: + msg = "Axis is not a range" + raise TypeError(msg) return self["binedges"]["last"].value @property def step(self) -> float: + if not self.is_range: + msg = "Axis is not a range" + raise TypeError(msg) return self["binedges"]["step"].value @property @@ -79,16 +109,35 @@ def closedleft(self) -> bool: @property def nbins(self) -> int: - bins = (self.last - self.first) / self.step - bins_int = int(np.rint(bins)) - assert np.isclose(bins, bins_int) - return bins_int + if self.is_range: + bins = (self.last - self.first) / self.step + bins_int = int(np.rint(bins)) + assert np.isclose(bins, bins_int) + return bins_int + return len(self["binedges"].nda) - 1 + + @property + def edges(self) -> np.ndarray: + if self.is_range: + return np.linspace(self.first, self.last, self.nbins + 1) + return self["binedges"].nda def __str__(self) -> str: - return ( - f"first={self.first}, last={self.last}, step={self.step}, " - + f"closedleft={self.closedleft}" - ) + thr_orig = np.get_printoptions()["threshold"] + np.set_printoptions(threshold=8) + + if self.is_range: + string = f"first={self.first}, last={self.last}, step={self.step}" + else: + string = f"edges={self.edges}" + string += f", closedleft={self.closedleft}" + + attrs = self["binedges"].getattrs() + if attrs: + string += f" with attrs={attrs}" + + np.set_printoptions(threshold=thr_orig) + return string def __init__( self, @@ -96,7 +145,7 @@ def __init__( binning: None | list[Histogram.Axis] | list[np.ndarray] - | list[tuple[float, float, float, bool]] = None, + | list[tuple[float, float, float]] = None, isdensity: bool = False, attrs: dict[str, Any] | None = None, ) -> None: @@ -112,7 +161,7 @@ def __init__( binning * has to by None if a :class:`hist.Hist` has been passed as ``weights`` * can be a list of pre-initialized :class:`Histogram.Axis` - * can be a list of tuples, representing arguments to :meth:`Histogram.Axis.__init__()` + * can be a list of tuples, representing a range, ``(first, last, step)`` * can be a list of numpy arrays, as returned by :func:`numpy.histogramdd`. """ if isinstance(weights, hist.Hist): @@ -127,15 +176,15 @@ def __init__( msg = "flow bins of hist.Hist cannot be represented" raise ValueError(msg) weights_view = weights.view(flow=False) - if not isinstance(weights_view, np.ndarray): - msg = "only numpy-backed storages can be used in a hist.Hist" + if type(weights_view) is not np.ndarray: + msg = "only simple numpy-backed storages can be used in a hist.Hist" raise ValueError(msg) w = Array(weights_view) b = [] for ax in weights.axes: - if not isinstance(ax, hist.axis.Regular): - msg = "only regular axes of hist.Hist can be converted" + if not isinstance(ax, (hist.axis.Regular, hist.axis.Variable)): + msg = "only regular or variable axes of hist.Hist can be converted" raise ValueError(msg) b.append(Histogram.Axis.from_edges(ax.edges)) b = self._create_binning(b) @@ -150,7 +199,7 @@ def __init__( elif all(isinstance(ax, np.ndarray) for ax in binning): b = [Histogram.Axis.from_edges(ax) for ax in binning] elif all(isinstance(ax, tuple) for ax in binning): - b = [Histogram.Axis(*ax) for ax in binning] + b = [Histogram.Axis(None, *ax, True) for ax in binning] else: msg = "invalid binning object passed" raise ValueError(msg) @@ -254,23 +303,27 @@ def view_as( if not a.closedleft: msg = "hist.Hist cannot represent right-closed intervals" raise ValueError(msg) - hist_axes.append( - hist.axis.Regular( + if a.is_range: + hist_ax = hist.axis.Regular( bins=a.nbins, start=a.first, stop=a.last, underflow=False, overflow=False, ) - ) + else: + hist_ax = hist.axis.Variable( + a.edges, + underflow=False, + overflow=False, + ) + hist_axes.append(hist_ax) return hist.Hist(*hist_axes, data=self.weights.view_as("np")) if library == "np": - edges = [] - for a in self.binning: - edges.append(np.linspace(a.first, a.last, a.nbins)) - return self.weights.view_as("np"), tuple(edges) + edges = tuple([a.edges for a in self.binning]) + return self.weights.view_as("np"), edges msg = f"{library!r} is not a supported third-party format." raise TypeError(msg) diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py index e4152b29..7955e641 100644 --- a/tests/lh5/test_lh5_write.py +++ b/tests/lh5/test_lh5_write.py @@ -454,3 +454,49 @@ def test_write_histogram(caplog, tmptestdir): write_start=1, group="my_group", ) + + +def test_write_histogram_variable(caplog, tmptestdir): + caplog.set_level(logging.DEBUG) + caplog.clear() + + # Start with an types.Histogram + if os.path.exists(f"{tmptestdir}/write_histogram_test.lh5"): + os.remove(f"{tmptestdir}/write_histogram_test.lh5") + + h1 = types.Histogram( + np.array([[1, 1], [1, 1]]), (np.array([0, 1.2, 2]), np.array([2.1, 2.5, 2.3])) + ) + h2 = types.Histogram( + np.array([[10, 10], [10, 10]]), + (np.array([2, 3.5, 4]), np.array([5, 6.5, 7])), + isdensity=True, + ) # Same field name, different values + store = lh5.LH5Store() + store.write( + h1, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + group="my_group", + wo_mode="write_safe", + ) + store.write( + h2, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + wo_mode="overwrite", + group="my_group", + ) + + # Now, check that the data were overwritten + h3, _ = store.read( + "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" + ) + assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) + assert np.array_equal(h3.binning[0].edges, np.array([2, 3.5, 4])) + with pytest.raises(TypeError): + x = h3.binning[0].first + with pytest.raises(TypeError): + x = h3.binning[1].last # noqa: F841 + assert not h3.binning[0].is_range + assert h3.isdensity diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py index 5637dc20..f3623aed 100644 --- a/tests/types/test_histogram.py +++ b/tests/types/test_histogram.py @@ -4,10 +4,10 @@ import numpy as np import pytest -from lgdo import Histogram, Scalar +from lgdo import Array, Histogram, Scalar -def test_init_hist(): +def test_init_hist_regular(): h = hist.Hist( hist.axis.Regular(bins=10, start=0, stop=1, name="x"), hist.axis.Regular(bins=10, start=0, stop=1, name="y"), @@ -19,7 +19,7 @@ def test_init_hist(): assert len(h.binning) == 2 h = hist.Hist(hist.axis.Integer(start=0, stop=10)) - with pytest.raises(ValueError, match="only regular axes"): + with pytest.raises(ValueError, match="only regular or variable axes"): h = Histogram(h) h = hist.Hist(hist.axis.Regular(bins=10, start=0, stop=1)) @@ -40,6 +40,25 @@ def test_init_hist(): h.fill([1, 2, 3]) assert np.sum(hi.weights.nda) == 6 + h = hist.Hist( + hist.axis.Regular(bins=10, start=0, stop=10), storage=hist.storage.Mean() + ) + h.fill([1, 2, 3], sample=[4, 4, 4]) + with pytest.raises(ValueError, match="simple numpy-backed storages"): + hi = Histogram(h) + + +def test_init_hist_variable(): + h = hist.Hist( + hist.axis.Variable((0, 0.5, 5), name="x"), + hist.axis.Variable((0, 0.5, 5), name="y"), + ) + rng = np.random.default_rng() + h.fill(rng.uniform(size=500), rng.uniform(size=500)) + h = Histogram(h) + + assert len(h.binning) == 2 + def test_init_np(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) @@ -60,8 +79,7 @@ def test_init_np(): np.array([[1, 1], [1, 1]]), (np.array([0, 1, 2]), np.array([0, 1])) ) - with pytest.raises(ValueError, match="only evenly"): - h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) def test_datatype_name(): @@ -84,6 +102,7 @@ def test_axes(): assert ax0.last == 2 assert ax0.step == 1 assert isinstance(ax0.nbins, int) + assert len(ax0.edges) == 3 assert str(ax0) == "first=0, last=2, step=1, closedleft=True" ax1 = h.binning[1] @@ -92,11 +111,17 @@ def test_axes(): assert np.isclose(ax1.step, 0.1) assert isinstance(ax0.nbins, int) - h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + h = Histogram( + np.array([[1, 1], [1, 1]]), + [(1, 3, 1), (4, 6, 1)], + ) ax0 = h.binning[0] str(h) - h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + h = Histogram( + np.array([[1, 1], [1, 1]]), + [Histogram.Axis(None, 1, 3, 1), Histogram.Axis(None, 4, 6, 1, False)], + ) ax0 = h.binning[0] str(h) @@ -105,32 +130,86 @@ def test_axes(): h = Histogram( np.array([[1, 1], [1, 1]]), - [Histogram.Axis(1, 3, 1, True), Histogram.Axis(4, 6, 1, False)], + [Histogram.Axis(None, 1, 3, 1, True), Histogram.Axis(None, 4, 6, 1, False)], ) with pytest.raises(ValueError, match="invalid binning object"): h = Histogram( np.array([[1, 1], [1, 1]]), - [(1, 3, 1, True), Histogram.Axis(4, 6, 1, False)], + [(1, 3, 1), Histogram.Axis(None, 4, 6, 1, False)], + ) + + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + with pytest.raises(TypeError, match="range"): + x = h.binning[0].first + with pytest.raises(TypeError, match="range"): + x = h.binning[0].last + with pytest.raises(TypeError, match="range"): + x = h.binning[0].step # noqa: F841 + assert h.binning[0].nbins == 3 + assert str(h.binning[0]) == "edges=[0. 1. 2.5 3. ], closedleft=True" + + Histogram.Axis( + np.array([0, 1, 2.5, 3]), None, None, None, binedge_attrs={"units": "m"} + ) + + with pytest.raises(ValueError, match="binedge_attrs"): + Histogram.Axis( + Array(np.array([0, 1, 2.5, 3])), + None, + None, + None, + binedge_attrs={"units": "m"}, ) + with pytest.raises(ValueError, match=r"array<1>\{real\}"): + Histogram.Axis(Array(np.array([[0, 1], [2.5, 3]])), None, None, None) + + ax = Histogram.Axis( + np.array([0, 1, 2.5, 3]), + None, + None, + None, + binedge_attrs={"units": "m"}, + ) + assert ( + str(ax) == "edges=[0. 1. 2.5 3. ], closedleft=True with attrs={'units': 'm'}" + ) + + with pytest.raises(ValueError, match="either from edges or from range"): + Histogram.Axis(np.array([0, 1, 2.5, 3]), 0, 1, None) + with pytest.raises(ValueError, match="all range parameters"): + Histogram.Axis(None, 0, 1, None) def test_view_as_hist(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) - h.view_as("hist") + hi = h.view_as("hist") + assert isinstance(hi.axes[0], hist.axis.Regular) h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),), isdensity=True) with pytest.raises(ValueError, match="cannot represent density"): h.view_as("hist") - h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + h = Histogram( + np.array([[1, 1], [1, 1]]), + [Histogram.Axis(None, 1, 3, 1, True), Histogram.Axis(None, 4, 6, 1, False)], + ) with pytest.raises(ValueError, match="cannot represent right-closed"): h.view_as("hist") + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + hi = h.view_as("hist") + assert isinstance(hi.axes[0], hist.axis.Variable) + def test_view_as_np(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) - h.view_as("np") + assert h.binning[0].is_range + assert h.binning[0].nbins == 2 + nda, axes = h.view_as("np") + assert isinstance(nda, np.ndarray) + assert len(axes) == 1 + assert np.array_equal(axes[0], np.array([0, 1, 2])) def test_not_like_table():