From d1444a4f9edfd6572617bae9d0fc41cf898831c5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:18:07 +0100 Subject: [PATCH 1/2] chore: update pre-commit hooks (#21) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.6 → v0.5.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.6...v0.5.7) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7bbd928..98bd64b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: args: [--prose-wrap=always] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.5.6" + rev: "v0.5.7" hooks: - id: ruff args: ["--fix", "--show-fixes"] From bcc7a947eb3d46141e719499ebc2b0ce4ee6056d Mon Sep 17 00:00:00 2001 From: Saransh Date: Thu, 15 Aug 2024 05:50:18 +0530 Subject: [PATCH 2/2] refactor!: match API with Scikit-HEP --- .pre-commit-config.yaml | 1 + README.md | 82 +++ src/cuda_histogram/axis/__init__.py | 160 ++++-- src/cuda_histogram/hist.py | 781 ++++++---------------------- 4 files changed, 342 insertions(+), 682 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 98bd64b..f25e20b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,7 @@ repos: rev: "1.18.0" hooks: - id: blacken-docs + args: ["-E"] additional_dependencies: [black==24.*] - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/README.md b/README.md index 5ff37b5..8594c6c 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,88 @@ The package is under active development at the moment. +## Example + +```py +In [1]: import cuda_histogram; import cupy as cp + +In [2]: a = cuda_histogram.axis.Regular(10, 0, 1) + +In [3]: b = cuda_histogram.axis.Variable([0, 2, 3, 6]) + +In [4]: c = cuda_histogram.Hist(a, b) + +In [5]: a, b, c +Out[5]: +(Regular(10, 0, 1), + Variable([0. 2. 3. 6.]), + Hist(Regular(10, 0, 1), Variable([0. 2. 3. 6.]))) + +In [6]: c.fill(cp.random.normal(size=1_000_000), cp.random.normal(size=1_000_000)) + +In [7]: c.values(), type(c.values()) +Out[7]: +(array([[28493., 1282., 96.], + [29645., 1366., 91.], + [30465., 1397., 80.], + [31537., 1473., 81.], + [32608., 1454., 102.], + [33015., 1440., 83.], + [33992., 1482., 87.], + [34388., 1482., 111.], + [34551., 1517., 90.], + [35177., 1515., 85.]]), + cupy.ndarray) + +In [8]: c[0, 0], type(c[0, 0]) +Out[8]: (array(28493.), cupy.ndarray) + +In [9]: c[0:2, 0], type(c[0, 0]) # should ideally return a reduced histogram +Out[9]: (array([28493., 29645.]), cupy.ndarray) + +In [10]: c.to_boost() +Out[10]: +Histogram( + Regular(10, 0, 1), + Variable([0, 2, 3, 6]), + storage=Double()) # Sum: 339185.0 (945991.0 with flow) + +In [11]: c.to_boost().values(), type(c.to_boost().values()) +Out[11]: +(array([[28493., 1282., 96.], + [29645., 1366., 91.], + [30465., 1397., 80.], + [31537., 1473., 81.], + [32608., 1454., 102.], + [33015., 1440., 83.], + [33992., 1482., 87.], + [34388., 1482., 111.], + [34551., 1517., 90.], + [35177., 1515., 85.]]), + numpy.ndarray) + +In [12]: c.to_hist() +Out[12]: +Hist( + Regular(10, 0, 1, label='Axis 0'), + Variable([0, 2, 3, 6], label='Axis 1'), + storage=Double()) # Sum: 339185.0 (945991.0 with flow) + +In [13]: c.to_hist().values(), type(c.to_hist().values()) +Out[13]: +(array([[28493., 1282., 96.], + [29645., 1366., 91.], + [30465., 1397., 80.], + [31537., 1473., 81.], + [32608., 1454., 102.], + [33015., 1440., 83.], + [33992., 1482., 87.], + [34388., 1482., 111.], + [34551., 1517., 90.], + [35177., 1515., 85.]]), + numpy.ndarray) +``` + [actions-badge]: https://github.com/Saransh-cpp/cuda-histogram/workflows/CI/badge.svg [actions-link]: https://github.com/Saransh-cpp/cuda-histogram/actions diff --git a/src/cuda_histogram/axis/__init__.py b/src/cuda_histogram/axis/__init__.py index 0613cc2..50bced5 100644 --- a/src/cuda_histogram/axis/__init__.py +++ b/src/cuda_histogram/axis/__init__.py @@ -4,13 +4,13 @@ import numbers import re import warnings +from typing import Iterable import awkward import cupy import numpy as np __all__: list[str] = [ - "Bin", "Regular", "Variable", "Cat", @@ -32,21 +32,11 @@ ) -def _overflow_behavior(overflow): - if overflow == "none": +def _overflow_behavior(overflow: bool): + if not overflow: return slice(1, -2) - elif overflow == "under": - return slice(None, -2) - elif overflow == "over": - return slice(1, -1) - elif overflow == "all": - return slice(None, -1) - elif overflow == "allnan": - return slice(None) - elif overflow == "justnan": - return slice(-1, None) else: - raise ValueError(f"Unrecognized overflow behavior: {overflow}") + return slice(None, None) @functools.total_ordering @@ -397,9 +387,6 @@ class DenseAxis(Axis): **_ireduce(slice)** - return a slice or list of indices, input slice to be interpred as values **reduced(islice)** - return a new axis with binning corresponding to the index slice (from _ireduce) - - TODO: hasoverflow(), not all dense axes might have an overflow concept, - currently it is implicitly assumed they do (as the only dense type is a numeric axis) """ @@ -408,10 +395,6 @@ class Bin(DenseAxis): Parameters ---------- - name : str - is used as a keyword in histogram filling, immutable - label : str - describes the meaning of the axis, can be changed n_or_arr : int or list or np.ndarray Integer number of bins, if uniform binning. Otherwise, a list or numpy 1D array of bin boundaries. @@ -419,14 +402,17 @@ class Bin(DenseAxis): lower boundary of bin range, if uniform binning hi : float, optional upper boundary of bin range, if uniform binning + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed This axis will generate frequencies for n+3 bins, special bin indices: ``0 = underflow, n+1 = overflow, n+2 = nanflow`` Bin boundaries are [lo, hi) """ - def __init__(self, name, label, n_or_arr, lo=None, hi=None): - super().__init__(name, label) + def __init__(self, n_or_arr, lo=None, hi=None, *, name="", label=""): self._lazy_intervals = None if isinstance(n_or_arr, (list, np.ndarray, cupy.ndarray)): self._uniform = False @@ -440,10 +426,6 @@ def __init__(self, name, label, n_or_arr, lo=None, hi=None): self._interval_bins = cupy.r_[-cupy.inf, self._bins, cupy.nan] self._bin_names = np.full(self._interval_bins[:-1].size, None) elif isinstance(n_or_arr, numbers.Integral): - if lo is None or hi is None: - raise TypeError( - "Interpreting n_or_arr as uniform binning, please specify lo and hi values" - ) self._uniform = True self._lo = lo self._hi = hi @@ -455,10 +437,16 @@ def __init__(self, name, label, n_or_arr, lo=None, hi=None): cupy.nan, ] self._bin_names = np.full(self._interval_bins[:-1].size, None) - else: - raise TypeError( - f"Cannot understand n_or_arr (nbins or binning array) type {n_or_arr!r}" - ) + self._label = label + self._name = name + + def __repr__(self): + class_name = self.__class__.__name__ + return ( + f"{class_name}({self._bins[:-1]})" + if not self._uniform + else f"{class_name}{self._bins, self._lo, self._hi}" + ) @property def _intervals(self): @@ -480,11 +468,6 @@ def __getstate__(self): return self.__dict__ def __setstate__(self, d): - if "_intervals" in d: # convert old hists to new serialization format - _old_intervals = d.pop("_intervals") - interval_bins = [i._lo for i in _old_intervals] + [_old_intervals[-1]._hi] - d["_interval_bins"] = cupy.array(interval_bins) - d["_bin_names"] = np.array([interval._label for interval in _old_intervals]) if "_interval_bins" in d and "_bin_names" not in d: d["_bin_names"] = np.full(d["_interval_bins"][:-1].size, None) self.__dict__ = d @@ -501,7 +484,9 @@ def index(self, identifier): Returns an integer corresponding to the index in the axis where the histogram would be filled. The integer range includes flow bins: ``0 = underflow, n+1 = overflow, n+2 = nanflow`` """ - isarray = isinstance(identifier, (awkward.Array, cupy.ndarray, np.ndarray)) + isarray = isinstance( + identifier, (awkward.Array, cupy.ndarray, np.ndarray, list) + ) if isarray or isinstance(identifier, numbers.Number): identifier = awkward.to_cupy(identifier) # cupy.asarray(identifier) if self._uniform: @@ -638,10 +623,8 @@ def _ireduce(self, the_slice): def reduced(self, islice): """Return a new axis with reduced binning - The new binning corresponds to the slice made on this axis. Overflow will be taken care of by ``Hist.__getitem__`` - Parameters ---------- islice : slice @@ -669,22 +652,23 @@ def reduced(self, islice): # TODO: remove this once satisfied it works rbins = (hi - lo) * self._bins / (self._hi - self._lo) assert abs(bins - rbins) < 1e-14, "%d %f %r" % (bins, rbins, self) - return Bin(self._name, self._label, bins, lo, hi) + return Regular(bins, lo, hi, name=self._name, label=self._label) else: lo = None if islice.start is None else islice.start - 1 hi = -1 if islice.stop is None else islice.stop bins = self._bins[slice(lo, hi)] - return Bin(self._name, self._label, bins) + return Variable(bins, name=self._name, label=self._label) @property def size(self): - """Number of bins, including overflow (i.e. ``n + 3``)""" - if self._uniform: - return self._bins + 3 - # (inf added at constructor) - return len(self._bins) + 1 + """Number of bins""" + return ( + self._bins + if isinstance(self._bins, (int, np.integer, cupy.integer)) + else len(self._bins) + ) - def edges(self, overflow="none"): + def edges(self, flow=False): """Bin boundaries Parameters @@ -700,9 +684,9 @@ def edges(self, overflow="none"): out = cupy.r_[ 2 * out[0] - out[1], out, 2 * out[-1] - out[-2], 3 * out[-1] - 2 * out[-2] ] - return out[_overflow_behavior(overflow)] + return out[_overflow_behavior(flow)] - def centers(self, overflow="none"): + def centers(self, flow=False): """Bin centers Parameters @@ -711,9 +695,81 @@ def centers(self, overflow="none"): Create overflow and/or underflow bins by adding a bin of same width to each end. See `Hist.sum` description for the allowed values. """ - edges = self.edges(overflow) + edges = self.edges(flow) return (edges[:-1] + edges[1:]) / 2 - def identifiers(self, overflow="none"): + def identifiers(self, flow=False): """List of `Interval` identifiers""" - return self._intervals[_overflow_behavior(overflow)] + return self._intervals[_overflow_behavior(flow)] + + +class Regular(Bin): + """A binned axis with name, label, and binning. + + Parameters + ---------- + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed + n_or_arr : int or list or np.ndarray + Integer number of bins, if uniform binning. Otherwise, a list or + numpy 1D array of bin boundaries. + lo : float, optional + lower boundary of bin range, if uniform binning + hi : float, optional + upper boundary of bin range, if uniform binning + + This axis will generate frequencies for n+3 bins, special bin indices: + ``0 = underflow, n+1 = overflow, n+2 = nanflow`` + Bin boundaries are [lo, hi) + """ + + def __init__( + self, + bins: int, + start: float, + stop: float, + *, + name: str = "", + label: str = "", + ) -> None: + super().__init__( + bins, + start, + stop, + name=name, + label=label, + ) + + +class Variable(Bin): + """A binned axis with name, label, and binning. + + Parameters + ---------- + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed + n_or_arr : int or list or np.ndarray + Integer number of bins, if uniform binning. Otherwise, a list or + numpy 1D array of bin boundaries. + lo : float, optional + lower boundary of bin range, if uniform binning + hi : float, optional + upper boundary of bin range, if uniform binning + + This axis will generate frequencies for n+3 bins, special bin indices: + ``0 = underflow, n+1 = overflow, n+2 = nanflow`` + Bin boundaries are [lo, hi) + """ + + def __init__( + self, + edges: Iterable[float], + *, + name: str = "", + label: str = "", + ) -> None: + super().__init__(edges, name=name, label=label) diff --git a/src/cuda_histogram/hist.py b/src/cuda_histogram/hist.py index 1f28c2e..527d2ed 100644 --- a/src/cuda_histogram/hist.py +++ b/src/cuda_histogram/hist.py @@ -1,23 +1,17 @@ from __future__ import annotations -import copy -import numbers -import warnings -from abc import ABCMeta, abstractmethod from collections import namedtuple -from collections.abc import Sequence -import awkward import cupy import numpy as np from cuda_histogram.axis import ( Axis, - Bin, Cat, DenseAxis, - Interval, + Regular, SparseAxis, + Variable, _overflow_behavior, ) @@ -55,177 +49,55 @@ def _assemble_blocks(array, ndslice, depth=0): return slist -class AccumulatorABC(metaclass=ABCMeta): - """Abstract base class for an accumulator - - Accumulators are abstract objects that enable the reduce stage of the typical map-reduce - scaleout that we do in Coffea. One concrete example is a histogram. The idea is that an - accumulator definition holds enough information to be able to create an empty accumulator - (the ``identity()`` method) and add two compatible accumulators together (the ``add()`` method). - The former is not strictly necessary, but helps with book-keeping. Here we show an example usage - of a few accumulator types. An arbitrary-depth nesting of dictionary accumulators is supported, much - like the behavior of directories in ROOT hadd. - - After defining an accumulator:: - - from coffea.processor import dict_accumulator, column_accumulator, defaultdict_accumulator - from cuda_histogram import Hist, Bin - import numpy as np - - adef = dict_accumulator({ - 'cutflow': defaultdict_accumulator(int), - 'pt': Hist("counts", Bin("pt", "$p_T$", 100, 0, 100)), - 'final_pt': column_accumulator(np.zeros(shape=(0,))), - }) - - Notice that this function does not mutate ``adef``:: - - def fill(n): - ptvals = np.random.exponential(scale=30, size=n) - cut = ptvals > 200. - acc = adef.identity() - acc['cutflow']['pt>200'] += cut.sum() - acc['pt'].fill(pt=ptvals) - acc['final_pt'] += column_accumulator(ptvals[cut]) - return acc - - As such, we can execute it several times in parallel and reduce the result:: - - import concurrent.futures - with concurrent.futures.ThreadPoolExecutor() as executor: - outputs = executor.map(fill, [2000, 2000]) - - combined = sum(outputs, adef.identity()) - - - Derived classes must implement - - ``identity()``: returns a new object of same type as self, - such that ``self + self.identity() == self`` - - ``add(other)``: adds an object of same type as self to self - - Concrete implementations are then provided for ``__add__``, ``__radd__``, and ``__iadd__``. +class Hist: """ + Construct a new histogram. - @abstractmethod - def identity(self): - """Identity of the accumulator - - A value such that any other value added to it will return - the other value - """ - - @abstractmethod - def add(self, other): - """Add another accumulator to this one in-place""" - - def __add__(self, other): - ret = self.identity() - ret.add(self) - ret.add(other) - return ret - - def __radd__(self, other): - ret = self.identity() - ret.add(other) - ret.add(self) - return ret - - def __iadd__(self, other): - self.add(other) - return self - - -class Hist(AccumulatorABC): - """ - Specify a multidimensional histogram. + If you pass in a single argument, this will be treated as a + histogram and this will convert the histogram to this type of + histogram. Parameters ---------- - label : str - A description of the meaning of the sum of weights - ``*axes`` - positional list of `Cat` or `Bin` objects, denoting the axes of the histogram - axes : collections.abc.Sequence - list of `Cat` or `Bin` objects, denoting the axes of the histogram (overridden by ``*axes``) - dtype : str - Underlying numpy dtype to use for storing sum of weights - - Examples - -------- - - Creating a histogram with a sparse axis, and two dense axes:: - - import cuda_histogram as chist - - h = chist.Hist("Observed bird count", - chist.Cat("species", "Bird species"), - chist.Bin("x", "x coordinate [m]", 20, -5, 5), - chist.Bin("y", "y coordinate [m]", 20, -5, 5), - ) - - # or - - h = chist.Hist(label="Observed bird count", - axes=(chist.Cat("species", "Bird species"), - chist.Bin("x", "x coordinate [m]", 20, -5, 5), - chist.Bin("y", "y coordinate [m]", 20, -5, 5), - ) - ) - - # or - - h = chist.Hist(axes=[chist.Cat("species", "Bird species"), - chist.Bin("x", "x coordinate [m]", 20, -5, 5), - chist.Bin("y", "y coordinate [m]", 20, -5, 5), - ], - label="Observed bird count", - ) - - which produces: - - >>> h - - + *args : Axis + Provide 1 or more axis instances. + label: str = None + Histogram's label + name: str = None + Histogram's name """ - #: Default numpy dtype to store sum of weights DEFAULT_DTYPE = "d" - def __init__(self, label, *axes, **kwargs): - if not isinstance(label, str): - raise TypeError("label must be a string") + def __init__( + self, + *axes, + label: str | None = None, + name: str | None = None, + ) -> None: self._label = label - self._dtype = kwargs.get("dtype", Hist.DEFAULT_DTYPE) + self._name = name + self._dtype = Hist.DEFAULT_DTYPE self._axes = axes - if len(axes) == 0 and "axes" in kwargs: - if not isinstance(kwargs["axes"], Sequence): - raise TypeError("axes must be a sequence type! (tuple, list, etc.)") - self._axes = tuple(kwargs["axes"]) - elif len(axes) != 0 and "axes" in kwargs: - warnings.warn( - "axes defined by both positional arguments and keyword argument, using positional arguments" - ) if not all(isinstance(ax, Axis) for ax in self._axes): - del self._axes raise TypeError("All axes must be derived from Axis class") - # if we stably partition axes to sparse, then dense, some things simplify - # ..but then the user would then see the order change under them self._dense_shape = tuple( - [ax.size for ax in self._axes if isinstance(ax, DenseAxis)] + [ + ax.size + 3 if isinstance(ax, Regular) else ax.size + 1 + for ax in self._axes + if isinstance(ax, DenseAxis) + ] ) - if np.prod(self._dense_shape) > 10000000: - warnings.warn("Allocating a large (>10M bin) histogram!", RuntimeWarning) self._sumw = {} # Storage of sumw2 starts at first use of weight keyword in fill() self._sumw2 = None def __repr__(self): - return "<{} ({}) instance at 0x{:0x}>".format( - self.__class__.__name__, - ",".join(d.name for d in self.axes()), - id(self), - ) + repr_str = "Hist(" + for ax in self._axes: + repr_str += f"{ax!r}, " + return f"{repr_str[:-2]})" @property def label(self): @@ -236,31 +108,14 @@ def label(self): def label(self, label): self._label = label - def copy(self, content=True): - """Create a deep copy - - Parameters - ---------- - content : bool - If set false, only the histogram definition is copied, resetting - the sum of weights to zero - """ - out = Hist(self._label, *self._axes, dtype=self._dtype) - if self._sumw2 is not None: - out._sumw2 = {} - if content: - out._sumw = copy.deepcopy(self._sumw) - out._sumw2 = copy.deepcopy(self._sumw2) - return out - - def identity(self): - """The identity (zero value) of this accumulator""" - return self.copy(content=False) + @property + def name(self): + """A label describing the meaning of the sum of weights""" + return self._name - def clear(self): - """Clear all content in this histogram""" - self._sumw = {} - self._sumw2 = None + @label.setter + def name(self, name): # noqa: F811 + self._name = name def axis(self, axis_name): """Get an ``Axis`` object""" @@ -272,11 +127,6 @@ def axes(self): """Get all axes in this histogram""" return self._axes - @property - def fields(self): - """This is a stub for histbook compatibility""" - return [ax.name for ax in self._axes] - def dim(self): """Dimension of this histogram (number of axes)""" return len(self._axes) @@ -285,25 +135,10 @@ def dense_dim(self): """Dense dimension of this histogram (number of non-sparse axes)""" return len(self._dense_shape) - def sparse_dim(self): - """Sparse dimension of this histogram (number of sparse axes)""" - return self.dim() - self.dense_dim() - - def dense_axes(self): - """All dense axes""" - return [ax for ax in self._axes if isinstance(ax, DenseAxis)] - def sparse_axes(self): """All sparse axes""" return [ax for ax in self._axes if isinstance(ax, SparseAxis)] - def sparse_nbins(self): - """Total number of sparse bins""" - return len(self._sumw) - - def _idense(self, axis): - return self.dense_axes().index(axis) - def _isparse(self, axis): return self.sparse_axes().index(axis) @@ -312,97 +147,62 @@ def _init_sumw2(self): for key in self._sumw: self._sumw2[key] = self._sumw[key].copy() - def compatible(self, other): - """Checks if this histogram is compatible with another, i.e. they have identical binning""" - if self.dim() != other.dim(): - return False - if {d.name for d in self.sparse_axes()} != { - d.name for d in other.sparse_axes() - }: - return False - if not all(d1 == d2 for d1, d2 in zip(self.dense_axes(), other.dense_axes())): # noqa: SIM103 - return False - return True - - def add(self, other): - """Add another histogram into this one, in-place""" - if not self.compatible(other): - raise ValueError( - f"Cannot add this histogram with histogram {other!r} of dissimilar dimensions" - ) - - raxes = other.sparse_axes() - - def add_dict(left, right): - for rkey in right: - lkey = tuple( - self.axis(rax).index(rax[ridx]) for rax, ridx in zip(raxes, rkey) - ) - if lkey in left: - left[lkey] += right[rkey] - else: - left[lkey] = copy.deepcopy(right[rkey]) - - if self._sumw2 is None and other._sumw2 is None: - pass - elif self._sumw2 is None: - self._init_sumw2() - add_dict(self._sumw2, other._sumw2) - elif other._sumw2 is None: - add_dict(self._sumw2, other._sumw) - else: - add_dict(self._sumw2, other._sumw2) - add_dict(self._sumw, other._sumw) - return self - + # fix this def __getitem__(self, keys): - if not isinstance(keys, tuple): - keys = (keys,) - if len(keys) > self.dim(): - raise IndexError("Too many indices for this histogram") - elif len(keys) < self.dim(): - if Ellipsis in keys: - idx = keys.index(Ellipsis) - slices = (slice(None),) * (self.dim() - len(keys) + 1) - keys = keys[:idx] + slices + keys[idx + 1 :] - else: - slices = (slice(None),) * (self.dim() - len(keys)) - keys += slices - sparse_idx = [] - dense_idx = [] - new_dims = [] - for s, ax in zip(keys, self._axes): - if isinstance(ax, SparseAxis): - sparse_idx.append(ax._ireduce(s)) - new_dims.append(ax) - else: - islice = ax._ireduce(s) - dense_idx.append(islice) - new_dims.append(ax.reduced(islice)) - dense_idx = tuple(dense_idx) - - def dense_op(array): - as_numpy = array.get() - blocked = np.block(_assemble_blocks(as_numpy, dense_idx)) - return cupy.asarray(blocked) - - out = Hist(self._label, *new_dims, dtype=self._dtype) - if self._sumw2 is not None: - out._init_sumw2() - for sparse_key in self._sumw: - if not all(k in idx for k, idx in zip(sparse_key, sparse_idx)): - continue - if sparse_key in out._sumw: - out._sumw[sparse_key] += dense_op(self._sumw[sparse_key]) - if self._sumw2 is not None: - out._sumw2[sparse_key] += dense_op(self._sumw2[sparse_key]) - else: - out._sumw[sparse_key] = dense_op(self._sumw[sparse_key]).copy() - if self._sumw2 is not None: - out._sumw2[sparse_key] = dense_op(self._sumw2[sparse_key]).copy() - return out - - def fill(self, **values): + if isinstance(keys, slice) and not all( + isinstance(s, int) or s is None for s in [keys.start, keys.stop, keys.step] + ): + raise ValueError("use to_boost/to_hist to access other UHI functionalities") + if not isinstance(keys, slice) and not isinstance(keys, (int, tuple)): + raise ValueError("use to_boost/to_hist to access other UHI functionalities") + return self.values()[keys] + # if not isinstance(keys, tuple): + # keys = (keys,) + # if len(keys) > self.dim(): + # raise IndexError("Too many indices for this histogram") + # elif len(keys) < self.dim(): + # if Ellipsis in keys: + # idx = keys.index(Ellipsis) + # slices = (slice(None),) * (self.dim() - len(keys) + 1) + # keys = keys[:idx] + slices + keys[idx + 1 :] + # else: + # slices = (slice(None),) * (self.dim() - len(keys)) + # keys += slices + # sparse_idx = [] + # dense_idx = [] + # new_dims = [] + # for s, ax in zip(keys, self._axes): + # if isinstance(ax, SparseAxis): + # sparse_idx.append(ax._ireduce(s)) + # new_dims.append(ax) + # else: + # islice = ax._ireduce(s) + # dense_idx.append(islice) + # new_dims.append(ax.reduced(islice)) + # dense_idx = tuple(dense_idx) + + # def dense_op(array): + # as_numpy = array.get() + # blocked = np.block(_assemble_blocks(as_numpy, dense_idx)) + # return cupy.asarray(blocked) + + # out = Hist(*new_dims, label=self._label) + # if self._sumw2 is not None: + # out._init_sumw2() + # for sparse_key in self._sumw: + # if not all(k in idx for k, idx in zip(sparse_key, sparse_idx)): + # continue + # if sparse_key in out._sumw: + # out._sumw[sparse_key] += dense_op(self._sumw[sparse_key]) + # if self._sumw2 is not None: + # out._sumw2[sparse_key] += dense_op(self._sumw2[sparse_key]) + # else: + # out._sumw[sparse_key] = dense_op(self._sumw[sparse_key]).copy() + # if self._sumw2 is not None: + # out._sumw2[sparse_key] = dense_op(self._sumw2[sparse_key]).copy() + # return out + + def fill(self, *args, weight=None): """Fill sum of weights from columns Parameters @@ -427,26 +227,23 @@ def fill(self, **values): >>> h.fill(species='ducks', x=np.random.normal(size=10), y=np.random.normal(size=10), weight=np.ones(size=10) * 3) """ - weight = values.pop("weight", None) - if isinstance(weight, (awkward.Array, cupy.ndarray, np.ndarray)): - weight = cupy.array(weight) - if isinstance(weight, numbers.Number): - weight = cupy.atleast_1d(weight) - if not all(d.name in values for d in self._axes): - missing = ", ".join(d.name for d in self._axes if d.name not in values) - raise ValueError( - f"Not all axes specified for {self!r}. Missing: {missing}" - ) - if not all(name in self._axes for name in values): - extra = ", ".join(name for name in values if name not in self._axes) - raise ValueError( - f"Unrecognized axes specified for {self!r}. Extraneous: {extra}" - ) + if not all(isinstance(a, (cupy.ndarray, str)) for a in args): + raise TypeError("pass CuPy arrays") + if weight is not None and not isinstance(weight, cupy.ndarray): + raise TypeError("pass CuPy arrays") + + if len(self._axes) != len(args): + raise ValueError("mismatching dimensions for provided values and axes") if weight is not None and self._sumw2 is None: self._init_sumw2() - sparse_key = tuple(d.index(values[d.name]) for d in self.sparse_axes()) + sparse_key = tuple( + d.index(value) + for d, value in zip(self._axes, args) + if isinstance(d, SparseAxis) + ) + if sparse_key not in self._sumw: self._sumw[sparse_key] = cupy.zeros( shape=self._dense_shape, dtype=self._dtype @@ -458,8 +255,8 @@ def fill(self, **values): if self.dense_dim() > 0: dense_indices = tuple( - cupy.asarray(d.index(values[d.name])) - for d in self._axes + cupy.asarray(d.index(value)) + for d, value in zip(self._axes, args) if isinstance(d, DenseAxis) ) xy = cupy.atleast_1d( @@ -492,322 +289,33 @@ def fill(self, **values): if self._sumw2 is not None: self._sumw2[sparse_key] += 1.0 - def sum(self, *axes, **kwargs): - """Integrates out a set of axes, producing a new histogram - - Parameters - ---------- - ``*axes`` - Positional list of axes to integrate out (either a string or an Axis object) - - overflow : {'none', 'under', 'over', 'all', 'allnan'}, optional - How to treat the overflow bins in the sum. Only applies to dense axes. - 'all' includes both under- and over-flow but not nan-flow bins. - Default is 'none'. - """ - overflow = kwargs.pop("overflow", "none") - axes = [self.axis(ax) for ax in axes] - reduced_dims = [ax for ax in self._axes if ax not in axes] - out = Hist(self._label, *reduced_dims, dtype=self._dtype) - if self._sumw2 is not None: - out._init_sumw2() - - sparse_drop = [] - dense_slice = [slice(None)] * self.dense_dim() - dense_sum_dim = [] - for axis in axes: - if isinstance(axis, DenseAxis): - idense = self._idense(axis) - dense_sum_dim.append(idense) - dense_slice[idense] = _overflow_behavior(overflow) - elif isinstance(axis, SparseAxis): - isparse = self._isparse(axis) - sparse_drop.append(isparse) - dense_slice = tuple(dense_slice) - dense_sum_dim = tuple(dense_sum_dim) - - def dense_op(array): - if len(dense_sum_dim) > 0: - return np.sum(array[dense_slice], axis=dense_sum_dim) - return array - - for key in self._sumw: - new_key = tuple(k for i, k in enumerate(key) if i not in sparse_drop) - if new_key in out._sumw: - out._sumw[new_key] += dense_op(self._sumw[key]) - if self._sumw2 is not None: - out._sumw2[new_key] += dense_op(self._sumw2[key]) - else: - out._sumw[new_key] = dense_op(self._sumw[key]).copy() - if self._sumw2 is not None: - out._sumw2[new_key] = dense_op(self._sumw2[key]).copy() - return out - - def project(self, *axes, **kwargs): - """Project histogram onto a subset of its axes - - Parameters - ---------- - ``*axes`` : str or Axis - Positional list of axes to project on to - overflow : str - Controls behavior of integration over remaining axes. - See `sum` description for meaning of allowed values - Default is to *not include* overflow bins - """ - overflow = kwargs.pop("overflow", "none") - axes = [self.axis(ax) for ax in axes] - toremove = [ax for ax in self.axes() if ax not in axes] - return self.sum(*toremove, overflow=overflow) - - def integrate(self, axis_name, int_range=None, overflow="none"): - """Integrates current histogram along one dimension - - Parameters - ---------- - axis_name : str or Axis - Which dimension to reduce on - int_range : slice - Any slice, list, string, or other object that the axis will understand - Default is to integrate over the whole range - overflow : str - See `sum` description for meaning of allowed values - Default is to *not include* overflow bins - - """ - if int_range is None: - int_range = slice(None) - axis = self.axis(axis_name) - full_slice = tuple( - slice(None) if ax != axis else int_range for ax in self._axes - ) - if isinstance(int_range, Interval): - # Handle overflow intervals nicely - if int_range.nan(): - overflow = "justnan" - elif int_range.lo == -np.inf: - overflow = "under" - elif int_range.hi == np.inf: - overflow = "over" - return self[full_slice].sum( - axis.name, overflow=overflow - ) # slice may make new axis, use name - - def remove(self, bins, axis): - """Remove bins from a sparse axis - - Parameters - ---------- - bins : iterable - A list of bin identifiers to remove - axis : str or Axis - Axis name or SparseAxis instance - - Returns a *copy* of the histogram with specified bins removed, not an in-place operation - """ - axis = self.axis(axis) - if not isinstance(axis, SparseAxis): - raise NotImplementedError( - "Hist.remove() only supports removing items from a sparse axis." - ) - bins = [axis.index(binid) for binid in bins] - keep = [binid.name for binid in self.identifiers(axis) if binid not in bins] - full_slice = tuple(slice(None) if ax != axis else keep for ax in self._axes) - return self[full_slice] - - def group(self, old_axes, new_axis, mapping, overflow="none"): - """Group a set of slices on old axes into a single new axis - - Parameters - ---------- - old_axes - Axis or tuple of axes which are being grouped - new_axis - A new sparse dimension definition, e.g. a `Cat` instance - mapping : dict - A mapping ``{'new_bin': (slice, ...), ...}`` where each - slice is on the axes being re-binned. In the case of - a single axis for ``old_axes``, ``{'new_bin': slice, ...}`` - is admissible. - overflow : str - See `sum` description for meaning of allowed values - Default is to *not include* overflow bins - - Returns a new histogram object - """ - if not isinstance(new_axis, SparseAxis): - raise TypeError( - "New axis must be a sparse axis. Note: Hist.group() signature has changed to group(old_axes, new_axis, ...)!" - ) - if new_axis in self.axes() and self.axis(new_axis) is new_axis: - raise RuntimeError( - "new_axis is already in the list of axes. Note: Hist.group() signature has changed to group(old_axes, new_axis, ...)!" - ) - if not isinstance(old_axes, tuple): - old_axes = (old_axes,) - old_axes = [self.axis(ax) for ax in old_axes] - old_indices = [i for i, ax in enumerate(self._axes) if ax in old_axes] - new_dims = [new_axis] + [ax for ax in self._axes if ax not in old_axes] - out = Hist(self._label, *new_dims, dtype=self._dtype) - if self._sumw2 is not None: - out._init_sumw2() - for new_cat in mapping: - the_slice = mapping[new_cat] - if not isinstance(the_slice, tuple): - the_slice = (the_slice,) - if len(the_slice) != len(old_axes): - raise Exception("Slicing does not match number of axes being rebinned") - full_slice = [slice(None)] * self.dim() - for idx, s in zip(old_indices, the_slice): - full_slice[idx] = s - full_slice = tuple(full_slice) - reduced_hist = self[full_slice].sum( - *tuple(ax.name for ax in old_axes), overflow=overflow - ) # slice may change old axis binning - new_idx = new_axis.index(new_cat) - for key in reduced_hist._sumw: - new_key = (new_idx, *key) - out._sumw[new_key] = reduced_hist._sumw[key] - if self._sumw2 is not None: - out._sumw2[new_key] = reduced_hist._sumw2[key] - return out - - def rebin(self, old_axis, new_axis): - """Rebin a dense axis - - This function will construct the mapping from old to new axis, and - constructs a new histogram, rebinning the sum of weights along that dimension. - - Note - ---- - No interpolation is performed, so the user must be sure the old - and new axes have compatible bin boundaries, e.g. that they evenly - divide each other. - - Parameters - ---------- - old_axis : str or Axis - Axis to rebin - new_axis : str or Axis or int - A DenseAxis object defining the new axis (e.g. a `Bin` instance). - If a number N is supplied, the old axis edges are downsampled by N, - resulting in a histogram with ``old_nbins // N`` bins. - - Returns a new `Hist` object. - """ - old_axis = self.axis(old_axis) - if isinstance(new_axis, numbers.Integral): - new_axis = Bin(old_axis.name, old_axis.label, old_axis.edges()[::new_axis]) - new_dims = [ax if ax != old_axis else new_axis for ax in self._axes] - out = Hist(self._label, *new_dims, dtype=self._dtype) - if self._sumw2 is not None: - out._init_sumw2() - - # would have been nice to use ufunc.reduceat, but we should support arbitrary reshuffling - idense = self._idense(old_axis) - - def view_ax(idx): - fullindex = [slice(None)] * self.dense_dim() - fullindex[idense] = idx - return tuple(fullindex) - - binmap = [new_axis.index(i) for i in old_axis.identifiers(overflow="allnan")] - - def dense_op(array): - anew = np.zeros(out._dense_shape, dtype=out._dtype) - for iold, inew in enumerate(binmap): - anew[view_ax(inew)] += array[view_ax(iold)] - return anew - - for key in self._sumw: - out._sumw[key] = dense_op(self._sumw[key]) - if self._sumw2 is not None: - out._sumw2[key] = dense_op(self._sumw2[key]) - return out + def _view_dim(self, arr, flow): + if self.dense_dim() == 0: + return arr + else: + return arr[tuple(_overflow_behavior(flow) for _ in range(self.dense_dim()))] - def values(self, sumw2=False, overflow="none"): + def values(self, flow=False): """Extract the sum of weights arrays from this histogram Parameters ---------- - sumw2 : bool - If True, frequencies is a tuple of arrays (sum weights, sum squared weights) - overflow + flow : bool See `sum` description for meaning of allowed values - - Returns a mapping ``{(sparse identifier, ...): np.array(...), ...}`` - where each array has dimension `dense_dim` and shape matching - the number of bins per axis, plus 0-3 overflow bins depending - on the ``overflow`` argument. """ - def view_dim(arr): - if self.dense_dim() == 0: - return arr - else: - return arr[ - tuple(_overflow_behavior(overflow) for _ in range(self.dense_dim())) - ] - - out = {} - for sparse_key in self._sumw: - id_key = tuple(ax[k] for ax, k in zip(self.sparse_axes(), sparse_key)) - if sumw2: - if self._sumw2 is not None: - w2 = view_dim(self._sumw2[sparse_key]) - else: - w2 = view_dim(self._sumw[sparse_key]) - out[id_key] = (view_dim(self._sumw[sparse_key]), w2) - else: - out[id_key] = view_dim(self._sumw[sparse_key]) - return out - - def scale(self, factor, axis=None): - """Scale histogram in-place by factor - - Parameters - ---------- - factor : float or dict - A number or mapping of identifier to number - axis : optional - Which (sparse) axis the dict applies to, may be a tuples of axes. - The dict keys must follow the same structure. - - Examples - -------- - This function is useful to quickly reweight according to some - weight mapping along a sparse axis, such as the ``species`` axis - in the `Hist` example: + return ( + self._view_dim(cupy.zeros(shape=self._dense_shape), flow) + if self._sumw == {} + else self._view_dim(next(iter(self._sumw.values())), flow) + ) - >>> h.scale({'ducks': 0.3, 'geese': 1.2}, axis='species') - >>> h.scale({('ducks',): 0.5}, axis=('species',)) - >>> h.scale({('geese', 'honk'): 5.0}, axis=('species', 'vocalization')) - """ - if self._sumw2 is None: - self._init_sumw2() - if isinstance(factor, numbers.Number) and axis is None: - for key in self._sumw: - self._sumw[key] *= factor - self._sumw2[key] *= factor**2 - elif isinstance(factor, dict): - if not isinstance(axis, tuple): - axis = (axis,) - factor = {(k,): v for k, v in factor.items()} - axis = tuple(map(self.axis, axis)) - isparse = list(map(self._isparse, axis)) - factor = { - tuple(a.index(e) for a, e in zip(axis, k)): v for k, v in factor.items() - } - for key in self._sumw: - factor_key = tuple(key[i] for i in isparse) - if factor_key in factor: - self._sumw[key] *= factor[factor_key] - self._sumw2[key] *= factor[factor_key] ** 2 - elif isinstance(factor, np.ndarray): - axis = self.axis(axis) - raise NotImplementedError("Scale dense dimension by a factor") - else: - raise TypeError("Could not interpret scale factor") + def variance(self, flow=False): + return ( + None + if self._sumw2 is None + else self._view_dim(next(iter(self._sumw2.values())), flow) + ) def identifiers(self, axis, overflow="none"): """Return a list of identifiers for an axis @@ -842,7 +350,7 @@ def to_boost(self): newaxes = [] for axis in self.axes(): - if isinstance(axis, Bin) and axis._uniform: + if isinstance(axis, Regular): newaxis = boost_histogram.axis.Regular( axis._bins, axis._lo, @@ -853,7 +361,7 @@ def to_boost(self): newaxis.name = axis.name newaxis.label = axis.label newaxes.append(newaxis) - elif isinstance(axis, Bin) and not axis._uniform: + elif isinstance(axis, Variable): newaxis = boost_histogram.axis.Variable( axis.edges(), underflow=True, @@ -889,19 +397,32 @@ def expandkey(key): else: yield slice(None) + view = out.view(flow=True) + nonan = [slice(None, -1, None)] * (len(newaxes)) if self._sumw2 is None: - values = self.values(overflow="all") - for sparse_key, sumw in values.items(): - index = tuple(expandkey(sparse_key)) - view = out.view(flow=True) - view[index] = sumw.get() + view[:] = self.values(flow=True)[(*nonan,)].get() else: - values = self.values(sumw2=True, overflow="all") - for sparse_key, (sumw, sumw2) in values.items(): - index = tuple(expandkey(sparse_key)) - view = out.view(flow=True) - view[index].value = sumw.get() - view[index].variance = sumw2.get() + view[:] = cupy.stack( + ( + self.values(flow=True)[(*nonan,)], + self.variance(flow=True)[(*nonan,)], + ), + axis=len(newaxes), + ).get() + + # if self._sumw2 is None: + # values = self.values(overflow="all") + # for sparse_key, sumw in values.items(): + # index = tuple(expandkey(sparse_key)) + # view = out.view(flow=True) + # view[index] = sumw.get() + # else: + # values = self.values(sumw2=True, overflow="all") + # for sparse_key, (sumw, sumw2) in values.items(): + # index = tuple(expandkey(sparse_key)) + # view = out.view(flow=True) + # view[index].value = sumw.get() + # view[index].variance = sumw2.get() return out