Skip to content

Commit

Permalink
refactor!: match API with Scikit-HEP
Browse files Browse the repository at this point in the history
  • Loading branch information
Saransh-cpp committed Aug 15, 2024
1 parent fabeebd commit 450a210
Show file tree
Hide file tree
Showing 4 changed files with 342 additions and 682 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ repos:
rev: "1.18.0"
hooks:
- id: blacken-docs
args: ["-E"]
additional_dependencies: [black==24.*]

- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
82 changes: 82 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,88 @@

The package is under active development at the moment.

## Example

```py
In [1]: import cuda_histogram; import cupy as cp

In [2]: a = cuda_histogram.axis.Regular(10, 0, 1)

In [3]: b = cuda_histogram.axis.Variable([0, 2, 3, 6])

In [4]: c = cuda_histogram.Hist(a, b)

In [5]: a, b, c
Out[5]:
(Regular(10, 0, 1),
Variable([0. 2. 3. 6.]),
Hist(Regular(10, 0, 1), Variable([0. 2. 3. 6.])))

In [6]: c.fill(cp.random.normal(size=1_000_000), cp.random.normal(size=1_000_000))

In [7]: c.values(), type(c.values())
Out[7]:
(array([[28493., 1282., 96.],
[29645., 1366., 91.],
[30465., 1397., 80.],
[31537., 1473., 81.],
[32608., 1454., 102.],
[33015., 1440., 83.],
[33992., 1482., 87.],
[34388., 1482., 111.],
[34551., 1517., 90.],
[35177., 1515., 85.]]),
cupy.ndarray)

In [8]: c[0, 0], type(c[0, 0])
Out[8]: (array(28493.), cupy.ndarray)

In [9]: c[0:2, 0], type(c[0, 0]) # should ideally return a reduced histogram
Out[9]: (array([28493., 29645.]), cupy.ndarray)

In [10]: c.to_boost()
Out[10]:
Histogram(
Regular(10, 0, 1),
Variable([0, 2, 3, 6]),
storage=Double()) # Sum: 339185.0 (945991.0 with flow)

In [11]: c.to_boost().values(), type(c.to_boost().values())
Out[11]:
(array([[28493., 1282., 96.],
[29645., 1366., 91.],
[30465., 1397., 80.],
[31537., 1473., 81.],
[32608., 1454., 102.],
[33015., 1440., 83.],
[33992., 1482., 87.],
[34388., 1482., 111.],
[34551., 1517., 90.],
[35177., 1515., 85.]]),
numpy.ndarray)

In [12]: c.to_hist()
Out[12]:
Hist(
Regular(10, 0, 1, label='Axis 0'),
Variable([0, 2, 3, 6], label='Axis 1'),
storage=Double()) # Sum: 339185.0 (945991.0 with flow)

In [13]: c.to_hist().values(), type(c.to_hist().values())
Out[13]:
(array([[28493., 1282., 96.],
[29645., 1366., 91.],
[30465., 1397., 80.],
[31537., 1473., 81.],
[32608., 1454., 102.],
[33015., 1440., 83.],
[33992., 1482., 87.],
[34388., 1482., 111.],
[34551., 1517., 90.],
[35177., 1515., 85.]]),
numpy.ndarray)
```

<!-- prettier-ignore-start -->
[actions-badge]: https://github.com/Saransh-cpp/cuda-histogram/workflows/CI/badge.svg
[actions-link]: https://github.com/Saransh-cpp/cuda-histogram/actions
Expand Down
160 changes: 108 additions & 52 deletions src/cuda_histogram/axis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import numbers
import re
import warnings
from typing import Iterable

import awkward
import cupy
import numpy as np

__all__: list[str] = [
"Bin",
"Regular",
"Variable",
"Cat",
Expand All @@ -32,21 +32,11 @@
)


def _overflow_behavior(overflow):
if overflow == "none":
def _overflow_behavior(overflow: bool):
if not overflow:
return slice(1, -2)
elif overflow == "under":
return slice(None, -2)
elif overflow == "over":
return slice(1, -1)
elif overflow == "all":
return slice(None, -1)
elif overflow == "allnan":
return slice(None)
elif overflow == "justnan":
return slice(-1, None)
else:
raise ValueError(f"Unrecognized overflow behavior: {overflow}")
return slice(None, None)


@functools.total_ordering
Expand Down Expand Up @@ -397,9 +387,6 @@ class DenseAxis(Axis):
**_ireduce(slice)** - return a slice or list of indices, input slice to be interpred as values
**reduced(islice)** - return a new axis with binning corresponding to the index slice (from _ireduce)
TODO: hasoverflow(), not all dense axes might have an overflow concept,
currently it is implicitly assumed they do (as the only dense type is a numeric axis)
"""


Expand All @@ -408,25 +395,24 @@ class Bin(DenseAxis):
Parameters
----------
name : str
is used as a keyword in histogram filling, immutable
label : str
describes the meaning of the axis, can be changed
n_or_arr : int or list or np.ndarray
Integer number of bins, if uniform binning. Otherwise, a list or
numpy 1D array of bin boundaries.
lo : float, optional
lower boundary of bin range, if uniform binning
hi : float, optional
upper boundary of bin range, if uniform binning
name : str
is used as a keyword in histogram filling, immutable
label : str
describes the meaning of the axis, can be changed
This axis will generate frequencies for n+3 bins, special bin indices:
``0 = underflow, n+1 = overflow, n+2 = nanflow``
Bin boundaries are [lo, hi)
"""

def __init__(self, name, label, n_or_arr, lo=None, hi=None):
super().__init__(name, label)
def __init__(self, n_or_arr, lo=None, hi=None, *, name="", label=""):
self._lazy_intervals = None
if isinstance(n_or_arr, (list, np.ndarray, cupy.ndarray)):
self._uniform = False
Expand All @@ -440,10 +426,6 @@ def __init__(self, name, label, n_or_arr, lo=None, hi=None):
self._interval_bins = cupy.r_[-cupy.inf, self._bins, cupy.nan]
self._bin_names = np.full(self._interval_bins[:-1].size, None)
elif isinstance(n_or_arr, numbers.Integral):
if lo is None or hi is None:
raise TypeError(
"Interpreting n_or_arr as uniform binning, please specify lo and hi values"
)
self._uniform = True
self._lo = lo
self._hi = hi
Expand All @@ -455,10 +437,16 @@ def __init__(self, name, label, n_or_arr, lo=None, hi=None):
cupy.nan,
]
self._bin_names = np.full(self._interval_bins[:-1].size, None)
else:
raise TypeError(
f"Cannot understand n_or_arr (nbins or binning array) type {n_or_arr!r}"
)
self._label = label
self._name = name

def __repr__(self):
class_name = self.__class__.__name__
return (
f"{class_name}({self._bins[:-1]})"
if not self._uniform
else f"{class_name}{self._bins, self._lo, self._hi}"
)

@property
def _intervals(self):
Expand All @@ -480,11 +468,6 @@ def __getstate__(self):
return self.__dict__

def __setstate__(self, d):
if "_intervals" in d: # convert old hists to new serialization format
_old_intervals = d.pop("_intervals")
interval_bins = [i._lo for i in _old_intervals] + [_old_intervals[-1]._hi]
d["_interval_bins"] = cupy.array(interval_bins)
d["_bin_names"] = np.array([interval._label for interval in _old_intervals])
if "_interval_bins" in d and "_bin_names" not in d:
d["_bin_names"] = np.full(d["_interval_bins"][:-1].size, None)
self.__dict__ = d
Expand All @@ -501,7 +484,9 @@ def index(self, identifier):
Returns an integer corresponding to the index in the axis where the histogram would be filled.
The integer range includes flow bins: ``0 = underflow, n+1 = overflow, n+2 = nanflow``
"""
isarray = isinstance(identifier, (awkward.Array, cupy.ndarray, np.ndarray))
isarray = isinstance(
identifier, (awkward.Array, cupy.ndarray, np.ndarray, list)
)
if isarray or isinstance(identifier, numbers.Number):
identifier = awkward.to_cupy(identifier) # cupy.asarray(identifier)
if self._uniform:
Expand Down Expand Up @@ -638,10 +623,8 @@ def _ireduce(self, the_slice):

def reduced(self, islice):
"""Return a new axis with reduced binning
The new binning corresponds to the slice made on this axis.
Overflow will be taken care of by ``Hist.__getitem__``
Parameters
----------
islice : slice
Expand Down Expand Up @@ -669,22 +652,23 @@ def reduced(self, islice):
# TODO: remove this once satisfied it works
rbins = (hi - lo) * self._bins / (self._hi - self._lo)
assert abs(bins - rbins) < 1e-14, "%d %f %r" % (bins, rbins, self)
return Bin(self._name, self._label, bins, lo, hi)
return Regular(bins, lo, hi, name=self._name, label=self._label)
else:
lo = None if islice.start is None else islice.start - 1
hi = -1 if islice.stop is None else islice.stop
bins = self._bins[slice(lo, hi)]
return Bin(self._name, self._label, bins)
return Variable(bins, name=self._name, label=self._label)

@property
def size(self):
"""Number of bins, including overflow (i.e. ``n + 3``)"""
if self._uniform:
return self._bins + 3
# (inf added at constructor)
return len(self._bins) + 1
"""Number of bins"""
return (
self._bins
if isinstance(self._bins, (int, np.integer, cupy.integer))
else len(self._bins)
)

def edges(self, overflow="none"):
def edges(self, flow=False):
"""Bin boundaries
Parameters
Expand All @@ -700,9 +684,9 @@ def edges(self, overflow="none"):
out = cupy.r_[
2 * out[0] - out[1], out, 2 * out[-1] - out[-2], 3 * out[-1] - 2 * out[-2]
]
return out[_overflow_behavior(overflow)]
return out[_overflow_behavior(flow)]

def centers(self, overflow="none"):
def centers(self, flow=False):
"""Bin centers
Parameters
Expand All @@ -711,9 +695,81 @@ def centers(self, overflow="none"):
Create overflow and/or underflow bins by adding a bin of same width to each end.
See `Hist.sum` description for the allowed values.
"""
edges = self.edges(overflow)
edges = self.edges(flow)
return (edges[:-1] + edges[1:]) / 2

def identifiers(self, overflow="none"):
def identifiers(self, flow=False):
"""List of `Interval` identifiers"""
return self._intervals[_overflow_behavior(overflow)]
return self._intervals[_overflow_behavior(flow)]


class Regular(Bin):
"""A binned axis with name, label, and binning.
Parameters
----------
name : str
is used as a keyword in histogram filling, immutable
label : str
describes the meaning of the axis, can be changed
n_or_arr : int or list or np.ndarray
Integer number of bins, if uniform binning. Otherwise, a list or
numpy 1D array of bin boundaries.
lo : float, optional
lower boundary of bin range, if uniform binning
hi : float, optional
upper boundary of bin range, if uniform binning
This axis will generate frequencies for n+3 bins, special bin indices:
``0 = underflow, n+1 = overflow, n+2 = nanflow``
Bin boundaries are [lo, hi)
"""

def __init__(
self,
bins: int,
start: float,
stop: float,
*,
name: str = "",
label: str = "",
) -> None:
super().__init__(
bins,
start,
stop,
name=name,
label=label,
)


class Variable(Bin):
"""A binned axis with name, label, and binning.
Parameters
----------
name : str
is used as a keyword in histogram filling, immutable
label : str
describes the meaning of the axis, can be changed
n_or_arr : int or list or np.ndarray
Integer number of bins, if uniform binning. Otherwise, a list or
numpy 1D array of bin boundaries.
lo : float, optional
lower boundary of bin range, if uniform binning
hi : float, optional
upper boundary of bin range, if uniform binning
This axis will generate frequencies for n+3 bins, special bin indices:
``0 = underflow, n+1 = overflow, n+2 = nanflow``
Bin boundaries are [lo, hi)
"""

def __init__(
self,
edges: Iterable[float],
*,
name: str = "",
label: str = "",
) -> None:
super().__init__(edges, name=name, label=label)
Loading

0 comments on commit 450a210

Please sign in to comment.