Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New histogram collection: VectorizedHistCollection #173

Merged
merged 30 commits into from
Aug 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
4f90f39
Remove pileup sel
Jul 10, 2019
13fc486
fixed pep8
kreczko May 3, 2019
b9bbcba
added vectorized version of all2017.yml
kreczko May 31, 2019
b5732c7
added test for boost histogram
kreczko Jul 9, 2019
90c67ba
added aghast and boost_histogram to requirements
kreczko Jul 9, 2019
5946a24
added draft and first tests for vectorized Histogram collection
kreczko Jul 9, 2019
01e5951
logger.warn (deprecated) -> logger.warning
kreczko Jul 9, 2019
aecf64c
fixed "len" for 1-dim collections
kreczko Jul 9, 2019
c01c49d
added VectorizedHistCollection.add
kreczko Jul 10, 2019
15bdaf3
added Bin and Hist proxy objects for histogram collection
kreczko Jul 17, 2019
f6326ca
implemented vectorized filling of histograms
kreczko Jul 17, 2019
e653da7
added VectorizedHistCollection.inner_fill()
kreczko Jul 17, 2019
8429436
fix pep8 issues
kreczko Jul 17, 2019
5133fa6
added filters to demo config
kreczko Jul 18, 2019
e1e8702
improved VectorizedHistCollection to handle numpy arrays
kreczko Jul 18, 2019
23d2b11
augmented demo analyzer with VectorizedHistCollection
kreczko Jul 18, 2019
8cde7ef
added hash to inner histogram name for VectorizedHistCollection if no…
kreczko Jul 18, 2019
ff54270
VectorizedHistCollection: making sure inner histogram is also filled
kreczko Jul 18, 2019
e643ec9
removed obsolete test
kreczko Jul 18, 2019
114f53c
CI: removed python 2.7 from tests
kreczko Jul 18, 2019
cefda36
fixed boost histogram tests
kreczko Jul 18, 2019
09178f4
added test for split_input and added awkward.JaggedArray check
kreczko Jul 18, 2019
88168fd
removed unused variable in _create_hist_names
kreczko Jul 19, 2019
51d44d3
fixed pep8 error in test_vectorized
kreczko Jul 19, 2019
838701e
removed cmsl1t.PY3 variable
kreczko Aug 2, 2019
841cea0
VectorizedHistCollection: count tuple as a valid iterable
kreczko Aug 2, 2019
da13c48
VectorizedHistCollection.add --> VectorizedHistCollection.insert
kreczko Aug 2, 2019
c1c9a7c
VectorizedHistCollection: replaced defaultdict with super() calls & u…
kreczko Aug 2, 2019
23147d3
added tests for different types of weights for VectorizedHistCollection
kreczko Aug 2, 2019
cb28ed4
extended VectorizedHistProxy to allow event weights and per-object we…
kreczko Aug 2, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ cache:
language: python

python:
- "2.7"
- "3.6"

env:
Expand Down
7 changes: 4 additions & 3 deletions cmsl1t/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import absolute_import
import logging
import os
from os import path
import logging


__version__ = '0.5.1'

Expand All @@ -22,8 +23,8 @@
logger.addHandler(ch)

if 'PROJECT_ROOT' not in os.environ:
logger.warn("Could not find environmental variable 'PROJECT_ROOT'")
logger.warn("You should to run 'source setup.sh' first!")
logger.warning("Could not find environmental variable 'PROJECT_ROOT'")
logger.warning("You should to run 'source setup.sh' first!")
HERE = path.dirname(path.abspath(__file__))
PROJECT_ROOT = path.abspath(path.join(HERE, path.pardir))
else:
Expand Down
20 changes: 14 additions & 6 deletions cmsl1t/analyzers/demo_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,38 @@
import numpy as np

from .BaseAnalyzer import BaseAnalyzer
from cmsl1t.collections import EfficiencyCollection
from cmsl1t.collections import EfficiencyCollection, VectorizedHistCollection


class Analyzer(BaseAnalyzer):

def __init__(self, **kwargs):
super(Analyzer, self).__init__(**kwargs)

self.met_calcs = dict(
RecalcL1EmuMETNot28=dict(
self.met_calcs = {
self.name + '_' + 'RecalcL1EmuMETNot28': dict(
title="Emulated MET, |ieta|<28",
attr='l1MetNot28'),
RecalcL1EmuMETNot28HF=dict(
self.name + '_' + 'RecalcL1EmuMETNot28HF': dict(
title="Emulated MET, |ieta|!=28",
attr='l1MetNot28HF'),
)
}

def prepare_for_events(self, reader):
bins = np.arange(0, 200, 25)
thresholds = [70, 90, 110]
puBins = list(range(0, 50, 10)) + [999]

self.hists = VectorizedHistCollection(innerBins=puBins, innerLabel='pu')

self.efficiencies = EfficiencyCollection(pileupBins=puBins)
add_met_variable = partial(
self.efficiencies.add_variable,
bins=bins, thresholds=thresholds)
list(map(add_met_variable, self.met_calcs))

for met, config in self.met_calcs.items():
self.hists.insert(met, bins=bins, title=config['title'])
return True

def reload_histograms(self, input_file):
Expand All @@ -43,16 +48,19 @@ def reload_histograms(self, input_file):
def fill_histograms(self, entry, event):
pileup = event['Vertex_nVtx']
self.efficiencies.set_pileup(pileup)
self.hists.inner_fill(pileup)

offlineMetBE = event.Sums_caloMetBE
for name, config in self.met_calcs.items():
onlineMet = event[config['attr']]
onlineMet = onlineMet.mag
self.efficiencies.fill_array(name, offlineMetBE, onlineMet)
self.hists[pileup][name].fill(offlineMetBE)
return True

def write_histograms(self):
self.efficiencies.to_root(self.get_histogram_filename())
self.efficiencies.to_root(self.get_histogram_filename().replace('.root', '_efficiencies.root'))
self.hists.to_root(self.get_histogram_filename())
return True

def make_plots(self):
Expand Down
6 changes: 3 additions & 3 deletions cmsl1t/analyzers/jetMet_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def __init__(self, **kwargs):

lumiMuDict = dict()
run_lumi_csv = os.path.join(cmsl1t.PROJECT_ROOT, 'run_lumi.csv')
with open(run_lumi_csv) as runLumiFile:
with open(run_lumi_csv, 'rb') as runLumiFile:
reader = csv.reader(runLumiFile, delimiter=',')
for line in reader:
lumiMuDict[(int(line[1]), int(line[2]))] = float(line[3])
Expand Down Expand Up @@ -361,8 +361,8 @@ def fill_histograms(self, entry, event):
# pileup = self._lumiMu[(event['run'], event['lumi'])]
pileup = 51
# print pileup
if pileup >= 60 or pileup < 50:
return True
# if pileup >= 60 or pileup < 50:
# return True
kreczko marked this conversation as resolved.
Show resolved Hide resolved

for name in self._sumTypes:
if 'pfMET' in name and not pfMetFilter(event):
Expand Down
2 changes: 2 additions & 0 deletions cmsl1t/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from .by_pileup import HistogramsByPileUpCollection
from .resolution import ResolutionCollection
from .efficiency import EfficiencyCollection
from .vectorized import VectorizedHistCollection

__all__ = [
'BaseHistCollection',
'HistogramsByPileUpCollection',
'ResolutionCollection',
'EfficiencyCollection',
'VectorizedHistCollection',
]
12 changes: 6 additions & 6 deletions cmsl1t/collections/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
logger = logging.getLogger(__name__)


def create_n_dim_dict(dimensions, initiaValue=0):
def create_n_dim_dict(dimensions, initialValue=0):
if dimensions < 1:
return initiaValue
factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initiaValue=initiaValue)
return initialValue
factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initialValue=initialValue)
return defaultdict(factory)


Expand All @@ -40,20 +40,20 @@ def create_n_dim_dict(dimensions, initiaValue=0):

def len_n_dim_dict(dictionary, dimensions):
if dimensions <= 1:
return len(dictionary)
return len(dictionary.keys())
kreczko marked this conversation as resolved.
Show resolved Hide resolved
return sum(len_n_dim_dict(v, dimensions - 1)
for v in six.itervalues(dictionary))


class BaseHistCollection(defaultdict):

def __init__(self, dimensions, initiaValue=0):
def __init__(self, dimensions, initialValue=0):
'''
For each dimension create a dictionary
'''
# TODO: add possibility for different lambda expresions for each
# dimension. This will allow to have custom dicts in certain dimensions
factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initiaValue=initiaValue)
factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initialValue=initialValue)
if sys.version_info[0] < 3:
defaultdict.__init__(self, factory)
else:
Expand Down
2 changes: 1 addition & 1 deletion cmsl1t/collections/by_pileup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def add(self, hist_name, bins=[]):
'No bins specified for histogram {0}'.format(hist_name))

if hist_name in self[self._pileupBins[0]].keys():
logger.warn('Histogram {0} already exists!'.format(hist_name))
logger.warning('Histogram {0} already exists!'.format(hist_name))
return
hist_names = []
add_name = hist_names.append
Expand Down
6 changes: 3 additions & 3 deletions cmsl1t/collections/efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def add_variable(self, variable, bins, thresholds):
"""
# TODO: this will no longer work since 1st dimension is pileup
if variable in self.keys():
logger.warn('Variable {0} already exists!')
logger.warning('Variable {0} already exists!')
return
self._thresholds[variable] = thresholds
hist_names = []
Expand All @@ -123,7 +123,7 @@ def fill(self, hist_name, recoValue, l1Value, w=1.0):
logger.error('Histogram {0} does not exist'.format(hist_name))
return
if hist_name not in self._thresholds:
logger.warn('No valid current thresholds.')
logger.warning('No valid current thresholds.')
for threshold in self._thresholds[hist_name]:
h[threshold].fill(recoValue, l1Value, w)

Expand All @@ -136,7 +136,7 @@ def fill_array(self, hist_name, recoValue, l1Value, w=None):
logger.error('Histogram {0} does not exist'.format(hist_name))
return
if hist_name not in self._thresholds:
logger.warn('No valid current thresholds.')
logger.warning('No valid current thresholds.')
for threshold in self._thresholds[hist_name]:
h[threshold].fill_array(recoValue, l1Value, w)

Expand Down
4 changes: 2 additions & 2 deletions cmsl1t/collections/resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ def fill(self, hist_name, x, w=1.0):
logger.error('Histogram {0} does not exist'.format(hist_name))
return
if not self._currentRegions:
logger.warn(
logger.warning(
'No valid current regions. Did you set_region_by_eta()?')
for region in self._currentRegions:
h[region].fill(x, w)

def add_variable(self, variable, bins=[]):
from rootpy.plotting import Hist
if variable in self.keys():
logger.warn('Variable {0} already exists!')
logger.warning('Variable {0} already exists!')
return
hist_names = []
add_name = hist_names.append
Expand Down
171 changes: 171 additions & 0 deletions cmsl1t/collections/vectorized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import awkward
from collections import defaultdict
import logging
import numpy as np
import random
from rootpy.plotting import Hist

from . import BaseHistCollection
from ..utils.iterators import pairwise
from ..io import to_root

logger = logging.getLogger(__name__)


def extend(arr1, starts, stops):
repeat = stops - starts
return np.repeat(arr1, repeat, axis=0)


def split_input(inner_indices, x, w):
content = x
weights = w
if hasattr(x, 'starts'):
inner_indices = extend(inner_indices, x.starts, x.stops)
content = x.content
if hasattr(w, 'starts'):
weights = w.content

if np.size(weights) < np.size(content) and hasattr(x, 'starts'):
weights = extend(weights, x.starts, x.stops)

for u in np.unique(inner_indices):
mask = inner_indices == u
if not isinstance(mask, (tuple, list, np.ndarray, np.generic)):
mask = np.array(mask)
yield u, content[mask], weights[mask]


class VectorizedHistCollection(BaseHistCollection):

def __init__(self, innerBins, innerLabel='inner', **kwargs):
# if we want to generalize to N dim, innerBins needs to be an array of innerBins
# TODO: last dimension should probably be a normal dictionary
dimensions = kwargs.pop('dimensions', 2)
self._name = kwargs.pop('name', str(hex(random.getrandbits(128)))[2:10])
self._execute_before_write = kwargs.pop('execute_before_write', [])
super(VectorizedHistCollection, self).__init__(dimensions)

self._innerBins = innerBins
self._innerLabel = innerLabel
self._innerHist = Hist(innerBins, name=innerLabel + '_' + self._name)

def __getitem__(self, key):
if not isinstance(key, (tuple, list, np.ndarray, np.generic)):
key = np.array(key)
real_keys = self._get_inner_indices(key)
return VectorizedBinProxy(self, real_keys)
# return [defaultdict.__getitem__(self, k) for k in real_keys.tolist()]

def _get_inner_indices(self, values):
'''
Returns the pileup bin corresponding to the provided pileup value.
- bin 0 is underflow
- bin len(innerBins) is overflow

:Example:
>>> hists = VectorizedHistCollection(innerBins=[0,10,15,20,30,999])
>>> hists._get_inner_indices([1, 11, 1111]) # returns [1, 2, 6]
'''
return np.digitize(values, self._innerBins)

def insert(self, name, bins, hist_type=Hist, **kwargs):
title = kwargs.pop('title', name)
bins = np.asarray(bins)
if bins.size == 0:
logger.error(
'No bins specified for histogram {0}'.format(name))

if name in super(VectorizedHistCollection, self).__getitem__(1):
logger.warning('Histogram {0} already exists!'.format(name))
return
names = []
add_name = names.append

for i, hist_name in enumerate(self._create_hist_names(name)):
__current_slice = super(VectorizedHistCollection, self).__getitem__(i + 1)
if i + 1 not in self or hist_name not in __current_slice:
add_name(hist_name)
__current_slice[hist_name] = hist_type(bins, name=hist_name, title=title)
logger.debug('Created {0} histograms: {1}'.format(
len(names), ', '.join(names)))

def _create_hist_names(self, name):
for lowerEdge, upperEdge in pairwise(self._innerBins):
yield f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}"

def get_hist_name(self, name, innerIndex):
lowerEdge, upperEdge = self._innerBins[innerIndex - 1], self._innerBins[innerIndex]
return f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}"

def inner_fill(self, x, w=None):
if w is None:
w = np.ones(np.size(x))
self._innerHist.fill_array(x, w)

def to_root(self, output_file):
for func in self._execute_before_write:
func(self)
to_root([self, self._innerHist], output_file)


class VectorizedBinProxy(object):

def __init__(self, collection, inner_indices):
self.collection = collection
self._inner_indices = inner_indices
# self._inner_values = inner_values

def __getitem__(self, key):
# TODO, if key != string, return a BinProxy of the bin above
return VectorizedHistProxy(self, key)

def __add__(self, other):
if self.collection != other.collection:
msg = 'Cannot add VectorizedBinProxy for two different collections'
logger.error(msg)
raise ValueError(msg)
self._inner_indices = np.append(self._inner_indices, other._inner_indices)
return self

def __eq__(self, other):
if self.collection != other.collection:
msg = 'Cannot compare VectorizedBinProxy for two different collections'
logger.error(msg)
raise ValueError(msg)
return self._inner_indices.tolist() == other._inner_indices.tolist()

def flatten(self):
self._inner_indices = np.unique(self._inner_indices)
return self


class VectorizedHistProxy(object):

def __init__(self, bin_proxy, hist_name):
self._bin_proxy = bin_proxy
self._hist_name = hist_name

def _get_hist(self, inner_index):
hist_name = self._bin_proxy.collection.get_hist_name(self._hist_name, inner_index)
return defaultdict.__getitem__(self._bin_proxy.collection, inner_index)[hist_name]

def fill(self, x, w=None):
if not isinstance(x, (tuple, list, np.ndarray, awkward.JaggedArray)):
x = np.array(x)

if w is None:
n = np.size(x.content) if hasattr(x, 'content') else np.size(x)
w = np.ones(n)
for i, x_i, w_i in split_input(self._bin_proxy._inner_indices, x, w):
hist = self._get_hist(i)
hist.fill_array(x_i, w_i)

# class VectorizedEfficiencyProxy(object):

# def split_input():
# a = np.array([1, 12, 1, 10, 50, 10])
# b = np.array([10, 20, 30, 40, 50, 60])
# arg = a.argsort(kind='stable')
# offsets, = np.where(np.r_[True, np.diff(a[arg]) > 0])
# output = awkward.JaggedArray.fromoffsets(offsets.flatten(), awkward.IndexedArray(arg, b))
2 changes: 1 addition & 1 deletion cmsl1t/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def reduce_scope_for_analyzer(self, analyzer_name):
forbidden_local_settings = ['name', 'input_files']
for s in forbidden_local_settings:
if s in analyzer:
logger.warn('Setting {0} is forbidden in analysis::analyzers::{1}'.format(s, analyzer_name))
logger.warning('Setting {0} is forbidden in analysis::analyzers::{1}'.format(s, analyzer_name))
analyzer.pop(s)

global_settings = dict(
Expand Down
Loading