From 4f90f39a259264a9e69de5fffc2998a9d9a626fa Mon Sep 17 00:00:00 2001 From: hwtest Date: Wed, 10 Jul 2019 15:19:42 +0200 Subject: [PATCH 01/30] Remove pileup sel --- cmsl1t/analyzers/jetMet_analyzer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cmsl1t/analyzers/jetMet_analyzer.py b/cmsl1t/analyzers/jetMet_analyzer.py index d0f963032eb..eb248b56518 100644 --- a/cmsl1t/analyzers/jetMet_analyzer.py +++ b/cmsl1t/analyzers/jetMet_analyzer.py @@ -357,12 +357,10 @@ def fill_histograms(self, entry, event): if self._doGen: genNVtx = event.Generator_nVtx - # TODO: vectorize - # pileup = self._lumiMu[(event['run'], event['lumi'])] - pileup = 51 + pileup = self._lumiMu[(event['run'], event['lumi'])] # print pileup - if pileup >= 60 or pileup < 50: - return True + # if pileup >= 60 or pileup < 50: + # return True for name in self._sumTypes: if 'pfMET' in name and not pfMetFilter(event): From 13fc486241f6faa93ff3e77fcb1d658fa1c58958 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 3 May 2019 10:02:55 +0100 Subject: [PATCH 02/30] fixed pep8 --- cmsl1t/analyzers/jetMet_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmsl1t/analyzers/jetMet_analyzer.py b/cmsl1t/analyzers/jetMet_analyzer.py index eb248b56518..0b352b0579c 100644 --- a/cmsl1t/analyzers/jetMet_analyzer.py +++ b/cmsl1t/analyzers/jetMet_analyzer.py @@ -119,7 +119,7 @@ def __init__(self, **kwargs): lumiMuDict = dict() run_lumi_csv = os.path.join(cmsl1t.PROJECT_ROOT, 'run_lumi.csv') - with open(run_lumi_csv) as runLumiFile: + with open(run_lumi_csv, 'rb') as runLumiFile: reader = csv.reader(runLumiFile, delimiter=',') for line in reader: lumiMuDict[(int(line[1]), int(line[2]))] = float(line[3]) From b9bbcba8453c91a3eafc4681a0a5bf4c4dbd3e0f Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 31 May 2019 14:58:49 +0100 Subject: [PATCH 03/30] added vectorized version of all2017.yml --- cmsl1t/analyzers/jetMet_analyzer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmsl1t/analyzers/jetMet_analyzer.py b/cmsl1t/analyzers/jetMet_analyzer.py index 0b352b0579c..a04f1ebf580 100644 --- a/cmsl1t/analyzers/jetMet_analyzer.py +++ b/cmsl1t/analyzers/jetMet_analyzer.py @@ -357,7 +357,9 @@ def fill_histograms(self, entry, event): if self._doGen: genNVtx = event.Generator_nVtx - pileup = self._lumiMu[(event['run'], event['lumi'])] + # TODO: vectorize + # pileup = self._lumiMu[(event['run'], event['lumi'])] + pileup = 51 # print pileup # if pileup >= 60 or pileup < 50: # return True From b5732c7385d54e15e8145a40a939567f2d920991 Mon Sep 17 00:00:00 2001 From: kreczko Date: Tue, 9 Jul 2019 10:44:59 +0100 Subject: [PATCH 04/30] added test for boost histogram --- test/collections/test_boost_histogram.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 test/collections/test_boost_histogram.py diff --git a/test/collections/test_boost_histogram.py b/test/collections/test_boost_histogram.py new file mode 100644 index 00000000000..183d06d02ea --- /dev/null +++ b/test/collections/test_boost_histogram.py @@ -0,0 +1,29 @@ +# import aghast +import awkward +import boost.histogram as bh +import numpy as np + + +def test_fill(): + pileup_bins = [0, 10, 15, 20, 30, 999] + jet_pt_bins = [35, 90, 120] + hist = bh.histogram( + bh.axis.variable(pileup_bins), + bh.axis.variable(jet_pt_bins, bh.storage.weight()), + ) + + ets = awkward.fromiter([ + np.random.poisson(30, 5), + np.random.poisson(30, 2), + np.random.poisson(30, 3), + ]) + repeat = ets.stops - ets.starts + + weights = np.ones(len(ets)) + weights = np.repeat(weights, repeat, axis=0) + pileup = np.random.poisson(50, len(ets)) + pileup = np.repeat(pileup, repeat, axis=0) + # expand pileup to size ets + assert len(pileup) == len(ets.content) + # hist.fill(pileup, ets.content, bh.weight(weights)) + hist(pileup, ets.content) From 90c67baf0d9e979dbf6142010e372a47d80735cd Mon Sep 17 00:00:00 2001 From: kreczko Date: Tue, 9 Jul 2019 10:45:23 +0100 Subject: [PATCH 05/30] added aghast and boost_histogram to requirements --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 946323ba768..14aa53a4ea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +aghast +git+https://github.com/scikit-hep/boost-histogram.git@develop numpy matplotlib pandas==0.23 From 5946a249d3c2cd437515ddeff33517a059ae1294 Mon Sep 17 00:00:00 2001 From: kreczko Date: Tue, 9 Jul 2019 10:49:19 +0100 Subject: [PATCH 06/30] added draft and first tests for vectorized Histogram collection --- cmsl1t/collections/__init__.py | 2 ++ cmsl1t/collections/vectorized.py | 28 ++++++++++++++++++++++++++++ test/collections/test_vectorized.py | 19 +++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 cmsl1t/collections/vectorized.py create mode 100644 test/collections/test_vectorized.py diff --git a/cmsl1t/collections/__init__.py b/cmsl1t/collections/__init__.py index daab1251d3d..30722f35d3e 100644 --- a/cmsl1t/collections/__init__.py +++ b/cmsl1t/collections/__init__.py @@ -5,10 +5,12 @@ from .by_pileup import HistogramsByPileUpCollection from .resolution import ResolutionCollection from .efficiency import EfficiencyCollection +from .vectorized import VectorizedHistCollection __all__ = [ 'BaseHistCollection', 'HistogramsByPileUpCollection', 'ResolutionCollection', 'EfficiencyCollection', + 'VectorizedHistCollection', ] diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py new file mode 100644 index 00000000000..58f9ce82b26 --- /dev/null +++ b/cmsl1t/collections/vectorized.py @@ -0,0 +1,28 @@ +import numbda + +from . import BaseHistCollection + + +@numba.jit(nopython=True) +def extend(arr1, starts, stops): + repeat = stops - starts + return np.repeat(arr1, repeat, axis=0) + + +class VectorizedHistCollection(object): + + def __init__(self, innerBins): + self._innerBins = innerBins + self._innerHist = Hist(100, 0, 100, name='inner') + + def _get_inner_indices(self, values): + ''' + Returns the pileup bin corresponding to the provided pileup value. + - bin 0 is underflow + - bin len(innerBins) is overflow + + :Example: + >>> hists = VectorizedHistCollection(innerBins=[0,10,15,20,30,999]) + >>> hists._get_inner_indices([1, 11, 1111]) # returns [0, 1, 5] + ''' + return np.digitize(values, self._innerBins) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py new file mode 100644 index 00000000000..a0eef2bfcb9 --- /dev/null +++ b/test/collections/test_vectorized.py @@ -0,0 +1,19 @@ +import pytest +import numpy as np +from rootpy.plotting import Hist + +from cmsl1t.collections import VectorizedHistCollection + + +@pytest.mark.parametrize( + "values,expected", + [ + ([1, 12, 1, 50], [1, 2, 1, 5]), + ([1, 11, 1111], [1, 2, 6]), + ([-10, 1111, 20], [0, 6, 4]), + ]) +def test_inner_index(values, expected): + innerBins = np.array([0, 10, 15, 20, 30, 999]) + coll = VectorizedHistCollection(innerBins) + + np.testing.assert_array_equal(coll._get_inner_indices(values), expected) From 01e5951d9716f08e2f256d1dfd6d2d5a41c81b56 Mon Sep 17 00:00:00 2001 From: kreczko Date: Tue, 9 Jul 2019 13:27:53 +0100 Subject: [PATCH 07/30] logger.warn (deprecated) -> logger.warning --- cmsl1t/__init__.py | 4 ++-- cmsl1t/collections/by_pileup.py | 2 +- cmsl1t/collections/efficiency.py | 6 +++--- cmsl1t/collections/resolution.py | 4 ++-- cmsl1t/config.py | 2 +- cmsl1t/playground/eventreader.py | 2 +- cmsl1t/playground/resolution.py | 2 +- cmsl1t/producers/met.py | 2 +- cmsl1t/producers/met_vectorized.py | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cmsl1t/__init__.py b/cmsl1t/__init__.py index 910dca48cd5..8e8605ed081 100644 --- a/cmsl1t/__init__.py +++ b/cmsl1t/__init__.py @@ -22,8 +22,8 @@ logger.addHandler(ch) if 'PROJECT_ROOT' not in os.environ: - logger.warn("Could not find environmental variable 'PROJECT_ROOT'") - logger.warn("You should to run 'source setup.sh' first!") + logger.warning("Could not find environmental variable 'PROJECT_ROOT'") + logger.warning("You should to run 'source setup.sh' first!") HERE = path.dirname(path.abspath(__file__)) PROJECT_ROOT = path.abspath(path.join(HERE, path.pardir)) else: diff --git a/cmsl1t/collections/by_pileup.py b/cmsl1t/collections/by_pileup.py index 42d9f2c15f7..e5c6011182b 100644 --- a/cmsl1t/collections/by_pileup.py +++ b/cmsl1t/collections/by_pileup.py @@ -39,7 +39,7 @@ def add(self, hist_name, bins=[]): 'No bins specified for histogram {0}'.format(hist_name)) if hist_name in self[self._pileupBins[0]].keys(): - logger.warn('Histogram {0} already exists!'.format(hist_name)) + logger.warning('Histogram {0} already exists!'.format(hist_name)) return hist_names = [] add_name = hist_names.append diff --git a/cmsl1t/collections/efficiency.py b/cmsl1t/collections/efficiency.py index f897c7d1f93..e4d05e9b578 100644 --- a/cmsl1t/collections/efficiency.py +++ b/cmsl1t/collections/efficiency.py @@ -100,7 +100,7 @@ def add_variable(self, variable, bins, thresholds): """ # TODO: this will no longer work since 1st dimension is pileup if variable in self.keys(): - logger.warn('Variable {0} already exists!') + logger.warning('Variable {0} already exists!') return self._thresholds[variable] = thresholds hist_names = [] @@ -123,7 +123,7 @@ def fill(self, hist_name, recoValue, l1Value, w=1.0): logger.error('Histogram {0} does not exist'.format(hist_name)) return if hist_name not in self._thresholds: - logger.warn('No valid current thresholds.') + logger.warning('No valid current thresholds.') for threshold in self._thresholds[hist_name]: h[threshold].fill(recoValue, l1Value, w) @@ -136,7 +136,7 @@ def fill_array(self, hist_name, recoValue, l1Value, w=None): logger.error('Histogram {0} does not exist'.format(hist_name)) return if hist_name not in self._thresholds: - logger.warn('No valid current thresholds.') + logger.warning('No valid current thresholds.') for threshold in self._thresholds[hist_name]: h[threshold].fill_array(recoValue, l1Value, w) diff --git a/cmsl1t/collections/resolution.py b/cmsl1t/collections/resolution.py index 776ace482d2..acad3b9b8e8 100644 --- a/cmsl1t/collections/resolution.py +++ b/cmsl1t/collections/resolution.py @@ -55,7 +55,7 @@ def fill(self, hist_name, x, w=1.0): logger.error('Histogram {0} does not exist'.format(hist_name)) return if not self._currentRegions: - logger.warn( + logger.warning( 'No valid current regions. Did you set_region_by_eta()?') for region in self._currentRegions: h[region].fill(x, w) @@ -63,7 +63,7 @@ def fill(self, hist_name, x, w=1.0): def add_variable(self, variable, bins=[]): from rootpy.plotting import Hist if variable in self.keys(): - logger.warn('Variable {0} already exists!') + logger.warning('Variable {0} already exists!') return hist_names = [] add_name = hist_names.append diff --git a/cmsl1t/config.py b/cmsl1t/config.py index ac154578ff0..300b936ef05 100644 --- a/cmsl1t/config.py +++ b/cmsl1t/config.py @@ -314,7 +314,7 @@ def reduce_scope_for_analyzer(self, analyzer_name): forbidden_local_settings = ['name', 'input_files'] for s in forbidden_local_settings: if s in analyzer: - logger.warn('Setting {0} is forbidden in analysis::analyzers::{1}'.format(s, analyzer_name)) + logger.warning('Setting {0} is forbidden in analysis::analyzers::{1}'.format(s, analyzer_name)) analyzer.pop(s) global_settings = dict( diff --git a/cmsl1t/playground/eventreader.py b/cmsl1t/playground/eventreader.py index 9b9480cdc94..987601796db 100644 --- a/cmsl1t/playground/eventreader.py +++ b/cmsl1t/playground/eventreader.py @@ -367,7 +367,7 @@ def __init__(self, files, events=-1, load_trees=['event', 'upgrade']): try: chain = TreeChain(path, input_files, cache=True, events=events) except RuntimeError: - logger.warn("Cannot find tree: {0} in input file".format(path)) + logger.warning("Cannot find tree: {0} in input file".format(path)) continue self._names.append(name) self._trees.append(chain) diff --git a/cmsl1t/playground/resolution.py b/cmsl1t/playground/resolution.py index a7969f4d2c1..0b76f0909bd 100644 --- a/cmsl1t/playground/resolution.py +++ b/cmsl1t/playground/resolution.py @@ -36,7 +36,7 @@ def add_hist_set(self, prefix, regions=geo.eta_regions, bins=[]): for region in regions: name = prefix + region if name in self._hists: - logger.warn('Overwriting existing histogram {0}'.format(name)) + logger.warning('Overwriting existing histogram {0}'.format(name)) del self._hists[name] logger.debug('Adding histogram {0}'.format(name)) self._hists[name] = Hist(bins, name=name) diff --git a/cmsl1t/producers/met.py b/cmsl1t/producers/met.py index ea494fc2aa4..658a317db7d 100644 --- a/cmsl1t/producers/met.py +++ b/cmsl1t/producers/met.py @@ -76,7 +76,7 @@ def __init__(self, inputs, outputs, **kwargs): self._method = Producer.METHODS[params['method']] else: msg = 'Could not find specified MET method, using default.' - logger.warn(msg) + logger.warning(msg) self._method = Producer.METHODS['default'] def produce(self, event): diff --git a/cmsl1t/producers/met_vectorized.py b/cmsl1t/producers/met_vectorized.py index 95ecaf03f38..573d2e45da2 100644 --- a/cmsl1t/producers/met_vectorized.py +++ b/cmsl1t/producers/met_vectorized.py @@ -79,7 +79,7 @@ def __init__(self, inputs, outputs, **kwargs): self._method = Producer.METHODS[params['method']] else: msg = 'Could not find specified MET method, using default.' - logger.warn(msg) + logger.warning(msg) self._method = Producer.METHODS['default'] def produce(self, event): From aecf64c6dd98ee0334826952cef7a59457b82751 Mon Sep 17 00:00:00 2001 From: kreczko Date: Tue, 9 Jul 2019 13:28:38 +0100 Subject: [PATCH 08/30] fixed "len" for 1-dim collections --- cmsl1t/collections/base.py | 12 ++++++------ test/collections/test_baseHistcollection.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/cmsl1t/collections/base.py b/cmsl1t/collections/base.py index 479ff9fd8b3..4b355d61199 100644 --- a/cmsl1t/collections/base.py +++ b/cmsl1t/collections/base.py @@ -20,10 +20,10 @@ logger = logging.getLogger(__name__) -def create_n_dim_dict(dimensions, initiaValue=0): +def create_n_dim_dict(dimensions, initialValue=0): if dimensions < 1: - return initiaValue - factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initiaValue=initiaValue) + return initialValue + factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initialValue=initialValue) return defaultdict(factory) @@ -40,20 +40,20 @@ def create_n_dim_dict(dimensions, initiaValue=0): def len_n_dim_dict(dictionary, dimensions): if dimensions <= 1: - return len(dictionary) + return len(dictionary.keys()) return sum(len_n_dim_dict(v, dimensions - 1) for v in six.itervalues(dictionary)) class BaseHistCollection(defaultdict): - def __init__(self, dimensions, initiaValue=0): + def __init__(self, dimensions, initialValue=0): ''' For each dimension create a dictionary ''' # TODO: add possibility for different lambda expresions for each # dimension. This will allow to have custom dicts in certain dimensions - factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initiaValue=initiaValue) + factory = partial(create_n_dim_dict, dimensions=dimensions - 1, initialValue=initialValue) if sys.version_info[0] < 3: defaultdict.__init__(self, factory) else: diff --git a/test/collections/test_baseHistcollection.py b/test/collections/test_baseHistcollection.py index 783c75555df..1807aa8ab60 100644 --- a/test/collections/test_baseHistcollection.py +++ b/test/collections/test_baseHistcollection.py @@ -1,6 +1,8 @@ -from cmsl1t.collections import BaseHistCollection -import unittest from collections import defaultdict +import pytest +import unittest + +from cmsl1t.collections import BaseHistCollection class TestBaseHistCollection(unittest.TestCase): @@ -9,6 +11,7 @@ def test_dimensions(self): dimensions = 4 initial_value = 0 hists = BaseHistCollection(dimensions, initial_value) + self.assertEqual(len(hists), 0) self.assertIs(type(hists[1]), defaultdict) self.assertIs(type(hists[1][2][3][4]), type(initial_value)) self.assertEqual(hists[1][2][3][4], initial_value) @@ -23,3 +26,9 @@ def test_dimensions(self): # length_from_iterator = len(list(six.itervalues(hists))) # self.assertEqual(length_from_iterator, 3) + + +@pytest.mark.parametrize("dimensions", [1, 2, 3]) +def test_empty(dimensions): + c = BaseHistCollection(dimensions) + assert len(c) == 0 From c01c49db2ebbaf6e2f4c55826c51600760e41d3e Mon Sep 17 00:00:00 2001 From: kreczko Date: Wed, 10 Jul 2019 16:33:41 +0100 Subject: [PATCH 09/30] added VectorizedHistCollection.add --- cmsl1t/__init__.py | 6 +++- cmsl1t/collections/vectorized.py | 53 ++++++++++++++++++++++++++--- test/collections/test_vectorized.py | 29 +++++++++++++--- 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/cmsl1t/__init__.py b/cmsl1t/__init__.py index 8e8605ed081..ee760307d2c 100644 --- a/cmsl1t/__init__.py +++ b/cmsl1t/__init__.py @@ -1,7 +1,9 @@ from __future__ import absolute_import +import logging import os from os import path -import logging +import sys + __version__ = '0.5.1' @@ -28,3 +30,5 @@ PROJECT_ROOT = path.abspath(path.join(HERE, path.pardir)) else: PROJECT_ROOT = os.environ['PROJECT_ROOT'] + +PY3 = sys.version_info[0] == 3 diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index 58f9ce82b26..ba72a4f9274 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -1,6 +1,14 @@ -import numbda +from collections import defaultdict +import logging +import numba +import numpy as np +from rootpy.plotting import Hist from . import BaseHistCollection +from ..utils.iterators import pairwise +from .. import PY3 + +logger = logging.getLogger(__name__) @numba.jit(nopython=True) @@ -9,12 +17,24 @@ def extend(arr1, starts, stops): return np.repeat(arr1, repeat, axis=0) -class VectorizedHistCollection(object): +class VectorizedHistCollection(BaseHistCollection): + + def __init__(self, innerBins, innerLabel='inner', **kwargs): + # if we want to generalize to N dim, innerBins needs to be an array of innerBins + dimensions = kwargs.pop('dimensions', 2) + if PY3: + super(VectorizedHistCollection, self).__init__(dimensions) + else: + BaseHistCollection.__init__(self, dimensions) - def __init__(self, innerBins): self._innerBins = innerBins + self._innerLabel = innerLabel self._innerHist = Hist(100, 0, 100, name='inner') + def __getitem__(self, key): + real_key = self._get_inner_indices(key) + return defaultdict.__getitem__(self, real_key) + def _get_inner_indices(self, values): ''' Returns the pileup bin corresponding to the provided pileup value. @@ -23,6 +43,31 @@ def _get_inner_indices(self, values): :Example: >>> hists = VectorizedHistCollection(innerBins=[0,10,15,20,30,999]) - >>> hists._get_inner_indices([1, 11, 1111]) # returns [0, 1, 5] + >>> hists._get_inner_indices([1, 11, 1111]) # returns [1, 2, 6] ''' return np.digitize(values, self._innerBins) + + def add(self, name, bins, hist_type=Hist): + + bins = np.asarray(bins) + if bins.size == 0: + logger.error( + 'No bins specified for histogram {0}'.format(hist_name)) + + if name in self[1]: + logger.warning('Histogram {0} already exists!'.format(hist_name)) + return + names = [] + add_name = names.append + print(self) + + for i, (lowerEdge, upperEdge) in enumerate(pairwise(self._innerBins)): + hist_name = f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" + if i + 1 not in self or hist_name not in self[i + 1]: + add_name(hist_name) + self[i + 1][hist_name] = Hist(bins, name=hist_name) + logger.debug('Created {0} histograms: {1}'.format( + len(names), ', '.join(names))) + + def fill(self): + pass diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index a0eef2bfcb9..7a11625e557 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -1,3 +1,4 @@ +import awkward import pytest import numpy as np from rootpy.plotting import Hist @@ -5,6 +6,14 @@ from cmsl1t.collections import VectorizedHistCollection +@pytest.fixture +def collection(): + innerBins = np.array([0, 10, 15, 20, 30, 999]) + coll = VectorizedHistCollection(innerBins) + # fill for [35, 90, 120] + return coll + + @pytest.mark.parametrize( "values,expected", [ @@ -12,8 +21,20 @@ ([1, 11, 1111], [1, 2, 6]), ([-10, 1111, 20], [0, 6, 4]), ]) -def test_inner_index(values, expected): - innerBins = np.array([0, 10, 15, 20, 30, 999]) - coll = VectorizedHistCollection(innerBins) +def test_inner_index(collection, values, expected): + np.testing.assert_array_equal(collection._get_inner_indices(values), expected) + + +def test_add(collection): + assert len(collection) == 0 + collection.add('test', bins=[35, 90, 120]) + assert len(collection) == len(collection._innerBins) - 1 - np.testing.assert_array_equal(coll._get_inner_indices(values), expected) +# def test_fill(collection): +# innerValues = [1, 12, 1, 50] +# outerValues = awkward.fromiter([ +# [60, 50, 40, 30, 20], +# [32, 23], +# [56, 34, 31], +# ]) +# collection.add('test', bins=[35, 90, 120]) From 15bdaf3341f3fb42415c77c3b3696c95440e3a85 Mon Sep 17 00:00:00 2001 From: kreczko Date: Wed, 17 Jul 2019 15:03:16 +0100 Subject: [PATCH 10/30] added Bin and Hist proxy objects for histogram collection --- cmsl1t/collections/vectorized.py | 63 +++++++++++++++++++++++++---- test/collections/test_vectorized.py | 26 ++++++++++++ 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index ba72a4f9274..d1d440b4665 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -32,8 +32,14 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): self._innerHist = Hist(100, 0, 100, name='inner') def __getitem__(self, key): - real_key = self._get_inner_indices(key) - return defaultdict.__getitem__(self, real_key) + if not isinstance(key, (list, np.ndarray, np.generic)): + key = np.array([key]) + real_keys = self._get_inner_indices(key) + # Python tries to copy the whole nested default dict ... which is infinite + # print(key, real_keys) + # return object() + return VectorizedBinProxy(self, real_keys) + return [defaultdict.__getitem__(self, k) for k in real_keys.tolist()] def _get_inner_indices(self, values): ''' @@ -54,20 +60,61 @@ def add(self, name, bins, hist_type=Hist): logger.error( 'No bins specified for histogram {0}'.format(hist_name)) - if name in self[1]: + if name in defaultdict.__getitem__(self, 1): logger.warning('Histogram {0} already exists!'.format(hist_name)) return names = [] add_name = names.append - print(self) for i, (lowerEdge, upperEdge) in enumerate(pairwise(self._innerBins)): hist_name = f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" - if i + 1 not in self or hist_name not in self[i + 1]: + if i + 1 not in self or hist_name not in defaultdict.__getitem__(self, i + 1): add_name(hist_name) - self[i + 1][hist_name] = Hist(bins, name=hist_name) + defaultdict.__getitem__(self, i + 1)[hist_name] = Hist(bins, name=hist_name) logger.debug('Created {0} histograms: {1}'.format( len(names), ', '.join(names))) - def fill(self): - pass + def fill(self, x, w=None): + if w is None: + w = np.ones() + + + +class VectorizedBinProxy(object): + + def __init__(self, collection, inner_indices): + self.collection = collection + self._inner_indices = inner_indices + + def __getitem__(self, key): + # TODO, if key != string, return a BinProxy + return VectorizedHistProxy(self, key) + + def __add__(self, other): + if self.collection != other.collection: + msg = 'Cannot add VectorizedBinProxy for two different collections' + logger.error(msg) + raise ValueError(msg) + self._inner_indices = np.append(self._inner_indices, other._inner_indices) + return self + + def __eq__(self, other): + if self.collection != other.collection: + msg = 'Cannot compare VectorizedBinProxy for two different collections' + logger.error(msg) + raise ValueError(msg) + return self._inner_indices.tolist() == other._inner_indices.tolist() + + def flatten(self): + self._inner_indices = np.unique(self._inner_indices) + return self + +class VectorizedHistProxy(object): + + def __init__(self, bin_proxy, hist_name): + self._bin_proxy = bin_proxy.flatten() + self._hist_name = hist_name + + def fill(self, x, w=None): + if w is None: + w = np.ones(x) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index 7a11625e557..2f05e53c857 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -4,6 +4,7 @@ from rootpy.plotting import Hist from cmsl1t.collections import VectorizedHistCollection +from cmsl1t.collections.vectorized import VectorizedBinProxy, VectorizedHistProxy @pytest.fixture @@ -30,6 +31,29 @@ def test_add(collection): collection.add('test', bins=[35, 90, 120]) assert len(collection) == len(collection._innerBins) - 1 + +def test_access(collection): + collection.add('test', bins=[35, 90, 120]) + innerValues = [1, 12, 1, 50] + assert collection[innerValues] == collection[1] + collection[12] + collection[1] + collection[50] + # assert type(collection[innerValues]) == Hist + assert type(collection[innerValues]['test']) == VectorizedHistProxy + + +def test_copy(collection): + proxy = VectorizedBinProxy(collection, [1, 12, 1, 50]) + + +@pytest.mark.parametrize( + "values,expected", + [ + ([1, 12, 1, 50], [1, 12, 50]), + ([1, 30, 12, 1, 50], [1, 12, 30, 50]), + ]) +def test_bin_proxy_flatten(collection, values, expected): + proxy = VectorizedBinProxy(collection, values) + assert proxy.flatten()._inner_indices.tolist() == expected + # def test_fill(collection): # innerValues = [1, 12, 1, 50] # outerValues = awkward.fromiter([ @@ -38,3 +62,5 @@ def test_add(collection): # [56, 34, 31], # ]) # collection.add('test', bins=[35, 90, 120]) +# weights = np.ones(len(outerValues.content)) +# collection[innerValues][hist_name].fill(outerValues, weights) From f6326ca599aa302387720a3deb4d919c5de45b43 Mon Sep 17 00:00:00 2001 From: kreczko Date: Wed, 17 Jul 2019 16:53:27 +0100 Subject: [PATCH 11/30] implemented vectorized filling of histograms --- cmsl1t/collections/vectorized.py | 43 +++++++++++++----- test/collections/test_vectorized.py | 69 ++++++++++++++++++++++++----- 2 files changed, 90 insertions(+), 22 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index d1d440b4665..7fb0fb3af79 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -11,7 +11,6 @@ logger = logging.getLogger(__name__) -@numba.jit(nopython=True) def extend(arr1, starts, stops): repeat = stops - starts return np.repeat(arr1, repeat, axis=0) @@ -21,6 +20,7 @@ class VectorizedHistCollection(BaseHistCollection): def __init__(self, innerBins, innerLabel='inner', **kwargs): # if we want to generalize to N dim, innerBins needs to be an array of innerBins + # TODO: last dimension should probably be a normal dictionary dimensions = kwargs.pop('dimensions', 2) if PY3: super(VectorizedHistCollection, self).__init__(dimensions) @@ -35,11 +35,8 @@ def __getitem__(self, key): if not isinstance(key, (list, np.ndarray, np.generic)): key = np.array([key]) real_keys = self._get_inner_indices(key) - # Python tries to copy the whole nested default dict ... which is infinite - # print(key, real_keys) - # return object() return VectorizedBinProxy(self, real_keys) - return [defaultdict.__getitem__(self, k) for k in real_keys.tolist()] + # return [defaultdict.__getitem__(self, k) for k in real_keys.tolist()] def _get_inner_indices(self, values): ''' @@ -66,28 +63,35 @@ def add(self, name, bins, hist_type=Hist): names = [] add_name = names.append - for i, (lowerEdge, upperEdge) in enumerate(pairwise(self._innerBins)): - hist_name = f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" + for i, hist_name in enumerate(self._create_hist_names(name)): if i + 1 not in self or hist_name not in defaultdict.__getitem__(self, i + 1): add_name(hist_name) defaultdict.__getitem__(self, i + 1)[hist_name] = Hist(bins, name=hist_name) logger.debug('Created {0} histograms: {1}'.format( len(names), ', '.join(names))) + def _create_hist_names(self, name): + for i, (lowerEdge, upperEdge) in enumerate(pairwise(self._innerBins)): + yield f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" + + def get_hist_name(self, name, innerIndex): + lowerEdge, upperEdge = self._innerBins[innerIndex - 1], self._innerBins[innerIndex] + return f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" + def fill(self, x, w=None): if w is None: w = np.ones() - class VectorizedBinProxy(object): def __init__(self, collection, inner_indices): self.collection = collection self._inner_indices = inner_indices + # self._inner_values = inner_values def __getitem__(self, key): - # TODO, if key != string, return a BinProxy + # TODO, if key != string, return a BinProxy of the bin above return VectorizedHistProxy(self, key) def __add__(self, other): @@ -109,12 +113,29 @@ def flatten(self): self._inner_indices = np.unique(self._inner_indices) return self + class VectorizedHistProxy(object): def __init__(self, bin_proxy, hist_name): - self._bin_proxy = bin_proxy.flatten() + self._bin_proxy = bin_proxy self._hist_name = hist_name + def _split_input(self, x, w): + inner_indices = self._bin_proxy._inner_indices + # TODO: what if x is not jagged + inner_indices = extend(inner_indices, x.starts, x.stops) + for u in np.unique(inner_indices): + mask = inner_indices == u + yield u, x.content[mask], w[mask] + + def _get_hist(self, inner_index): + hist_name = self._bin_proxy.collection.get_hist_name(self._hist_name, inner_index) + return defaultdict.__getitem__(self._bin_proxy.collection, inner_index)[hist_name] + def fill(self, x, w=None): if w is None: - w = np.ones(x) + # TODO: what if x is not jagged + w = np.ones(len(x.content)) + for i, x_i, w_i in self._split_input(x, w): + hist = self._get_hist(i) + hist.fill_array(x_i, w_i) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index 2f05e53c857..8ce24536d46 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -4,7 +4,7 @@ from rootpy.plotting import Hist from cmsl1t.collections import VectorizedHistCollection -from cmsl1t.collections.vectorized import VectorizedBinProxy, VectorizedHistProxy +from cmsl1t.collections.vectorized import VectorizedBinProxy, VectorizedHistProxy, extend @pytest.fixture @@ -54,13 +54,60 @@ def test_bin_proxy_flatten(collection, values, expected): proxy = VectorizedBinProxy(collection, values) assert proxy.flatten()._inner_indices.tolist() == expected -# def test_fill(collection): -# innerValues = [1, 12, 1, 50] -# outerValues = awkward.fromiter([ -# [60, 50, 40, 30, 20], -# [32, 23], -# [56, 34, 31], -# ]) -# collection.add('test', bins=[35, 90, 120]) -# weights = np.ones(len(outerValues.content)) -# collection[innerValues][hist_name].fill(outerValues, weights) + +@pytest.mark.parametrize( + "bins, x, expected", + [ + ( + np.array([1, 12, 1, 50]), + np.array([10, 20, 30, 40]), + [np.array([10, 30]), np.array([20]), np.array([40])] + ), + ( + np.array([1, 1, 1, 2, 1, 2]), + np.array([10, 20, 30, 40, 50, 60]), + [np.array([10, 20, 30, 50]), np.array([40, 60])] + ), + ]) +def test_split(bins, x, expected): + unique_bins = np.unique(bins) + result = [] + for b in unique_bins: + result.append(x[bins == b]) + for chunk, exp in zip(result, expected): + assert chunk.tolist() == exp.tolist() + + +def test_fill(collection): + innerValues = [1, 12, 1, 50] + outerValues = awkward.fromiter([ + [60, 50, 40, 30, 20], + [32, 23], + [56, 34, 31], + [], + ]) + expected = [ + [4.0, 4.0, 0.0, 0.0], + [2.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0], + ] + + hist_name = 'test' + collection.add(hist_name, bins=[35, 90, 120]) + weights = np.ones(len(outerValues.content)) + collection[innerValues][hist_name].fill(outerValues, weights) + for i in range(len(np.unique(innerValues))): + hist = collection[innerValues][hist_name]._get_hist(i + 1) + assert list(hist.y(overflow=True)) == expected[i] + + +def test_extend(): + innerValues = [1, 12, 1, 50] + outerValues = awkward.fromiter([ + [60, 50, 40, 30, 20], + [32, 23], + [56, 34, 31], + [], + ]) + innerValues = extend(innerValues, outerValues.starts, outerValues.stops) + assert len(innerValues) == len(outerValues.content) From e653da76fdd3ce0ca3574cd0e6e8e81d6e9712b4 Mon Sep 17 00:00:00 2001 From: kreczko Date: Wed, 17 Jul 2019 17:00:34 +0100 Subject: [PATCH 12/30] added VectorizedHistCollection.inner_fill() --- cmsl1t/collections/vectorized.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index 7fb0fb3af79..ddc4d41245f 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -78,9 +78,10 @@ def get_hist_name(self, name, innerIndex): lowerEdge, upperEdge = self._innerBins[innerIndex - 1], self._innerBins[innerIndex] return f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" - def fill(self, x, w=None): + def inner_fill(self, x, w=None): if w is None: - w = np.ones() + w = np.ones(len(x)) + self._innerHist.fill_array(x, w) class VectorizedBinProxy(object): From 84294368adcfb490ef1052e73bcc2a6e0ee00415 Mon Sep 17 00:00:00 2001 From: kreczko Date: Wed, 17 Jul 2019 17:02:18 +0100 Subject: [PATCH 13/30] fix pep8 issues --- cmsl1t/collections/vectorized.py | 5 ++--- test/collections/test_vectorized.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index ddc4d41245f..588a7a77c89 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -1,6 +1,5 @@ from collections import defaultdict import logging -import numba import numpy as np from rootpy.plotting import Hist @@ -55,10 +54,10 @@ def add(self, name, bins, hist_type=Hist): bins = np.asarray(bins) if bins.size == 0: logger.error( - 'No bins specified for histogram {0}'.format(hist_name)) + 'No bins specified for histogram {0}'.format(name)) if name in defaultdict.__getitem__(self, 1): - logger.warning('Histogram {0} already exists!'.format(hist_name)) + logger.warning('Histogram {0} already exists!'.format(name)) return names = [] add_name = names.append diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index 8ce24536d46..b61fae8598f 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -1,7 +1,6 @@ import awkward import pytest import numpy as np -from rootpy.plotting import Hist from cmsl1t.collections import VectorizedHistCollection from cmsl1t.collections.vectorized import VectorizedBinProxy, VectorizedHistProxy, extend From 5133fa6ea0d09441582424b32278f5a1e4e0dd1d Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 13:12:22 +0100 Subject: [PATCH 14/30] added filters to demo config --- config/demo.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/demo.yaml b/config/demo.yaml index ceadd10ad1a..70037422dc2 100644 --- a/config/demo.yaml +++ b/config/demo.yaml @@ -52,6 +52,7 @@ analysis: outputs: - l1MetNot28HF method: l1MetNot28HF + filters: [] output: # template is a list here that is joined (os.path.join) in the config parser From e1e8702581447714dde19afea03477226706aec0 Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 14:09:55 +0100 Subject: [PATCH 15/30] improved VectorizedHistCollection to handle numpy arrays --- cmsl1t/collections/vectorized.py | 33 +++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index 588a7a77c89..e41444e2ab2 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -49,8 +49,8 @@ def _get_inner_indices(self, values): ''' return np.digitize(values, self._innerBins) - def add(self, name, bins, hist_type=Hist): - + def add(self, name, bins, hist_type=Hist, **kwargs): + title = kwargs.pop('title', name) bins = np.asarray(bins) if bins.size == 0: logger.error( @@ -65,7 +65,7 @@ def add(self, name, bins, hist_type=Hist): for i, hist_name in enumerate(self._create_hist_names(name)): if i + 1 not in self or hist_name not in defaultdict.__getitem__(self, i + 1): add_name(hist_name) - defaultdict.__getitem__(self, i + 1)[hist_name] = Hist(bins, name=hist_name) + defaultdict.__getitem__(self, i + 1)[hist_name] = hist_type(bins, name=hist_name, title=title) logger.debug('Created {0} histograms: {1}'.format( len(names), ', '.join(names))) @@ -79,7 +79,7 @@ def get_hist_name(self, name, innerIndex): def inner_fill(self, x, w=None): if w is None: - w = np.ones(len(x)) + w = np.ones(np.size(x)) self._innerHist.fill_array(x, w) @@ -122,20 +122,35 @@ def __init__(self, bin_proxy, hist_name): def _split_input(self, x, w): inner_indices = self._bin_proxy._inner_indices - # TODO: what if x is not jagged - inner_indices = extend(inner_indices, x.starts, x.stops) + content = x + if hasattr(x, 'starts'): + inner_indices = extend(inner_indices, x.starts, x.stops) + content = x.content + for u in np.unique(inner_indices): mask = inner_indices == u - yield u, x.content[mask], w[mask] + if not isinstance(mask, (list, np.ndarray)): + mask = np.array([mask]) + yield u, content[mask], w[mask] def _get_hist(self, inner_index): hist_name = self._bin_proxy.collection.get_hist_name(self._hist_name, inner_index) return defaultdict.__getitem__(self._bin_proxy.collection, inner_index)[hist_name] def fill(self, x, w=None): + if not isinstance(x, (list, np.ndarray)): + x = np.array([x]) if w is None: - # TODO: what if x is not jagged - w = np.ones(len(x.content)) + n = np.size(x.content) if hasattr(x, 'content') else np.size(x) + w = np.ones(n) for i, x_i, w_i in self._split_input(x, w): hist = self._get_hist(i) hist.fill_array(x_i, w_i) + + +# def split_input(): +# a = np.array([1, 12, 1, 10, 50, 10]) +# b = np.array([10, 20, 30, 40, 50, 60]) +# arg = a.argsort(kind='stable') +# offsets, = np.where(np.r_[True, np.diff(a[arg]) > 0]) +# output = awkward.JaggedArray.fromoffsets(offsets.flatten(), awkward.IndexedArray(arg, b)) From 23d2b11d32ef79268858e07f6a996a2b01f4952c Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 14:47:47 +0100 Subject: [PATCH 16/30] augmented demo analyzer with VectorizedHistCollection --- cmsl1t/analyzers/demo_analyzer.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/cmsl1t/analyzers/demo_analyzer.py b/cmsl1t/analyzers/demo_analyzer.py index a8ff18b0be6..1a0a8a062e8 100644 --- a/cmsl1t/analyzers/demo_analyzer.py +++ b/cmsl1t/analyzers/demo_analyzer.py @@ -6,7 +6,7 @@ import numpy as np from .BaseAnalyzer import BaseAnalyzer -from cmsl1t.collections import EfficiencyCollection +from cmsl1t.collections import EfficiencyCollection, VectorizedHistCollection class Analyzer(BaseAnalyzer): @@ -14,25 +14,30 @@ class Analyzer(BaseAnalyzer): def __init__(self, **kwargs): super(Analyzer, self).__init__(**kwargs) - self.met_calcs = dict( - RecalcL1EmuMETNot28=dict( + self.met_calcs = { + self.name + '_' + 'RecalcL1EmuMETNot28': dict( title="Emulated MET, |ieta|<28", attr='l1MetNot28'), - RecalcL1EmuMETNot28HF=dict( + self.name + '_' + 'RecalcL1EmuMETNot28HF': dict( title="Emulated MET, |ieta|!=28", attr='l1MetNot28HF'), - ) + } def prepare_for_events(self, reader): bins = np.arange(0, 200, 25) thresholds = [70, 90, 110] puBins = list(range(0, 50, 10)) + [999] + self.hists = VectorizedHistCollection(innerBins=puBins, innerLabel='pu') + self.efficiencies = EfficiencyCollection(pileupBins=puBins) add_met_variable = partial( self.efficiencies.add_variable, bins=bins, thresholds=thresholds) list(map(add_met_variable, self.met_calcs)) + + for met, config in self.met_calcs.items(): + self.hists.add(met, bins=bins, title=config['title']) return True def reload_histograms(self, input_file): @@ -43,16 +48,19 @@ def reload_histograms(self, input_file): def fill_histograms(self, entry, event): pileup = event['Vertex_nVtx'] self.efficiencies.set_pileup(pileup) + self.hists.inner_fill(pileup) offlineMetBE = event.Sums_caloMetBE for name, config in self.met_calcs.items(): onlineMet = event[config['attr']] onlineMet = onlineMet.mag self.efficiencies.fill_array(name, offlineMetBE, onlineMet) + self.hists[pileup][name].fill(offlineMetBE) return True def write_histograms(self): - self.efficiencies.to_root(self.get_histogram_filename()) + self.efficiencies.to_root(self.get_histogram_filename().replace('.root', '_efficiencies.root')) + self.hists.to_root(self.get_histogram_filename()) return True def make_plots(self): From 8cde7ef3f3a9ee34480e31f742b134f8d30123f5 Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 14:48:29 +0100 Subject: [PATCH 17/30] added hash to inner histogram name for VectorizedHistCollection if no name is given (to avoid name clashes) --- cmsl1t/collections/vectorized.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index e41444e2ab2..589d016419f 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -1,6 +1,7 @@ from collections import defaultdict import logging import numpy as np +import random from rootpy.plotting import Hist from . import BaseHistCollection @@ -21,6 +22,7 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): # if we want to generalize to N dim, innerBins needs to be an array of innerBins # TODO: last dimension should probably be a normal dictionary dimensions = kwargs.pop('dimensions', 2) + name = kwargs.pop('name', str(hex(random.getrandbits(128)))[2:10]) if PY3: super(VectorizedHistCollection, self).__init__(dimensions) else: @@ -28,7 +30,7 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): self._innerBins = innerBins self._innerLabel = innerLabel - self._innerHist = Hist(100, 0, 100, name='inner') + self._innerHist = Hist(100, 0, 100, name=innerLabel + '_' + name) def __getitem__(self, key): if not isinstance(key, (list, np.ndarray, np.generic)): @@ -147,6 +149,7 @@ def fill(self, x, w=None): hist = self._get_hist(i) hist.fill_array(x_i, w_i) +# class VectorizedEfficiencyProxy(object): # def split_input(): # a = np.array([1, 12, 1, 10, 50, 10]) From ff54270ae9c5e5c36f05c7c347a49a4e57b92ed1 Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 14:56:07 +0100 Subject: [PATCH 18/30] VectorizedHistCollection: making sure inner histogram is also filled --- cmsl1t/collections/vectorized.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index 589d016419f..c071660512d 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -7,6 +7,7 @@ from . import BaseHistCollection from ..utils.iterators import pairwise from .. import PY3 +from ..io import to_root logger = logging.getLogger(__name__) @@ -22,7 +23,8 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): # if we want to generalize to N dim, innerBins needs to be an array of innerBins # TODO: last dimension should probably be a normal dictionary dimensions = kwargs.pop('dimensions', 2) - name = kwargs.pop('name', str(hex(random.getrandbits(128)))[2:10]) + self._name = kwargs.pop('name', str(hex(random.getrandbits(128)))[2:10]) + self._execute_before_write = kwargs.pop('execute_before_write', []) if PY3: super(VectorizedHistCollection, self).__init__(dimensions) else: @@ -30,7 +32,7 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): self._innerBins = innerBins self._innerLabel = innerLabel - self._innerHist = Hist(100, 0, 100, name=innerLabel + '_' + name) + self._innerHist = Hist(100, 0, 100, name=innerLabel + '_' + self._name) def __getitem__(self, key): if not isinstance(key, (list, np.ndarray, np.generic)): @@ -84,6 +86,11 @@ def inner_fill(self, x, w=None): w = np.ones(np.size(x)) self._innerHist.fill_array(x, w) + def to_root(self, output_file): + for func in self._execute_before_write: + func(self) + to_root([self, self._innerHist], output_file) + class VectorizedBinProxy(object): From e643ec92f49f8380ce1616b669b55f2632da392a Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 15:08:08 +0100 Subject: [PATCH 19/30] removed obsolete test --- test/collections/test_vectorized.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index b61fae8598f..90806d9e20d 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -39,8 +39,8 @@ def test_access(collection): assert type(collection[innerValues]['test']) == VectorizedHistProxy -def test_copy(collection): - proxy = VectorizedBinProxy(collection, [1, 12, 1, 50]) +# def test_copy(collection): +# proxy = VectorizedBinProxy(collection, [1, 12, 1, 50]) @pytest.mark.parametrize( From 114f53ce00239d864c7b0688be69da8e5900f269 Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 16:33:15 +0100 Subject: [PATCH 20/30] CI: removed python 2.7 from tests --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 64dacb89a93..58de9dfc00c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,6 @@ cache: language: python python: - - "2.7" - "3.6" env: From cefda36fb450004f368b4d47bd7ce2bcd80de943 Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 17:01:56 +0100 Subject: [PATCH 21/30] fixed boost histogram tests --- test/collections/test_boost_histogram.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/collections/test_boost_histogram.py b/test/collections/test_boost_histogram.py index 183d06d02ea..6b30dd0e176 100644 --- a/test/collections/test_boost_histogram.py +++ b/test/collections/test_boost_histogram.py @@ -25,5 +25,7 @@ def test_fill(): pileup = np.repeat(pileup, repeat, axis=0) # expand pileup to size ets assert len(pileup) == len(ets.content) + # weights are not yet supported # hist.fill(pileup, ets.content, bh.weight(weights)) - hist(pileup, ets.content) + hist.fill(pileup, ets.content) + # hist(pileup, ets.content) From 09178f4a377bce14849ad2bdea5f693775db448d Mon Sep 17 00:00:00 2001 From: kreczko Date: Thu, 18 Jul 2019 17:26:49 +0100 Subject: [PATCH 22/30] added test for split_input and added awkward.JaggedArray check --- cmsl1t/collections/vectorized.py | 31 +++++++++++++++-------------- test/collections/test_vectorized.py | 25 ++++++++++++++++++++++- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index c071660512d..e79588ff9d3 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -1,3 +1,4 @@ +import awkward from collections import defaultdict import logging import numpy as np @@ -17,6 +18,19 @@ def extend(arr1, starts, stops): return np.repeat(arr1, repeat, axis=0) +def split_input(inner_indices, x, w): + content = x + if hasattr(x, 'starts'): + inner_indices = extend(inner_indices, x.starts, x.stops) + content = x.content + + for u in np.unique(inner_indices): + mask = inner_indices == u + if not isinstance(mask, (list, np.ndarray)): + mask = np.array([mask]) + yield u, content[mask], w[mask] + + class VectorizedHistCollection(BaseHistCollection): def __init__(self, innerBins, innerLabel='inner', **kwargs): @@ -129,30 +143,17 @@ def __init__(self, bin_proxy, hist_name): self._bin_proxy = bin_proxy self._hist_name = hist_name - def _split_input(self, x, w): - inner_indices = self._bin_proxy._inner_indices - content = x - if hasattr(x, 'starts'): - inner_indices = extend(inner_indices, x.starts, x.stops) - content = x.content - - for u in np.unique(inner_indices): - mask = inner_indices == u - if not isinstance(mask, (list, np.ndarray)): - mask = np.array([mask]) - yield u, content[mask], w[mask] - def _get_hist(self, inner_index): hist_name = self._bin_proxy.collection.get_hist_name(self._hist_name, inner_index) return defaultdict.__getitem__(self._bin_proxy.collection, inner_index)[hist_name] def fill(self, x, w=None): - if not isinstance(x, (list, np.ndarray)): + if not isinstance(x, (list, np.ndarray, awkward.JaggedArray)): x = np.array([x]) if w is None: n = np.size(x.content) if hasattr(x, 'content') else np.size(x) w = np.ones(n) - for i, x_i, w_i in self._split_input(x, w): + for i, x_i, w_i in split_input(self._bin_proxy._inner_indices, x, w): hist = self._get_hist(i) hist.fill_array(x_i, w_i) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index 90806d9e20d..b04445e7516 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -3,7 +3,7 @@ import numpy as np from cmsl1t.collections import VectorizedHistCollection -from cmsl1t.collections.vectorized import VectorizedBinProxy, VectorizedHistProxy, extend +from cmsl1t.collections.vectorized import VectorizedBinProxy, VectorizedHistProxy, extend, split_input @pytest.fixture @@ -110,3 +110,26 @@ def test_extend(): ]) innerValues = extend(innerValues, outerValues.starts, outerValues.stops) assert len(innerValues) == len(outerValues.content) + +def test_split_input(): + innerValues = [1, 12, 1, 50] + outerValues = awkward.fromiter([ + [60, 50, 40, 30, 20], + [32, 23], + [56, 34, 31], + [], + ]) + weights = np.ones(len(outerValues.content)) + + expected = [ + (1, [60, 50, 40, 30, 20, 56, 34, 31], list(np.ones(8))), + [12, [32, 23], list(np.ones(2))], + ] + results = list(split_input(innerValues, outerValues, weights)) + assert len(results) == len(expected) + for r, e in zip(results, expected): + i, o, w = r + i_e, o_e, w_e = e + assert i == i_e + assert o.tolist() == o_e + assert w.tolist() == w_e From 88168fdb057289d2612924471b230e185ccf7312 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 19 Jul 2019 10:33:17 +0100 Subject: [PATCH 23/30] removed unused variable in _create_hist_names --- cmsl1t/collections/vectorized.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index e79588ff9d3..d337c52f7c4 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -88,7 +88,7 @@ def add(self, name, bins, hist_type=Hist, **kwargs): len(names), ', '.join(names))) def _create_hist_names(self, name): - for i, (lowerEdge, upperEdge) in enumerate(pairwise(self._innerBins)): + for lowerEdge, upperEdge in pairwise(self._innerBins): yield f"{name}_{self._innerLabel}{lowerEdge}To{upperEdge}" def get_hist_name(self, name, innerIndex): From 51d44d3be7663276e1bbeb9bbae29448a11a6e33 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 19 Jul 2019 10:34:23 +0100 Subject: [PATCH 24/30] fixed pep8 error in test_vectorized --- test/collections/test_vectorized.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index b04445e7516..3005c9141d0 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -111,6 +111,7 @@ def test_extend(): innerValues = extend(innerValues, outerValues.starts, outerValues.stops) assert len(innerValues) == len(outerValues.content) + def test_split_input(): innerValues = [1, 12, 1, 50] outerValues = awkward.fromiter([ From 838701e0518196fa7245f0958016c98ee0e02a99 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 2 Aug 2019 11:00:51 +0100 Subject: [PATCH 25/30] removed cmsl1t.PY3 variable --- cmsl1t/__init__.py | 3 --- cmsl1t/collections/vectorized.py | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/cmsl1t/__init__.py b/cmsl1t/__init__.py index ee760307d2c..3cd437f6817 100644 --- a/cmsl1t/__init__.py +++ b/cmsl1t/__init__.py @@ -2,7 +2,6 @@ import logging import os from os import path -import sys __version__ = '0.5.1' @@ -30,5 +29,3 @@ PROJECT_ROOT = path.abspath(path.join(HERE, path.pardir)) else: PROJECT_ROOT = os.environ['PROJECT_ROOT'] - -PY3 = sys.version_info[0] == 3 diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index d337c52f7c4..1318ec1af1e 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -7,7 +7,6 @@ from . import BaseHistCollection from ..utils.iterators import pairwise -from .. import PY3 from ..io import to_root logger = logging.getLogger(__name__) @@ -39,10 +38,7 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): dimensions = kwargs.pop('dimensions', 2) self._name = kwargs.pop('name', str(hex(random.getrandbits(128)))[2:10]) self._execute_before_write = kwargs.pop('execute_before_write', []) - if PY3: - super(VectorizedHistCollection, self).__init__(dimensions) - else: - BaseHistCollection.__init__(self, dimensions) + super(VectorizedHistCollection, self).__init__(dimensions) self._innerBins = innerBins self._innerLabel = innerLabel From 841cea0a92030061e453926892f23954696523e0 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 2 Aug 2019 11:03:31 +0100 Subject: [PATCH 26/30] VectorizedHistCollection: count tuple as a valid iterable --- cmsl1t/collections/vectorized.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index 1318ec1af1e..b145ddc7e89 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -25,8 +25,8 @@ def split_input(inner_indices, x, w): for u in np.unique(inner_indices): mask = inner_indices == u - if not isinstance(mask, (list, np.ndarray)): - mask = np.array([mask]) + if not isinstance(mask, (tuple, list, np.ndarray, np.generic)): + mask = np.array(mask) yield u, content[mask], w[mask] @@ -45,8 +45,8 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): self._innerHist = Hist(100, 0, 100, name=innerLabel + '_' + self._name) def __getitem__(self, key): - if not isinstance(key, (list, np.ndarray, np.generic)): - key = np.array([key]) + if not isinstance(key, (tuple, list, np.ndarray, np.generic)): + key = np.array(key) real_keys = self._get_inner_indices(key) return VectorizedBinProxy(self, real_keys) # return [defaultdict.__getitem__(self, k) for k in real_keys.tolist()] @@ -144,8 +144,8 @@ def _get_hist(self, inner_index): return defaultdict.__getitem__(self._bin_proxy.collection, inner_index)[hist_name] def fill(self, x, w=None): - if not isinstance(x, (list, np.ndarray, awkward.JaggedArray)): - x = np.array([x]) + if not isinstance(x, (tuple, list, np.ndarray, awkward.JaggedArray)): + x = np.array(x) if w is None: n = np.size(x.content) if hasattr(x, 'content') else np.size(x) w = np.ones(n) From da13c487238a0bbfad9c20c5a0846571591230c2 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 2 Aug 2019 11:20:39 +0100 Subject: [PATCH 27/30] VectorizedHistCollection.add --> VectorizedHistCollection.insert --- cmsl1t/analyzers/demo_analyzer.py | 2 +- cmsl1t/collections/vectorized.py | 2 +- test/collections/test_vectorized.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmsl1t/analyzers/demo_analyzer.py b/cmsl1t/analyzers/demo_analyzer.py index 1a0a8a062e8..3af964fea69 100644 --- a/cmsl1t/analyzers/demo_analyzer.py +++ b/cmsl1t/analyzers/demo_analyzer.py @@ -37,7 +37,7 @@ def prepare_for_events(self, reader): list(map(add_met_variable, self.met_calcs)) for met, config in self.met_calcs.items(): - self.hists.add(met, bins=bins, title=config['title']) + self.hists.insert(met, bins=bins, title=config['title']) return True def reload_histograms(self, input_file): diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index b145ddc7e89..ce1ee07675f 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -63,7 +63,7 @@ def _get_inner_indices(self, values): ''' return np.digitize(values, self._innerBins) - def add(self, name, bins, hist_type=Hist, **kwargs): + def insert(self, name, bins, hist_type=Hist, **kwargs): title = kwargs.pop('title', name) bins = np.asarray(bins) if bins.size == 0: diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index 3005c9141d0..2ca73e169fc 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -27,12 +27,12 @@ def test_inner_index(collection, values, expected): def test_add(collection): assert len(collection) == 0 - collection.add('test', bins=[35, 90, 120]) + collection.insert('test', bins=[35, 90, 120]) assert len(collection) == len(collection._innerBins) - 1 def test_access(collection): - collection.add('test', bins=[35, 90, 120]) + collection.insert('test', bins=[35, 90, 120]) innerValues = [1, 12, 1, 50] assert collection[innerValues] == collection[1] + collection[12] + collection[1] + collection[50] # assert type(collection[innerValues]) == Hist @@ -92,7 +92,7 @@ def test_fill(collection): ] hist_name = 'test' - collection.add(hist_name, bins=[35, 90, 120]) + collection.insert(hist_name, bins=[35, 90, 120]) weights = np.ones(len(outerValues.content)) collection[innerValues][hist_name].fill(outerValues, weights) for i in range(len(np.unique(innerValues))): From c1c9a7c3c9f685f2b1393f1b517155336c99061c Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 2 Aug 2019 14:31:03 +0100 Subject: [PATCH 28/30] VectorizedHistCollection: replaced defaultdict with super() calls & using innerBins for innerHist --- cmsl1t/collections/vectorized.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index ce1ee07675f..7a776db00cb 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -42,7 +42,7 @@ def __init__(self, innerBins, innerLabel='inner', **kwargs): self._innerBins = innerBins self._innerLabel = innerLabel - self._innerHist = Hist(100, 0, 100, name=innerLabel + '_' + self._name) + self._innerHist = Hist(innerBins, name=innerLabel + '_' + self._name) def __getitem__(self, key): if not isinstance(key, (tuple, list, np.ndarray, np.generic)): @@ -70,16 +70,17 @@ def insert(self, name, bins, hist_type=Hist, **kwargs): logger.error( 'No bins specified for histogram {0}'.format(name)) - if name in defaultdict.__getitem__(self, 1): + if name in super(VectorizedHistCollection, self).__getitem__(1): logger.warning('Histogram {0} already exists!'.format(name)) return names = [] add_name = names.append for i, hist_name in enumerate(self._create_hist_names(name)): - if i + 1 not in self or hist_name not in defaultdict.__getitem__(self, i + 1): + __current_slice = super(VectorizedHistCollection, self).__getitem__(i + 1) + if i + 1 not in self or hist_name not in __current_slice: add_name(hist_name) - defaultdict.__getitem__(self, i + 1)[hist_name] = hist_type(bins, name=hist_name, title=title) + __current_slice[hist_name] = hist_type(bins, name=hist_name, title=title) logger.debug('Created {0} histograms: {1}'.format( len(names), ', '.join(names))) From 23147d36ee1f14814f4d15eb53da3c21ef02ed6e Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 2 Aug 2019 15:17:08 +0100 Subject: [PATCH 29/30] added tests for different types of weights for VectorizedHistCollection --- test/collections/test_vectorized.py | 62 +++++++++++++++++++---------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/test/collections/test_vectorized.py b/test/collections/test_vectorized.py index 2ca73e169fc..13e6c17260a 100644 --- a/test/collections/test_vectorized.py +++ b/test/collections/test_vectorized.py @@ -7,13 +7,43 @@ @pytest.fixture -def collection(): - innerBins = np.array([0, 10, 15, 20, 30, 999]) - coll = VectorizedHistCollection(innerBins) - # fill for [35, 90, 120] +def scalarBins(): + return [0, 10, 15, 20, 30, 999] + + +@pytest.fixture +def collection(scalarBins): + coll = VectorizedHistCollection(scalarBins) return coll +@pytest.fixture +def scalarDistribution(): + return [1, 12, 1, 50] + + +@pytest.fixture +def vectorDistribution(): + return awkward.fromiter([ + [60, 50, 40, 30, 20], + [32, 23], + [56, 34, 31], + [], + ]) + + +@pytest.fixture(params=['event_weights', 'vector_weights', 'flat_vector_weights']) +def weights(vectorDistribution, request): + if request.param == 'event_weights': + return np.ones(np.size(vectorDistribution)) + if request.param == 'vector_weights': + return awkward.JaggedArray.fromoffsets( + (vectorDistribution.starts, vectorDistribution.stops), + np.ones(np.size(vectorDistribution.content)) + ) + return np.ones(np.size(vectorDistribution.content)) + + @pytest.mark.parametrize( "values,expected", [ @@ -33,10 +63,10 @@ def test_add(collection): def test_access(collection): collection.insert('test', bins=[35, 90, 120]) - innerValues = [1, 12, 1, 50] - assert collection[innerValues] == collection[1] + collection[12] + collection[1] + collection[50] + values = [1, 12, 1, 50] + assert collection[values] == collection[1] + collection[12] + collection[1] + collection[50] # assert type(collection[innerValues]) == Hist - assert type(collection[innerValues]['test']) == VectorizedHistProxy + assert type(collection[values]['test']) == VectorizedHistProxy # def test_copy(collection): @@ -77,26 +107,18 @@ def test_split(bins, x, expected): assert chunk.tolist() == exp.tolist() -def test_fill(collection): - innerValues = [1, 12, 1, 50] - outerValues = awkward.fromiter([ - [60, 50, 40, 30, 20], - [32, 23], - [56, 34, 31], - [], - ]) +def test_fill(collection, scalarDistribution, vectorDistribution, weights): expected = [ [4.0, 4.0, 0.0, 0.0], [2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] - hist_name = 'test' collection.insert(hist_name, bins=[35, 90, 120]) - weights = np.ones(len(outerValues.content)) - collection[innerValues][hist_name].fill(outerValues, weights) - for i in range(len(np.unique(innerValues))): - hist = collection[innerValues][hist_name]._get_hist(i + 1) + # event_weights = np.ones(np.size(vectorDistribution.content)) + collection[scalarDistribution][hist_name].fill(vectorDistribution, weights) + for i in range(len(np.unique(scalarDistribution))): + hist = collection[scalarDistribution][hist_name]._get_hist(i + 1) assert list(hist.y(overflow=True)) == expected[i] From cb28ed4a4a20d2078b3b0f2bfd23e3f493959b17 Mon Sep 17 00:00:00 2001 From: kreczko Date: Fri, 2 Aug 2019 15:21:45 +0100 Subject: [PATCH 30/30] extended VectorizedHistProxy to allow event weights and per-object weigths --- cmsl1t/collections/vectorized.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmsl1t/collections/vectorized.py b/cmsl1t/collections/vectorized.py index 7a776db00cb..31fea7e29f8 100644 --- a/cmsl1t/collections/vectorized.py +++ b/cmsl1t/collections/vectorized.py @@ -19,15 +19,21 @@ def extend(arr1, starts, stops): def split_input(inner_indices, x, w): content = x + weights = w if hasattr(x, 'starts'): inner_indices = extend(inner_indices, x.starts, x.stops) content = x.content + if hasattr(w, 'starts'): + weights = w.content + + if np.size(weights) < np.size(content) and hasattr(x, 'starts'): + weights = extend(weights, x.starts, x.stops) for u in np.unique(inner_indices): mask = inner_indices == u if not isinstance(mask, (tuple, list, np.ndarray, np.generic)): mask = np.array(mask) - yield u, content[mask], w[mask] + yield u, content[mask], weights[mask] class VectorizedHistCollection(BaseHistCollection): @@ -147,6 +153,7 @@ def _get_hist(self, inner_index): def fill(self, x, w=None): if not isinstance(x, (tuple, list, np.ndarray, awkward.JaggedArray)): x = np.array(x) + if w is None: n = np.size(x.content) if hasattr(x, 'content') else np.size(x) w = np.ones(n)