From 37b54a81b0a344736c767507e9aac8522892dac5 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 5 Mar 2018 18:30:22 +0100 Subject: [PATCH 1/5] Start optimising get_matching_variables --- root_pandas/readwrite.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index 5fc2083..c710293 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -7,7 +7,7 @@ from numpy.lib.recfunctions import append_fields from pandas import DataFrame, RangeIndex from root_numpy import root2array, list_trees -from fnmatch import fnmatch +import fnmatch from root_numpy import list_branches from root_numpy.extern.six import string_types import itertools @@ -59,17 +59,23 @@ def get_nonscalar_columns(array): def get_matching_variables(branches, patterns, fail=True): + # Convert branches to a set to make x "in branches" O(1) on average + branches = set(branches) selected = [] - - for p in patterns: + for pattern in patterns: found = False - for b in branches: - if fnmatch(b, p): + # Avoid using fnmatch if the pattern can only match literally + if re.findall(r'(\*)|(\?)|(\[.*\])|(\[\!.*\])', pattern): + for match in fnmatch.filter(branches, pattern): found = True - if fnmatch(b, p) and b not in selected: - selected.append(b) + if match not in selected: + selected.append(match) + elif pattern in branches: + found = True + if pattern not in selected: + selected.append(pattern) if not found and fail: - raise ValueError("Pattern '{}' didn't match any branch".format(p)) + raise ValueError("Pattern '{}' didn't match any branch".format(pattern)) return selected From cde75c81ab117c6c5415079b9980944048902bd2 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 5 Mar 2018 18:44:34 +0100 Subject: [PATCH 2/5] Further optimise get_matching_variables --- root_pandas/readwrite.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index c710293..5e5c7ae 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -61,10 +61,13 @@ def get_nonscalar_columns(array): def get_matching_variables(branches, patterns, fail=True): # Convert branches to a set to make x "in branches" O(1) on average branches = set(branches) - selected = [] - for pattern in patterns: + patterns = set(patterns) + # Find any trivial matches + selected = list(branches.intersection(patterns)) + # Any matches that weren't trivial need to be looped over... + for pattern in patterns.difference(selected): found = False - # Avoid using fnmatch if the pattern can only match literally + # Avoid using fnmatch if the pattern if possible if re.findall(r'(\*)|(\?)|(\[.*\])|(\[\!.*\])', pattern): for match in fnmatch.filter(branches, pattern): found = True From 568987879b31c6a949db6bd56f64f2bf1ebcbfd9 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 5 Mar 2018 18:47:42 +0100 Subject: [PATCH 3/5] Add regression test for #59 --- tests/test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test.py b/tests/test.py index 4b3b99f..8a05d93 100644 --- a/tests/test.py +++ b/tests/test.py @@ -230,6 +230,18 @@ def test_nonscalar_columns(): os.remove(path) +def test_get_matching_variables_performance(): + """Performance regression test for #59""" + import random + import string + import root_pandas.readwrite + for n in [10, 100, 1000, 10000, 100000]: + branches = [' '.join(random.choices(string.ascii_letters, k=100)) for i in range(n)] + patterns = [' '.join(random.choices(string.ascii_letters, k=100)) for i in range(n)] + root_pandas.readwrite.get_matching_variables(branches, patterns, fail=False) + root_pandas.readwrite.get_matching_variables(branches, branches, fail=False) + + def test_noexpand_prefix(): xs = np.array([1, 2, 3]) df = pd.DataFrame({'x': xs}) From 2b54a53c00823942298783f5ffc1fd93aa3eda96 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 5 Mar 2018 19:02:51 +0100 Subject: [PATCH 4/5] Use random.sample as random.choices is too new --- tests/test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 8a05d93..792f672 100644 --- a/tests/test.py +++ b/tests/test.py @@ -235,9 +235,9 @@ def test_get_matching_variables_performance(): import random import string import root_pandas.readwrite - for n in [10, 100, 1000, 10000, 100000]: - branches = [' '.join(random.choices(string.ascii_letters, k=100)) for i in range(n)] - patterns = [' '.join(random.choices(string.ascii_letters, k=100)) for i in range(n)] + for n in [10, 100, 1000, 10000]: + branches = [' '.join(random.sample(string.ascii_letters*100, k=100)) for i in range(n)] + patterns = [' '.join(random.sample(string.ascii_letters*100, k=100)) for i in range(n)] root_pandas.readwrite.get_matching_variables(branches, patterns, fail=False) root_pandas.readwrite.get_matching_variables(branches, branches, fail=False) From fec37ddeb8ebf0ee284ceadef2645c94eb7be752 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 5 Mar 2018 19:19:30 +0100 Subject: [PATCH 5/5] Improve coverage of get_matching_variables_performance --- root_pandas/readwrite.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index 5e5c7ae..dc7873f 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -74,9 +74,7 @@ def get_matching_variables(branches, patterns, fail=True): if match not in selected: selected.append(match) elif pattern in branches: - found = True - if pattern not in selected: - selected.append(pattern) + raise NotImplementedError('I think this is impossible?') if not found and fail: raise ValueError("Pattern '{}' didn't match any branch".format(pattern)) return selected