diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 5d4bf0e..6e51655 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -6,6 +6,9 @@ from __future__ import annotations from odapt._version import __version__ -from odapt.operations import hadd # noqa: F401 +from odapt.operations import ( + hadd, # noqa: F401 + merge, # noqa: F401 +) __all__ = ["__version__"] diff --git a/src/odapt/operations/__init__.py b/src/odapt/operations/__init__.py index d477324..bc23cd6 100644 --- a/src/odapt/operations/__init__.py +++ b/src/odapt/operations/__init__.py @@ -1,3 +1,4 @@ from __future__ import annotations -from odapt.operations.hadd import hadd, main # noqa: F401 +from odapt.operations.hadd import hadd_1d, hadd_2d, hadd_3d # noqa: F401 +from odapt.operations.merge import hadd_and_merge # noqa: F401 diff --git a/src/odapt/operations/hadd.py b/src/odapt/operations/hadd.py index 2551e00..8d345e3 100644 --- a/src/odapt/operations/hadd.py +++ b/src/odapt/operations/hadd.py @@ -1,8 +1,5 @@ from __future__ import annotations -import argparse -from pathlib import Path - import numpy as np import uproot @@ -38,7 +35,14 @@ def hadd_1d(destination, file, key, first, *, n_key=None): hist.values(flow=True), *member_data, hist.variances(flow=True), - hist.member("fXaxis"), + uproot.writing.identify.to_TAxis( + "fXaxis", + "", + hist.member("fN"), + hist.axis(axis="x").low, + hist.axis(axis="x").high, + fXbins=hist.member("fXaxis").edges(flow=True), + ), ) if hist.member("fN") == outfile[key].member("fN"): member_data = np.array( @@ -67,7 +71,14 @@ def hadd_1d(destination, file, key, first, *, n_key=None): member_data, ), outfile[key].variances(flow=True) + hist.variances(flow=True), - hist.member("fXaxis"), + uproot.writing.identify.to_TAxis( + "fXaxis", + "", + hist.member("fN"), + hist.axis(axis="x").low, + hist.axis(axis="x").high, + fXbins=hist.member("fXaxis").edges(flow=True), + ), ) outfile.close() return h_sum @@ -115,8 +126,21 @@ def hadd_2d(destination, file, key, first, *, n_key=None): np.ravel(hist.values(flow=True), order="C"), *member_data, np.ravel(hist.variances(flow=True), order="C"), - hist.member("fXaxis"), - hist.member("fYaxis"), + uproot.writing.identify.to_TAxis( + "fXaxis", + "", + hist.member("fXaxis").member("fNbins"), + hist.axis(axis="x").low, + hist.axis(axis="x").high, + fXbins=hist.member("fXaxis").edges(flow=True), + ), + uproot.writing.identify.to_TAxis( + "fYaxis", + "", + hist.member("fYaxis").member("fNbins"), + hist.axis(axis="y").low, + hist.axis(axis="y").high, + ), ) if hist.member("fN") == outfile[key].member("fN"): member_data = np.array( @@ -131,13 +155,11 @@ def hadd_2d(destination, file, key, first, *, n_key=None): hist.member("fTsumwxy"), ] ) - h_sum = uproot.writing.identify.to_TH2x( hist.member("fName"), hist.member("fTitle"), - np.ravel( - outfile[key].values(flow=True) + hist.values(flow=True), order="C" - ), + np.ravel(outfile[key].values(flow=True), order="C") + + np.ravel(hist.values(flow=True), order="C"), *np.add( np.array( [ @@ -156,8 +178,21 @@ def hadd_2d(destination, file, key, first, *, n_key=None): np.ravel( outfile[key].variances(flow=True) + hist.variances(flow=True), order="C" ), - hist.member("fXaxis"), - hist.member("fYaxis"), + uproot.writing.identify.to_TAxis( + "fXaxis", + "", + hist.member("fXaxis").member("fNbins"), + hist.axis(axis="x").low, + hist.axis(axis="x").high, + fXbins=hist.member("fXaxis").edges(flow=True), + ), + uproot.writing.identify.to_TAxis( + "fYaxis", + "", + hist.member("fYaxis").member("fNbins"), + hist.axis(axis="y").low, + hist.axis(axis="y").high, + ), ) outfile.close() return h_sum @@ -209,9 +244,28 @@ def hadd_3d(destination, file, key, first, *, n_key=None): np.ravel(hist.values(flow=True), order="C"), *member_data, np.ravel(hist.variances(flow=True), order="C"), - hist.member("fXaxis"), - hist.member("fYaxis"), - hist.member("fZaxis"), + uproot.writing.identify.to_TAxis( + "fXaxis", + "", + hist.member("fXaxis").member("fNbins"), + hist.axis(axis="x").low, + hist.axis(axis="x").high, + fXbins=hist.member("fXaxis").edges(flow=True), + ), + uproot.writing.identify.to_TAxis( + "fYaxis", + "", + hist.member("fYaxis").member("fNbins"), + hist.axis(axis="y").low, + hist.axis(axis="y").high, + ), + uproot.writing.identify.to_TAxis( + "fZaxis", + "", + hist.member("fZaxis").member("fNbins"), + hist.axis(axis="z").low, + hist.axis(axis="z").high, + ), ) if hist.member("fN") == outfile[key].member("fN"): member_data = np.add( @@ -255,13 +309,35 @@ def hadd_3d(destination, file, key, first, *, n_key=None): outfile[key].values(flow=True) + hist.values(flow=True), order="C" ), *member_data, - np.ravel( - (outfile[key].variances(flow=True) + hist.variances(flow=True)), - order="C", + ( + np.ravel(outfile[key].variances(flow=True), order="C") + + np.ravel( + hist.variances(flow=True), + order="C", + ) + ), + uproot.writing.identify.to_TAxis( + "fXaxis", + "", + hist.member("fXaxis").member("fNbins"), + hist.axis(axis="x").low, + hist.axis(axis="x").high, + fXbins=hist.member("fXaxis").edges(flow=True), + ), + uproot.writing.identify.to_TAxis( + "fYaxis", + "", + hist.member("fYaxis").member("fNbins"), + hist.axis(axis="y").low, + hist.axis(axis="y").high, + ), + uproot.writing.identify.to_TAxis( + "fZaxis", + "", + hist.member("fZaxis").member("fNbins"), + hist.axis(axis="z").low, + hist.axis(axis="z").high, ), - hist.member("fXaxis"), - hist.member("fYaxis"), - hist.member("fZaxis"), ) outfile.close() return h_sum @@ -273,227 +349,3 @@ def hadd_3d(destination, file, key, first, *, n_key=None): " and ", outfile[key].member("fN"), ) from None - - -def hadd( - destination, - files, - *, - force=True, - append=False, - compression="lz4", - compression_level=1, - skip_bad_files=False, - union=True, - same_names=False, -): - """ - Args: - destination (path-like): Name of the output file or file path. - files (Str or list of str): List of local ROOT files to read histograms from. - May contain glob patterns. - force (bool): If True, overwrites destination file if it exists. Force and append - cannot both be True. - append (bool): If True, appends histograms to an existing file. Force and append - cannot both be True. - compression (str): Sets compression level for root file to write to. Can be one of - "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4". - compression_level (int): Use a compression level particular to the chosen compressor. - By default the compression level is 1. - skip_bad_files (bool): If True, skips corrupt or non-existent files without exiting. - max_opened_files (int): Limits the number of files to be open at the same time. If 0, - this gets set to system limit. - union (bool): If True, adds the histograms that have the same name and copies all others - to the new file. - same_names (bool): If True, only adds together histograms which have the same name (key). If False, - histograms are added together based on TTree structure (bins must be equal). - - Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to - a new or existing ROOT file. - - >>> odapt.add_histograms("destination.root", ["file1_to_hadd.root", "file2_to_hadd.root"]) - - """ - if compression in ("ZLIB", "zlib"): - compression_code = uproot.const.kZLIB - elif compression in ("LZMA", "lzma"): - compression_code = uproot.const.kLZMA - elif compression in ("LZ4", "lz4"): - compression_code = uproot.const.kLZ4 - elif compression in ("ZSTD", "zstd"): - compression_code = uproot.const.kZSTD - else: - msg = f"unrecognized compression algorithm: {compression}. Only ZLIB, LZMA, LZ4, and ZSTD are accepted." - raise ValueError(msg) - p = Path(destination) - if Path.is_file(p): - if not force and not append: - raise FileExistsError - if force and append: - msg = "Cannot append to a new file. Either force or append can be true." - raise ValueError(msg) - file_out = uproot.recreate( - destination, - compression=uproot.compression.Compression.from_code_pair( - compression_code, compression_level - ), - ) - else: - if append: - raise FileNotFoundError( - "File %s" + destination + " not found. File must exist to append." - ) - file_out = uproot.recreate( - destination, - compression=uproot.compression.Compression.from_code_pair( - compression_code, compression_level - ), - ) - - if not isinstance(files, list): - path = Path(files) - files = sorted(path.glob("**/*.root")) - - if len(files) <= 1: - msg = "Cannot hadd one file. Use root_to_root to copy a ROOT file." - raise ValueError(msg) from None - - with uproot.open(files[0]) as file: - keys = file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False) - if same_names: - if union: - for i, _value in enumerate(files[1:]): - with uproot.open(files[i]) as file: - keys = np.union1d( - keys, - file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False), - ) - else: - for i, _value in enumerate(files[1:]): - with uproot.open(files[i]) as file: - keys = np.intersect1d( - keys, - file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False), - ) - else: - keys = file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False) - - first = True - for input_file in files: - p = Path(input_file) - if Path.is_file(p): - file_out = uproot.update(destination) - else: - file_out = uproot.recreate( - destination, - compression=uproot.compression.Compression.from_code_pair( - compression_code, compression_level - ), - ) - - try: - file = uproot.open(input_file) - except FileNotFoundError: - if skip_bad_files: - continue - msg = "File: {input_file} does not exist or is corrupt." - raise FileNotFoundError(msg) from None - if same_names: - for key in keys: - try: - file[key] - except ValueError: - if not union: - continue - msg = "Union key filter error." - raise ValueError(msg) from None - if len(file[key].axes) == 1: - h_sum = hadd_1d(destination, file, key, first) - - elif len(file[key].axes) == 2: - h_sum = hadd_2d(destination, file, key, first) - - else: - h_sum = hadd_3d(destination, file, key, first) - - else: - n_keys = file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False) - for i, _value in enumerate(keys): - if len(file[n_keys[i]].axes) == 1: - h_sum = hadd_1d(destination, file, keys[i], first, n_key=n_keys[i]) - - elif len(file[n_keys[i]].axes) == 2: - h_sum = hadd_2d(destination, file, keys[i], first, n_key=n_keys[i]) - - else: - h_sum = hadd_3d(destination, file, keys[i], first, n_key=n_keys[i]) - - if h_sum is not None: - file_out[keys[i]] = h_sum - - first = False - file.close() - - -def main(): - """ - Implementation of cmd-line executables. - """ - argparser = argparse.ArgumentParser(description="Hadd ROOT histograms with Uproot") - argparser.add_argument("destination", type=str, help="path of output file") - argparser.add_argument( - "input_files", - type=str, - nargs="+", - help="list or directory (glob syntax accepted) of input files", - ) - argparser.add_argument( - "-f", - "--force", - action="store_true", - default=True, - help="force overwrite of output file", - ) - argparser.add_argument( - "-a", "--append", action="store", default=False, help="append to existing file" - ) - argparser.add_argument( - "-c", - "--compression", - action="store", - default="lz4", - help="set compression level between 1-9", - ) - argparser.add_argument( - "-c[0-9]", - "--compression_level", - action="store", - default=1, - help="set compression level between 1-9", - ) - argparser.add_argument( - "-k", - "--skip_bad_files", - action="store", - default=False, - help="corrupt or non-existent input files are ignored", - ) - argparser.add_argument( - "-u", - action="union", - default=True, - help="all histograms get copied to new file, only those with same name get added", - ) - - args = argparser.parse_args() - - hadd( - args.destination, - args.input_file, - force=args.force, - append=args.append, - compression=args.compression, - compression_level=args.compression_level, - skip_bad_files=args.skip_bad_files, - union=args.union, - ) diff --git a/src/odapt/operations/merge.py b/src/odapt/operations/merge.py new file mode 100644 index 0000000..e09261b --- /dev/null +++ b/src/odapt/operations/merge.py @@ -0,0 +1,315 @@ +from __future__ import annotations + +from pathlib import Path + +import awkward as ak +import uproot + +from odapt.operations.hadd import hadd_1d, hadd_2d, hadd_3d + + +def hadd_and_merge( + destination, + files, + *, + fieldname_separator="_", + branch_types=None, + title="", + field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner, + initial_basket_capacity=10, + resize_factor=10.0, + counter_name=lambda counted: "n" + counted, + step_size="100 MB", + force=True, + append=False, + compression="LZ4", + compression_level=1, + skip_bad_files=False, +): + """ + Args: + destination (path-like): Name of the output file or file path. + files (Str or list of str): List of local ROOT files to read histograms from. + May contain glob patterns. + branch_types (dict or pairs of str → NumPy dtype/Awkward type): Name and type specification for the TBranches. + field_name (callable of str → str): Function to generate TBranch names for columns + of an Awkward record array or a Pandas DataFrame. + initial_basket_capacity (int): Number of TBaskets that can be written to the TTree + without rewriting the TTree metadata to make room. + resize_factor (float): When the TTree metadata needs to be rewritten, this specifies how + many more TBasket slots to allocate as a multiplicative factor. + step_size (int or str): If an integer, the maximum number of entries to include in each + iteration step; if a string, the maximum memory size to include. The string must be + a number followed by a memory unit, such as “100 MB”. Recommended to be >100 kB. + force (bool): If True, overwrites destination file if it exists. Force and append + cannot both be True. + append (bool): If True, appends histograms to an existing file. Force and append + cannot both be True. + compression (str): Sets compression level for root file to write to. Can be one of + "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4". + compression_level (int): Use a compression level particular to the chosen compressor.. + By default the compression level is 1. + skip_bad_files (bool): If True, skips corrupt or non-existent files without exiting. + + Merges TTrees together, and adds values in histograms from local ROOT files, and writes + them to a new ROOT file. + + >>> odapt.hadd_and_merge("destination.root", ["file1_to_hadd.root", "file2_to_hadd.root"]) + + """ + if compression in ("LZMA", "lzma"): + compression_code = uproot.const.kLZMA + elif compression in ("ZLIB", "zlib"): + compression_code = uproot.const.kZLIB + elif compression in ("LZ4", "lz4"): + compression_code = uproot.const.kLZ4 + elif compression in ("ZSTD", "zstd"): + compression_code = uproot.const.kZSTD + else: + msg = f"unrecognized compression algorithm: {compression}. Only ZLIB, LZMA, LZ4, and ZSTD are accepted." + raise ValueError(msg) + path = Path(destination) + if Path.is_file(path): + if not force and not append: + raise FileExistsError + if force and append: + msg = "Cannot append to an empty file. Either force or append can be true." + raise ValueError(msg) + if append: + out_file = uproot.update( + destination, + compression=uproot.compression.Compression.from_code_pair( + compression_code, compression_level + ), + ) + first = False + else: + out_file = uproot.recreate( + destination, + compression=uproot.compression.Compression.from_code_pair( + compression_code, compression_level + ), + first=True, + ) + else: + if append: + raise FileNotFoundError( + "File %s" + destination + " not found. File must exist to append." + ) + out_file = uproot.recreate( + destination, + compression=uproot.compression.Compression.from_code_pair( + compression_code, compression_level + ), + ) + first = True + + if not isinstance(files, list): + path = Path(files) + files = sorted(path.glob("**/*.root")) + + if len(files) <= 1: + msg = "Only one file was input. Use root_to_root to copy a ROOT file." + raise ValueError(msg) from None + + try: + f = uproot.open(files[0]) + except FileNotFoundError: + if skip_bad_files: + for file in files: + try: + f = uproot.open(file) + break + except FileNotFoundError: + continue + + msg = "File: {files[0]} does not exist or is corrupt." + raise FileNotFoundError(msg) from None + hist_keys = f.keys( + filter_classname=["TH*", "TProfile"], cycle=False, recursive=False + ) + for key in f.keys(cycle=False, recursive=False): + if key in hist_keys: + if len(f[key].axes) == 1: + h_sum = hadd_1d(destination, f, key, True) + out_file[key] = h_sum + elif len(f[key].axes) == 2: + out_file[key] = hadd_2d(destination, f, key, True) + else: + out_file[key] = hadd_3d(destination, f, key, True) + + trees = f.keys(filter_classname="TTree", cycle=False, recursive=False) + + for t in trees: + tree = f[t] + histograms = tree.keys(filter_typename=["TH*", "TProfile"], recursive=False) + groups = [] + count_branches = [] + temp_branches = [branch.name for branch in tree.branches] + temp_branches1 = [branch.name for branch in tree.branches] + cur_group = 0 + for branch in temp_branches: + if len(tree[branch].member("fLeaves")) > 1: + msg = "Cannot handle split objects." + raise NotImplementedError(msg) + if tree[branch].member("fLeaves")[0].member("fLeafCount") is None: + continue + groups.append([]) + groups[cur_group].append(branch) + for branch1 in temp_branches1: + if tree[branch].member("fLeaves")[0].member("fLeafCount") is tree[ + branch1 + ].member("fLeaves")[0].member("fLeafCount") and ( + tree[branch].name != tree[branch1].name + ): + groups[cur_group].append(branch1) + temp_branches.remove(branch1) + count_branches.append(tree[branch].count_branch.name) + temp_branches.remove(tree[branch].count_branch.name) + temp_branches.remove(branch) + cur_group += 1 + + writable_hists = {} + if len(histograms) > 1: + for key in histograms: + if len(f[key].axes) == 1: + writable_hists[key] = hadd_1d(destination, f, key, True) + + elif len(f[key].axes) == 2: + writable_hists[key] = hadd_2d(destination, f, key, True) + + else: + writable_hists[key] = hadd_3d(destination, f, key, True) + + elif len(histograms) == 1: + if len(f[histograms[0]].axes) == 1: + writable_hists = hadd_1d(destination, f, histograms[0], True) + + elif len(f[histograms[0]].axes) == 2: + writable_hists = hadd_2d(destination, f, histograms[0], True) + + else: + writable_hists = hadd_3d(destination, f, histograms[0], True) + + first = True + for chunk in uproot.iterate(tree, step_size=step_size, how=dict): + for key in count_branches: + del chunk[key] + for group in groups: + if (len(group)) > 1: + chunk.update( + { + group[0][0 : (group[0].index(fieldname_separator))]: ak.zip( + { + name[ + group[0].index(fieldname_separator) + 1 : + ]: array + for name, array in zip( + ak.fields(chunk), ak.unzip(chunk) + ) + if name in group + } + ) + } + ) + for key in group: + del chunk[key] + + if branch_types is None: + branch_types = {name: array.type for name, array in chunk.items()} + + if first: + out_file.mktree( + tree.name, + branch_types, + title=title, + counter_name=counter_name, + field_name=field_name, + initial_basket_capacity=initial_basket_capacity, + resize_factor=resize_factor, + ) + try: + out_file[tree.name].extend(chunk) + except AssertionError: + msg = "TTrees must have the same structure to be merged" + first = False + + else: + try: + out_file[tree.name].extend(chunk) + except AssertionError: + msg = "TTrees must have the same structure to be merged" + + for i, _value in enumerate(histograms): + out_file[histograms[i]] = writable_hists[i] + + f.close() + + for file in files[1:]: + try: + f = uproot.open(file) + except FileNotFoundError: + if skip_bad_files: + continue + msg = "File: {file} does not exist or is corrupt." + raise FileNotFoundError(msg) from None + + for key in f.keys(cycle=False, recursive=False): + if key in hist_keys: + if len(f[key].axes) == 1: + h_sum = hadd_1d(destination, f, key, False) + elif len(f[key].axes) == 2: + h_sum = hadd_2d(destination, f, key, False) + else: + h_sum = hadd_3d(destination, f, key, False) + + out_file[key] = h_sum + + writable_hists = {} + for t in trees: + tree = f[t] + writable_hists = [] + for key in histograms: + if len(f[key].axes) == 1: + writable_hists[key] = hadd_1d(destination, out_file, key, False) + + elif len(f[key].axes) == 2: + writable_hists[key] = hadd_2d(destination, out_file, key, False) + + else: + writable_hists[key] = hadd_3d(destination, out_file, key, False) + + for chunk in uproot.iterate(tree, step_size=step_size, how=dict): + for group in groups: + if len(group) > 1: + chunk.update( + { + group[0][ + 0 : (group[0].index(fieldname_separator)) + ]: ak.zip( + { + name[ + group[0].index(fieldname_separator) + 1 : + ]: array + for name, array in zip( + ak.fields(chunk), ak.unzip(chunk) + ) + if name in group + } + ) + } + ) + for key in group: + del chunk[key] + for key in count_branches: + del chunk[key] + try: + out_file[tree.name].extend(chunk) + + except AssertionError: + msg = "TTrees must have the same structure to be merged" + + for key in histograms: + out_file[key] = writable_hists[key] + + f.close() diff --git a/tests/test_ttree_merge.py b/tests/test_ttree_merge.py new file mode 100644 index 0000000..c854ae0 --- /dev/null +++ b/tests/test_ttree_merge.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import awkward as ak +import numpy as np +import pytest +import uproot + +from odapt import merge + +skhep_testdata = pytest.importorskip("skhep_testdata") + + +def test_simple(): + merge.hadd_and_merge( + "od_test_simple.root", + [ + skhep_testdata.data_path("uproot-HZZ.root"), + skhep_testdata.data_path("uproot-HZZ.root"), + ], + counter_name=lambda counted: "N" + counted, + ) + odapt_file = uproot.open("od_test_hist.root") + hadd_file = uproot.open( + "/Users/zobil/Documents/odapt/src/odapt/operations/HZZ-hadd.root" + ) + assert ak.all(odapt_file.keys() == hadd_file.keys()) + for key in odapt_file["events"]: + assert ak.all( + odapt_file["events"].arrays()[key] == hadd_file["events"].arrays()[key] + ) + + +def test_hists(): + merge.hadd_and_merge( + "od_test_hists.root", + [ + skhep_testdata.data_path("uproot-hepdata-example.root"), + skhep_testdata.data_path("uproot-hepdata-example.root"), + ], + step_size=100, + counter_name=lambda counted: "N" + counted, + ) + odapt_file = uproot.open("od_test_hists.root") + hadd_file = uproot.open( + "/Users/zobil/Documents/odapt/src/odapt/operations/hadd-hepdata.root" + ) + + assert ak.all(odapt_file["hpx"].values() == hadd_file["hpx"].values()) + + +def test_force(): + uproot.recreate("od_test_force.root") + merge.hadd_and_merge( + "od_test_force.root", + [ + skhep_testdata.data_path("uproot-HZZ.root"), + skhep_testdata.data_path("uproot-HZZ.root"), + "nonexistent_file.root", + ], + force=False, + ) + with pytest.raises(FileExistsError) as excinfo: + merge.hadd_and_merge( + "od_test_force.root", + [ + skhep_testdata.data_path("uproot-HZZ.root"), + skhep_testdata.data_path("uproot-HZZ.root"), + "nonexistent_file.root", + ], + force=False, + ) + assert "file exists " in str(excinfo.value) + try: + merge.hadd_and_merge( + "od_test_force.root", + [ + skhep_testdata.data_path("uproot-HZZ.root"), + skhep_testdata.data_path("uproot-HZZ.root"), + "nonexistent_file.root", + ], + force=True, + ) + except FileExistsError: + pytest.fail("Error with force argument") + + +def test_skip_bad_files(): + merge.hadd_and_merge( + "od_test_skip_files.root", + [ + skhep_testdata.data_path("uproot-HZZ.root"), + skhep_testdata.data_path("uproot-HZZ.root"), + "nonexistent_file.root", + ], + skip_bad_files=True, + ) + + with pytest.raises(FileNotFoundError) as excinfo: + merge.hadd_and_merge( + "od_test_skip_files.root", + [ + skhep_testdata.data_path("uproot-HZZ.root"), + skhep_testdata.data_path("uproot-HZZ.root"), + "nonexistent_file.root", + ], + skip_bad_files=False, + ) + assert "does not exist or is corrupt." in str(excinfo.value) + + +def realistic_data(): + merge.hadd_and_merge( + "test_existing_file.root", + [ + skhep_testdata.data_path("nanoAOD_2015_CMS_Open_Data_ttbar.root"), + skhep_testdata.data_path("nanoAOD_2015_CMS_Open_Data_ttbar.root"), + skhep_testdata.data_path("nanoAOD_2015_CMS_Open_Data_ttbar.root"), + ], + step_size="100 MB", + ) + + odapt_file = uproot.open("test_existing_file.root") + hadd_file = uproot.open( + "/Users/zobil/Documents/odapt/tests/samples/test_existing.root" + ) + for key in hadd_file["Events"]: + assert np.equal( + odapt_file["Events"].arrays()[key].to_numpy, + hadd_file["Events"].arrays()[key].to_numpy, + ).all