Skip to content

Commit

Permalink
proof of concept edm4hep schema, only does PFOs
Browse files Browse the repository at this point in the history
  • Loading branch information
lgray committed Jun 4, 2023
1 parent 3aaddc5 commit 09bb414
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/coffea/nanoevents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from coffea.nanoevents.schemas import (
BaseSchema,
DelphesSchema,
EDM4HEPSchema,
NanoAODSchema,
PDUNESchema,
PFNanoAODSchema,
Expand All @@ -21,4 +22,5 @@
"PHYSLITESchema",
"DelphesSchema",
"PDUNESchema",
"EDM4HEPSchema",
]
12 changes: 12 additions & 0 deletions src/coffea/nanoevents/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from coffea.nanoevents.schemas import (
BaseSchema,
DelphesSchema,
EDM4HEPSchema,
NanoAODSchema,
PHYSLITESchema,
TreeMakerSchema,
Expand Down Expand Up @@ -57,6 +58,11 @@ def _remove_not_interpretable(branch):
f"Skipping {branch.name} as it is it cannot be represented as an Awkward array"
)
return False
except uproot.interpretation.identify.UnknownInterpretation:
warnings.warn(
f"Skipping {branch.name} as it is it cannot be interpreted by Uproot"
)
return False
else:
return True

Expand Down Expand Up @@ -291,6 +297,12 @@ def from_root(
elif schemaclass is TreeMakerSchema:
from coffea.nanoevents.methods import base, vector

behavior = {}
behavior.update(base.behavior)
behavior.update(vector.behavior)
elif schemaclass is EDM4HEPSchema:
from coffea.nanoevents.methods import base, vector

behavior = {}
behavior.update(base.behavior)
behavior.update(vector.behavior)
Expand Down
2 changes: 2 additions & 0 deletions src/coffea/nanoevents/schemas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .base import BaseSchema
from .delphes import DelphesSchema
from .edm4hep import EDM4HEPSchema
from .nanoaod import NanoAODSchema, PFNanoAODSchema
from .pdune import PDUNESchema
from .physlite import PHYSLITESchema
Expand All @@ -13,4 +14,5 @@
"PHYSLITESchema",
"DelphesSchema",
"PDUNESchema",
"EDM4HEPSchema",
]
183 changes: 183 additions & 0 deletions src/coffea/nanoevents/schemas/edm4hep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import re

from coffea.nanoevents.schemas.base import BaseSchema, nest_jagged_forms, zip_forms

_base_collection = re.compile(r".*[\#\/]+.*")
_trailing_under = re.compile(r".*_[0-9]")


class EDM4HEPSchema(BaseSchema):
"""EDM4HEP schema builder
The EDM4HEP schema is built from all branches found in the supplied file,
based on the naming pattern of the branches. There are two steps of to the
generation of array collections:
- Objects with vector-like quantities (momentum, coordinate points) in the
TreeMaker ntuples are stored using ROOT PtEtaPhiEVectors and XYZPoint
classes with maximum TTree splitting. These variable branches are grouped
into a single collection with the original object name, with the
corresponding coordinate variables names mapped to the standard variable
names for coffea.nanoevents.methods.vector behaviors. For example:
- The "Jets" branch in a TreeMaker Ntuple branch stores 'PtEtaPhiEVector's
corresponding to the momentum of AK4 jets. The resulting collection after
this first step would contain the vector variables in the form of
Jets.pt, Jets.eta, Jets.phi, Jets.energy, and addition vector quantities
(px) can be accessed via the usual vector behavior methods.
- The "PrimaryVertices" branch in a TreeMaker Ntuple branch stores
'XYZPoint's corresponding to the coordinates of the primary vertices, The
resulting collection after this first step wold contain the coordinate
variables in the form of PrimaryVertices.x, PrimaryVertices.y,
PrimaryVertices.z.
- Extended quantities of physic objects are stored in the format
<Object>_<variable>, such as "Jets_jecFactor". Such variables will be
merged into the collection <Object>, so the branch "Jets_jetFactor" will be
access to in the array format as "Jets.jecFactor". An exception to the
All collections are then zipped into one `base.NanoEvents` record and
returned.
"""

__dask_capable__ = True

_momentum_fields = {"energy", "momentum.x", "momentum.y", "momentum.z"}

def __init__(self, base_form, *args, **kwargs):
super().__init__(base_form, *args, **kwargs)
old_style_form = {
k: v for k, v in zip(self._form["fields"], self._form["contents"])
}
output = self._build_collections(old_style_form)
self._form["fields"] = [k for k in output.keys()]
self._form["contents"] = [v for v in output.values()]

def _build_collections(self, branch_forms):
# Turn any special classes into the appropriate awkward form
composite_objects = [
k
for k in branch_forms
if not _base_collection.match(k) and not _trailing_under.match(k)
]

composite_behavior = { # Dictionary for overriding the default behavior
"Tracks": "LorentzVector"
}
for objname in composite_objects:
if objname != "PandoraPFOs":
continue

# grab the * from "objname/objname.*"
components = {
k[2 * len(objname) + 2 :]
for k in branch_forms
if k.startswith(objname + "/")
}

print(components)

if all(comp in components for comp in self._momentum_fields):
form = zip_forms(
{
"x": branch_forms.pop(f"{objname}/{objname}.momentum.x"),
"y": branch_forms.pop(f"{objname}/{objname}.momentum.y"),
"z": branch_forms.pop(f"{objname}/{objname}.momentum.z"),
"t": branch_forms.pop(f"{objname}/{objname}.energy"),
"charge": branch_forms.pop(f"{objname}/{objname}.charge"),
"pdgId": branch_forms.pop(f"{objname}/{objname}.type"),
},
objname,
composite_behavior.get(objname, "LorentzVector"),
)
branch_forms[objname] = form
elif components == {
"fCoordinates.fX",
"fCoordinates.fY",
"fCoordinates.fZ",
}:
form = zip_forms(
{
"x": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fX"),
"y": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fY"),
"z": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fZ"),
},
objname,
composite_behavior.get(objname, "ThreeVector"),
)
branch_forms[objname] = form
else:
raise ValueError(
f"Unrecognized class with split branches: {components}"
)

# Generating collection from branch name
collections = [k for k in branch_forms if k == "PandoraPFOs"]
collections = {
"_".join(k.split("_")[:-1])
for k in collections
if k.split("_")[-1] != "AK8"
# Excluding per-event variables with AK8 variants like Mjj and MT
}

subcollections = []

for cname in collections:
items = sorted(k for k in branch_forms if k.startswith(cname + "_"))
if len(items) == 0:
continue

# Special pattern parsing for <collection>_<subcollection>Counts branches
countitems = [x for x in items if x.endswith("Counts")]
subcols = {x[:-6] for x in countitems} # List of subcollection names
for subcol in subcols:
items = [
k for k in items if not k.startswith(subcol) or k.endswith("Counts")
]
subname = subcol[len(cname) + 1 :]
subcollections.append(
{
"colname": cname,
"subcol": subcol,
"countname": subname + "Counts",
"subname": subname,
}
)

if cname not in branch_forms:
collection = zip_forms(
{k[len(cname) + 1]: branch_forms.pop(k) for k in items}, cname
)
branch_forms[cname] = collection
else:
collection = branch_forms[cname]
if not collection["class"].startswith("ListOffsetArray"):
print(collection["class"])
raise NotImplementedError(
f"{cname} isn't a jagged array, not sure what to do"
)
for item in items:
Itemname = item[len(cname) + 1 :]
collection["content"]["fields"].append(Itemname)
collection["content"]["contents"].append(
branch_forms.pop(item)["content"]
)

for sub in subcollections:
nest_jagged_forms(
branch_forms[sub["colname"]],
branch_forms.pop(sub["subcol"]),
sub["countname"],
sub["subname"],
)

return branch_forms

@property
def behavior(self):
"""Behaviors necessary to implement this schema"""
from coffea.nanoevents.methods import base, vector

behavior = {}
behavior.update(base.behavior)
behavior.update(vector.behavior)
return behavior

0 comments on commit 09bb414

Please sign in to comment.