From 876c7c8e04a49841bb55d9041d91fe1f966464ea Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Tue, 14 Nov 2023 13:45:22 -0800 Subject: [PATCH 01/18] Add new graphs to combined provenance graph --- .../pipe/base/quantum_provenance_graph.py | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 python/lsst/pipe/base/quantum_provenance_graph.py diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py new file mode 100644 index 000000000..9c13cbc4c --- /dev/null +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -0,0 +1,147 @@ +# This file is part of pipe_base. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""An under-construction version of QuantumGraph and various helper +classes. +""" + +from __future__ import annotations + +__all__ = ( + "QuantumProvenanceGraph", + "QuantumKey", + "DatasetKey", + "PrerequisiteDatasetKey", +) + +import itertools +from typing import TYPE_CHECKING, ClassVar, Literal, NamedTuple + +import networkx +from lsst.daf.butler import Butler, DataIdValue +from lsst.resources import ResourcePathExpression +from lsst.utils.logging import getLogger + +from .graph import QuantumGraph + +if TYPE_CHECKING: + pass + +_LOG = getLogger(__name__) + + +class QuantumKey(NamedTuple): + """Identifier type for quantum keys in a `QuantumProvenanceGraph`.""" + + task_label: str + """Label of the task in the pipeline.""" + + data_id_values: tuple[DataIdValue, ...] + """Data ID values of the quantum. + + Note that keys are fixed given `task_label`, so using only the values here + speeds up comparisons. + """ + + is_task: ClassVar[Literal[True]] = True + """Whether this node represents a quantum rather + than a dataset (always `True`). + """ + + +class DatasetKey(NamedTuple): + """Identifier type for dataset keys in a `QuantumProvenanceGraph`.""" + + parent_dataset_type_name: str + """Name of the dataset type (never a component).""" + + data_id_values: tuple[DataIdValue, ...] + """Data ID values of the dataset. + + Note that keys are fixed given `parent_dataset_type_name`, so using only + the values here speeds up comparisons. + """ + + is_task: ClassVar[Literal[False]] = False + """Whether this node represents a quantum rather + than a dataset (always `False`). + """ + + is_prerequisite: ClassVar[Literal[False]] = False + + +class PrerequisiteDatasetKey(NamedTuple): + """Identifier type for prerequisite dataset keys in a + `QuantumProvenanceGraph`. + + Unlike regular datasets, prerequisites are not actually required to come + from a find-first search of `input_collections`, so we don't want to + assume that the same data ID implies the same dataset. Happily we also + don't need to search for them by data ID in the graph, so we can use the + dataset ID (UUID) instead. + """ + + parent_dataset_type_name: str + """Name of the dataset type (never a component).""" + + dataset_id_bytes: bytes + """Dataset ID (UUID) as raw bytes.""" + + is_task: ClassVar[Literal[False]] = False + """Whether this node represents a quantum rather + than a dataset (always `False`). + """ + + is_prerequisite: ClassVar[Literal[True]] = True + + +class QuantumProvenanceGraph: + """A set of already-run, merged quantum graphs with provenance + information. + """ + + def __init__(self): + self._xgraph = networkx.DiGraph() + + def add_new_graph(self, qgraph: QuantumGraph | ResourcePathExpression, butler: Butler) -> None: + if not isinstance(qgraph, QuantumGraph): + qgraph = QuantumGraph.loadUri(qgraph) + assert qgraph.metadata is not None, "Saved QGs always have metadata." + qgraph.metadata["output_run"] + for node in qgraph: + quantum_key = QuantumKey(node.taskDef.label, node.quantum.dataId.values_tuple()) + self._xgraph.add_node(quantum_key) + for ref in itertools.chain.from_iterable(node.quantum.outputs.values()): + dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.values_tuple()) + self._xgraph.add_edge(quantum_key, dataset_key) + for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): + dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.values_tuple()) + if dataset_key in self._xgraph: + self._xgraph.add_edge(dataset_key, quantum_key) + + def resolve_duplicates(): + pass From 01f64f0fa7c3f4692077a633a109719e209716c0 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Wed, 15 Nov 2023 14:06:15 -0800 Subject: [PATCH 02/18] Walk multiple quantum graphs, annotating the status of specific dataset- and quantum-run collecion pairs --- .../pipe/base/quantum_provenance_graph.py | 408 +++++++++++++++++- 1 file changed, 396 insertions(+), 12 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 9c13cbc4c..ccceef57b 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -25,8 +25,9 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -"""An under-construction version of QuantumGraph and various helper -classes. +"""A set of already-run, merged quantum graphs with provenance information +which can be used to compose a report on the status of multi-attempt +processing. """ from __future__ import annotations @@ -38,8 +39,12 @@ "PrerequisiteDatasetKey", ) +import dataclasses import itertools -from typing import TYPE_CHECKING, ClassVar, Literal, NamedTuple +import logging +import uuid +from collections.abc import Iterator, Sequence, Set +from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple import networkx from lsst.daf.butler import Butler, DataIdValue @@ -72,6 +77,12 @@ class QuantumKey(NamedTuple): than a dataset (always `True`). """ + def to_summary_dict(self, xgraph: networkx.DiGraph) -> dict[str, Any]: + return { + "task": self.task_label, + "data_id": xgraph.nodes[self]["data_id"], + } + class DatasetKey(NamedTuple): """Identifier type for dataset keys in a `QuantumProvenanceGraph`.""" @@ -119,29 +130,402 @@ class PrerequisiteDatasetKey(NamedTuple): is_prerequisite: ClassVar[Literal[True]] = True +@dataclasses.dataclass +class QuantumRun: + """Information about a quantum in a given run collection.""" + + id: uuid.UUID + """The quantum graph node ID associated with the dataId in a specific run. + """ + + status: Literal["failed", "not_attempted", "successful", "logs_missing"] = "not_attempted" + """The status of the quantum in that run. + """ + + +@dataclasses.dataclass +class DatasetRun: + """Information about a dataset in a given run collection.""" + + id: uuid.UUID + """The dataset ID associated with the dataset in a specific run. + """ + + produced: bool = False + """Whether the specific run produced the dataset. + """ + + published: bool = False + """Whether this dataset was published in the final output collection. + """ + + +@dataclasses.dataclass(frozen=True, order=True) +class ResolvedDatasetKey: + """A combination of a dataset key and a particular dataset run to be used + for recording specific instances of issues. + """ + + key: DatasetKey + run: str + id: uuid.UUID + + def to_summary_dict(self, xgraph: networkx.DiGraph) -> dict[str, Any]: + return { + "dataset_type": self.key.parent_dataset_type_name, + "data_id": xgraph.nodes[self.key]["data_id"], + "uuid": self.id, + "run": self.run, + } + + class QuantumProvenanceGraph: """A set of already-run, merged quantum graphs with provenance information. + + Step through all the quantum graphs associated with certain tasks or + processing steps. For each graph/attempt, the status of each quantum and + dataset is recorded in `QuantumProvenanceGraph.add_new_graph` and duplicate + outcomes of dataIds are resolved in + `QuantumProvenanceGraph.resolve_duplicates`. At the end of this process, we + can combine all attempts into a final summary graph which can be converted + into a report on the production over multiple processing and recovery + attempts in `name functions later`. This serves to answer the question + "What happened to this data ID?" in a wholistic sense. """ def __init__(self): + # The graph we annotate as we step through all the graphs associated + # with the processing to create the `QuantumProvenanceGraph`. self._xgraph = networkx.DiGraph() - - def add_new_graph(self, qgraph: QuantumGraph | ResourcePathExpression, butler: Butler) -> None: + # The nodes representing quanta in `_xgraph` grouped by task label. + self._quanta: dict[str, set[QuantumKey]] = {} + # The nodes representing datasets in `_xgraph` grouped by dataset type + # name. + self._datasets: dict[str, set[DatasetKey]] = {} + self._published_failures: set[ResolvedDatasetKey] = set() + self._ignored_successes: set[ResolvedDatasetKey] = set() + self._rejected_successes: set[ResolvedDatasetKey] = set() + self._heterogeneous_quanta: set[QuantumKey] = set() + self._no_work_datasets: set[ResolvedDatasetKey] = set() + + @property + def published_failures(self) -> Set[ResolvedDatasetKey]: + """Datasets that appeared in the final output collection even though + the quantum that produced them failed. + """ + return self._published_failures + + @property + def ignored_successes(self) -> Set[ResolvedDatasetKey]: + """Dataset types and data ids that were produced by one or more + successful quanta but not included in the final output collection. + """ + # Note: we want to make this a set[DatasetKey] instead. + return self._ignored_successes + + @property + def rejected_successes(self) -> Set[ResolvedDatasetKey]: + """Datasets from successful quanta that were not published, where + another dataset with the same data id was published. + """ + return self._rejected_successes + + @property + def heterogeneous_quanta(self) -> Set[QuantumKey]: + """Quanta whose published outputs came from multiple runs.""" + return self._heterogeneous_quanta + + def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpression) -> None: + """Add a new quantum graph to the `QuantumProvenanceGraph`. + + Step through the quantum graph. Annotate a mirror networkx.DiGraph + (QuantumProvenanceGraph._xgraph) with all of the relevant information: + quanta, dataset types and their associated run collection (these unique + quanta- and dataset type-run collection combinations are encapsulated + in the dataclasses `DatasetRun` and `QuantumRun`). For each new + quantum, annotate the status of the `QuantumRun` by inspecting the + graph. If a DatasetType was produced, annotate this in the run by + setting `DatasetRun.produced = True`. Then, we can resolve newly- + successful quanta (failed in previous runs) with + `QuantumProvenanceGraph.resolve_duplicates`. + + Parameters + ---------- + butler : `lsst.daf.butler.Butler` + The Butler used for this report. This should match the Butler used + for the run associated with the executed quantum graph. + + qgraph : `QuantumGraph` | `ResourcePathExpression` + Either the associated quantum graph object or the uri of the + location of said quantum graph. + """ + # first we load the quantum graph and associated output run collection if not isinstance(qgraph, QuantumGraph): qgraph = QuantumGraph.loadUri(qgraph) assert qgraph.metadata is not None, "Saved QGs always have metadata." - qgraph.metadata["output_run"] + output_run = qgraph.metadata["output_run"] + new_quanta = [] for node in qgraph: - quantum_key = QuantumKey(node.taskDef.label, node.quantum.dataId.values_tuple()) + # make a key to add to the mirror graph with specific quanta for + # nodes. + quantum_key = QuantumKey(node.taskDef.label, node.quantum.dataId.required_values) self._xgraph.add_node(quantum_key) + self._xgraph.nodes[quantum_key]["data_id"] = node.quantum.dataId + new_quanta.append(quantum_key) + self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key) + # associate run collections with specific quanta. this is important + # if the same quanta are processed in multiple runs as in recovery + # workflows. + quantum_runs = self._xgraph.nodes[quantum_key].setdefault("runs", {}) + # the QuantumRun here is the specific quantum-run collection + # combination. + quantum_runs[output_run] = QuantumRun(node.nodeId) for ref in itertools.chain.from_iterable(node.quantum.outputs.values()): - dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.values_tuple()) + dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) + # add datasets to the nodes of the mirror graph, with edges on + # the quanta. self._xgraph.add_edge(quantum_key, dataset_key) + self._xgraph.nodes[dataset_key]["data_id"] = ref.dataId + self._datasets.setdefault(dataset_key.parent_dataset_type_name, set()).add(dataset_key) + dataset_runs = self._xgraph.nodes[dataset_key].setdefault("runs", {}) + # make a DatasetRun for the specific dataset-run collection + # combination. + dataset_runs[output_run] = DatasetRun(ref.id) + # save metadata and logs for easier status interpretation + if dataset_key.parent_dataset_type_name.endswith("_metadata"): + self._xgraph.nodes[quantum_key]["metadata"] = dataset_key + if dataset_key.parent_dataset_type_name.endswith("_log"): + self._xgraph.nodes[quantum_key]["log"] = dataset_key for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): - dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.values_tuple()) + dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.required_values) if dataset_key in self._xgraph: + # add another edge if the input datasetType and quantum are + # in the graph self._xgraph.add_edge(dataset_key, quantum_key) - - def resolve_duplicates(): - pass + for dataset_type_name in self._datasets: + for ref in butler.registry.queryDatasets(dataset_type_name, collections=output_run): + # find the datasets in the butler + dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) + dataset_run = self._xgraph.nodes[dataset_key]["runs"][output_run] # dataset run (singular) + # if the dataset is in the output run collection, we produced + # it! + dataset_run.produced = True + for quantum_key in new_quanta: + quantum_run: QuantumRun = self._xgraph.nodes[quantum_key]["runs"][output_run] + metadata_key = self._xgraph.nodes[quantum_key]["metadata"] + log_key = self._xgraph.nodes[quantum_key]["log"] + metadata_dataset_run: DatasetRun = self._xgraph.nodes[metadata_key]["runs"][output_run] + log_dataset_run: DatasetRun = self._xgraph.nodes[log_key]["runs"][output_run] + if metadata_dataset_run.produced: # check with Jim about this condition + # if we do have metadata: + if log_dataset_run.produced: + # if we also have logs, this is a success + # this includes No Work Found (the only things produced + # were metadata and logs). + quantum_run.status = "successful" + else: + # if we have metadata and no logs, this is a very rare + # case. either the task ran successfully and the datastore + # died immediately afterwards, or some supporting + # infrastructure for transferring the logs to the datastore + # failed. + quantum_run.status = "logs_missing" + else: + # missing metadata means that the task did not finish. + if log_dataset_run.produced: + # if we do have logs, the task not finishing is a failure + # in the task itself. This includes all payload errors and + # some other errors. + quantum_run.status = "failed" + else: + # we are missing metadata and logs. Either the task was not + # started, or a hard external environmental error prevented + # it from writing logs or metadata. + quantum_run.status = "not_attempted" + + # I imagine that the next step is to call `resolve_duplicates` on the + # self._xgraph. + # Things that could have happened to a quanta over multiple runs + # Failed until it suceeded + # Never been attempted + # Succeeded immediately + # Failed and continued to fail + # Horrible flip-flopping (doesn't happen with skip-existing-in) + + def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = None, where: str = ""): + for dataset_type_name in self._datasets: + for ref in butler.registry.queryDatasets( + dataset_type_name, + collections=collections, + findFirst=True, + where=where, + ): + # find the datasets in a larger collection. "who won?" + dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) + self._xgraph.nodes[dataset_key]["winner"] = ref.run + self._xgraph.nodes[dataset_key]["runs"][ref.run].published = True + for task_label, task_quanta in self._quanta.items(): + for quantum_key in task_quanta: + # these are the run collections of the datasets produced by + # this quantum that were published in the final collection + winners = { + winner + for dataset_key in self.iter_outputs_of(quantum_key) + if (winner := self._xgraph.nodes[dataset_key].get("winner")) + } + # note: we expect len(winners) = 1 + for run, quantum_run in self._xgraph.nodes[quantum_key]["runs"].items(): + if quantum_run.status != "successful" and run in winners: + for dataset_key in self.iter_outputs_of(quantum_key): + # the outputs of this quantum in this run may have + # been mistakenly published + dataset_run = self._xgraph.nodes[dataset_key]["runs"][run] + if dataset_run.published: + self._published_failures.add( + ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) + ) + break + if quantum_run.status == "successful" and run not in winners: + if len(winners) == 0: + # the quantum succeeded but no outputs were + # published + for dataset_key in self.iter_outputs_of(quantum_key): + dataset_run: DatasetRun = self._xgraph.nodes[dataset_key]["runs"][run] + if not dataset_run.published and dataset_run.produced: + self._ignored_successes.add( + ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) + ) + else: + for dataset_key in self.iter_outputs_of(quantum_key): + dataset_run = self._xgraph.nodes[dataset_key]["runs"][run] + self._rejected_successes.add( + ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) + ) + if len(winners) > 1: + # some rejected outputs may be in here + print("published outputs for this quantum were from multiple runs") + self._heterogeneous_quanta.add(quantum_key) + for dataset_type_name, datasets_for_type in self._datasets.items(): + for dataset_key in datasets_for_type: + for run, dataset_run in self._xgraph.nodes[dataset_key]["runs"].items(): + if not dataset_run.produced: + quantum_key = self.get_producer_of(dataset_key) + quantum_run: QuantumRun = self._xgraph.nodes[quantum_key]["runs"][run] + if quantum_run.status == "successful": + self._no_work_datasets.add( + ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) + ) + # this is a NoWorkFound + # we can look at datasets that were not produced whose quanta + # were successful -> these are no work found + + # probably count things and modify the self._xgraph here. + # maybe add dicts of TaskExecutionReport and + # DatasetTypeExecutionReport to this object + # instead of having all the dataset refs, ids, etc, make them have sets + # of dataset keys, quantum keys and take len() to figure out how many + + # for each dataset, how many got published of each type? how many were + # produced and not published? how many were predicted and not produced + # (for various reasons) + + def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: + """Summarize the results of the TaskExecutionReport in a dictionary. + + Parameters + ---------- + butler : `lsst.daf.butler.Butler` + The Butler used for this report. + do_store_logs : `bool` + Store the logs in the summary dictionary. + + Returns + ------- + summary_dict : `dict` + A dictionary containing: + + - outputs: A dictionary summarizing the + DatasetTypeExecutionReport for each DatasetType associated with + the task + - failed_quanta: A dictionary of quanta which failed and their + dataIDs by quantum graph node id + - n_quanta_blocked: The number of quanta which failed due to + upstream failures. + - n_successful: The number of quanta which succeeeded. + """ + result = { + "tasks": {}, + "datasets": {}, + "published_failures": [ + key.to_summary_dict(self._xgraph) for key in sorted(self._published_failures) + ], + "rejected_successes": [ + key.to_summary_dict(self._xgraph) for key in sorted(self._rejected_successes) + ], + "ignored_successes": [ + key.to_summary_dict(self._xgraph) for key in sorted(self._ignored_successes) + ], + "heterogeneous_quanta": [ + key.to_summary_dict(self._xgraph) for key in sorted(self._heterogeneous_quanta) + ], + } + for task_label, quanta in self._quanta.items(): + n_blocked = 0 + n_successful = 0 + failed_quanta = [] + # every item in this list will correspond to a data_id and be a + # dict keyed by run + for quantum_key in quanta: + failed_quantum_info = {"data_id": {}, "runs": {}} + for run, quantum_run in self._xgraph.nodes[quantum_key]["runs"].items(): + if quantum_run.status == "successful": + failed_quantum_info["runs"].clear() + # if any of the quantum runs successful, we don't worry + # about it + n_successful += 1 + break + elif quantum_run.status == "blocked": + n_blocked += 1 + continue + else: + log_key: DatasetKey = self._xgraph.nodes[quantum_key]["log"] + quantum_data_id = self._xgraph.nodes[quantum_key]["data_id"] + failed_quantum_info["data_id"].update(quantum_data_id.mapping) + quantum_info = {"id": quantum_run.id, "status": quantum_run.status} + if do_store_logs: + try: + # should probably upgrade this to use a dataset + # ref + log = butler.get( + log_key.parent_dataset_type_name, quantum_data_id, collections=run + ) + except LookupError: + quantum_info["error"] = [] + except FileNotFoundError: + quantum_info["error"] = None + else: + quantum_info["error"] = [ + record.message for record in log if record.levelno >= logging.ERROR + ] + failed_quantum_info["runs"][run] = quantum_info + if failed_quantum_info["runs"]: + # if the quantum runs continue to fail, report. + failed_quanta.append(failed_quantum_info) + result["tasks"][task_label] = { + "failed_quanta": failed_quanta, + "n_quanta_blocked": n_blocked, + "n_successful": n_successful, + } + return result + + def iter_outputs_of(self, quantum_key: QuantumKey) -> Iterator[DatasetKey]: + metadata_key = self._xgraph.nodes[quantum_key]["metadata"] + log_key = self._xgraph.nodes[quantum_key]["log"] + for dataset_key in self._xgraph.successors(quantum_key): + if dataset_key != metadata_key and dataset_key != log_key: + yield dataset_key + + def get_producer_of(self, dataset_key: DatasetKey) -> QuantumKey: + (result,) = self._xgraph.predecessors(dataset_key) + return result From 20afb37ac8a60599448f67f8c3963445995bf79f Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Tue, 20 Feb 2024 17:17:39 -0500 Subject: [PATCH 03/18] Start working on dataset summaries. Successful, no-work, and the weird ones we've already categorized are easy enough. But failed/blocked get tricky when you consider what else they might have been combined with. --- .../pipe/base/quantum_provenance_graph.py | 75 ++++++++++++++----- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index ccceef57b..b22264565 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -48,6 +48,7 @@ import networkx from lsst.daf.butler import Butler, DataIdValue +from lsst.daf.butler.nonempty_mapping import NonemptyMapping from lsst.resources import ResourcePathExpression from lsst.utils.logging import getLogger @@ -203,11 +204,11 @@ def __init__(self): # The nodes representing datasets in `_xgraph` grouped by dataset type # name. self._datasets: dict[str, set[DatasetKey]] = {} - self._published_failures: set[ResolvedDatasetKey] = set() - self._ignored_successes: set[ResolvedDatasetKey] = set() - self._rejected_successes: set[ResolvedDatasetKey] = set() + self._published_failures: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() + self._ignored_successes: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() + self._rejected_successes: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() + self._no_work_datasets: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() self._heterogeneous_quanta: set[QuantumKey] = set() - self._no_work_datasets: set[ResolvedDatasetKey] = set() @property def published_failures(self) -> Set[ResolvedDatasetKey]: @@ -382,7 +383,7 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = # been mistakenly published dataset_run = self._xgraph.nodes[dataset_key]["runs"][run] if dataset_run.published: - self._published_failures.add( + self._published_failures[dataset_key.parent_dataset_type_name].add( ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) ) break @@ -393,13 +394,13 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = for dataset_key in self.iter_outputs_of(quantum_key): dataset_run: DatasetRun = self._xgraph.nodes[dataset_key]["runs"][run] if not dataset_run.published and dataset_run.produced: - self._ignored_successes.add( + self._ignored_successes[dataset_key.parent_dataset_type_name].add( ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) ) else: for dataset_key in self.iter_outputs_of(quantum_key): dataset_run = self._xgraph.nodes[dataset_key]["runs"][run] - self._rejected_successes.add( + self._rejected_successes[dataset_key.parent_dataset_type_name].add( ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) ) if len(winners) > 1: @@ -413,18 +414,10 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = quantum_key = self.get_producer_of(dataset_key) quantum_run: QuantumRun = self._xgraph.nodes[quantum_key]["runs"][run] if quantum_run.status == "successful": - self._no_work_datasets.add( + self._no_work_datasets[dataset_key.parent_dataset_type_name].add( ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) ) # this is a NoWorkFound - # we can look at datasets that were not produced whose quanta - # were successful -> these are no work found - - # probably count things and modify the self._xgraph here. - # maybe add dicts of TaskExecutionReport and - # DatasetTypeExecutionReport to this object - # instead of having all the dataset refs, ids, etc, make them have sets - # of dataset keys, quantum keys and take len() to figure out how many # for each dataset, how many got published of each type? how many were # produced and not published? how many were predicted and not produced @@ -458,16 +451,20 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st "tasks": {}, "datasets": {}, "published_failures": [ - key.to_summary_dict(self._xgraph) for key in sorted(self._published_failures) + key.to_summary_dict(self._xgraph) + for key in sorted(itertools.chain.from_iterable(self._published_failures.values())) ], "rejected_successes": [ - key.to_summary_dict(self._xgraph) for key in sorted(self._rejected_successes) + key.to_summary_dict(self._xgraph) + for key in sorted(itertools.chain.from_iterable(self._rejected_successes.values())) ], "ignored_successes": [ - key.to_summary_dict(self._xgraph) for key in sorted(self._ignored_successes) + key.to_summary_dict(self._xgraph) + for key in sorted(itertools.chain.from_iterable(self._ignored_successes.values())) ], "heterogeneous_quanta": [ - key.to_summary_dict(self._xgraph) for key in sorted(self._heterogeneous_quanta) + key.to_summary_dict(self._xgraph) + for key in sorted(itertools.chain.from_iterable(self._heterogeneous_quanta.values())) ], } for task_label, quanta in self._quanta.items(): @@ -517,6 +514,44 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st "n_quanta_blocked": n_blocked, "n_successful": n_successful, } + for dataset_type_name, datasets in self._datasets.items(): + n_successful = 0 + for dataset_key in datasets: + producer_quantum_key = self.get_producer_of(dataset_key) + producer_quantum_run: QuantumRun + if winning_run := self._xgraph.nodes[dataset_key].get("winner"): + # A dataset for this data ID was published. + producer_quantum_run = self._xgraph.nodes[producer_quantum_key]["runs"][winning_run] + if producer_quantum_run.status == "successful": + n_successful += 1 + else: + # A dataset for this data ID was not published. + # THIS BRANCH VERY MUCH TO DO. IT MAY BE GARBAGE. + dataset_run: DatasetRun + final_status: str = "not_attempted" + for run, dataset_run in reversed(self._xgraph.nodes[dataset_key]["runs"].items()): + if dataset_run.produced: + # Published failures handled elsewhere. + break + producer_quantum_run = self._xgraph.nodes[producer_quantum_key]["runs"][run] + match producer_quantum_run.status: + case "successful": + # No work handled elsewhere. + break + + result["datasets"][dataset_type_name] = { + # This is the total number in the original QG. + "predicted": len(datasets), + # These should all add up to 'predicted'... + "successful": n_successful, # (and published) + "no_work": self._no_work_datasets[dataset_type_name], + "ignored_successes": self._ignored_successes[dataset_type_name], + "published_failures": self._published_failures[dataset_type_name], + "failed": ..., + "blocked": ..., + # ... these do not sum nicely to anything: + "rejected_successes": self._rejected_successes[dataset_type_name], + } return result def iter_outputs_of(self, quantum_key: QuantumKey) -> Iterator[DatasetKey]: From ba8b9890f7dbd2d4e1ffc0e05ea2f7e70951b0b9 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Fri, 8 Mar 2024 21:59:27 -0800 Subject: [PATCH 04/18] Formalize overall status with match statements --- .../pipe/base/quantum_provenance_graph.py | 281 +++++++++--------- 1 file changed, 142 insertions(+), 139 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index b22264565..21307107d 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -43,12 +43,11 @@ import itertools import logging import uuid -from collections.abc import Iterator, Sequence, Set -from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple +from collections.abc import Iterator, Sequence +from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, TypedDict import networkx -from lsst.daf.butler import Butler, DataIdValue -from lsst.daf.butler.nonempty_mapping import NonemptyMapping +from lsst.daf.butler import Butler, DataCoordinate, DataIdValue from lsst.resources import ResourcePathExpression from lsst.utils.logging import getLogger @@ -139,11 +138,41 @@ class QuantumRun: """The quantum graph node ID associated with the dataId in a specific run. """ - status: Literal["failed", "not_attempted", "successful", "logs_missing"] = "not_attempted" + status: Literal[ + "failed", "successful", "logs_missing", "blocked", "metadata_missing" + ] = "metadata_missing" """The status of the quantum in that run. """ +class QuantumInfo(TypedDict): + """Information about a quantum across all run collections. + + Used to annotate the networkx node dictionary. + """ + + data_id: DataCoordinate + """The data_id of the quantum. + """ + + runs: dict[str, QuantumRun] + """All run collections associated with the quantum. + """ + + status: Literal["successful", "wonky", "blocked", "not_attempted", "failed"] + """The overall status of the quantum. Note that it is impossible to exit a + wonky state. + """ + + recovered: bool + """The quantum was originally not successful but was ultimately successful. + """ + + messages: list[str] + """Diagnostic messages to help disambiguate wonky states. + """ + + @dataclasses.dataclass class DatasetRun: """Information about a dataset in a given run collection.""" @@ -160,24 +189,36 @@ class DatasetRun: """Whether this dataset was published in the final output collection. """ + def __post_init__(self) -> None: + assert not (self.published and not self.produced) + -@dataclasses.dataclass(frozen=True, order=True) -class ResolvedDatasetKey: - """A combination of a dataset key and a particular dataset run to be used - for recording specific instances of issues. +class DatasetInfo(TypedDict): + """Information about a given dataset across all runs. + + Used to annotate the networkx node dictionary. """ - key: DatasetKey - run: str - id: uuid.UUID + data_id: DataCoordinate + """The data_id of the quantum. + """ - def to_summary_dict(self, xgraph: networkx.DiGraph) -> dict[str, Any]: - return { - "dataset_type": self.key.parent_dataset_type_name, - "data_id": xgraph.nodes[self.key]["data_id"], - "uuid": self.id, - "run": self.run, - } + runs: dict[str, DatasetRun] + """All runs associated with the dataset. + """ + + status: Literal["published", "unpublished", "predicted_only", "unsuccessful", "cursed"] + """Overall status of the dataset. + """ + + messages: list[str] + """Diagnostic messages to help disambiguate cursed states. + """ + + winner: str | None + """The run whose dataset was published, if any. These are retrievable with + butler.get + """ class QuantumProvenanceGraph: @@ -204,38 +245,12 @@ def __init__(self): # The nodes representing datasets in `_xgraph` grouped by dataset type # name. self._datasets: dict[str, set[DatasetKey]] = {} - self._published_failures: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() - self._ignored_successes: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() - self._rejected_successes: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() - self._no_work_datasets: NonemptyMapping[str, set[ResolvedDatasetKey]] = NonemptyMapping() - self._heterogeneous_quanta: set[QuantumKey] = set() - - @property - def published_failures(self) -> Set[ResolvedDatasetKey]: - """Datasets that appeared in the final output collection even though - the quantum that produced them failed. - """ - return self._published_failures - @property - def ignored_successes(self) -> Set[ResolvedDatasetKey]: - """Dataset types and data ids that were produced by one or more - successful quanta but not included in the final output collection. - """ - # Note: we want to make this a set[DatasetKey] instead. - return self._ignored_successes - - @property - def rejected_successes(self) -> Set[ResolvedDatasetKey]: - """Datasets from successful quanta that were not published, where - another dataset with the same data id was published. - """ - return self._rejected_successes + def get_quantum_info(self, key: QuantumKey) -> QuantumInfo: + return self._xgraph.nodes[key] - @property - def heterogeneous_quanta(self) -> Set[QuantumKey]: - """Quanta whose published outputs came from multiple runs.""" - return self._heterogeneous_quanta + def get_dataset_info(self, key: DatasetKey) -> DatasetInfo: + return self._xgraph.nodes[key] def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpression) -> None: """Add a new quantum graph to the `QuantumProvenanceGraph`. @@ -272,13 +287,14 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # nodes. quantum_key = QuantumKey(node.taskDef.label, node.quantum.dataId.required_values) self._xgraph.add_node(quantum_key) - self._xgraph.nodes[quantum_key]["data_id"] = node.quantum.dataId + quantum_info = self.get_quantum_info(quantum_key) + quantum_info["data_id"] = node.quantum.dataId new_quanta.append(quantum_key) self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key) # associate run collections with specific quanta. this is important # if the same quanta are processed in multiple runs as in recovery # workflows. - quantum_runs = self._xgraph.nodes[quantum_key].setdefault("runs", {}) + quantum_runs = quantum_info.setdefault("runs", {}) # the QuantumRun here is the specific quantum-run collection # combination. quantum_runs[output_run] = QuantumRun(node.nodeId) @@ -287,17 +303,18 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # add datasets to the nodes of the mirror graph, with edges on # the quanta. self._xgraph.add_edge(quantum_key, dataset_key) - self._xgraph.nodes[dataset_key]["data_id"] = ref.dataId + dataset_info = self.get_dataset_info(dataset_key) + dataset_info["data_id"] = ref.dataId self._datasets.setdefault(dataset_key.parent_dataset_type_name, set()).add(dataset_key) - dataset_runs = self._xgraph.nodes[dataset_key].setdefault("runs", {}) + dataset_runs = dataset_info.setdefault("runs", {}) # make a DatasetRun for the specific dataset-run collection # combination. dataset_runs[output_run] = DatasetRun(ref.id) # save metadata and logs for easier status interpretation if dataset_key.parent_dataset_type_name.endswith("_metadata"): - self._xgraph.nodes[quantum_key]["metadata"] = dataset_key + quantum_info["metadata"] = dataset_key if dataset_key.parent_dataset_type_name.endswith("_log"): - self._xgraph.nodes[quantum_key]["log"] = dataset_key + quantum_info[quantum_key]["log"] = dataset_key for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.required_values) if dataset_key in self._xgraph: @@ -308,16 +325,19 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre for ref in butler.registry.queryDatasets(dataset_type_name, collections=output_run): # find the datasets in the butler dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) - dataset_run = self._xgraph.nodes[dataset_key]["runs"][output_run] # dataset run (singular) + dataset_run = dataset_info["runs"][output_run] # dataset run (singular) # if the dataset is in the output run collection, we produced # it! dataset_run.produced = True + # the outputs of failed or blocked quanta in this run. + blocked: set[DatasetKey] = set() for quantum_key in new_quanta: - quantum_run: QuantumRun = self._xgraph.nodes[quantum_key]["runs"][output_run] - metadata_key = self._xgraph.nodes[quantum_key]["metadata"] - log_key = self._xgraph.nodes[quantum_key]["log"] - metadata_dataset_run: DatasetRun = self._xgraph.nodes[metadata_key]["runs"][output_run] - log_dataset_run: DatasetRun = self._xgraph.nodes[log_key]["runs"][output_run] + quantum_info = self.get_quantum_info(quantum_key) + quantum_run = quantum_info["runs"][output_run] + metadata_key = quantum_info["metadata"] + log_key = quantum_info["log"] + metadata_dataset_run = self.get_dataset_info(metadata_key)["runs"][output_run] + log_dataset_run = self.get_dataset_info(log_key)["runs"][output_run] if metadata_dataset_run.produced: # check with Jim about this condition # if we do have metadata: if log_dataset_run.produced: @@ -339,20 +359,46 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # in the task itself. This includes all payload errors and # some other errors. quantum_run.status = "failed" + # if a quantum fails, all its successor datasets are + # blocked. + blocked.update(self._xgraph.successors(quantum_key)) else: # we are missing metadata and logs. Either the task was not # started, or a hard external environmental error prevented # it from writing logs or metadata. - quantum_run.status = "not_attempted" + if blocked.isdisjoint(self._xgraph.predecessors(quantum_key)): + # None of this quantum's inputs were blocked. + quantum_run.status = "metadata_missing" + else: + quantum_run.status = "blocked" + blocked.update(self._xgraph.successors(quantum_key)) - # I imagine that the next step is to call `resolve_duplicates` on the - # self._xgraph. - # Things that could have happened to a quanta over multiple runs - # Failed until it suceeded - # Never been attempted - # Succeeded immediately - # Failed and continued to fail - # Horrible flip-flopping (doesn't happen with skip-existing-in) + # Now we can start using state transitions to mark overall status. + if len(quantum_info["runs"]) == 1: + last_status = "not_attempted" + else: + last_run = list(quantum_info["runs"].values())[-1] + last_status = last_run.status + match last_status, quantum_run.status: + case ("not_attempted", new_status): + pass + case ("wonky", _): + new_status = "wonky" + case (_, "successful"): + new_status = "successful" + if last_status != "successful": + quantum_info["recovered"] = True + case (_, "logs_missing"): + new_status = "wonky" + case ("successful", _): + new_status = "wonky" + case (_, "blocked"): + pass + case (_, "metadata_missing"): + new_status = "not_attempted" + case ("failed", _): + pass + quantum_info["status"] = new_status def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = None, where: str = ""): for dataset_type_name in self._datasets: @@ -366,62 +412,40 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) self._xgraph.nodes[dataset_key]["winner"] = ref.run self._xgraph.nodes[dataset_key]["runs"][ref.run].published = True + for task_label, task_quanta in self._quanta.items(): for quantum_key in task_quanta: # these are the run collections of the datasets produced by # this quantum that were published in the final collection + dataset_keys = self.iter_outputs_of(quantum_key) winners = { winner - for dataset_key in self.iter_outputs_of(quantum_key) + for dataset_key in dataset_keys if (winner := self._xgraph.nodes[dataset_key].get("winner")) } # note: we expect len(winners) = 1 for run, quantum_run in self._xgraph.nodes[quantum_key]["runs"].items(): - if quantum_run.status != "successful" and run in winners: - for dataset_key in self.iter_outputs_of(quantum_key): - # the outputs of this quantum in this run may have - # been mistakenly published - dataset_run = self._xgraph.nodes[dataset_key]["runs"][run] - if dataset_run.published: - self._published_failures[dataset_key.parent_dataset_type_name].add( - ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) - ) - break - if quantum_run.status == "successful" and run not in winners: - if len(winners) == 0: - # the quantum succeeded but no outputs were - # published - for dataset_key in self.iter_outputs_of(quantum_key): - dataset_run: DatasetRun = self._xgraph.nodes[dataset_key]["runs"][run] - if not dataset_run.published and dataset_run.produced: - self._ignored_successes[dataset_key.parent_dataset_type_name].add( - ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) - ) - else: - for dataset_key in self.iter_outputs_of(quantum_key): - dataset_run = self._xgraph.nodes[dataset_key]["runs"][run] - self._rejected_successes[dataset_key.parent_dataset_type_name].add( - ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) - ) - if len(winners) > 1: - # some rejected outputs may be in here - print("published outputs for this quantum were from multiple runs") - self._heterogeneous_quanta.add(quantum_key) - for dataset_type_name, datasets_for_type in self._datasets.items(): - for dataset_key in datasets_for_type: - for run, dataset_run in self._xgraph.nodes[dataset_key]["runs"].items(): - if not dataset_run.produced: - quantum_key = self.get_producer_of(dataset_key) - quantum_run: QuantumRun = self._xgraph.nodes[quantum_key]["runs"][run] - if quantum_run.status == "successful": - self._no_work_datasets[dataset_key.parent_dataset_type_name].add( - ResolvedDatasetKey(key=dataset_key, run=run, id=dataset_run.id) - ) - # this is a NoWorkFound - - # for each dataset, how many got published of each type? how many were - # produced and not published? how many were predicted and not produced - # (for various reasons) + for dataset_key in dataset_keys: + dataset_info = self.get_dataset_info(dataset_key) + match (quantum_run.status, (run in winners)): + case ("successful", True): + dataset_info["status"] = "published" + case ("successful", False): + if len(winners) == 0: + # This is the No Work Found case + dataset_info["status"] = "predicted_only" + else: + dataset_info["status"] = "unpublished" + case (_, True): + # If anything other than a successful quantum + # produces a published dataset, that dataset + # is cursed. + dataset_info["status"] = "cursed" + case _: + if len(winners) > 1: + dataset_info["status"] = "cursed" + else: + dataset_info["status"] = "unsuccessful" def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: """Summarize the results of the TaskExecutionReport in a dictionary. @@ -450,22 +474,6 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st result = { "tasks": {}, "datasets": {}, - "published_failures": [ - key.to_summary_dict(self._xgraph) - for key in sorted(itertools.chain.from_iterable(self._published_failures.values())) - ], - "rejected_successes": [ - key.to_summary_dict(self._xgraph) - for key in sorted(itertools.chain.from_iterable(self._rejected_successes.values())) - ], - "ignored_successes": [ - key.to_summary_dict(self._xgraph) - for key in sorted(itertools.chain.from_iterable(self._ignored_successes.values())) - ], - "heterogeneous_quanta": [ - key.to_summary_dict(self._xgraph) - for key in sorted(itertools.chain.from_iterable(self._heterogeneous_quanta.values())) - ], } for task_label, quanta in self._quanta.items(): n_blocked = 0 @@ -526,9 +534,9 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st n_successful += 1 else: # A dataset for this data ID was not published. - # THIS BRANCH VERY MUCH TO DO. IT MAY BE GARBAGE. + # THIS BRANCH VERY MUCH TO DO. IT MAY BE GARBAGE. dataset_run: DatasetRun - final_status: str = "not_attempted" + # final_status: str = "not_attempted" for run, dataset_run in reversed(self._xgraph.nodes[dataset_key]["runs"].items()): if dataset_run.produced: # Published failures handled elsewhere. @@ -544,13 +552,8 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st "predicted": len(datasets), # These should all add up to 'predicted'... "successful": n_successful, # (and published) - "no_work": self._no_work_datasets[dataset_type_name], - "ignored_successes": self._ignored_successes[dataset_type_name], - "published_failures": self._published_failures[dataset_type_name], "failed": ..., "blocked": ..., - # ... these do not sum nicely to anything: - "rejected_successes": self._rejected_successes[dataset_type_name], } return result From ffeff4ffaa5800de789a699860be30aff2178bfa Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Mon, 11 Mar 2024 17:57:54 -0700 Subject: [PATCH 05/18] Summarize info from graph in a dictionary --- .../pipe/base/quantum_provenance_graph.py | 144 +++++++++--------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 21307107d..13b40e22f 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -430,6 +430,7 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = match (quantum_run.status, (run in winners)): case ("successful", True): dataset_info["status"] = "published" + dataset_info["winner"] = run case ("successful", False): if len(winners) == 0: # This is the No Work Found case @@ -443,12 +444,14 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = dataset_info["status"] = "cursed" case _: if len(winners) > 1: + # This is the heterogeneous quanta case. dataset_info["status"] = "cursed" else: + # This should be a regular failure. dataset_info["status"] = "unsuccessful" def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: - """Summarize the results of the TaskExecutionReport in a dictionary. + """Summarize the QuantumProvenanceGraph in a dictionary. Parameters ---------- @@ -460,100 +463,105 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st Returns ------- summary_dict : `dict` - A dictionary containing: - - - outputs: A dictionary summarizing the - DatasetTypeExecutionReport for each DatasetType associated with - the task - - failed_quanta: A dictionary of quanta which failed and their - dataIDs by quantum graph node id - - n_quanta_blocked: The number of quanta which failed due to - upstream failures. - - n_successful: The number of quanta which succeeeded. + A dictionary containing counts of quanta and datasets in each of + the overall states defined in `QuantumInfo` and `DatasetInfo`, + as well as diagnostic information and error messages for failed + quanta and strange edge cases, and a list of recovered quanta. """ result = { "tasks": {}, "datasets": {}, } for task_label, quanta in self._quanta.items(): - n_blocked = 0 n_successful = 0 - failed_quanta = [] + n_wonky = 0 + n_blocked = 0 + n_failed = 0 + failed_quanta = {"data_id": {}, "runs": {}, "message": {}} + recovered_quanta = [] + wonky_quanta = {"data_id": {}, "runs": {}, "message": {}} # every item in this list will correspond to a data_id and be a # dict keyed by run for quantum_key in quanta: - failed_quantum_info = {"data_id": {}, "runs": {}} - for run, quantum_run in self._xgraph.nodes[quantum_key]["runs"].items(): - if quantum_run.status == "successful": - failed_quantum_info["runs"].clear() - # if any of the quantum runs successful, we don't worry - # about it - n_successful += 1 - break - elif quantum_run.status == "blocked": - n_blocked += 1 - continue - else: - log_key: DatasetKey = self._xgraph.nodes[quantum_key]["log"] - quantum_data_id = self._xgraph.nodes[quantum_key]["data_id"] - failed_quantum_info["data_id"].update(quantum_data_id.mapping) - quantum_info = {"id": quantum_run.id, "status": quantum_run.status} - if do_store_logs: + quantum_info = self.get_quantum_info(quantum_key) + if quantum_info["status"] == "successful": + n_successful += 1 + if quantum_info["recovered"]: + recovered_quanta.append(quantum_info["data_id"]) + elif quantum_info["status"] == "wonky": + n_wonky += 1 + wonky_quanta["data_id"].update(quantum_info["data_id"]) + wonky_quanta["runs"].update(quantum_info["runs"]) + wonky_quanta["message"].update(quantum_info["messages"]) + elif quantum_info["status"] == "blocked": + n_blocked += 1 + elif quantum_info["status"] == "failed": + n_failed += 1 + failed_quanta["data_id"].update(quantum_info["data_id"]) + runs = quantum_info["runs"] + failed_quanta["runs"].update(runs) + log_key: DatasetKey = self._xgraph.nodes[quantum_key]["log"] + if do_store_logs: + for run in runs: try: # should probably upgrade this to use a dataset # ref log = butler.get( - log_key.parent_dataset_type_name, quantum_data_id, collections=run + log_key.parent_dataset_type_name, quantum_info["data_id"], collections=run ) except LookupError: - quantum_info["error"] = [] + failed_quanta["message"] = [] except FileNotFoundError: - quantum_info["error"] = None + failed_quanta["message"] = None else: - quantum_info["error"] = [ - record.message for record in log if record.levelno >= logging.ERROR - ] - failed_quantum_info["runs"][run] = quantum_info - if failed_quantum_info["runs"]: - # if the quantum runs continue to fail, report. - failed_quanta.append(failed_quantum_info) + failed_quanta["message"].update( + [record.message for record in log if record.levelno >= logging.ERROR] + ) result["tasks"][task_label] = { - "failed_quanta": failed_quanta, - "n_quanta_blocked": n_blocked, "n_successful": n_successful, + "n_wonky": n_wonky, + "n_blocked": n_blocked, + "n_failed": n_failed, + "failed_quanta": failed_quanta, + "recovered_quanta": recovered_quanta, + "wonky_quanta": wonky_quanta, } for dataset_type_name, datasets in self._datasets.items(): - n_successful = 0 + n_published = 0 + n_unpublished = 0 + n_predicted_only = 0 + n_unsuccessful = 0 + n_cursed = 0 + unsuccessful_datasets = [] + cursed_datasets = {"parent_data_id": {}, "runs": {}, "message": {}} for dataset_key in datasets: - producer_quantum_key = self.get_producer_of(dataset_key) - producer_quantum_run: QuantumRun - if winning_run := self._xgraph.nodes[dataset_key].get("winner"): - # A dataset for this data ID was published. - producer_quantum_run = self._xgraph.nodes[producer_quantum_key]["runs"][winning_run] - if producer_quantum_run.status == "successful": - n_successful += 1 - else: - # A dataset for this data ID was not published. - # THIS BRANCH VERY MUCH TO DO. IT MAY BE GARBAGE. - dataset_run: DatasetRun - # final_status: str = "not_attempted" - for run, dataset_run in reversed(self._xgraph.nodes[dataset_key]["runs"].items()): - if dataset_run.produced: - # Published failures handled elsewhere. - break - producer_quantum_run = self._xgraph.nodes[producer_quantum_key]["runs"][run] - match producer_quantum_run.status: - case "successful": - # No work handled elsewhere. - break + dataset_info = self.get_dataset_info(dataset_key) + if dataset_info["status"] == "published": + n_published += 1 + elif dataset_info["status"] == "unpublished": + n_unpublished += 1 + elif dataset_info["status"] == "predicted_only": + n_predicted_only += 1 + elif dataset_info["status"] == "unsuccessful": + n_unsuccessful += 1 + unsuccessful_datasets.append(dataset_info["data_id"]) + elif dataset_info["status"] == "cursed": + n_cursed += 1 + cursed_datasets["parent_data_id"].update(dataset_info["data_id"]) + cursed_datasets["runs"].update(dataset_info["runs"]) + cursed_datasets["message"].update(dataset_info["message"]) result["datasets"][dataset_type_name] = { # This is the total number in the original QG. - "predicted": len(datasets), + "n_predicted": len(datasets), # These should all add up to 'predicted'... - "successful": n_successful, # (and published) - "failed": ..., - "blocked": ..., + "n_published": n_published, # (and published) + "n_unpublished": n_unpublished, + "n_predicted_only": n_predicted_only, + "n_unsuccessful": n_unsuccessful, + "n_cursed": n_cursed, + "unsuccessful_datasets": unsuccessful_datasets, + "cursed_datasets": cursed_datasets, } return result From b8ddd4f1494475d517d57610d4184bd26c0f9b8c Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Tue, 12 Mar 2024 13:47:58 -0700 Subject: [PATCH 06/18] Add diagnostic messages --- .../pipe/base/quantum_provenance_graph.py | 96 ++++++++++--------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 13b40e22f..c825d9787 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -215,11 +215,6 @@ class DatasetInfo(TypedDict): """Diagnostic messages to help disambiguate cursed states. """ - winner: str | None - """The run whose dataset was published, if any. These are retrievable with - butler.get - """ - class QuantumProvenanceGraph: """A set of already-run, merged quantum graphs with provenance @@ -288,7 +283,11 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre quantum_key = QuantumKey(node.taskDef.label, node.quantum.dataId.required_values) self._xgraph.add_node(quantum_key) quantum_info = self.get_quantum_info(quantum_key) - quantum_info["data_id"] = node.quantum.dataId + quantum_info.setdefault("messages", []) + quantum_info.setdefault("runs", {}) + quantum_info.setdefault("data_id", node.quantum.dataId) + quantum_info.setdefault("status", "not_attempted") + quantum_info.setdefault("recovered", False) new_quanta.append(quantum_key) self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key) # associate run collections with specific quanta. this is important @@ -304,7 +303,9 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # the quanta. self._xgraph.add_edge(quantum_key, dataset_key) dataset_info = self.get_dataset_info(dataset_key) - dataset_info["data_id"] = ref.dataId + dataset_info.setdefault("data_id", ref.dataId) + dataset_info.setdefault("status", "missing") + dataset_info.setdefault("messages", []) self._datasets.setdefault(dataset_key.parent_dataset_type_name, set()).add(dataset_key) dataset_runs = dataset_info.setdefault("runs", {}) # make a DatasetRun for the specific dataset-run collection @@ -390,8 +391,13 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre quantum_info["recovered"] = True case (_, "logs_missing"): new_status = "wonky" + quantum_info["messages"].append(f"Logs missing for run {output_run!r}.") case ("successful", _): new_status = "wonky" + quantum_info["messages"].append( + f"Status went from successful in run {last_run!r} " + f"to {quantum_run.status!r} in {output_run!r}." + ) case (_, "blocked"): pass case (_, "metadata_missing"): @@ -410,45 +416,49 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = ): # find the datasets in a larger collection. "who won?" dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) - self._xgraph.nodes[dataset_key]["winner"] = ref.run - self._xgraph.nodes[dataset_key]["runs"][ref.run].published = True + dataset_info = self.get_dataset_info(dataset_key) + dataset_info["runs"][ref.run].published = True for task_label, task_quanta in self._quanta.items(): for quantum_key in task_quanta: # these are the run collections of the datasets produced by # this quantum that were published in the final collection - dataset_keys = self.iter_outputs_of(quantum_key) - winners = { - winner - for dataset_key in dataset_keys - if (winner := self._xgraph.nodes[dataset_key].get("winner")) - } - # note: we expect len(winners) = 1 - for run, quantum_run in self._xgraph.nodes[quantum_key]["runs"].items(): - for dataset_key in dataset_keys: - dataset_info = self.get_dataset_info(dataset_key) - match (quantum_run.status, (run in winners)): - case ("successful", True): - dataset_info["status"] = "published" - dataset_info["winner"] = run - case ("successful", False): - if len(winners) == 0: - # This is the No Work Found case - dataset_info["status"] = "predicted_only" - else: - dataset_info["status"] = "unpublished" - case (_, True): - # If anything other than a successful quantum - # produces a published dataset, that dataset - # is cursed. - dataset_info["status"] = "cursed" - case _: - if len(winners) > 1: - # This is the heterogeneous quanta case. - dataset_info["status"] = "cursed" - else: - # This should be a regular failure. - dataset_info["status"] = "unsuccessful" + published_runs = set() + quantum_info = self.get_quantum_info(quantum_key) + for dataset_key in self.iter_outputs_of(quantum_key): + dataset_info = self.get_dataset_info(dataset_key) + published_runs.update( + run for run, dataset_run in dataset_info["runs"].items() if dataset_run.published + ) + if any(dataset_run.published for dataset_run in dataset_info["runs"].values()): + publish_state = "published" + elif any(dataset_run.produced for dataset_run in dataset_info["runs"].values()): + publish_state = "unpublished" + else: + publish_state = "missing" + match (quantum_info["status"], publish_state): + case ("successful", "published"): + dataset_info["status"] = "published" + case ("successful", "missing"): + dataset_info["status"] = "predicted_only" + case ("successful", "unpublished"): + dataset_info["status"] = "unpublished" + case (_, "published"): + # If anything other than a successful quantum + # produces a published dataset, that dataset + # is cursed. + dataset_info["status"] = "cursed" + dataset_info["messages"].append( + "Published dataset is from an unsuccessful quantum." + ) + case _: + # This should be a regular failure. + dataset_info["status"] = "unsuccessful" + if len(published_runs) > 1: + quantum_info["status"] = "wonky" + quantum_info["messages"].append( + f"Outputs from different runs of the same quanta were published: {published_runs}." + ) def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: """Summarize the QuantumProvenanceGraph in a dictionary. @@ -480,8 +490,6 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st failed_quanta = {"data_id": {}, "runs": {}, "message": {}} recovered_quanta = [] wonky_quanta = {"data_id": {}, "runs": {}, "message": {}} - # every item in this list will correspond to a data_id and be a - # dict keyed by run for quantum_key in quanta: quantum_info = self.get_quantum_info(quantum_key) if quantum_info["status"] == "successful": @@ -555,7 +563,7 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st # This is the total number in the original QG. "n_predicted": len(datasets), # These should all add up to 'predicted'... - "n_published": n_published, # (and published) + "n_published": n_published, "n_unpublished": n_unpublished, "n_predicted_only": n_predicted_only, "n_unsuccessful": n_unsuccessful, From e05df225115d2f35d5eb20d6d24bb0cd3d65eb06 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Thu, 21 Mar 2024 12:55:31 -0700 Subject: [PATCH 07/18] Output contents of QuantumProvenanceGraph in a summary dictionary --- .../pipe/base/quantum_provenance_graph.py | 116 +++++++++++++----- 1 file changed, 87 insertions(+), 29 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index c825d9787..7fedad7df 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -138,12 +138,19 @@ class QuantumRun: """The quantum graph node ID associated with the dataId in a specific run. """ - status: Literal[ - "failed", "successful", "logs_missing", "blocked", "metadata_missing" - ] = "metadata_missing" + status: Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"] = ( + "metadata_missing" + ) """The status of the quantum in that run. """ + def to_summary_dict( + self, + ) -> dict[str, Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"]]: + return { + "status": self.status, + } + class QuantumInfo(TypedDict): """Information about a quantum across all run collections. @@ -173,6 +180,21 @@ class QuantumInfo(TypedDict): """ +def make_quantum_info_summary_dict(quantum_info: QuantumInfo) -> dict[str, Any]: + return { + "data_id": dict(quantum_info["data_id"].required), + "runs": dict( + zip( + quantum_info["runs"].keys(), + [value.to_summary_dict() for value in quantum_info["runs"].values()], + ) + ), + "status": quantum_info["status"], + "recovered": quantum_info["recovered"], + "messages": quantum_info["messages"], + } + + @dataclasses.dataclass class DatasetRun: """Information about a dataset in a given run collection.""" @@ -192,6 +214,12 @@ class DatasetRun: def __post_init__(self) -> None: assert not (self.published and not self.produced) + def to_summary_dict(self) -> dict[str, bool]: + return { + "produced": self.produced, + "published": self.published, + } + class DatasetInfo(TypedDict): """Information about a given dataset across all runs. @@ -199,6 +227,10 @@ class DatasetInfo(TypedDict): Used to annotate the networkx node dictionary. """ + parent_task: QuantumKey.task_label + """The task_label of the task which produced this dataset. + """ + data_id: DataCoordinate """The data_id of the quantum. """ @@ -216,6 +248,21 @@ class DatasetInfo(TypedDict): """ +def make_dataset_info_summary_dict(dataset_info: DatasetInfo) -> dict[str, Any]: + return { + "parent_task": str(dataset_info["parent_task"]), + "data_id": dict(dataset_info["data_id"].required), + "runs": dict( + zip( + dataset_info["runs"].keys(), + [value.to_summary_dict() for value in dataset_info["runs"].values()], + ) + ), + "status": dataset_info["status"], + "messages": dataset_info["messages"], + } + + class QuantumProvenanceGraph: """A set of already-run, merged quantum graphs with provenance information. @@ -303,6 +350,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # the quanta. self._xgraph.add_edge(quantum_key, dataset_key) dataset_info = self.get_dataset_info(dataset_key) + dataset_info.setdefault("parent_task", quantum_key.task_label) dataset_info.setdefault("data_id", ref.dataId) dataset_info.setdefault("status", "missing") dataset_info.setdefault("messages", []) @@ -315,7 +363,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre if dataset_key.parent_dataset_type_name.endswith("_metadata"): quantum_info["metadata"] = dataset_key if dataset_key.parent_dataset_type_name.endswith("_log"): - quantum_info[quantum_key]["log"] = dataset_key + quantum_info["log"] = dataset_key for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.required_values) if dataset_key in self._xgraph: @@ -399,14 +447,15 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre f"to {quantum_run.status!r} in {output_run!r}." ) case (_, "blocked"): - pass + new_status = last_status case (_, "metadata_missing"): new_status = "not_attempted" - case ("failed", _): - pass + case (_, "failed"): + new_status = "failed" quantum_info["status"] = new_status def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = None, where: str = ""): + # could also call "resolve runs" for dataset_type_name in self._datasets: for ref in butler.registry.queryDatasets( dataset_type_name, @@ -427,6 +476,7 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = quantum_info = self.get_quantum_info(quantum_key) for dataset_key in self.iter_outputs_of(quantum_key): dataset_info = self.get_dataset_info(dataset_key) + dataset_info["parent_task"] = quantum_key.task_label published_runs.update( run for run, dataset_run in dataset_info["runs"].items() if dataset_run.published ) @@ -487,27 +537,28 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st n_wonky = 0 n_blocked = 0 n_failed = 0 - failed_quanta = {"data_id": {}, "runs": {}, "message": {}} + failed_quanta = {"data_id": {}, "runs": {}, "messages": {}} recovered_quanta = [] - wonky_quanta = {"data_id": {}, "runs": {}, "message": {}} + wonky_quanta = {"data_id": {}, "runs": {}, "messages": {}} for quantum_key in quanta: quantum_info = self.get_quantum_info(quantum_key) - if quantum_info["status"] == "successful": + quantum_summary = make_quantum_info_summary_dict(quantum_info) + if quantum_summary["status"] == "successful": n_successful += 1 - if quantum_info["recovered"]: - recovered_quanta.append(quantum_info["data_id"]) - elif quantum_info["status"] == "wonky": + if quantum_summary["recovered"]: + recovered_quanta.append(quantum_summary["data_id"]) + elif quantum_summary["status"] == "wonky": n_wonky += 1 - wonky_quanta["data_id"].update(quantum_info["data_id"]) - wonky_quanta["runs"].update(quantum_info["runs"]) - wonky_quanta["message"].update(quantum_info["messages"]) - elif quantum_info["status"] == "blocked": + wonky_quanta.update({"data_id": quantum_summary["data_id"]}) + wonky_quanta.update({"runs": quantum_summary["runs"]}) + wonky_quanta.update({"messages": quantum_summary["messages"]}) + elif quantum_summary["status"] == "blocked": n_blocked += 1 - elif quantum_info["status"] == "failed": + elif quantum_summary["status"] == "failed": n_failed += 1 - failed_quanta["data_id"].update(quantum_info["data_id"]) - runs = quantum_info["runs"] - failed_quanta["runs"].update(runs) + failed_quanta.update({"data_id": quantum_summary["data_id"]}) + runs = quantum_summary["runs"] + failed_quanta.update({"runs": runs}) log_key: DatasetKey = self._xgraph.nodes[quantum_key]["log"] if do_store_logs: for run in runs: @@ -518,11 +569,11 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st log_key.parent_dataset_type_name, quantum_info["data_id"], collections=run ) except LookupError: - failed_quanta["message"] = [] + failed_quanta["messages"] = [] except FileNotFoundError: - failed_quanta["message"] = None + failed_quanta["messages"] = None else: - failed_quanta["message"].update( + failed_quanta["messages"].extend( [record.message for record in log if record.levelno >= logging.ERROR] ) result["tasks"][task_label] = { @@ -541,9 +592,15 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st n_unsuccessful = 0 n_cursed = 0 unsuccessful_datasets = [] - cursed_datasets = {"parent_data_id": {}, "runs": {}, "message": {}} + cursed_datasets = { + "parent_task_name": "", + "parent_data_id": {}, + "runs": {}, + "messages": [], + } for dataset_key in datasets: dataset_info = self.get_dataset_info(dataset_key) + dataset_summary = make_dataset_info_summary_dict(dataset_info) if dataset_info["status"] == "published": n_published += 1 elif dataset_info["status"] == "unpublished": @@ -552,12 +609,13 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st n_predicted_only += 1 elif dataset_info["status"] == "unsuccessful": n_unsuccessful += 1 - unsuccessful_datasets.append(dataset_info["data_id"]) + unsuccessful_datasets.append(dataset_summary["data_id"]) elif dataset_info["status"] == "cursed": n_cursed += 1 - cursed_datasets["parent_data_id"].update(dataset_info["data_id"]) - cursed_datasets["runs"].update(dataset_info["runs"]) - cursed_datasets["message"].update(dataset_info["message"]) + cursed_datasets.update({"parent_task_name": dataset_summary["parent_task"]}) + cursed_datasets.update({"parent_data_id": dataset_summary["data_id"]}) + cursed_datasets.update({"runs": dataset_summary["runs"]}) + cursed_datasets.update({"messages": dataset_summary["messages"]}) result["datasets"][dataset_type_name] = { # This is the total number in the original QG. From 8c0e74a26a1f662a907f86ed9cd35dfdd1fb19fd Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Tue, 30 Apr 2024 13:41:05 -0700 Subject: [PATCH 08/18] Pydantify Quantum and datasetType summaries, document and resolve some bugs --- doc/changes/DM-41711.feature.md | 3 + .../pipe/base/quantum_provenance_graph.py | 735 ++++++++++++------ 2 files changed, 504 insertions(+), 234 deletions(-) create mode 100644 doc/changes/DM-41711.feature.md diff --git a/doc/changes/DM-41711.feature.md b/doc/changes/DM-41711.feature.md new file mode 100644 index 000000000..91dd23095 --- /dev/null +++ b/doc/changes/DM-41711.feature.md @@ -0,0 +1,3 @@ +Create a QuantumProvenanceGraph, which details the status of every quantum +and dataset over multiple attempts at executing graphs, noting when quanta +have been recovered. diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 7fedad7df..695c69229 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -39,14 +39,14 @@ "PrerequisiteDatasetKey", ) -import dataclasses import itertools import logging import uuid from collections.abc import Iterator, Sequence -from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, TypedDict +from typing import TYPE_CHECKING, ClassVar, Literal, NamedTuple, TypeAlias, TypedDict, cast import networkx +import pydantic from lsst.daf.butler import Butler, DataCoordinate, DataIdValue from lsst.resources import ResourcePathExpression from lsst.utils.logging import getLogger @@ -77,12 +77,6 @@ class QuantumKey(NamedTuple): than a dataset (always `True`). """ - def to_summary_dict(self, xgraph: networkx.DiGraph) -> dict[str, Any]: - return { - "task": self.task_label, - "data_id": xgraph.nodes[self]["data_id"], - } - class DatasetKey(NamedTuple): """Identifier type for dataset keys in a `QuantumProvenanceGraph`.""" @@ -130,26 +124,22 @@ class PrerequisiteDatasetKey(NamedTuple): is_prerequisite: ClassVar[Literal[True]] = True -@dataclasses.dataclass -class QuantumRun: +QuantumRunStatus: TypeAlias = Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"] + + +class QuantumRun(pydantic.BaseModel): """Information about a quantum in a given run collection.""" id: uuid.UUID """The quantum graph node ID associated with the dataId in a specific run. """ - status: Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"] = ( - "metadata_missing" - ) + status: QuantumRunStatus = "metadata_missing" """The status of the quantum in that run. """ - def to_summary_dict( - self, - ) -> dict[str, Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"]]: - return { - "status": self.status, - } + +QuantumInfoStatus: TypeAlias = Literal["successful", "wonky", "blocked", "not_attempted", "failed"] class QuantumInfo(TypedDict): @@ -166,7 +156,7 @@ class QuantumInfo(TypedDict): """All run collections associated with the quantum. """ - status: Literal["successful", "wonky", "blocked", "not_attempted", "failed"] + status: QuantumInfoStatus """The overall status of the quantum. Note that it is impossible to exit a wonky state. """ @@ -179,24 +169,18 @@ class QuantumInfo(TypedDict): """Diagnostic messages to help disambiguate wonky states. """ + log: DatasetKey + """The `DatasetKey` which can be used to access the log associated with the + quantum. + """ -def make_quantum_info_summary_dict(quantum_info: QuantumInfo) -> dict[str, Any]: - return { - "data_id": dict(quantum_info["data_id"].required), - "runs": dict( - zip( - quantum_info["runs"].keys(), - [value.to_summary_dict() for value in quantum_info["runs"].values()], - ) - ), - "status": quantum_info["status"], - "recovered": quantum_info["recovered"], - "messages": quantum_info["messages"], - } + metadata: DatasetKey + """The `DatasetKey` which can be used to access the metadata for the + quantum. + """ -@dataclasses.dataclass -class DatasetRun: +class DatasetRun(pydantic.BaseModel): """Information about a dataset in a given run collection.""" id: uuid.UUID @@ -211,14 +195,16 @@ class DatasetRun: """Whether this dataset was published in the final output collection. """ - def __post_init__(self) -> None: + @pydantic.model_validator(mode="after") + def _validate(self) -> DatasetRun: + """Validate the model for `DatasetRun` by asserting that no published + `DatasetRun` is also not produced (this should be impossible). + """ assert not (self.published and not self.produced) + return self + - def to_summary_dict(self) -> dict[str, bool]: - return { - "produced": self.produced, - "published": self.published, - } +DatasetInfoStatus: TypeAlias = Literal["published", "unpublished", "predicted_only", "unsuccessful", "cursed"] class DatasetInfo(TypedDict): @@ -227,10 +213,6 @@ class DatasetInfo(TypedDict): Used to annotate the networkx node dictionary. """ - parent_task: QuantumKey.task_label - """The task_label of the task which produced this dataset. - """ - data_id: DataCoordinate """The data_id of the quantum. """ @@ -239,7 +221,7 @@ class DatasetInfo(TypedDict): """All runs associated with the dataset. """ - status: Literal["published", "unpublished", "predicted_only", "unsuccessful", "cursed"] + status: DatasetInfoStatus """Overall status of the dataset. """ @@ -248,19 +230,279 @@ class DatasetInfo(TypedDict): """ -def make_dataset_info_summary_dict(dataset_info: DatasetInfo) -> dict[str, Any]: - return { - "parent_task": str(dataset_info["parent_task"]), - "data_id": dict(dataset_info["data_id"].required), - "runs": dict( - zip( - dataset_info["runs"].keys(), - [value.to_summary_dict() for value in dataset_info["runs"].values()], - ) - ), - "status": dataset_info["status"], - "messages": dataset_info["messages"], - } +class UnsuccessfulQuantumSummary(pydantic.BaseModel): + """A summary of all relevant information on an unsuccessful quantum.""" + + data_id: dict[str, DataIdValue] + """The data_id of the unsuccessful quantum. + """ + runs: dict[str, QuantumRunStatus] + """A dictionary including the `QuantumRunStatus` of each run associated + with an attempt to process the unsuccessful quantum. + """ + messages: list[str] + """Any messages associated with the unsuccessful quantum (any clues as to + why the quantum may be in a `failed` or `wonky` state). + """ + + @classmethod + def from_info(cls, info: QuantumInfo) -> UnsuccessfulQuantumSummary: + """Summarize all relevant information from the `QuantumInfo` in an + `UnsuccessfulQuantumSummary`; return an `UnsuccessfulQuantumSummary`. + + Parameters + ---------- + info : `QuantumInfo` + The `QuantumInfo` object for the unsuccessful quantum. + """ + return cls( + data_id=dict(info["data_id"].required), + runs={k: v.status for k, v in info["runs"].items()}, + messages=info["messages"], + ) + + +class TaskSummary(pydantic.BaseModel): + """A summary of the status of all quanta for a single task.""" + + n_successful: int = 0 + """A count of successful quanta. + """ + n_blocked: int = 0 + """A count of blocked quanta. + """ + n_not_attempted: int = 0 + """A count of quanta for which processing was not attempted. + """ + + n_expected: int = 0 + """The number of quanta expected by the graph. + """ + + @pydantic.computed_field # type: ignore[misc] + @property + def n_wonky(self) -> int: + """Return a count of `wonky` quanta.""" + return len(self.wonky_quanta) + + @pydantic.computed_field # type: ignore[misc] + @property + def n_failed(self) -> int: + """Return a count of `failed` quanta.""" + return len(self.failed_quanta) + + failed_quanta: list[UnsuccessfulQuantumSummary] = pydantic.Field(default_factory=list) + """A list of all `UnsuccessfulQuantumSummary` objects associated with the + `failed` quanta. This is a report containing their data_ids, the status + of each run associated with each `failed` quantum, and the error messages + associated with the failures when applicable. + """ + recovered_quanta: list[dict[str, DataIdValue]] = pydantic.Field(default_factory=list) + """A list of the quanta which moved from an unsuccessful to `successful` + state. + """ + wonky_quanta: list[UnsuccessfulQuantumSummary] = pydantic.Field(default_factory=list) + """A list of all `UnsuccessfulQuantumSummary` objects associated with the + `wonky` quanta. This is a report containing their data_ids, the status of + each run associated with each `wonky` quantum, and messages (dictated in + this module) associated with the particular issue identified. + """ + + def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: bool = True) -> None: + """Add a `QuantumInfo` to a `TaskSummary`. + + Unpack the `QuantumInfo` object, sorting quanta of each status into + the correct place in the `TaskSummary`. If looking for error messages + in the `Butler` logs is desired, take special care to catch issues + with missing logs. + + Parameters + ---------- + info : `QuantumInfo` + The `QuantumInfo` object to add to the `TaskSummary`. + butler : `lsst.daf.butler.Butler` + The butler repo used for the graph being inspected, which can be + queried for errors and logs. + do_store_logs : `bool`, optional + Store error messages from Butler logs associated with failed quanta + if `True`. + """ + match info["status"]: + case "successful": + self.n_successful += 1 + if info["recovered"]: + self.recovered_quanta.append(dict(info["data_id"].required)) + case "wonky": + self.wonky_quanta.append(UnsuccessfulQuantumSummary.from_info(info)) + case "blocked": + self.n_blocked += 1 + case "failed": + failed_quantum_summary = UnsuccessfulQuantumSummary.from_info(info) + log_key = info["log"] + if do_store_logs: + for run in info["runs"]: + try: + # should probably upgrade this to use a dataset + # ref + log = butler.get( + log_key.parent_dataset_type_name, info["data_id"], collections=run + ) + except LookupError: + failed_quantum_summary.messages.append(f"Logs not ingested for {run!r}") + except FileNotFoundError: + failed_quantum_summary.messages.append(f"Logs missing or corrupt for {run!r}") + else: + failed_quantum_summary.messages.extend( + [record.message for record in log if record.levelno >= logging.ERROR] + ) + self.failed_quanta.append(failed_quantum_summary) + case "not_attempted": + self.n_not_attempted += 1 + case unrecognized_state: + raise AssertionError(f"Unrecognized quantum status {unrecognized_state!r}") + + +class CursedDatasetSummary(pydantic.BaseModel): + """A summary of all the relevant information on a `cursed` dataset.""" + + producer_data_id: dict[str, DataIdValue] + """The data_id of the task which produced this dataset. This is mostly + useful for people wishing to track down the task which produced this + `cursed` dataset quickly. + """ + data_id: dict[str, DataIdValue] + """The data_id of the cursed `Dataset`. + """ + runs_produced: dict[str, bool] + """A dictionary of all the runs associated with the `cursed` dataset; + the `bool` is true if the dataset was produced in the associated run. + """ + run_published: str | None + """A dictionary of all `published` runs containing the `cursed` dataset. + """ + messages: list[str] + """Any diagnostic messages (dictated in this module) which might help in + understanding why or how the dataset became cursed. + """ + + @classmethod + def from_info(cls, info: DatasetInfo, producer_info: QuantumInfo) -> CursedDatasetSummary: + """Summarize all relevant information from the `DatasetInfo` in an + `CursedDatasetSummary`; return a `CursedDatasetSummary`. + + Parameters + ---------- + info : `DatasetInfo` + All relevant information on the dataset. + producer_info : `QuantumInfo` + All relevant information on the producer task. This is used to + report the data_id of the producer task. + """ + runs_published = {k for k, v in info["runs"].items() if v.published} + return cls( + producer_data_id=dict(producer_info["data_id"].required), + data_id=dict(info["data_id"].required), + runs_produced={k: v.produced for k, v in info["runs"].items()}, + # this has at most one element + run_published=runs_published.pop() if runs_published else None, + messages=info["messages"], + ) + + +class DatasetTypeSummary(pydantic.BaseModel): + """A summary of the status of all datasets of a particular type.""" + + producer: str + """The name of the task which produced this dataset. + """ + + n_published: int = 0 + """A count of the datasets of this type which were published in the final + collection. + """ + n_unpublished: int = 0 + """A count of the datasets of this type which were produced but not + published. This includes any datasets which do not come up in a butler + query over their associated collection. + """ + n_predicted_only: int = 0 + """A count of the datasets of this type which were predicted but + ultimately not produced. Note that this does not indicate a failure, + which are accounted for differently. This is commonly referred to as + a `NoWorkFound` case. + """ + n_expected: int = 0 + """The number of datasets of this type expected by the graph. + """ + + @pydantic.computed_field # type: ignore[misc] + @property + def n_cursed(self) -> int: + """Return a count of cursed datasets.""" + return len(self.cursed_datasets) + + @pydantic.computed_field # type: ignore[misc] + @property + def n_unsuccessful(self) -> int: + """Return a count of unsuccessful datasets.""" + return len(self.unsuccessful_datasets) + + cursed_datasets: list[CursedDatasetSummary] = pydantic.Field(default_factory=list) + """A list of all `CursedDatasetSummary` objects associated with the + `cursed` datasets. This is a report containing their data_ids and the + data_ids of their producer task, the status of each run associated with + each `cursed` dataset, and messages (dictated in this module) associated + with the particular issue identified. + """ + unsuccessful_datasets: list[dict[str, DataIdValue]] = pydantic.Field(default_factory=list) + """A list of all unsuccessful datasets by their name and data_id. + """ + + def add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> None: + """Add a `DatasetInfo` to a `DatasetTypeSummary`. + + Unpack the `DatasetInfo` object, sorting datasets of each status into + the correct place in the `DatasetTypeSummary`. If the status of a + dataset is not valid, raise an `AssertionError`. + + Parameters + ---------- + info : `DatasetInfo` + The `DatasetInfo` object to add to the `DatasetTypeSummary`. + producer_info : `QuantumInfo` + The `QuantumInfo` object associated with the producer of the + dataset. This is used to report the producer task in the + summaries for `cursed` datasets, which may help identify + specific issues. + """ + match info["status"]: + case "published": + self.n_published += 1 + case "unpublished": + self.n_unpublished += 1 + case "unsuccessful": + self.unsuccessful_datasets.append(dict(info["data_id"].mapping)) + case "cursed": + self.cursed_datasets.append(CursedDatasetSummary.from_info(info, producer_info)) + case "predicted_only": + self.n_predicted_only += 1 + case unrecognized_state: + raise AssertionError(f"Unrecognized dataset status {unrecognized_state!r}") + + +class Summary(pydantic.BaseModel): + """A summary of the contents of the QuantumProvenanceGraph, including + all information on the quanta for each `Task` and the datasets of each + `DatasetType`. + """ + + tasks: dict[str, TaskSummary] = pydantic.Field(default_factory=dict) + """Summaries for the tasks and their quanta. + """ + + datasets: dict[str, DatasetTypeSummary] = pydantic.Field(default_factory=dict) + """Summaries for the datasets. + """ class QuantumProvenanceGraph: @@ -269,16 +511,14 @@ class QuantumProvenanceGraph: Step through all the quantum graphs associated with certain tasks or processing steps. For each graph/attempt, the status of each quantum and - dataset is recorded in `QuantumProvenanceGraph.add_new_graph` and duplicate - outcomes of dataIds are resolved in - `QuantumProvenanceGraph.resolve_duplicates`. At the end of this process, we - can combine all attempts into a final summary graph which can be converted - into a report on the production over multiple processing and recovery - attempts in `name functions later`. This serves to answer the question - "What happened to this data ID?" in a wholistic sense. + dataset is recorded in `QuantumProvenanceGraph.add_new_graph` and outcomes + of quanta over multiple runs are resolved in + `QuantumProvenanceGraph.resolve_duplicates`. At the end of this process, + we can combine all attempts into a summary. This serves to answer the + question "What happened to this data ID?" in a wholistic sense. """ - def __init__(self): + def __init__(self) -> None: # The graph we annotate as we step through all the graphs associated # with the processing to create the `QuantumProvenanceGraph`. self._xgraph = networkx.DiGraph() @@ -289,30 +529,53 @@ def __init__(self): self._datasets: dict[str, set[DatasetKey]] = {} def get_quantum_info(self, key: QuantumKey) -> QuantumInfo: + """Get a `QuantumInfo` object from the `QuantumProvenanceGraph` using + a `QuantumKey`. + + Parameters + ---------- + key : `QuantumKey` + The key used to refer to the node on the graph. + """ return self._xgraph.nodes[key] def get_dataset_info(self, key: DatasetKey) -> DatasetInfo: + """Get a `DatasetInfo` object from the `QuantumProvenanceGraph` using + a `DatasetKey`. + + Parameters + ---------- + key : `DatasetKey` + The key used to refer to the node on the graph. + """ return self._xgraph.nodes[key] def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpression) -> None: """Add a new quantum graph to the `QuantumProvenanceGraph`. - Step through the quantum graph. Annotate a mirror networkx.DiGraph - (QuantumProvenanceGraph._xgraph) with all of the relevant information: - quanta, dataset types and their associated run collection (these unique - quanta- and dataset type-run collection combinations are encapsulated - in the dataclasses `DatasetRun` and `QuantumRun`). For each new - quantum, annotate the status of the `QuantumRun` by inspecting the - graph. If a DatasetType was produced, annotate this in the run by - setting `DatasetRun.produced = True`. Then, we can resolve newly- - successful quanta (failed in previous runs) with - `QuantumProvenanceGraph.resolve_duplicates`. + Step through the quantum graph. Annotate a `networkx.DiGraph` + (`QuantumProvenanceGraph._xgraph`) with all of the relevant + information: quanta, dataset types and their associated run + collections (these unique quanta- and dataset type-run + collection combinations are encapsulated in the classes + `DatasetRun` and `QuantumRun`). For each new quantum, annotate + the status of the `QuantumRun` by inspecting the graph. If a + DatasetType was produced, annotate this in the run by setting + `DatasetRun.produced = True`. If a quantum is given `blocked` + or `failed` status, annotate all their successors in the graph + as `blocked`. For each new quantum, use the transition between + the current and last `QuantumRun.status` to determine the status + to assign to the overall `QuantumInfo`. For example, if a + previous run associated with a quantum had the status `failed`, + and the status from the new graph reads `successful`, we can + mark the overall quantum status as `successful` and list the id + as `recovered`. Parameters ---------- butler : `lsst.daf.butler.Butler` - The Butler used for this report. This should match the Butler used - for the run associated with the executed quantum graph. + The Butler used for this report. This should match the Butler + used for the run associated with the executed quantum graph. qgraph : `QuantumGraph` | `ResourcePathExpression` Either the associated quantum graph object or the uri of the @@ -325,14 +588,17 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre output_run = qgraph.metadata["output_run"] new_quanta = [] for node in qgraph: - # make a key to add to the mirror graph with specific quanta for - # nodes. - quantum_key = QuantumKey(node.taskDef.label, node.quantum.dataId.required_values) + # make a key to refer to the quantum and add it to the graph. + quantum_key = QuantumKey( + node.taskDef.label, cast(DataCoordinate, node.quantum.dataId).required_values + ) self._xgraph.add_node(quantum_key) + # use the key to get a `QuantumInfo` object for the quantum + # and set defaults for its values. quantum_info = self.get_quantum_info(quantum_key) quantum_info.setdefault("messages", []) quantum_info.setdefault("runs", {}) - quantum_info.setdefault("data_id", node.quantum.dataId) + quantum_info.setdefault("data_id", cast(DataCoordinate, node.quantum.dataId)) quantum_info.setdefault("status", "not_attempted") quantum_info.setdefault("recovered", False) new_quanta.append(quantum_key) @@ -341,25 +607,28 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # if the same quanta are processed in multiple runs as in recovery # workflows. quantum_runs = quantum_info.setdefault("runs", {}) - # the QuantumRun here is the specific quantum-run collection + # the `QuantumRun` here is the specific quantum-run collection # combination. - quantum_runs[output_run] = QuantumRun(node.nodeId) + quantum_runs[output_run] = QuantumRun(id=node.nodeId) + # For each of the outputs of the quanta (datasets) make a key to + # refer to the dataset. for ref in itertools.chain.from_iterable(node.quantum.outputs.values()): dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) - # add datasets to the nodes of the mirror graph, with edges on - # the quanta. + # add datasets to the nodes of the graph, with edges on the + # quanta. self._xgraph.add_edge(quantum_key, dataset_key) + # use the dataset key to make a `DatasetInfo` object for + # the dataset and set defaults for its values. dataset_info = self.get_dataset_info(dataset_key) - dataset_info.setdefault("parent_task", quantum_key.task_label) dataset_info.setdefault("data_id", ref.dataId) - dataset_info.setdefault("status", "missing") + dataset_info.setdefault("status", "predicted_only") dataset_info.setdefault("messages", []) self._datasets.setdefault(dataset_key.parent_dataset_type_name, set()).add(dataset_key) dataset_runs = dataset_info.setdefault("runs", {}) - # make a DatasetRun for the specific dataset-run collection - # combination. - dataset_runs[output_run] = DatasetRun(ref.id) - # save metadata and logs for easier status interpretation + # make a `DatasetRun` for the specific dataset-run + # collection combination. + dataset_runs[output_run] = DatasetRun(id=ref.id) + # save metadata and logs for easier status interpretation later if dataset_key.parent_dataset_type_name.endswith("_metadata"): quantum_info["metadata"] = dataset_key if dataset_key.parent_dataset_type_name.endswith("_log"): @@ -374,6 +643,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre for ref in butler.registry.queryDatasets(dataset_type_name, collections=output_run): # find the datasets in the butler dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) + dataset_info = self.get_dataset_info(dataset_key) dataset_run = dataset_info["runs"][output_run] # dataset run (singular) # if the dataset is in the output run collection, we produced # it! @@ -387,122 +657,185 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre log_key = quantum_info["log"] metadata_dataset_run = self.get_dataset_info(metadata_key)["runs"][output_run] log_dataset_run = self.get_dataset_info(log_key)["runs"][output_run] - if metadata_dataset_run.produced: # check with Jim about this condition - # if we do have metadata: + # if we do have metadata, we know that the task finished. + if metadata_dataset_run.produced: + # if we also have logs, this is a success. if log_dataset_run.produced: - # if we also have logs, this is a success - # this includes No Work Found (the only things produced - # were metadata and logs). quantum_run.status = "successful" + # if we have metadata and no logs, this is a very rare + # case. either the task ran successfully and the datastore + # died immediately afterwards, or some supporting + # infrastructure for transferring the logs to the datastore + # failed. else: - # if we have metadata and no logs, this is a very rare - # case. either the task ran successfully and the datastore - # died immediately afterwards, or some supporting - # infrastructure for transferring the logs to the datastore - # failed. quantum_run.status = "logs_missing" + # missing metadata means that the task did not finish. + else: - # missing metadata means that the task did not finish. + # if we have logs and no metadata, the task not finishing is + # a failure in the task itself. This includes all payload + # errors and some other problems. if log_dataset_run.produced: - # if we do have logs, the task not finishing is a failure - # in the task itself. This includes all payload errors and - # some other errors. quantum_run.status = "failed" # if a quantum fails, all its successor datasets are # blocked. blocked.update(self._xgraph.successors(quantum_key)) + # if we are missing metadata and logs, either the task was not + # started, or a hard external environmental error prevented + # it from writing logs or metadata. else: - # we are missing metadata and logs. Either the task was not - # started, or a hard external environmental error prevented - # it from writing logs or metadata. + # if none of this quantum's inputs were blocked, the + # metadata must just be missing. if blocked.isdisjoint(self._xgraph.predecessors(quantum_key)): # None of this quantum's inputs were blocked. quantum_run.status = "metadata_missing" + # otherwise we can assume from no metadata and no logs + # that the task was blocked by an upstream failure. else: quantum_run.status = "blocked" blocked.update(self._xgraph.successors(quantum_key)) # Now we can start using state transitions to mark overall status. - if len(quantum_info["runs"]) == 1: - last_status = "not_attempted" - else: - last_run = list(quantum_info["runs"].values())[-1] - last_status = last_run.status + last_status = quantum_info["status"] + new_status: QuantumInfoStatus match last_status, quantum_run.status: - case ("not_attempted", new_status): - pass + # A quantum can never escape a `wonky` state. case ("wonky", _): new_status = "wonky" + # Any transition to a success (excluding from `wonky`) is + # a success; any transition from a failed state is also a + # recovery. case (_, "successful"): new_status = "successful" - if last_status != "successful": + if last_status != "successful" and last_status != "not_attempted": quantum_info["recovered"] = True + # Missing logs are one of the categories of wonky quanta. They + # interfere with our ability to discern quantum status and are + # signs of weird things afoot in processing. Add a message + # noting why this quantum is being marked as wonky to be stored + # in its `UnsuccessfulQuantumInfo`. case (_, "logs_missing"): new_status = "wonky" quantum_info["messages"].append(f"Logs missing for run {output_run!r}.") + # Leaving a successful state is another category of wonky + # quanta. If a previous success fails on a subsequent run, + # a human should inspect why. Add a message noting why this + # quantum is being marked as wonky to be stored in its + # `UnsuccessfulQuantumInfo`. case ("successful", _): new_status = "wonky" quantum_info["messages"].append( - f"Status went from successful in run {last_run!r} " + f"Status went from successful in run {list(quantum_info['runs'].values())[-1]!r} " f"to {quantum_run.status!r} in {output_run!r}." ) + # If a quantumm is not attempted and moves to blocked, we know + # for sure that it is a blocked quantum. + case ("not_attempted", "blocked"): + new_status = "blocked" + # A transition into blocked does not change the overall quantum + # status for a failure. case (_, "blocked"): new_status = last_status + # If a quantum transitions from any state into missing + # metadata, it was probably not attempted. case (_, "metadata_missing"): new_status = "not_attempted" + # Any transition into failure is a failed state. case (_, "failed"): new_status = "failed" + # Update `QuantumInfo.status` for this quantum. quantum_info["status"] = new_status - def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = None, where: str = ""): - # could also call "resolve runs" + def resolve_duplicates( + self, butler: Butler, collections: Sequence[str] | None = None, where: str = "" + ) -> None: + """After quantum graphs associated with each run have been added + to the `QuantumProvenanceGraph, resolve any discrepancies between + them and use all attempts to finalize overall status. + + Particularly, use the publish state of each `DatasetRun` in combination + with overall quantum status to ascertain the status of each dataset. + Additionally, if there are multiple published runs associated with a + dataset, mark the producer quantum as `wonky`. + + This method should be called after + `QuantumProvenanceGraph.add_new_graph` has been called on every graph + associated with the data processing. + + Parameters + ---------- + butler : `lsst.daf.butler.Butler` + The Butler used for this report. This should match the Butler used + for the run associated with the executed quantum graph. + + collections : `Sequence[str]` | `None` + Collections to use in `lsst.daf.butler.registry.queryDatasets` if + paring down the query would be useful. + + where : `str` + A "where" string to use to constrain the collections, if passed. + """ for dataset_type_name in self._datasets: + # find datasets in a larger collection. for ref in butler.registry.queryDatasets( dataset_type_name, collections=collections, findFirst=True, where=where, ): - # find the datasets in a larger collection. "who won?" dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) dataset_info = self.get_dataset_info(dataset_key) + # queryable datasets are `published`. dataset_info["runs"][ref.run].published = True - for task_label, task_quanta in self._quanta.items(): + for task_quanta in self._quanta.values(): for quantum_key in task_quanta: - # these are the run collections of the datasets produced by - # this quantum that were published in the final collection - published_runs = set() + # runs associated with published datasets. + published_runs: set[str] = set() quantum_info = self.get_quantum_info(quantum_key) + # Loop over each dataset in the outputs of a single quantum. for dataset_key in self.iter_outputs_of(quantum_key): dataset_info = self.get_dataset_info(dataset_key) - dataset_info["parent_task"] = quantum_key.task_label published_runs.update( run for run, dataset_run in dataset_info["runs"].items() if dataset_run.published ) if any(dataset_run.published for dataset_run in dataset_info["runs"].values()): publish_state = "published" + # set the publish state to `unpublished` if the dataset was + # produced but not published (i.e., not queryable from the + # final collection(s)). elif any(dataset_run.produced for dataset_run in dataset_info["runs"].values()): publish_state = "unpublished" + # a dataset which was not produced and not published is + # missing. else: publish_state = "missing" + # use the quantum status and publish state to ascertain the + # status of the dataset. match (quantum_info["status"], publish_state): + # published datasets from successful quanta are as + # intended. case ("successful", "published"): dataset_info["status"] = "published" + # missing datasets from successful quanta indicate a + # `NoWorkFound` case. case ("successful", "missing"): dataset_info["status"] = "predicted_only" case ("successful", "unpublished"): dataset_info["status"] = "unpublished" + # If anything other than a successful quantum produces + # a published dataset, that dataset is cursed. Set the + # status for the dataset to cursed and note the reason + # for labeling the dataset as cursed. case (_, "published"): - # If anything other than a successful quantum - # produces a published dataset, that dataset - # is cursed. dataset_info["status"] = "cursed" dataset_info["messages"].append( "Published dataset is from an unsuccessful quantum." ) + # any other produced dataset (produced but not + # published and not successful) is a regular + # failure. case _: - # This should be a regular failure. dataset_info["status"] = "unsuccessful" if len(published_runs) > 1: quantum_info["status"] = "wonky" @@ -510,8 +843,8 @@ def resolve_duplicates(self, butler: Butler, collections: Sequence[str] | None = f"Outputs from different runs of the same quanta were published: {published_runs}." ) - def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: - """Summarize the QuantumProvenanceGraph in a dictionary. + def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary: + """Summarize the `QuantumProvenanceGraph`. Parameters ---------- @@ -522,122 +855,56 @@ def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[st Returns ------- - summary_dict : `dict` - A dictionary containing counts of quanta and datasets in each of + summary : `Summary` + A struct containing counts of quanta and datasets in each of the overall states defined in `QuantumInfo` and `DatasetInfo`, as well as diagnostic information and error messages for failed quanta and strange edge cases, and a list of recovered quanta. """ - result = { - "tasks": {}, - "datasets": {}, - } + result = Summary() for task_label, quanta in self._quanta.items(): - n_successful = 0 - n_wonky = 0 - n_blocked = 0 - n_failed = 0 - failed_quanta = {"data_id": {}, "runs": {}, "messages": {}} - recovered_quanta = [] - wonky_quanta = {"data_id": {}, "runs": {}, "messages": {}} + task_summary = TaskSummary() + task_summary.n_expected = len(quanta) for quantum_key in quanta: quantum_info = self.get_quantum_info(quantum_key) - quantum_summary = make_quantum_info_summary_dict(quantum_info) - if quantum_summary["status"] == "successful": - n_successful += 1 - if quantum_summary["recovered"]: - recovered_quanta.append(quantum_summary["data_id"]) - elif quantum_summary["status"] == "wonky": - n_wonky += 1 - wonky_quanta.update({"data_id": quantum_summary["data_id"]}) - wonky_quanta.update({"runs": quantum_summary["runs"]}) - wonky_quanta.update({"messages": quantum_summary["messages"]}) - elif quantum_summary["status"] == "blocked": - n_blocked += 1 - elif quantum_summary["status"] == "failed": - n_failed += 1 - failed_quanta.update({"data_id": quantum_summary["data_id"]}) - runs = quantum_summary["runs"] - failed_quanta.update({"runs": runs}) - log_key: DatasetKey = self._xgraph.nodes[quantum_key]["log"] - if do_store_logs: - for run in runs: - try: - # should probably upgrade this to use a dataset - # ref - log = butler.get( - log_key.parent_dataset_type_name, quantum_info["data_id"], collections=run - ) - except LookupError: - failed_quanta["messages"] = [] - except FileNotFoundError: - failed_quanta["messages"] = None - else: - failed_quanta["messages"].extend( - [record.message for record in log if record.levelno >= logging.ERROR] - ) - result["tasks"][task_label] = { - "n_successful": n_successful, - "n_wonky": n_wonky, - "n_blocked": n_blocked, - "n_failed": n_failed, - "failed_quanta": failed_quanta, - "recovered_quanta": recovered_quanta, - "wonky_quanta": wonky_quanta, - } + task_summary.add_quantum_info(quantum_info, butler, do_store_logs) + result.tasks[task_label] = task_summary + for dataset_type_name, datasets in self._datasets.items(): - n_published = 0 - n_unpublished = 0 - n_predicted_only = 0 - n_unsuccessful = 0 - n_cursed = 0 - unsuccessful_datasets = [] - cursed_datasets = { - "parent_task_name": "", - "parent_data_id": {}, - "runs": {}, - "messages": [], - } + dataset_type_summary = DatasetTypeSummary(producer="") + dataset_type_summary.n_expected = len(datasets) for dataset_key in datasets: dataset_info = self.get_dataset_info(dataset_key) - dataset_summary = make_dataset_info_summary_dict(dataset_info) - if dataset_info["status"] == "published": - n_published += 1 - elif dataset_info["status"] == "unpublished": - n_unpublished += 1 - elif dataset_info["status"] == "predicted_only": - n_predicted_only += 1 - elif dataset_info["status"] == "unsuccessful": - n_unsuccessful += 1 - unsuccessful_datasets.append(dataset_summary["data_id"]) - elif dataset_info["status"] == "cursed": - n_cursed += 1 - cursed_datasets.update({"parent_task_name": dataset_summary["parent_task"]}) - cursed_datasets.update({"parent_data_id": dataset_summary["data_id"]}) - cursed_datasets.update({"runs": dataset_summary["runs"]}) - cursed_datasets.update({"messages": dataset_summary["messages"]}) - - result["datasets"][dataset_type_name] = { - # This is the total number in the original QG. - "n_predicted": len(datasets), - # These should all add up to 'predicted'... - "n_published": n_published, - "n_unpublished": n_unpublished, - "n_predicted_only": n_predicted_only, - "n_unsuccessful": n_unsuccessful, - "n_cursed": n_cursed, - "unsuccessful_datasets": unsuccessful_datasets, - "cursed_datasets": cursed_datasets, - } + producer_key = self.get_producer_of(dataset_key) + producer_info = self.get_quantum_info(producer_key) + # Not ideal, but hard to get out of the graph at the moment. + # Change after DM-40441 + dataset_type_summary.producer = producer_key.task_label + dataset_type_summary.add_dataset_info(dataset_info, producer_info) + + result.datasets[dataset_type_name] = dataset_type_summary return result def iter_outputs_of(self, quantum_key: QuantumKey) -> Iterator[DatasetKey]: - metadata_key = self._xgraph.nodes[quantum_key]["metadata"] - log_key = self._xgraph.nodes[quantum_key]["log"] + """Iterate through the outputs of a quantum, yielding all the + `DatasetKey`s produced by the quantum. + + Parameters + ---------- + quantum_key : `QuantumKey` + The key for the quantum whose outputs are needed. + """ for dataset_key in self._xgraph.successors(quantum_key): - if dataset_key != metadata_key and dataset_key != log_key: - yield dataset_key + yield dataset_key def get_producer_of(self, dataset_key: DatasetKey) -> QuantumKey: + """Unpack the predecessor (producer quantum) of a given dataset key + from a graph. + + Parameters + ---------- + dataset_key : `DatasetKey` + The key for the dataset whose producer quantum is needed. + """ (result,) = self._xgraph.predecessors(dataset_key) return result From fe1043198fefc800fe54e693a28a31278399c8e9 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Fri, 17 May 2024 16:19:30 -0700 Subject: [PATCH 09/18] Raise RunTimeError if resolve_duplicates is used improperly --- .../pipe/base/quantum_provenance_graph.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 695c69229..242df3107 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -527,6 +527,9 @@ def __init__(self) -> None: # The nodes representing datasets in `_xgraph` grouped by dataset type # name. self._datasets: dict[str, set[DatasetKey]] = {} + # Bool representing whether the graph has been finalized. This is set + # to True when resolve_duplicates is + self._finalized: bool = False def get_quantum_info(self, key: QuantumKey) -> QuantumInfo: """Get a `QuantumInfo` object from the `QuantumProvenanceGraph` using @@ -775,6 +778,15 @@ def resolve_duplicates( where : `str` A "where" string to use to constrain the collections, if passed. """ + # First thing: raise an error if resolve_duplicates has been run + # before on this qpg. + if self._finalized: + raise RuntimeError( + """resolve_duplicates may only be called on a + QuantumProvenanceGraph once. Call only after all graphs have + been added, or make a new graph with all constituent + attempts.""" + ) for dataset_type_name in self._datasets: # find datasets in a larger collection. for ref in butler.registry.queryDatasets( @@ -827,7 +839,7 @@ def resolve_duplicates( # a published dataset, that dataset is cursed. Set the # status for the dataset to cursed and note the reason # for labeling the dataset as cursed. - case (_, "published"): + case (_, "published") if not dataset_type_name.endswith("_log"): dataset_info["status"] = "cursed" dataset_info["messages"].append( "Published dataset is from an unsuccessful quantum." @@ -842,6 +854,16 @@ def resolve_duplicates( quantum_info["messages"].append( f"Outputs from different runs of the same quanta were published: {published_runs}." ) + for dataset_key in self.iter_outputs_of(quantum_key): + dataset_info = self.get_dataset_info(dataset_key) + quantum_info["messages"].append( + f"{dataset_key.parent_dataset_type_name}" + + f"from {str(dataset_info['runs'])};" + + f"{str(dataset_info['status'])}" + ) + # If we make it all the way through resolve_duplicates, set + # self._finalized = True so that it cannot be run again. + self._finalized = True def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary: """Summarize the `QuantumProvenanceGraph`. @@ -861,6 +883,11 @@ def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary: as well as diagnostic information and error messages for failed quanta and strange edge cases, and a list of recovered quanta. """ + if not self._finalized: + raise RuntimeError( + """resolve_duplicates must be called to finalize the + QuantumProvenanceGraph before making a summary.""" + ) result = Summary() for task_label, quanta in self._quanta.items(): task_summary = TaskSummary() From a20235973d282db4605d21bddcf0331d53042dc6 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Wed, 22 May 2024 15:33:12 -0700 Subject: [PATCH 10/18] Add tests for QuantumProvenanceGraph --- tests/test_quantum_provenance_graph.py | 157 +++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 tests/test_quantum_provenance_graph.py diff --git a/tests/test_quantum_provenance_graph.py b/tests/test_quantum_provenance_graph.py new file mode 100644 index 000000000..369ef4f14 --- /dev/null +++ b/tests/test_quantum_provenance_graph.py @@ -0,0 +1,157 @@ +# # This file is part of pipe_base. +# # +# # Developed for the LSST Data Management System. +# # This product includes software developed by the LSST Project +# # (http://www.lsst.org). +# # See the COPYRIGHT file at the top-level directory of this distribution +# # for details of code ownership. +# # +# # This software is dual licensed under the GNU General Public License and +# also +# # under a 3-clause BSD license. Recipients may choose which of these licenses +# # to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# # respectively. If you choose the GPL option then the following text applies +# # (but note that there is still no warranty even if you opt for BSD instead): +# # +# # This program is free software: you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation, either version 3 of the License, or +# # (at your option) any later version. +# # +# # This program is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with this program. If not, see . + +"""Simple unit test for quantum_provenance_graph. +""" + +import unittest + +from lsst.pipe.base.quantum_provenance_graph import DatasetTypeSummary, QuantumProvenanceGraph, TaskSummary +from lsst.pipe.base.tests import simpleQGraph +from lsst.utils.tests import temporaryDirectory + + +class QuantumProvenanceGraphTestCase(unittest.TestCase): + """Test reports from the QuantumProvenanceGraph. + + Verify that the `QuantumProvenanceGraph` is able to extract correct + information from `simpleQgraph`. + + More tests are in lsst/ci_middleware/tests/test_prod_outputs.py + """ + + def test_qpg_reports(self) -> None: + """Test that we can add a new graph to the + `QuantumProvenanceGraph`. + """ + with temporaryDirectory() as root: + # make a simple qgraph to make an execution report on + butler, qgraph = simpleQGraph.makeSimpleQGraph(root=root) + qpg = QuantumProvenanceGraph() + qpg.add_new_graph(butler, qgraph) + qpg.resolve_duplicates(butler) + d = qpg.to_summary(butler) + self.assertIsNotNone(d) + with open("testmodel.json", "w") as buffer: + buffer.write(d.model_dump_json(indent=2)) + summary_dict = d.model_dump() + for task in d.tasks: + self.assertIsInstance(d.tasks[task], TaskSummary) + # We know that we have one expected task that was not run. + # As such, the following dictionary should describe all of + # the mock tasks. + self.assertDictEqual( + summary_dict["tasks"][task], + { + "n_successful": 0, + "n_blocked": 0, + "n_not_attempted": 1, + "n_expected": 1, + "failed_quanta": [], + "recovered_quanta": [], + "wonky_quanta": [], + "n_wonky": 0, + "n_failed": 0, + }, + ) + for dataset in d.datasets: + self.assertIsInstance(d.datasets[dataset], DatasetTypeSummary) + self.assertListEqual( + summary_dict["datasets"][dataset]["unsuccessful_datasets"], + [{"instrument": "INSTR", "detector": 0}], + ) + # Check dataset counts (can't be done all in one because + # datasets have different producers), but all the counts for + # each task should be the same. + self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_unpublished"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_predicted_only"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_expected"], 1) + self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_cursed"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) + self.assertEqual(summary_dict["datasets"][dataset]["n_unsuccessful"], 1) + # Make sure the cursed dataset is an empty list + self.assertIsInstance(summary_dict["datasets"][dataset]["cursed_datasets"], list) + self.assertFalse(summary_dict["datasets"][dataset]["cursed_datasets"]) + # Make sure we have the right datasets based on the mock we have + for task in [ + "add_dataset1", + "add2_dataset1", + "task0_metadata", + "task0_log", + "add_dataset2", + "add2_dataset2", + "task1_metadata", + "task1_log", + "add_dataset3", + "add2_dataset3", + "task2_metadata", + "task2_log", + "add_dataset4", + "add2_dataset4", + "task3_metadata", + "task3_log", + "add_dataset5", + "add2_dataset5", + "task4_metadata", + "task4_log", + ]: + self.assertIn(task, list(summary_dict["datasets"].keys())) + # Make sure the expected datasets were produced by the expected tasks + for dataset in ["add_dataset1", "add2_dataset1", "task0_metadata", "task0_log"]: + self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task0") + for dataset in [ + "add_dataset2", + "add2_dataset2", + "task1_metadata", + "task1_log", + ]: + self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task1") + for dataset in [ + "add_dataset3", + "add2_dataset3", + "task2_metadata", + "task2_log", + ]: + self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task2") + for dataset in [ + "add_dataset4", + "add2_dataset4", + "task3_metadata", + "task3_log", + ]: + self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task3") + for dataset in [ + "add_dataset5", + "add2_dataset5", + "task4_metadata", + "task4_log", + ]: + self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task4") From ceb253e97aa8370671c0a1af55ddd9f9efcbd594 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Thu, 23 May 2024 11:38:54 -0700 Subject: [PATCH 11/18] Add flag for what to do with failed log datasets --- doc/changes/DM-41711.feature.md | 8 +++ .../pipe/base/quantum_provenance_graph.py | 52 ++++++++++++++----- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/doc/changes/DM-41711.feature.md b/doc/changes/DM-41711.feature.md index 91dd23095..10c3ef1f1 100644 --- a/doc/changes/DM-41711.feature.md +++ b/doc/changes/DM-41711.feature.md @@ -1,3 +1,11 @@ Create a QuantumProvenanceGraph, which details the status of every quantum and dataset over multiple attempts at executing graphs, noting when quanta have been recovered. + +Step through all the quantum graphs associated with certain tasks or +processing steps. For each graph/attempt, the status of each quantum and +dataset is recorded in `QuantumProvenanceGraph.add_new_graph` and outcomes +of quanta over multiple runs are resolved in +`QuantumProvenanceGraph.resolve_duplicates`. At the end of this process, +we can combine all attempts into a summary. This serves to answer the +question "What happened to this data ID?" in a wholistic sense. diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 242df3107..2b08416ac 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -97,6 +97,9 @@ class DatasetKey(NamedTuple): """ is_prerequisite: ClassVar[Literal[False]] = False + """Whether this node is a prerequisite to another node (also always + `False`). + """ class PrerequisiteDatasetKey(NamedTuple): @@ -122,6 +125,8 @@ class PrerequisiteDatasetKey(NamedTuple): """ is_prerequisite: ClassVar[Literal[True]] = True + """Whether this node is a prerequisite to another node (always `True`). + """ QuantumRunStatus: TypeAlias = Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"] @@ -279,13 +284,13 @@ class TaskSummary(pydantic.BaseModel): """The number of quanta expected by the graph. """ - @pydantic.computed_field # type: ignore[misc] + @pydantic.computed_field # type: ignore[prop-decorator] @property def n_wonky(self) -> int: """Return a count of `wonky` quanta.""" return len(self.wonky_quanta) - @pydantic.computed_field # type: ignore[misc] + @pydantic.computed_field # type: ignore[prop-decorator] @property def n_failed(self) -> int: """Return a count of `failed` quanta.""" @@ -417,8 +422,8 @@ class DatasetTypeSummary(pydantic.BaseModel): """ n_published: int = 0 - """A count of the datasets of this type which were published in the final - collection. + """A count of the datasets of this type which were published in the + finalized collection(s). """ n_unpublished: int = 0 """A count of the datasets of this type which were produced but not @@ -435,13 +440,13 @@ class DatasetTypeSummary(pydantic.BaseModel): """The number of datasets of this type expected by the graph. """ - @pydantic.computed_field # type: ignore[misc] + @pydantic.computed_field # type: ignore[prop-decorator] @property def n_cursed(self) -> int: """Return a count of cursed datasets.""" return len(self.cursed_datasets) - @pydantic.computed_field # type: ignore[misc] + @pydantic.computed_field # type: ignore[prop-decorator] @property def n_unsuccessful(self) -> int: """Return a count of unsuccessful datasets.""" @@ -528,7 +533,7 @@ def __init__(self) -> None: # name. self._datasets: dict[str, set[DatasetKey]] = {} # Bool representing whether the graph has been finalized. This is set - # to True when resolve_duplicates is + # to True when resolve_duplicates completes. self._finalized: bool = False def get_quantum_info(self, key: QuantumKey) -> QuantumInfo: @@ -571,7 +576,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre to assign to the overall `QuantumInfo`. For example, if a previous run associated with a quantum had the status `failed`, and the status from the new graph reads `successful`, we can - mark the overall quantum status as `successful` and list the id + mark the overall quantum status as `successful` and list the data_id as `recovered`. Parameters @@ -750,7 +755,11 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre quantum_info["status"] = new_status def resolve_duplicates( - self, butler: Butler, collections: Sequence[str] | None = None, where: str = "" + self, + butler: Butler, + collections: Sequence[str] | None = None, + where: str = "", + curse_failed_logs: bool = False, ) -> None: """After quantum graphs associated with each run have been added to the `QuantumProvenanceGraph, resolve any discrepancies between @@ -777,6 +786,14 @@ def resolve_duplicates( where : `str` A "where" string to use to constrain the collections, if passed. + + curse_failed_logs : `bool` + Mark log datasets as `cursed` if they are published in the final + output collection. Note that a campaign-level collection must be + used here for `collections` if `curse_failed_logs` is `True`; if + `resolve_duplicates` is run on a list of group-level collections + then each will show logs from their own failures as published + the datasets will show as cursed regardless of this flag. """ # First thing: raise an error if resolve_duplicates has been run # before on this qpg. @@ -839,11 +856,18 @@ def resolve_duplicates( # a published dataset, that dataset is cursed. Set the # status for the dataset to cursed and note the reason # for labeling the dataset as cursed. - case (_, "published") if not dataset_type_name.endswith("_log"): - dataset_info["status"] = "cursed" - dataset_info["messages"].append( - "Published dataset is from an unsuccessful quantum." - ) + case (_, "published"): + # Avoiding publishing failed logs is difficult + # without using tagged collections, so flag them as + # merely unsuccessful unless the user requests it. + if dataset_type_name.endswith("_log") and not curse_failed_logs: + dataset_info["status"] = "unsuccessful" + else: + dataset_info["status"] = "cursed" + dataset_info["messages"].append( + f"Unsuccessful dataset {dataset_type_name} published in " + "final output collection." + ) # any other produced dataset (produced but not # published and not successful) is a regular # failure. From 6a173fd39ebcb5069ec545982cc0ba05d613ec98 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Fri, 2 Aug 2024 17:27:47 -0700 Subject: [PATCH 12/18] Use Pydantic models for tests and improve documentation --- .../pipe/base/quantum_provenance_graph.py | 101 ++++++++++++-- tests/test_quantum_provenance_graph.py | 130 +++++++----------- 2 files changed, 142 insertions(+), 89 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 2b08416ac..1e33ef4a2 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -60,7 +60,10 @@ class QuantumKey(NamedTuple): - """Identifier type for quantum keys in a `QuantumProvenanceGraph`.""" + """Identifier type for quantum keys in a `QuantumProvenanceGraph`. These + keys correspond to a task label and data ID, but can refer to this over + multiple runs or datasets. + """ task_label: str """Label of the task in the pipeline.""" @@ -92,8 +95,8 @@ class DatasetKey(NamedTuple): """ is_task: ClassVar[Literal[False]] = False - """Whether this node represents a quantum rather - than a dataset (always `False`). + """Whether this node represents a quantum rather than a dataset (always + `False`). """ is_prerequisite: ClassVar[Literal[False]] = False @@ -141,6 +144,19 @@ class QuantumRun(pydantic.BaseModel): status: QuantumRunStatus = "metadata_missing" """The status of the quantum in that run. + + Possible Statuses + ----------------- + `failed`: Attempts to execute the quantum failed in this run. + `successful`: This quantum was executed successfully in this run. + `logs_missing`: Logs are missing for this quantum in this run. It was + attempted, but it is impossible to tell if it succeeded or failed due + to missing logs. + `blocked`: This run does not include an executed version of this quantum + because an upstream task failed. + `metadata_missing`: Metadata is missing for this quantum in this run. It is + impossible to tell whether execution of this quantum was attempted due + to missing metadata. """ @@ -148,7 +164,8 @@ class QuantumRun(pydantic.BaseModel): class QuantumInfo(TypedDict): - """Information about a quantum across all run collections. + """Information about a quantum (i.e., the combination of a task label and + data ID) across all attempted runs. Used to annotate the networkx node dictionary. """ @@ -164,6 +181,35 @@ class QuantumInfo(TypedDict): status: QuantumInfoStatus """The overall status of the quantum. Note that it is impossible to exit a wonky state. + + Possible Statuses + ----------------- + `successful`: Attempts at executing this quantum were successful. + `wonky`: The overall state of this quantum reflects inconsistencies or is + difficult to discern. There are a few specific ways to enter a wonky + state; it is impossible to exit and requires human intervention to + proceed with processing. + Currently, a quantum enters a wonky state for one of three reasons: + - Its `QuantumInfoStatus` exits a successful state. Something that + initially succeeded fails on + - A `QuantumRun` is missing logs. + - There are multiple runs associated with a dataset which comes up in a + findFirst search. This means that a dataset which will be used as an + input data product for further processing has heterogeneous inputs, + which may have had different inputs or a different data-query. + `blocked`: The quantum is not able to execute because its inputs are + missing due to an upstream failure. Blocked quanta are distinguished + from failed quanta by being successors of failed quanta in the graph. + All the successors of blocked quanta are also marked as blocked. + `not_attempted`: These are quanta which do not have any metadata associated + with processing, but for which it is impossible to tell the status due + to an additional absence of logs. Quanta which had not been processed + at all would reflect this state, as would quanta which were + conceptualized in the construction of the quantum graph but later + identified to be unneccesary or erroneous (deemed `NoWorkFound` by the + Science Pipelines). + `failed`: These quanta were attempted and failed. Failed quanta have logs + and no metadata. """ recovered: bool @@ -176,12 +222,12 @@ class QuantumInfo(TypedDict): log: DatasetKey """The `DatasetKey` which can be used to access the log associated with the - quantum. + quantum across runs. """ metadata: DatasetKey """The `DatasetKey` which can be used to access the metadata for the - quantum. + quantum across runs. """ @@ -228,6 +274,26 @@ class DatasetInfo(TypedDict): status: DatasetInfoStatus """Overall status of the dataset. + + Possible Statuses + ----------------- + `published`: The dataset is queryable in a find_first search. This means + that it can be used as an input by subsequent tasks and processing. + `unpublished`: The dataset exists but is not queryable in a find_first + search. This could mean that the version of this dataset which is + passed as an input to further processing is not in the collections + given. An `unpublished` dataset will not be used as an input to further + processing. + `predicted_only`: The dataset was predicted, and was not published in any + run, but was the successor of a successful quantum. These datasets are + the result of pipelines `NoWorkFound` cases, in which a dataset is + predicted in the graph but found to not be necessary in processing. + `unsuccessful`: The dataset was not produced. These are the results of + failed or blocked quanta. + `cursed`: The dataset was the result of an unsuccessful quantum and was + published in the output collection anyway. These are flagged as + `cursed` so that they may be caught before they become inputs to + further processing. """ messages: list[str] @@ -236,7 +302,11 @@ class DatasetInfo(TypedDict): class UnsuccessfulQuantumSummary(pydantic.BaseModel): - """A summary of all relevant information on an unsuccessful quantum.""" + """A summary of all relevant information on an unsuccessful quantum. + + This summarizes all information on a task's output for a particular data ID + over all runs. + """ data_id: dict[str, DataIdValue] """The data_id of the unsuccessful quantum. @@ -268,7 +338,9 @@ def from_info(cls, info: QuantumInfo) -> UnsuccessfulQuantumSummary: class TaskSummary(pydantic.BaseModel): - """A summary of the status of all quanta for a single task.""" + """A summary of the status of all quanta associated with a single task, + across all runs. + """ n_successful: int = 0 """A count of successful quanta. @@ -298,7 +370,7 @@ def n_failed(self) -> int: failed_quanta: list[UnsuccessfulQuantumSummary] = pydantic.Field(default_factory=list) """A list of all `UnsuccessfulQuantumSummary` objects associated with the - `failed` quanta. This is a report containing their data_ids, the status + `failed` quanta. This is a report containing their data IDs, the status of each run associated with each `failed` quantum, and the error messages associated with the failures when applicable. """ @@ -415,7 +487,9 @@ def from_info(cls, info: DatasetInfo, producer_info: QuantumInfo) -> CursedDatas class DatasetTypeSummary(pydantic.BaseModel): - """A summary of the status of all datasets of a particular type.""" + """A summary of the status of all datasets of a particular type across all + runs. + """ producer: str """The name of the task which produced this dataset. @@ -813,7 +887,12 @@ def resolve_duplicates( where=where, ): dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) - dataset_info = self.get_dataset_info(dataset_key) + try: + dataset_info = self.get_dataset_info(dataset_key) + # Ignore if we don't actually have the dataset in any of the + # graphs given. + except KeyError: + continue # queryable datasets are `published`. dataset_info["runs"][ref.run].published = True diff --git a/tests/test_quantum_provenance_graph.py b/tests/test_quantum_provenance_graph.py index 369ef4f14..eca3fceae 100644 --- a/tests/test_quantum_provenance_graph.py +++ b/tests/test_quantum_provenance_graph.py @@ -31,7 +31,7 @@ import unittest -from lsst.pipe.base.quantum_provenance_graph import DatasetTypeSummary, QuantumProvenanceGraph, TaskSummary +from lsst.pipe.base.quantum_provenance_graph import QuantumProvenanceGraph, TaskSummary from lsst.pipe.base.tests import simpleQGraph from lsst.utils.tests import temporaryDirectory @@ -42,7 +42,8 @@ class QuantumProvenanceGraphTestCase(unittest.TestCase): Verify that the `QuantumProvenanceGraph` is able to extract correct information from `simpleQgraph`. - More tests are in lsst/ci_middleware/tests/test_prod_outputs.py + More tests are in lsst/ci_middleware/tests/test_prod_outputs.py and + lsst/ci_middleware/tests/test_rc2_outputs.py """ def test_qpg_reports(self) -> None: @@ -55,53 +56,27 @@ def test_qpg_reports(self) -> None: qpg = QuantumProvenanceGraph() qpg.add_new_graph(butler, qgraph) qpg.resolve_duplicates(butler) - d = qpg.to_summary(butler) - self.assertIsNotNone(d) - with open("testmodel.json", "w") as buffer: - buffer.write(d.model_dump_json(indent=2)) - summary_dict = d.model_dump() - for task in d.tasks: - self.assertIsInstance(d.tasks[task], TaskSummary) + summary = qpg.to_summary(butler) + + for task_summary in summary.tasks.values(): # We know that we have one expected task that was not run. # As such, the following dictionary should describe all of # the mock tasks. - self.assertDictEqual( - summary_dict["tasks"][task], - { - "n_successful": 0, - "n_blocked": 0, - "n_not_attempted": 1, - "n_expected": 1, - "failed_quanta": [], - "recovered_quanta": [], - "wonky_quanta": [], - "n_wonky": 0, - "n_failed": 0, - }, - ) - for dataset in d.datasets: - self.assertIsInstance(d.datasets[dataset], DatasetTypeSummary) - self.assertListEqual( - summary_dict["datasets"][dataset]["unsuccessful_datasets"], - [{"instrument": "INSTR", "detector": 0}], + self.assertEqual( + task_summary, + TaskSummary( + n_successful=0, + n_blocked=0, + n_not_attempted=1, + n_expected=1, + failed_quanta=[], + recovered_quanta=[], + wonky_quanta=[], + n_wonky=0, + n_failed=0, + ), ) - # Check dataset counts (can't be done all in one because - # datasets have different producers), but all the counts for - # each task should be the same. - self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_unpublished"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_predicted_only"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_expected"], 1) - self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_cursed"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_published"], 0) - self.assertEqual(summary_dict["datasets"][dataset]["n_unsuccessful"], 1) - # Make sure the cursed dataset is an empty list - self.assertIsInstance(summary_dict["datasets"][dataset]["cursed_datasets"], list) - self.assertFalse(summary_dict["datasets"][dataset]["cursed_datasets"]) - # Make sure we have the right datasets based on the mock we have - for task in [ + expected_mock_datasets = [ "add_dataset1", "add2_dataset1", "task0_metadata", @@ -122,36 +97,35 @@ def test_qpg_reports(self) -> None: "add2_dataset5", "task4_metadata", "task4_log", - ]: - self.assertIn(task, list(summary_dict["datasets"].keys())) - # Make sure the expected datasets were produced by the expected tasks - for dataset in ["add_dataset1", "add2_dataset1", "task0_metadata", "task0_log"]: - self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task0") - for dataset in [ - "add_dataset2", - "add2_dataset2", - "task1_metadata", - "task1_log", - ]: - self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task1") - for dataset in [ - "add_dataset3", - "add2_dataset3", - "task2_metadata", - "task2_log", - ]: - self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task2") - for dataset in [ - "add_dataset4", - "add2_dataset4", - "task3_metadata", - "task3_log", - ]: - self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task3") - for dataset in [ - "add_dataset5", - "add2_dataset5", - "task4_metadata", - "task4_log", - ]: - self.assertEqual(summary_dict["datasets"][dataset]["producer"], "task4") + ] + for dataset_type_name, dataset_type_summary in summary.datasets.items(): + self.assertListEqual( + dataset_type_summary.unsuccessful_datasets, + [{"instrument": "INSTR", "detector": 0}], + ) + # Check dataset counts (can't be done all in one because + # datasets have different producers), but all the counts for + # each task should be the same. + self.assertEqual(dataset_type_summary.n_published, 0) + self.assertEqual(dataset_type_summary.n_unpublished, 0) + self.assertEqual(dataset_type_summary.n_predicted_only, 0) + self.assertEqual(dataset_type_summary.n_expected, 1) + self.assertEqual(dataset_type_summary.n_cursed, 0) + self.assertEqual(dataset_type_summary.n_unsuccessful, 1) + # Make sure the cursed dataset is an empty list + self.assertListEqual(dataset_type_summary.cursed_datasets, []) + # Make sure we have the right datasets based on our mock + self.assertIn(dataset_type_name, expected_mock_datasets) + # Make sure the expected datasets were produced by the expected + # tasks + match dataset_type_name: + case name if name in ["add_dataset1", "add2_dataset1", "task0_metadata", "task0_log"]: + self.assertEqual(dataset_type_summary.producer, "task0") + case name if name in ["add_dataset2", "add2_dataset2", "task1_metadata", "task1_log"]: + self.assertEqual(dataset_type_summary.producer, "task1") + case name if name in ["add_dataset3", "add2_dataset3", "task2_metadata", "task2_log"]: + self.assertEqual(dataset_type_summary.producer, "task2") + case name if name in ["add_dataset4", "add2_dataset4", "task3_metadata", "task3_log"]: + self.assertEqual(dataset_type_summary.producer, "task3") + case name if name in ["add_dataset5", "add2_dataset5", "task4_metadata", "task4_log"]: + self.assertEqual(dataset_type_summary.producer, "task4") From c7e8d8abec5ad3ab59dcb2740afed749c5aea69f Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Tue, 10 Sep 2024 15:25:53 -0700 Subject: [PATCH 13/18] Change not_attempted status to uknown. --- .../pipe/base/quantum_provenance_graph.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 1e33ef4a2..ebdd1df13 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -160,7 +160,7 @@ class QuantumRun(pydantic.BaseModel): """ -QuantumInfoStatus: TypeAlias = Literal["successful", "wonky", "blocked", "not_attempted", "failed"] +QuantumInfoStatus: TypeAlias = Literal["successful", "wonky", "blocked", "unknown", "failed"] class QuantumInfo(TypedDict): @@ -201,7 +201,7 @@ class QuantumInfo(TypedDict): missing due to an upstream failure. Blocked quanta are distinguished from failed quanta by being successors of failed quanta in the graph. All the successors of blocked quanta are also marked as blocked. - `not_attempted`: These are quanta which do not have any metadata associated + `unknown`: These are quanta which do not have any metadata associated with processing, but for which it is impossible to tell the status due to an additional absence of logs. Quanta which had not been processed at all would reflect this state, as would quanta which were @@ -348,8 +348,8 @@ class TaskSummary(pydantic.BaseModel): n_blocked: int = 0 """A count of blocked quanta. """ - n_not_attempted: int = 0 - """A count of quanta for which processing was not attempted. + n_unknown: int = 0 + """A count of quanta for which there are no metadata or logs. """ n_expected: int = 0 @@ -433,8 +433,8 @@ def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: boo [record.message for record in log if record.levelno >= logging.ERROR] ) self.failed_quanta.append(failed_quantum_summary) - case "not_attempted": - self.n_not_attempted += 1 + case "unknown": + self.n_unknown += 1 case unrecognized_state: raise AssertionError(f"Unrecognized quantum status {unrecognized_state!r}") @@ -681,7 +681,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre quantum_info.setdefault("messages", []) quantum_info.setdefault("runs", {}) quantum_info.setdefault("data_id", cast(DataCoordinate, node.quantum.dataId)) - quantum_info.setdefault("status", "not_attempted") + quantum_info.setdefault("status", "unknown") quantum_info.setdefault("recovered", False) new_quanta.append(quantum_key) self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key) @@ -789,7 +789,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # recovery. case (_, "successful"): new_status = "successful" - if last_status != "successful" and last_status != "not_attempted": + if last_status != "successful" and last_status != "unknown": quantum_info["recovered"] = True # Missing logs are one of the categories of wonky quanta. They # interfere with our ability to discern quantum status and are @@ -810,18 +810,19 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre f"Status went from successful in run {list(quantum_info['runs'].values())[-1]!r} " f"to {quantum_run.status!r} in {output_run!r}." ) - # If a quantumm is not attempted and moves to blocked, we know - # for sure that it is a blocked quantum. - case ("not_attempted", "blocked"): + # If a quantum status is unknown and it moves to blocked, we + # know for sure that it is a blocked quantum. + case ("unknown", "blocked"): new_status = "blocked" # A transition into blocked does not change the overall quantum # status for a failure. case (_, "blocked"): new_status = last_status # If a quantum transitions from any state into missing - # metadata, it was probably not attempted. + # metadata, we don't have enough information to diagnose its + # state. case (_, "metadata_missing"): - new_status = "not_attempted" + new_status = "unknown" # Any transition into failure is a failed state. case (_, "failed"): new_status = "failed" From 4b87edca435a4c7968e310055013b0d3ea1950cd Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Tue, 10 Sep 2024 15:49:21 -0700 Subject: [PATCH 14/18] Change published to visible and unpublished to shadowed. --- .../pipe/base/quantum_provenance_graph.py | 98 ++++++++++--------- tests/test_quantum_provenance_graph.py | 6 +- 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index ebdd1df13..4545f7289 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -242,20 +242,22 @@ class DatasetRun(pydantic.BaseModel): """Whether the specific run produced the dataset. """ - published: bool = False - """Whether this dataset was published in the final output collection. + visible: bool = False + """Whether this dataset is visible in the final output collection; in other + words, whether this dataset is queryable in a find-first search. This + determines whether it will be used as an input to further processing. """ @pydantic.model_validator(mode="after") def _validate(self) -> DatasetRun: - """Validate the model for `DatasetRun` by asserting that no published + """Validate the model for `DatasetRun` by asserting that no visible `DatasetRun` is also not produced (this should be impossible). """ - assert not (self.published and not self.produced) + assert not (self.visible and not self.produced) return self -DatasetInfoStatus: TypeAlias = Literal["published", "unpublished", "predicted_only", "unsuccessful", "cursed"] +DatasetInfoStatus: TypeAlias = Literal["visible", "shadowed", "predicted_only", "unsuccessful", "cursed"] class DatasetInfo(TypedDict): @@ -277,21 +279,21 @@ class DatasetInfo(TypedDict): Possible Statuses ----------------- - `published`: The dataset is queryable in a find_first search. This means + `visible`: The dataset is queryable in a find_first search. This means that it can be used as an input by subsequent tasks and processing. - `unpublished`: The dataset exists but is not queryable in a find_first + `shadowed`: The dataset exists but is not queryable in a find_first search. This could mean that the version of this dataset which is passed as an input to further processing is not in the collections - given. An `unpublished` dataset will not be used as an input to further + given. A `shadowed` dataset will not be used as an input to further processing. - `predicted_only`: The dataset was predicted, and was not published in any + `predicted_only`: The dataset was predicted, and was not visible in any run, but was the successor of a successful quantum. These datasets are the result of pipelines `NoWorkFound` cases, in which a dataset is predicted in the graph but found to not be necessary in processing. `unsuccessful`: The dataset was not produced. These are the results of failed or blocked quanta. `cursed`: The dataset was the result of an unsuccessful quantum and was - published in the output collection anyway. These are flagged as + visible in the output collection anyway. These are flagged as `cursed` so that they may be caught before they become inputs to further processing. """ @@ -454,8 +456,8 @@ class CursedDatasetSummary(pydantic.BaseModel): """A dictionary of all the runs associated with the `cursed` dataset; the `bool` is true if the dataset was produced in the associated run. """ - run_published: str | None - """A dictionary of all `published` runs containing the `cursed` dataset. + run_visible: str | None + """A dictionary of all `visible` runs containing the `cursed` dataset. """ messages: list[str] """Any diagnostic messages (dictated in this module) which might help in @@ -475,13 +477,13 @@ def from_info(cls, info: DatasetInfo, producer_info: QuantumInfo) -> CursedDatas All relevant information on the producer task. This is used to report the data_id of the producer task. """ - runs_published = {k for k, v in info["runs"].items() if v.published} + runs_visible = {k for k, v in info["runs"].items() if v.visible} return cls( producer_data_id=dict(producer_info["data_id"].required), data_id=dict(info["data_id"].required), runs_produced={k: v.produced for k, v in info["runs"].items()}, # this has at most one element - run_published=runs_published.pop() if runs_published else None, + run_visible=runs_visible.pop() if runs_visible else None, messages=info["messages"], ) @@ -495,13 +497,13 @@ class DatasetTypeSummary(pydantic.BaseModel): """The name of the task which produced this dataset. """ - n_published: int = 0 - """A count of the datasets of this type which were published in the + n_visible: int = 0 + """A count of the datasets of this type which were visible in the finalized collection(s). """ - n_unpublished: int = 0 + n_shadowed: int = 0 """A count of the datasets of this type which were produced but not - published. This includes any datasets which do not come up in a butler + visible. This includes any datasets which do not come up in a butler query over their associated collection. """ n_predicted_only: int = 0 @@ -555,10 +557,10 @@ def add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> Non specific issues. """ match info["status"]: - case "published": - self.n_published += 1 - case "unpublished": - self.n_unpublished += 1 + case "visible": + self.n_visible += 1 + case "shadowed": + self.n_shadowed += 1 case "unsuccessful": self.unsuccessful_datasets.append(dict(info["data_id"].mapping)) case "cursed": @@ -842,7 +844,7 @@ def resolve_duplicates( Particularly, use the publish state of each `DatasetRun` in combination with overall quantum status to ascertain the status of each dataset. - Additionally, if there are multiple published runs associated with a + Additionally, if there are multiple visible runs associated with a dataset, mark the producer quantum as `wonky`. This method should be called after @@ -863,11 +865,11 @@ def resolve_duplicates( A "where" string to use to constrain the collections, if passed. curse_failed_logs : `bool` - Mark log datasets as `cursed` if they are published in the final + Mark log datasets as `cursed` if they are visible in the final output collection. Note that a campaign-level collection must be used here for `collections` if `curse_failed_logs` is `True`; if `resolve_duplicates` is run on a list of group-level collections - then each will show logs from their own failures as published + then each will show logs from their own failures as visible the datasets will show as cursed regardless of this flag. """ # First thing: raise an error if resolve_duplicates has been run @@ -894,49 +896,49 @@ def resolve_duplicates( # graphs given. except KeyError: continue - # queryable datasets are `published`. - dataset_info["runs"][ref.run].published = True + # queryable datasets are `visible`. + dataset_info["runs"][ref.run].visible = True for task_quanta in self._quanta.values(): for quantum_key in task_quanta: - # runs associated with published datasets. - published_runs: set[str] = set() + # runs associated with visible datasets. + visible_runs: set[str] = set() quantum_info = self.get_quantum_info(quantum_key) # Loop over each dataset in the outputs of a single quantum. for dataset_key in self.iter_outputs_of(quantum_key): dataset_info = self.get_dataset_info(dataset_key) - published_runs.update( - run for run, dataset_run in dataset_info["runs"].items() if dataset_run.published + visible_runs.update( + run for run, dataset_run in dataset_info["runs"].items() if dataset_run.visible ) - if any(dataset_run.published for dataset_run in dataset_info["runs"].values()): - publish_state = "published" - # set the publish state to `unpublished` if the dataset was - # produced but not published (i.e., not queryable from the + if any(dataset_run.visible for dataset_run in dataset_info["runs"].values()): + publish_state = "visible" + # set the publish state to `shadowed` if the dataset was + # produced but not visible (i.e., not queryable from the # final collection(s)). elif any(dataset_run.produced for dataset_run in dataset_info["runs"].values()): - publish_state = "unpublished" - # a dataset which was not produced and not published is + publish_state = "shadowed" + # a dataset which was not produced and not visible is # missing. else: publish_state = "missing" # use the quantum status and publish state to ascertain the # status of the dataset. match (quantum_info["status"], publish_state): - # published datasets from successful quanta are as + # visible datasets from successful quanta are as # intended. - case ("successful", "published"): - dataset_info["status"] = "published" + case ("successful", "visible"): + dataset_info["status"] = "visible" # missing datasets from successful quanta indicate a # `NoWorkFound` case. case ("successful", "missing"): dataset_info["status"] = "predicted_only" - case ("successful", "unpublished"): - dataset_info["status"] = "unpublished" + case ("successful", "shadowed"): + dataset_info["status"] = "shadowed" # If anything other than a successful quantum produces - # a published dataset, that dataset is cursed. Set the + # a visible dataset, that dataset is cursed. Set the # status for the dataset to cursed and note the reason # for labeling the dataset as cursed. - case (_, "published"): + case (_, "visible"): # Avoiding publishing failed logs is difficult # without using tagged collections, so flag them as # merely unsuccessful unless the user requests it. @@ -945,18 +947,18 @@ def resolve_duplicates( else: dataset_info["status"] = "cursed" dataset_info["messages"].append( - f"Unsuccessful dataset {dataset_type_name} published in " + f"Unsuccessful dataset {dataset_type_name} visible in " "final output collection." ) # any other produced dataset (produced but not - # published and not successful) is a regular + # visible and not successful) is a regular # failure. case _: dataset_info["status"] = "unsuccessful" - if len(published_runs) > 1: + if len(visible_runs) > 1: quantum_info["status"] = "wonky" quantum_info["messages"].append( - f"Outputs from different runs of the same quanta were published: {published_runs}." + f"Outputs from different runs of the same quanta were visible: {visible_runs}." ) for dataset_key in self.iter_outputs_of(quantum_key): dataset_info = self.get_dataset_info(dataset_key) diff --git a/tests/test_quantum_provenance_graph.py b/tests/test_quantum_provenance_graph.py index eca3fceae..eccda2073 100644 --- a/tests/test_quantum_provenance_graph.py +++ b/tests/test_quantum_provenance_graph.py @@ -67,7 +67,7 @@ def test_qpg_reports(self) -> None: TaskSummary( n_successful=0, n_blocked=0, - n_not_attempted=1, + n_unknown=1, n_expected=1, failed_quanta=[], recovered_quanta=[], @@ -106,8 +106,8 @@ def test_qpg_reports(self) -> None: # Check dataset counts (can't be done all in one because # datasets have different producers), but all the counts for # each task should be the same. - self.assertEqual(dataset_type_summary.n_published, 0) - self.assertEqual(dataset_type_summary.n_unpublished, 0) + self.assertEqual(dataset_type_summary.n_visible, 0) + self.assertEqual(dataset_type_summary.n_shadowed, 0) self.assertEqual(dataset_type_summary.n_predicted_only, 0) self.assertEqual(dataset_type_summary.n_expected, 1) self.assertEqual(dataset_type_summary.n_cursed, 0) From c3cfa2f698bfa3e9e7891a4cb1a283002ec312df Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Fri, 13 Sep 2024 17:06:57 -0700 Subject: [PATCH 15/18] Make add_new_graph and resolve_duplicates private methods and require user to pass graphs in order --- .../pipe/base/quantum_provenance_graph.py | 39 ++++++++++++++++++- tests/test_quantum_provenance_graph.py | 3 +- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 4545f7289..faa926bba 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -634,7 +634,7 @@ def get_dataset_info(self, key: DatasetKey) -> DatasetInfo: """ return self._xgraph.nodes[key] - def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpression) -> None: + def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpression) -> None: """Add a new quantum graph to the `QuantumProvenanceGraph`. Step through the quantum graph. Annotate a `networkx.DiGraph` @@ -831,7 +831,7 @@ def add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpre # Update `QuantumInfo.status` for this quantum. quantum_info["status"] = new_status - def resolve_duplicates( + def __resolve_duplicates( self, butler: Butler, collections: Sequence[str] | None = None, @@ -971,6 +971,41 @@ def resolve_duplicates( # self._finalized = True so that it cannot be run again. self._finalized = True + def assemble_quantum_provenance_graph( + self, + butler: Butler, + qgraphs: Sequence[QuantumGraph | ResourcePathExpression], + collections: Sequence[str] | None = None, + where: str = "", + curse_failed_logs: bool = False, + ) -> None: + output_runs = [] + for count, graph in enumerate(qgraphs): + qgraph = graph if isinstance(graph, QuantumGraph) else QuantumGraph.loadUri(graph) + assert qgraph.metadata is not None, "Saved QGs always have metadata." + # If the most recent graph's timestamp was earlier than any of the + # previous graphs, raise a RuntimeError. + if len(qgraphs) > 1: + for previous_graph in qgraphs[: count - 1]: + previous_graph = ( + previous_graph + if isinstance(previous_graph, QuantumGraph) + else QuantumGraph.loadUri(previous_graph) + ) + if qgraph.metadata["time"] < previous_graph.metadata["time"]: + raise RuntimeError( + """add_new_graph may only be called on graphs + which are passed in the order they were + created. Please call again, passing your + graphs in order.""" + ) + self.__add_new_graph(butler, qgraph) + output_runs.append(qgraph.metadata["output_run"]) + # If the user has not passed a `collections` variable + if not collections: + collections = list(reversed(output_runs)) + self.__resolve_duplicates(butler, collections, where, curse_failed_logs) + def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary: """Summarize the `QuantumProvenanceGraph`. diff --git a/tests/test_quantum_provenance_graph.py b/tests/test_quantum_provenance_graph.py index eccda2073..f237674a7 100644 --- a/tests/test_quantum_provenance_graph.py +++ b/tests/test_quantum_provenance_graph.py @@ -54,8 +54,7 @@ def test_qpg_reports(self) -> None: # make a simple qgraph to make an execution report on butler, qgraph = simpleQGraph.makeSimpleQGraph(root=root) qpg = QuantumProvenanceGraph() - qpg.add_new_graph(butler, qgraph) - qpg.resolve_duplicates(butler) + qpg.assemble_quantum_provenance_graph(butler, [qgraph]) summary = qpg.to_summary(butler) for task_summary in summary.tasks.values(): From 2adfc32196fee0a676f6de12cd907afc7baac6db Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Wed, 18 Sep 2024 12:37:07 -0700 Subject: [PATCH 16/18] Change status Literals to Enums --- .../pipe/base/quantum_provenance_graph.py | 261 ++++++++++-------- 1 file changed, 145 insertions(+), 116 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index faa926bba..86abbbd06 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -43,7 +43,8 @@ import logging import uuid from collections.abc import Iterator, Sequence -from typing import TYPE_CHECKING, ClassVar, Literal, NamedTuple, TypeAlias, TypedDict, cast +from enum import Enum +from typing import TYPE_CHECKING, ClassVar, Literal, NamedTuple, TypedDict, cast import networkx import pydantic @@ -132,7 +133,28 @@ class PrerequisiteDatasetKey(NamedTuple): """ -QuantumRunStatus: TypeAlias = Literal["failed", "successful", "logs_missing", "blocked", "metadata_missing"] +class QuantumRunStatus(Enum): + """Enum describing the status of a quantum-run collection combination. + + Possible Statuses + ----------------- + METADATA_MISSING = -3: Metadata is missing for this quantum in this run. + It is impossible to tell whether execution of this quantum was + attempted due to missing metadata. + LOGS_MISSING = -2: Logs are missing for this quantum in this run. It was + attempted, but it is impossible to tell if it succeeded or failed due + to missing logs. + FAILED = -1: Attempts to execute the quantum failed in this run. + BLOCKED = 0: This run does not include an executed version of this + quantum because an upstream task failed. + SUCCESSFUL = 1: This quantum was executed successfully in this run. + """ + + METADATA_MISSING = -3 + LOGS_MISSING = -2 + FAILED = -1 + BLOCKED = 0 + SUCCESSFUL = 1 class QuantumRun(pydantic.BaseModel): @@ -142,25 +164,50 @@ class QuantumRun(pydantic.BaseModel): """The quantum graph node ID associated with the dataId in a specific run. """ - status: QuantumRunStatus = "metadata_missing" + status: QuantumRunStatus = QuantumRunStatus.METADATA_MISSING """The status of the quantum in that run. + """ + + +class QuantumInfoStatus(Enum): + """The status of a quantum (a particular task run on a particular dataID) + across all runs. Possible Statuses ----------------- - `failed`: Attempts to execute the quantum failed in this run. - `successful`: This quantum was executed successfully in this run. - `logs_missing`: Logs are missing for this quantum in this run. It was - attempted, but it is impossible to tell if it succeeded or failed due - to missing logs. - `blocked`: This run does not include an executed version of this quantum - because an upstream task failed. - `metadata_missing`: Metadata is missing for this quantum in this run. It is - impossible to tell whether execution of this quantum was attempted due - to missing metadata. + WONKY = -3: The overall state of this quantum reflects inconsistencies or + is difficult to discern. There are a few specific ways to enter a wonky + state; it is impossible to exit and requires human intervention to + proceed with processing. + Currently, a quantum enters a wonky state for one of three reasons: + - Its `QuantumInfoStatus` exits a successful state. Something that + initially succeeded fails on + - A `QuantumRun` is missing logs. + - There are multiple runs associated with a dataset which comes up in a + findFirst search. This means that a dataset which will be used as an + input data product for further processing has heterogeneous inputs, + which may have had different inputs or a different data-query. + FAILED = -2: These quanta were attempted and failed. Failed quanta have + logs and no metadata. + UNKNOWN = -1: These are quanta which do not have any metadata associated + with processing, but for which it is impossible to tell the status due + to an additional absence of logs. Quanta which had not been processed + at all would reflect this state, as would quanta which were + conceptualized in the construction of the quantum graph but later + identified to be unneccesary or erroneous (deemed NoWorkFound by the + Science Pipelines). + BLOCKED = 0: The quantum is not able to execute because its inputs are + missing due to an upstream failure. Blocked quanta are distinguished + from failed quanta by being successors of failed quanta in the graph. + All the successors of blocked quanta are also marked as blocked. + SUCCESSFUL = 1: Attempts at executing this quantum were successful. """ - -QuantumInfoStatus: TypeAlias = Literal["successful", "wonky", "blocked", "unknown", "failed"] + WONKY = -3 + FAILED = -2 + UNKNOWN = -1 + BLOCKED = 0 + SUCCESSFUL = 1 class QuantumInfo(TypedDict): @@ -181,35 +228,6 @@ class QuantumInfo(TypedDict): status: QuantumInfoStatus """The overall status of the quantum. Note that it is impossible to exit a wonky state. - - Possible Statuses - ----------------- - `successful`: Attempts at executing this quantum were successful. - `wonky`: The overall state of this quantum reflects inconsistencies or is - difficult to discern. There are a few specific ways to enter a wonky - state; it is impossible to exit and requires human intervention to - proceed with processing. - Currently, a quantum enters a wonky state for one of three reasons: - - Its `QuantumInfoStatus` exits a successful state. Something that - initially succeeded fails on - - A `QuantumRun` is missing logs. - - There are multiple runs associated with a dataset which comes up in a - findFirst search. This means that a dataset which will be used as an - input data product for further processing has heterogeneous inputs, - which may have had different inputs or a different data-query. - `blocked`: The quantum is not able to execute because its inputs are - missing due to an upstream failure. Blocked quanta are distinguished - from failed quanta by being successors of failed quanta in the graph. - All the successors of blocked quanta are also marked as blocked. - `unknown`: These are quanta which do not have any metadata associated - with processing, but for which it is impossible to tell the status due - to an additional absence of logs. Quanta which had not been processed - at all would reflect this state, as would quanta which were - conceptualized in the construction of the quantum graph but later - identified to be unneccesary or erroneous (deemed `NoWorkFound` by the - Science Pipelines). - `failed`: These quanta were attempted and failed. Failed quanta have logs - and no metadata. """ recovered: bool @@ -257,7 +275,35 @@ def _validate(self) -> DatasetRun: return self -DatasetInfoStatus: TypeAlias = Literal["visible", "shadowed", "predicted_only", "unsuccessful", "cursed"] +class DatasetInfoStatus(Enum): + """Status of the the DatasetType-dataID pair over all runs. + + Possible Statuses + ----------------- + CURSED: The dataset was the result of an unsuccessful quantum and was + visible in the output collection anyway. These are flagged as + cursed so that they may be caught before they become inputs to + further processing. + UNSUCCESSFUL: The dataset was not produced. These are the results of + failed or blocked quanta. + PREDICTED_ONLY: The dataset was predicted, and was not visible in any + run, but was the successor of a successful quantum. These datasets are + the result of pipelines NoWorkFound cases, in which a dataset is + predicted in the graph but found to not be necessary in processing. + SHADOWED: The dataset exists but is not queryable in a find_first + search. This could mean that the version of this dataset which is + passed as an input to further processing is not in the collections + given. A shadowed dataset will not be used as an input to further + processing. + VISIBLE: The dataset is queryable in a find_first search. This means + that it can be used as an input by subsequent tasks and processing. + """ + + CURSED = -2 + UNSUCCESSFUL = -1 + PREDICTED_ONLY = 0 + SHADOWED = 1 + VISIBLE = 2 class DatasetInfo(TypedDict): @@ -276,26 +322,6 @@ class DatasetInfo(TypedDict): status: DatasetInfoStatus """Overall status of the dataset. - - Possible Statuses - ----------------- - `visible`: The dataset is queryable in a find_first search. This means - that it can be used as an input by subsequent tasks and processing. - `shadowed`: The dataset exists but is not queryable in a find_first - search. This could mean that the version of this dataset which is - passed as an input to further processing is not in the collections - given. A `shadowed` dataset will not be used as an input to further - processing. - `predicted_only`: The dataset was predicted, and was not visible in any - run, but was the successor of a successful quantum. These datasets are - the result of pipelines `NoWorkFound` cases, in which a dataset is - predicted in the graph but found to not be necessary in processing. - `unsuccessful`: The dataset was not produced. These are the results of - failed or blocked quanta. - `cursed`: The dataset was the result of an unsuccessful quantum and was - visible in the output collection anyway. These are flagged as - `cursed` so that they may be caught before they become inputs to - further processing. """ messages: list[str] @@ -313,9 +339,9 @@ class UnsuccessfulQuantumSummary(pydantic.BaseModel): data_id: dict[str, DataIdValue] """The data_id of the unsuccessful quantum. """ - runs: dict[str, QuantumRunStatus] - """A dictionary including the `QuantumRunStatus` of each run associated - with an attempt to process the unsuccessful quantum. + runs: dict[str, str] + """A dictionary including the enum name of the `QuantumRunStatus` of each + run associated with an attempt to process the unsuccessful quantum. """ messages: list[str] """Any messages associated with the unsuccessful quantum (any clues as to @@ -334,7 +360,7 @@ def from_info(cls, info: QuantumInfo) -> UnsuccessfulQuantumSummary: """ return cls( data_id=dict(info["data_id"].required), - runs={k: v.status for k, v in info["runs"].items()}, + runs={k: v.status.name for k, v in info["runs"].items()}, messages=info["messages"], ) @@ -407,15 +433,15 @@ def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: boo if `True`. """ match info["status"]: - case "successful": + case QuantumInfoStatus.SUCCESSFUL: self.n_successful += 1 if info["recovered"]: self.recovered_quanta.append(dict(info["data_id"].required)) - case "wonky": + case QuantumInfoStatus.WONKY: self.wonky_quanta.append(UnsuccessfulQuantumSummary.from_info(info)) - case "blocked": + case QuantumInfoStatus.BLOCKED: self.n_blocked += 1 - case "failed": + case QuantumInfoStatus.FAILED: failed_quantum_summary = UnsuccessfulQuantumSummary.from_info(info) log_key = info["log"] if do_store_logs: @@ -435,7 +461,7 @@ def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: boo [record.message for record in log if record.levelno >= logging.ERROR] ) self.failed_quanta.append(failed_quantum_summary) - case "unknown": + case QuantumInfoStatus.UNKNOWN: self.n_unknown += 1 case unrecognized_state: raise AssertionError(f"Unrecognized quantum status {unrecognized_state!r}") @@ -557,15 +583,15 @@ def add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> Non specific issues. """ match info["status"]: - case "visible": + case DatasetInfoStatus.VISIBLE: self.n_visible += 1 - case "shadowed": + case DatasetInfoStatus.SHADOWED: self.n_shadowed += 1 - case "unsuccessful": + case DatasetInfoStatus.UNSUCCESSFUL: self.unsuccessful_datasets.append(dict(info["data_id"].mapping)) - case "cursed": + case DatasetInfoStatus.CURSED: self.cursed_datasets.append(CursedDatasetSummary.from_info(info, producer_info)) - case "predicted_only": + case DatasetInfoStatus.PREDICTED_ONLY: self.n_predicted_only += 1 case unrecognized_state: raise AssertionError(f"Unrecognized dataset status {unrecognized_state!r}") @@ -683,7 +709,7 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp quantum_info.setdefault("messages", []) quantum_info.setdefault("runs", {}) quantum_info.setdefault("data_id", cast(DataCoordinate, node.quantum.dataId)) - quantum_info.setdefault("status", "unknown") + quantum_info.setdefault("status", QuantumInfoStatus.UNKNOWN) quantum_info.setdefault("recovered", False) new_quanta.append(quantum_key) self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key) @@ -705,7 +731,7 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp # the dataset and set defaults for its values. dataset_info = self.get_dataset_info(dataset_key) dataset_info.setdefault("data_id", ref.dataId) - dataset_info.setdefault("status", "predicted_only") + dataset_info.setdefault("status", DatasetInfoStatus.PREDICTED_ONLY) dataset_info.setdefault("messages", []) self._datasets.setdefault(dataset_key.parent_dataset_type_name, set()).add(dataset_key) dataset_runs = dataset_info.setdefault("runs", {}) @@ -745,14 +771,14 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp if metadata_dataset_run.produced: # if we also have logs, this is a success. if log_dataset_run.produced: - quantum_run.status = "successful" + quantum_run.status = QuantumRunStatus.SUCCESSFUL # if we have metadata and no logs, this is a very rare # case. either the task ran successfully and the datastore # died immediately afterwards, or some supporting # infrastructure for transferring the logs to the datastore # failed. else: - quantum_run.status = "logs_missing" + quantum_run.status = QuantumRunStatus.LOGS_MISSING # missing metadata means that the task did not finish. else: @@ -760,7 +786,7 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp # a failure in the task itself. This includes all payload # errors and some other problems. if log_dataset_run.produced: - quantum_run.status = "failed" + quantum_run.status = QuantumRunStatus.FAILED # if a quantum fails, all its successor datasets are # blocked. blocked.update(self._xgraph.successors(quantum_key)) @@ -772,11 +798,11 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp # metadata must just be missing. if blocked.isdisjoint(self._xgraph.predecessors(quantum_key)): # None of this quantum's inputs were blocked. - quantum_run.status = "metadata_missing" + quantum_run.status = QuantumRunStatus.METADATA_MISSING # otherwise we can assume from no metadata and no logs # that the task was blocked by an upstream failure. else: - quantum_run.status = "blocked" + quantum_run.status = QuantumRunStatus.BLOCKED blocked.update(self._xgraph.successors(quantum_key)) # Now we can start using state transitions to mark overall status. @@ -784,50 +810,53 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp new_status: QuantumInfoStatus match last_status, quantum_run.status: # A quantum can never escape a `wonky` state. - case ("wonky", _): - new_status = "wonky" + case (QuantumInfoStatus.WONKY, _): + new_status = QuantumInfoStatus.WONKY # Any transition to a success (excluding from `wonky`) is # a success; any transition from a failed state is also a # recovery. - case (_, "successful"): - new_status = "successful" - if last_status != "successful" and last_status != "unknown": + case (_, QuantumRunStatus.SUCCESSFUL): + new_status = QuantumInfoStatus.SUCCESSFUL + if ( + last_status != QuantumInfoStatus.SUCCESSFUL + and last_status != QuantumInfoStatus.UNKNOWN + ): quantum_info["recovered"] = True # Missing logs are one of the categories of wonky quanta. They # interfere with our ability to discern quantum status and are # signs of weird things afoot in processing. Add a message # noting why this quantum is being marked as wonky to be stored # in its `UnsuccessfulQuantumInfo`. - case (_, "logs_missing"): - new_status = "wonky" + case (_, QuantumRunStatus.LOGS_MISSING): + new_status = QuantumInfoStatus.WONKY quantum_info["messages"].append(f"Logs missing for run {output_run!r}.") # Leaving a successful state is another category of wonky # quanta. If a previous success fails on a subsequent run, # a human should inspect why. Add a message noting why this # quantum is being marked as wonky to be stored in its # `UnsuccessfulQuantumInfo`. - case ("successful", _): - new_status = "wonky" + case (QuantumInfoStatus.SUCCESSFUL, _): + new_status = QuantumInfoStatus.WONKY quantum_info["messages"].append( f"Status went from successful in run {list(quantum_info['runs'].values())[-1]!r} " f"to {quantum_run.status!r} in {output_run!r}." ) # If a quantum status is unknown and it moves to blocked, we # know for sure that it is a blocked quantum. - case ("unknown", "blocked"): - new_status = "blocked" + case (QuantumInfoStatus.UNKNOWN, QuantumRunStatus.BLOCKED): + new_status = QuantumInfoStatus.BLOCKED # A transition into blocked does not change the overall quantum # status for a failure. - case (_, "blocked"): + case (_, QuantumRunStatus.BLOCKED): new_status = last_status # If a quantum transitions from any state into missing # metadata, we don't have enough information to diagnose its # state. - case (_, "metadata_missing"): - new_status = "unknown" + case (_, QuantumRunStatus.METADATA_MISSING): + new_status = QuantumInfoStatus.UNKNOWN # Any transition into failure is a failed state. - case (_, "failed"): - new_status = "failed" + case (_, QuantumRunStatus.FAILED): + new_status = QuantumInfoStatus.FAILED # Update `QuantumInfo.status` for this quantum. quantum_info["status"] = new_status @@ -911,29 +940,29 @@ def __resolve_duplicates( run for run, dataset_run in dataset_info["runs"].items() if dataset_run.visible ) if any(dataset_run.visible for dataset_run in dataset_info["runs"].values()): - publish_state = "visible" + query_state = "visible" # set the publish state to `shadowed` if the dataset was # produced but not visible (i.e., not queryable from the # final collection(s)). elif any(dataset_run.produced for dataset_run in dataset_info["runs"].values()): - publish_state = "shadowed" + query_state = "shadowed" # a dataset which was not produced and not visible is # missing. else: - publish_state = "missing" + query_state = "missing" # use the quantum status and publish state to ascertain the # status of the dataset. - match (quantum_info["status"], publish_state): + match (quantum_info["status"], query_state): # visible datasets from successful quanta are as # intended. - case ("successful", "visible"): - dataset_info["status"] = "visible" + case (QuantumInfoStatus.SUCCESSFUL, "visible"): + dataset_info["status"] = DatasetInfoStatus.VISIBLE # missing datasets from successful quanta indicate a # `NoWorkFound` case. - case ("successful", "missing"): - dataset_info["status"] = "predicted_only" - case ("successful", "shadowed"): - dataset_info["status"] = "shadowed" + case (QuantumInfoStatus.SUCCESSFUL, "missing"): + dataset_info["status"] = DatasetInfoStatus.PREDICTED_ONLY + case (QuantumInfoStatus.SUCCESSFUL, "shadowed"): + dataset_info["status"] = DatasetInfoStatus.SHADOWED # If anything other than a successful quantum produces # a visible dataset, that dataset is cursed. Set the # status for the dataset to cursed and note the reason @@ -943,9 +972,9 @@ def __resolve_duplicates( # without using tagged collections, so flag them as # merely unsuccessful unless the user requests it. if dataset_type_name.endswith("_log") and not curse_failed_logs: - dataset_info["status"] = "unsuccessful" + dataset_info["status"] = DatasetInfoStatus.UNSUCCESSFUL else: - dataset_info["status"] = "cursed" + dataset_info["status"] = DatasetInfoStatus.CURSED dataset_info["messages"].append( f"Unsuccessful dataset {dataset_type_name} visible in " "final output collection." @@ -954,9 +983,9 @@ def __resolve_duplicates( # visible and not successful) is a regular # failure. case _: - dataset_info["status"] = "unsuccessful" + dataset_info["status"] = DatasetInfoStatus.UNSUCCESSFUL if len(visible_runs) > 1: - quantum_info["status"] = "wonky" + quantum_info["status"] = QuantumInfoStatus.WONKY quantum_info["messages"].append( f"Outputs from different runs of the same quanta were visible: {visible_runs}." ) From 853b1dacd5c089ae2e8c237e35b8b4dc0a67b547 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Wed, 18 Sep 2024 16:11:42 -0700 Subject: [PATCH 17/18] Clean up documentation --- .../pipe/base/quantum_provenance_graph.py | 115 ++++++++++++------ 1 file changed, 75 insertions(+), 40 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 86abbbd06..1881df596 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -181,12 +181,13 @@ class QuantumInfoStatus(Enum): proceed with processing. Currently, a quantum enters a wonky state for one of three reasons: - Its `QuantumInfoStatus` exits a successful state. Something that - initially succeeded fails on + initially succeeded fails on subsequent attempts. - A `QuantumRun` is missing logs. - - There are multiple runs associated with a dataset which comes up in a - findFirst search. This means that a dataset which will be used as an - input data product for further processing has heterogeneous inputs, - which may have had different inputs or a different data-query. + - There are multiple runs associated with a dataset, and this comes up + in a findFirst search. This means that a dataset which will be used + as an input data product for further processing has heterogeneous + inputs, which may have had different inputs or a different + data-query. FAILED = -2: These quanta were attempted and failed. Failed quanta have logs and no metadata. UNKNOWN = -1: These are quanta which do not have any metadata associated @@ -345,7 +346,7 @@ class UnsuccessfulQuantumSummary(pydantic.BaseModel): """ messages: list[str] """Any messages associated with the unsuccessful quantum (any clues as to - why the quantum may be in a `failed` or `wonky` state). + why the quantum may be in a `FAILED` or `WONKY` state). """ @classmethod @@ -398,17 +399,17 @@ def n_failed(self) -> int: failed_quanta: list[UnsuccessfulQuantumSummary] = pydantic.Field(default_factory=list) """A list of all `UnsuccessfulQuantumSummary` objects associated with the - `failed` quanta. This is a report containing their data IDs, the status + FAILED quanta. This is a report containing their data IDs, the status of each run associated with each `failed` quantum, and the error messages associated with the failures when applicable. """ recovered_quanta: list[dict[str, DataIdValue]] = pydantic.Field(default_factory=list) - """A list of the quanta which moved from an unsuccessful to `successful` + """A list of the quanta which moved from an unsuccessful to SUCCESSFUL state. """ wonky_quanta: list[UnsuccessfulQuantumSummary] = pydantic.Field(default_factory=list) """A list of all `UnsuccessfulQuantumSummary` objects associated with the - `wonky` quanta. This is a report containing their data_ids, the status of + WONKY quanta. This is a report containing their data_ids, the status of each run associated with each `wonky` quantum, and messages (dictated in this module) associated with the particular issue identified. """ @@ -468,22 +469,22 @@ def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: boo class CursedDatasetSummary(pydantic.BaseModel): - """A summary of all the relevant information on a `cursed` dataset.""" + """A summary of all the relevant information on a cursed dataset.""" producer_data_id: dict[str, DataIdValue] """The data_id of the task which produced this dataset. This is mostly useful for people wishing to track down the task which produced this - `cursed` dataset quickly. + cursed dataset quickly. """ data_id: dict[str, DataIdValue] - """The data_id of the cursed `Dataset`. + """The data_id of the cursed dataset. """ runs_produced: dict[str, bool] - """A dictionary of all the runs associated with the `cursed` dataset; + """A dictionary of all the runs associated with the cursed dataset; the `bool` is true if the dataset was produced in the associated run. """ run_visible: str | None - """A dictionary of all `visible` runs containing the `cursed` dataset. + """A dictionary of all `visible` runs containing the cursed dataset. """ messages: list[str] """Any diagnostic messages (dictated in this module) which might help in @@ -556,7 +557,7 @@ def n_unsuccessful(self) -> int: cursed_datasets: list[CursedDatasetSummary] = pydantic.Field(default_factory=list) """A list of all `CursedDatasetSummary` objects associated with the - `cursed` datasets. This is a report containing their data_ids and the + cursed datasets. This is a report containing their data_ids and the data_ids of their producer task, the status of each run associated with each `cursed` dataset, and messages (dictated in this module) associated with the particular issue identified. @@ -579,7 +580,7 @@ def add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> Non producer_info : `QuantumInfo` The `QuantumInfo` object associated with the producer of the dataset. This is used to report the producer task in the - summaries for `cursed` datasets, which may help identify + summaries for cursed datasets, which may help identify specific issues. """ match info["status"]: @@ -599,7 +600,7 @@ def add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> Non class Summary(pydantic.BaseModel): """A summary of the contents of the QuantumProvenanceGraph, including - all information on the quanta for each `Task` and the datasets of each + all information on the quanta for each task and the datasets of each `DatasetType`. """ @@ -618,11 +619,14 @@ class QuantumProvenanceGraph: Step through all the quantum graphs associated with certain tasks or processing steps. For each graph/attempt, the status of each quantum and - dataset is recorded in `QuantumProvenanceGraph.add_new_graph` and outcomes - of quanta over multiple runs are resolved in - `QuantumProvenanceGraph.resolve_duplicates`. At the end of this process, - we can combine all attempts into a summary. This serves to answer the - question "What happened to this data ID?" in a wholistic sense. + dataset is recorded in `QuantumProvenanceGraph.__add_new_graph` and + outcomes of quanta over multiple runs are resolved in + `QuantumProvenanceGraph.__resolve_duplicates`. These can be called outside + the class in the correct order by + `QuantumProvenanceGraph.assemble_quantum_provenance_graph`. At the end of + this process, we can combine all attempts into a summary using the + `QuantumProvenanceGraph.to_summary` method. This serves to answer the + question 'What happened to this data ID?' in a wholistic sense. """ def __init__(self) -> None: @@ -671,15 +675,15 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp `DatasetRun` and `QuantumRun`). For each new quantum, annotate the status of the `QuantumRun` by inspecting the graph. If a DatasetType was produced, annotate this in the run by setting - `DatasetRun.produced = True`. If a quantum is given `blocked` - or `failed` status, annotate all their successors in the graph - as `blocked`. For each new quantum, use the transition between + `DatasetRun.produced = True`. If a quantum is given BLOCKED + or FAILED status, annotate all their successors in the graph + as BLOCKED. For each new quantum, use the transition between the current and last `QuantumRun.status` to determine the status to assign to the overall `QuantumInfo`. For example, if a - previous run associated with a quantum had the status `failed`, - and the status from the new graph reads `successful`, we can - mark the overall quantum status as `successful` and list the data_id - as `recovered`. + previous run associated with a quantum had the status FAILED, + and the status from the new graph reads SUCCESSFUL, we can + mark the overall quantum status as SUCCESSFUL and list the data_id + as RECOVERED. Parameters ---------- @@ -780,7 +784,6 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp else: quantum_run.status = QuantumRunStatus.LOGS_MISSING # missing metadata means that the task did not finish. - else: # if we have logs and no metadata, the task not finishing is # a failure in the task itself. This includes all payload @@ -809,10 +812,10 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp last_status = quantum_info["status"] new_status: QuantumInfoStatus match last_status, quantum_run.status: - # A quantum can never escape a `wonky` state. + # A quantum can never escape a WONKY state. case (QuantumInfoStatus.WONKY, _): new_status = QuantumInfoStatus.WONKY - # Any transition to a success (excluding from `wonky`) is + # Any transition to a success (excluding from WONKY) is # a success; any transition from a failed state is also a # recovery. case (_, QuantumRunStatus.SUCCESSFUL): @@ -871,13 +874,13 @@ def __resolve_duplicates( to the `QuantumProvenanceGraph, resolve any discrepancies between them and use all attempts to finalize overall status. - Particularly, use the publish state of each `DatasetRun` in combination - with overall quantum status to ascertain the status of each dataset. + Particularly, use the state of each `DatasetRun` in combination with + overall quantum status to ascertain the status of each dataset. Additionally, if there are multiple visible runs associated with a - dataset, mark the producer quantum as `wonky`. + dataset, mark the producer quantum as WONKY. This method should be called after - `QuantumProvenanceGraph.add_new_graph` has been called on every graph + `QuantumProvenanceGraph.__add_new_graph` has been called on every graph associated with the data processing. Parameters @@ -894,12 +897,12 @@ def __resolve_duplicates( A "where" string to use to constrain the collections, if passed. curse_failed_logs : `bool` - Mark log datasets as `cursed` if they are visible in the final + Mark log datasets as CURSED if they are visible in the final output collection. Note that a campaign-level collection must be used here for `collections` if `curse_failed_logs` is `True`; if - `resolve_duplicates` is run on a list of group-level collections - then each will show logs from their own failures as visible - the datasets will show as cursed regardless of this flag. + `__resolve_duplicates` is run on a list of group-level collections + then each will only show log datasets from their own failures as + visible and datasets from others will be marked as cursed. """ # First thing: raise an error if resolve_duplicates has been run # before on this qpg. @@ -1008,6 +1011,35 @@ def assemble_quantum_provenance_graph( where: str = "", curse_failed_logs: bool = False, ) -> None: + """Assemble the quantum provenance graph from a list of all graphs + corresponding to processing attempts. + + This method calls the private method `__add_new_graph` on each of the + constituent graphs, verifying that the graphs have been passed in + order. After `__add_new_graph` has been called on all graphs in the + `Sequence`, the method calls `__resolve_duplicates`. + + Parameters + ---------- + butler : `lsst.daf.butler.Butler` + The Butler used for this report. This should match the Butler used + for the run associated with the executed quantum graph. + qgraphs : `Sequence`[`QuantumGraph` | `ResourcePathExpression`] + A list of either quantum graph objects or their uri's, to be used + to assemble the `QuantumProvenanceGraph`. + collections : `Sequence[str]` | `None` + Collections to use in `lsst.daf.butler.registry.queryDatasets` if + paring down the query would be useful. + where : `str` + A "where" string to use to constrain the collections, if passed. + curse_failed_logs : `bool` + Mark log datasets as CURSED if they are visible in the final + output collection. Note that a campaign-level collection must be + used here for `collections` if `curse_failed_logs` is `True`; if + `__resolve_duplicates` is run on a list of group-level collections + then each will only show log datasets from their own failures as + visible and datasets from others will be marked as cursed. + """ output_runs = [] for count, graph in enumerate(qgraphs): qgraph = graph if isinstance(graph, QuantumGraph) else QuantumGraph.loadUri(graph) @@ -1033,6 +1065,9 @@ def assemble_quantum_provenance_graph( # If the user has not passed a `collections` variable if not collections: collections = list(reversed(output_runs)) + assert ( + not curse_failed_logs + ), "curse_failed_logs option must be used with one campaign-level collection." self.__resolve_duplicates(butler, collections, where, curse_failed_logs) def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary: From e5d3d756fe483daaac26245f9c6f673d22319564 Mon Sep 17 00:00:00 2001 From: Orion Eiger Date: Fri, 20 Sep 2024 20:52:25 -0700 Subject: [PATCH 18/18] Respond to review feedback --- .../pipe/base/quantum_provenance_graph.py | 131 +++++++++++------- tests/test_quantum_provenance_graph.py | 51 +++---- 2 files changed, 106 insertions(+), 76 deletions(-) diff --git a/python/lsst/pipe/base/quantum_provenance_graph.py b/python/lsst/pipe/base/quantum_provenance_graph.py index 1881df596..58f036e5b 100644 --- a/python/lsst/pipe/base/quantum_provenance_graph.py +++ b/python/lsst/pipe/base/quantum_provenance_graph.py @@ -85,7 +85,7 @@ class QuantumKey(NamedTuple): class DatasetKey(NamedTuple): """Identifier type for dataset keys in a `QuantumProvenanceGraph`.""" - parent_dataset_type_name: str + dataset_type_name: str """Name of the dataset type (never a component).""" data_id_values: tuple[DataIdValue, ...] @@ -117,7 +117,7 @@ class PrerequisiteDatasetKey(NamedTuple): dataset ID (UUID) instead. """ - parent_dataset_type_name: str + dataset_type_name: str """Name of the dataset type (never a component).""" dataset_id_bytes: bytes @@ -180,8 +180,9 @@ class QuantumInfoStatus(Enum): state; it is impossible to exit and requires human intervention to proceed with processing. Currently, a quantum enters a wonky state for one of three reasons: - - Its `QuantumInfoStatus` exits a successful state. Something that - initially succeeded fails on subsequent attempts. + - Its overall `QuantumInfoStatus` moves from a successful state (as a + result of a successful run) to any other state. In other words, + something that initially succeeded fails on subsequent attempts. - A `QuantumRun` is missing logs. - There are multiple runs associated with a dataset, and this comes up in a findFirst search. This means that a dataset which will be used @@ -258,7 +259,7 @@ class DatasetRun(pydantic.BaseModel): """ produced: bool = False - """Whether the specific run produced the dataset. + """Whether the specific run wrote the dataset. """ visible: bool = False @@ -271,13 +272,20 @@ class DatasetRun(pydantic.BaseModel): def _validate(self) -> DatasetRun: """Validate the model for `DatasetRun` by asserting that no visible `DatasetRun` is also not produced (this should be impossible). + + Returns + ------- + self : `DatasetRun` + The `DatasetRun` object, validated. """ assert not (self.visible and not self.produced) return self class DatasetInfoStatus(Enum): - """Status of the the DatasetType-dataID pair over all runs. + """Status of the the DatasetType-dataID pair over all runs. This depends + not only on the presence of the dataset itself, but also on metadata, logs + and the state of its producer quantum. Possible Statuses ----------------- @@ -341,12 +349,13 @@ class UnsuccessfulQuantumSummary(pydantic.BaseModel): """The data_id of the unsuccessful quantum. """ runs: dict[str, str] - """A dictionary including the enum name of the `QuantumRunStatus` of each - run associated with an attempt to process the unsuccessful quantum. + """A dictionary (key: output run collection name) with the value of the + enum name of the `QuantumRunStatus` of each run associated with an attempt + to process the unsuccessful quantum. """ messages: list[str] """Any messages associated with the unsuccessful quantum (any clues as to - why the quantum may be in a `FAILED` or `WONKY` state). + why the quantum may be in a FAILED or WONKY state). """ @classmethod @@ -358,6 +367,17 @@ def from_info(cls, info: QuantumInfo) -> UnsuccessfulQuantumSummary: ---------- info : `QuantumInfo` The `QuantumInfo` object for the unsuccessful quantum. + + Returns + ------- + summary : `UnsuccessfulQuantumSummary` + A Pydantic model containing the dataID, run collection names (and + each of their `QuantumRunStatus` enum names) as well as messages + which may point to any clues about the nature of the problem. For + failed quanta, these are usually error messages from the butler + logs. For wonky quanta, these can be messages generated during the + assembly of the `QuantumProvenanceGraph` that describe why it was + marked as wonky. """ return cls( data_id=dict(info["data_id"].required), @@ -404,8 +424,8 @@ def n_failed(self) -> int: associated with the failures when applicable. """ recovered_quanta: list[dict[str, DataIdValue]] = pydantic.Field(default_factory=list) - """A list of the quanta which moved from an unsuccessful to SUCCESSFUL - state. + """A list of dataIDs (key->value) which moved from an unsuccessful to + successful state. """ wonky_quanta: list[UnsuccessfulQuantumSummary] = pydantic.Field(default_factory=list) """A list of all `UnsuccessfulQuantumSummary` objects associated with the @@ -450,9 +470,7 @@ def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: boo try: # should probably upgrade this to use a dataset # ref - log = butler.get( - log_key.parent_dataset_type_name, info["data_id"], collections=run - ) + log = butler.get(log_key.dataset_type_name, info["data_id"], collections=run) except LookupError: failed_quantum_summary.messages.append(f"Logs not ingested for {run!r}") except FileNotFoundError: @@ -503,6 +521,17 @@ def from_info(cls, info: DatasetInfo, producer_info: QuantumInfo) -> CursedDatas producer_info : `QuantumInfo` All relevant information on the producer task. This is used to report the data_id of the producer task. + + Returns + ------- + summary : `CursedDatasetSummary` + A Pydantic model containing the dataID of the task which produced + this cursed dataset, the dataID associated with the cursed dataset, + run collection names (and their `DatasetRun` information) as well + as any messages which may point to any clues about the nature of + the problem. These are be messages generated during the assembly of + the `QuantumProvenanceGraph` that describe why it was marked as + cursed. """ runs_visible = {k for k, v in info["runs"].items() if v.visible} return cls( @@ -650,6 +679,12 @@ def get_quantum_info(self, key: QuantumKey) -> QuantumInfo: ---------- key : `QuantumKey` The key used to refer to the node on the graph. + + Returns + ------- + quantum_info : `QuantumInfo` + The `TypedDict` with information on the task label-dataID pair + across all runs. """ return self._xgraph.nodes[key] @@ -661,15 +696,23 @@ def get_dataset_info(self, key: DatasetKey) -> DatasetInfo: ---------- key : `DatasetKey` The key used to refer to the node on the graph. + + Returns + ------- + dataset_info : `DatasetInfo` + The `TypedDict` with information about the `DatasetType`-dataID + pair across all runs. """ return self._xgraph.nodes[key] def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExpression) -> None: """Add a new quantum graph to the `QuantumProvenanceGraph`. - Step through the quantum graph. Annotate a `networkx.DiGraph` - (`QuantumProvenanceGraph._xgraph`) with all of the relevant - information: quanta, dataset types and their associated run + Notes + ----- + The algorithm: step through the quantum graph. Annotate a + `networkx.DiGraph` (`QuantumProvenanceGraph._xgraph`) with all of the + relevant information: quanta, dataset types and their associated run collections (these unique quanta- and dataset type-run collection combinations are encapsulated in the classes `DatasetRun` and `QuantumRun`). For each new quantum, annotate @@ -702,7 +745,8 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp output_run = qgraph.metadata["output_run"] new_quanta = [] for node in qgraph: - # make a key to refer to the quantum and add it to the graph. + # make a key to refer to the quantum and add it to the quantum + # provenance graph. quantum_key = QuantumKey( node.taskDef.label, cast(DataCoordinate, node.quantum.dataId).required_values ) @@ -737,15 +781,15 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp dataset_info.setdefault("data_id", ref.dataId) dataset_info.setdefault("status", DatasetInfoStatus.PREDICTED_ONLY) dataset_info.setdefault("messages", []) - self._datasets.setdefault(dataset_key.parent_dataset_type_name, set()).add(dataset_key) + self._datasets.setdefault(dataset_key.dataset_type_name, set()).add(dataset_key) dataset_runs = dataset_info.setdefault("runs", {}) # make a `DatasetRun` for the specific dataset-run # collection combination. dataset_runs[output_run] = DatasetRun(id=ref.id) # save metadata and logs for easier status interpretation later - if dataset_key.parent_dataset_type_name.endswith("_metadata"): + if dataset_key.dataset_type_name.endswith("_metadata"): quantum_info["metadata"] = dataset_key - if dataset_key.parent_dataset_type_name.endswith("_log"): + if dataset_key.dataset_type_name.endswith("_log"): quantum_info["log"] = dataset_key for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.required_values) @@ -776,12 +820,12 @@ def __add_new_graph(self, butler: Butler, qgraph: QuantumGraph | ResourcePathExp # if we also have logs, this is a success. if log_dataset_run.produced: quantum_run.status = QuantumRunStatus.SUCCESSFUL - # if we have metadata and no logs, this is a very rare - # case. either the task ran successfully and the datastore - # died immediately afterwards, or some supporting - # infrastructure for transferring the logs to the datastore - # failed. else: + # if we have metadata and no logs, this is a very rare + # case. either the task ran successfully and the datastore + # died immediately afterwards, or some supporting + # infrastructure for transferring the logs to the datastore + # failed. quantum_run.status = QuantumRunStatus.LOGS_MISSING # missing metadata means that the task did not finish. else: @@ -889,7 +933,7 @@ def __resolve_duplicates( The Butler used for this report. This should match the Butler used for the run associated with the executed quantum graph. - collections : `Sequence[str]` | `None` + collections : `Sequence` [`str`] | `None` Collections to use in `lsst.daf.butler.registry.queryDatasets` if paring down the query would be useful. @@ -995,7 +1039,7 @@ def __resolve_duplicates( for dataset_key in self.iter_outputs_of(quantum_key): dataset_info = self.get_dataset_info(dataset_key) quantum_info["messages"].append( - f"{dataset_key.parent_dataset_type_name}" + f"{dataset_key.dataset_type_name}" + f"from {str(dataset_info['runs'])};" + f"{str(dataset_info['status'])}" ) @@ -1024,10 +1068,10 @@ def assemble_quantum_provenance_graph( butler : `lsst.daf.butler.Butler` The Butler used for this report. This should match the Butler used for the run associated with the executed quantum graph. - qgraphs : `Sequence`[`QuantumGraph` | `ResourcePathExpression`] + qgraphs : `Sequence` [`QuantumGraph` | `ResourcePathExpression`] A list of either quantum graph objects or their uri's, to be used to assemble the `QuantumProvenanceGraph`. - collections : `Sequence[str]` | `None` + collections : `Sequence` [`str`] | `None` Collections to use in `lsst.daf.butler.registry.queryDatasets` if paring down the query would be useful. where : `str` @@ -1041,29 +1085,15 @@ def assemble_quantum_provenance_graph( visible and datasets from others will be marked as cursed. """ output_runs = [] - for count, graph in enumerate(qgraphs): + for graph in qgraphs: qgraph = graph if isinstance(graph, QuantumGraph) else QuantumGraph.loadUri(graph) assert qgraph.metadata is not None, "Saved QGs always have metadata." - # If the most recent graph's timestamp was earlier than any of the - # previous graphs, raise a RuntimeError. - if len(qgraphs) > 1: - for previous_graph in qgraphs[: count - 1]: - previous_graph = ( - previous_graph - if isinstance(previous_graph, QuantumGraph) - else QuantumGraph.loadUri(previous_graph) - ) - if qgraph.metadata["time"] < previous_graph.metadata["time"]: - raise RuntimeError( - """add_new_graph may only be called on graphs - which are passed in the order they were - created. Please call again, passing your - graphs in order.""" - ) self.__add_new_graph(butler, qgraph) output_runs.append(qgraph.metadata["output_run"]) # If the user has not passed a `collections` variable if not collections: + # We reverse the order of the associated output runs because the + # query in __resolve_duplicates must be done most recent-first. collections = list(reversed(output_runs)) assert ( not curse_failed_logs @@ -1082,7 +1112,7 @@ def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary: Returns ------- - summary : `Summary` + result : `Summary` A struct containing counts of quanta and datasets in each of the overall states defined in `QuantumInfo` and `DatasetInfo`, as well as diagnostic information and error messages for failed @@ -1137,6 +1167,11 @@ def get_producer_of(self, dataset_key: DatasetKey) -> QuantumKey: ---------- dataset_key : `DatasetKey` The key for the dataset whose producer quantum is needed. + + Returns + ------- + result : `QuantumKey` + The key for the quantum which produced the dataset. """ (result,) = self._xgraph.predecessors(dataset_key) return result diff --git a/tests/test_quantum_provenance_graph.py b/tests/test_quantum_provenance_graph.py index f237674a7..cdb2c2c91 100644 --- a/tests/test_quantum_provenance_graph.py +++ b/tests/test_quantum_provenance_graph.py @@ -31,7 +31,7 @@ import unittest -from lsst.pipe.base.quantum_provenance_graph import QuantumProvenanceGraph, TaskSummary +from lsst.pipe.base.quantum_provenance_graph import QuantumProvenanceGraph from lsst.pipe.base.tests import simpleQGraph from lsst.utils.tests import temporaryDirectory @@ -61,20 +61,15 @@ def test_qpg_reports(self) -> None: # We know that we have one expected task that was not run. # As such, the following dictionary should describe all of # the mock tasks. - self.assertEqual( - task_summary, - TaskSummary( - n_successful=0, - n_blocked=0, - n_unknown=1, - n_expected=1, - failed_quanta=[], - recovered_quanta=[], - wonky_quanta=[], - n_wonky=0, - n_failed=0, - ), - ) + self.assertEqual(task_summary.n_successful, 0) + self.assertEqual(task_summary.n_blocked, 0) + self.assertEqual(task_summary.n_unknown, 1) + self.assertEqual(task_summary.n_expected, 1) + self.assertListEqual(task_summary.failed_quanta, []) + self.assertListEqual(task_summary.recovered_quanta, []) + self.assertListEqual(task_summary.wonky_quanta, []) + self.assertEqual(task_summary.n_wonky, 0) + self.assertEqual(task_summary.n_failed, 0) expected_mock_datasets = [ "add_dataset1", "add2_dataset1", @@ -115,16 +110,16 @@ def test_qpg_reports(self) -> None: self.assertListEqual(dataset_type_summary.cursed_datasets, []) # Make sure we have the right datasets based on our mock self.assertIn(dataset_type_name, expected_mock_datasets) - # Make sure the expected datasets were produced by the expected - # tasks - match dataset_type_name: - case name if name in ["add_dataset1", "add2_dataset1", "task0_metadata", "task0_log"]: - self.assertEqual(dataset_type_summary.producer, "task0") - case name if name in ["add_dataset2", "add2_dataset2", "task1_metadata", "task1_log"]: - self.assertEqual(dataset_type_summary.producer, "task1") - case name if name in ["add_dataset3", "add2_dataset3", "task2_metadata", "task2_log"]: - self.assertEqual(dataset_type_summary.producer, "task2") - case name if name in ["add_dataset4", "add2_dataset4", "task3_metadata", "task3_log"]: - self.assertEqual(dataset_type_summary.producer, "task3") - case name if name in ["add_dataset5", "add2_dataset5", "task4_metadata", "task4_log"]: - self.assertEqual(dataset_type_summary.producer, "task4") + # Make sure the expected datasets were produced by the expected + # tasks + match dataset_type_name: + case name if name in ["add_dataset1", "add2_dataset1", "task0_metadata", "task0_log"]: + self.assertEqual(dataset_type_summary.producer, "task0") + case name if name in ["add_dataset2", "add2_dataset2", "task1_metadata", "task1_log"]: + self.assertEqual(dataset_type_summary.producer, "task1") + case name if name in ["add_dataset3", "add2_dataset3", "task2_metadata", "task2_log"]: + self.assertEqual(dataset_type_summary.producer, "task2") + case name if name in ["add_dataset4", "add2_dataset4", "task3_metadata", "task3_log"]: + self.assertEqual(dataset_type_summary.producer, "task3") + case name if name in ["add_dataset5", "add2_dataset5", "task4_metadata", "task4_log"]: + self.assertEqual(dataset_type_summary.producer, "task4")