From 0d686a1b21f9e8395ead4d22f4e2e0a8bcf5b826 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Mon, 12 Feb 2024 14:43:35 -0500 Subject: [PATCH] Add hacky kind that can modify upstream parts of the graph --- taskcluster/kinds/all-pr/kind.yml | 1 + taskcluster/kinds/all/kind.yml | 1 + taskcluster/kinds/modify-graph/kind.yml | 35 +++++++++++++ .../translations_taskgraph/actions/train.py | 3 ++ .../translations_taskgraph/parameters.py | 2 + .../transforms/modify_graph.py | 49 +++++++++++++++++++ 6 files changed, 91 insertions(+) create mode 100644 taskcluster/kinds/modify-graph/kind.yml create mode 100644 taskcluster/translations_taskgraph/transforms/modify_graph.py diff --git a/taskcluster/kinds/all-pr/kind.yml b/taskcluster/kinds/all-pr/kind.yml index bd22887ae..1981d5410 100644 --- a/taskcluster/kinds/all-pr/kind.yml +++ b/taskcluster/kinds/all-pr/kind.yml @@ -17,6 +17,7 @@ kind-dependencies: - evaluate - evaluate-quantized - evaluate-teacher-ensemble + - modify-graph tasks: all: diff --git a/taskcluster/kinds/all/kind.yml b/taskcluster/kinds/all/kind.yml index 1d3b4d5f5..cdf9716cb 100644 --- a/taskcluster/kinds/all/kind.yml +++ b/taskcluster/kinds/all/kind.yml @@ -16,6 +16,7 @@ kind-dependencies: - evaluate - evaluate-quantized - evaluate-teacher-ensemble + - modify-graph tasks: all: diff --git a/taskcluster/kinds/modify-graph/kind.yml b/taskcluster/kinds/modify-graph/kind.yml new file mode 100644 index 000000000..01f9d3ba3 --- /dev/null +++ b/taskcluster/kinds/modify-graph/kind.yml @@ -0,0 +1,35 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- + +loader: taskgraph.loader.transform:loader + +transforms: + - translations_taskgraph.transforms.modify_graph + - taskgraph.transforms.task_context + - taskgraph.transforms.task:transforms + +kind-dependencies: + - clean-corpus + - bicleaner + - merge-corpus + +tasks: + all: + description: Modify upstream parts of the graph + attributes: + stage: all + src_locale: "{src_locale}" + trg_locale: "{trg_locale}" + + task-context: + from-parameters: + src_locale: training_config.experiment.src + trg_locale: training_config.experiment.trg + substitution-fields: + - attributes + + run-on-tasks-for: [] + expires-after: "90 days" + worker-type: succeed diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py index 0d786d3db..e2b697f64 100644 --- a/taskcluster/translations_taskgraph/actions/train.py +++ b/taskcluster/translations_taskgraph/actions/train.py @@ -144,6 +144,9 @@ def validate_pretrained_models(params): }, "bicleaner": { "properties": { + "disable": { + "type": "string", + }, "default-threshold": { "type": "number", "description": "bicleaner threshold", diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py index dfc833141..e0c381440 100644 --- a/taskcluster/translations_taskgraph/parameters.py +++ b/taskcluster/translations_taskgraph/parameters.py @@ -27,6 +27,7 @@ def get_defaults(_): "best-model": "chrf", "use-opuscleaner": "true", "bicleaner": { + "disable": "true", "default-threshold": 0.5, "dataset-thresholds": { "opus_ada83/v1": 0.0, @@ -126,6 +127,7 @@ def get_defaults(_): Required("best-model"): str, Required("use-opuscleaner"): str, Required("bicleaner"): { + Required("disable"): str, Required("default-threshold"): float, Optional("dataset-thresholds"): { str: float, diff --git a/taskcluster/translations_taskgraph/transforms/modify_graph.py b/taskcluster/translations_taskgraph/transforms/modify_graph.py new file mode 100644 index 000000000..ec08d24f4 --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/modify_graph.py @@ -0,0 +1,49 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +import json + +from taskgraph.transforms.base import TransformSequence + +transforms = TransformSequence() + + +@transforms.add +def enable_or_disable_bicleaner(config, jobs): + """Set the appropriate dependencies and fetches based on whether bicleaner + is enabled or disabled.""" + + merge_corpus_task = None + for name, task in config.kind_dependencies_tasks.items(): + if name.startswith("merge-corpus"): + merge_corpus_task = task + + if not merge_corpus_task: + raise Exception("Couldn't find merge corpus task!") + + pull_from = "bicleaner" + if config.params["training_config"]["experiment"]["bicleaner"]["disable"]: + pull_from = "clean-corpus" + + # If bicleaner is disabled, we pull upstream artifacts from `clean-corpus` + new_deps = {} + for name, task in config.kind_dependencies_tasks.items(): + if name.startswith(pull_from): + new_deps[task.kind] = task.label + + merge_corpus_task.dependencies.update(new_deps) + # Also need to set up fetches, which will involve setting the `MOZ_FETCHES` env var + # like the `run` transform does: https://github.com/taskcluster/taskgraph/blob/1a7dba0db709c84940fcf85a421c1dfc931f9747/src/taskgraph/transforms/run/__init__.py#L207 + # this will have to get somehow combined with the existing `MOZ_FETCHES` present in merge_corpus_task.task["payload"]["env"] + fetches = json.loads(merge_corpus_task.task["payload"]["env"]["MOZ_FETCHES"]["task-reference"]) + # Consult an existing task definition for the MOZ_FETCHES format + fetches.append([ + {"artifact": "something", "extract": True, "task": f"<{pull_from}>"} + ]) + merge_corpus_task.task["payload"]["env"]["MOZ_FETCHES"]["task-reference"] = json.dumps(fetches) + + # We don't actually make any adjustements to the new jobs; this transform only exists + # to cause the side effects done above. + yield from jobs