Skip to content

Commit

Permalink
Add hacky kind that can modify upstream parts of the graph
Browse files Browse the repository at this point in the history
  • Loading branch information
bhearsum committed Feb 12, 2024
1 parent 6277e7b commit 04ee6de
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 0 deletions.
1 change: 1 addition & 0 deletions taskcluster/kinds/all-pr/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ kind-dependencies:
- evaluate
- evaluate-quantized
- evaluate-teacher-ensemble
- modify-graph

tasks:
all:
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/all/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ kind-dependencies:
- evaluate
- evaluate-quantized
- evaluate-teacher-ensemble
- modify-graph

tasks:
all:
Expand Down
35 changes: 35 additions & 0 deletions taskcluster/kinds/modify-graph/kind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---

loader: taskgraph.loader.transform:loader

transforms:
- translations_taskgraph.transforms.modify_graph
- taskgraph.transforms.task_context
- taskgraph.transforms.task:transforms

kind-dependencies:
- clean-corpus
- bicleaner
- merge-corpus

tasks:
all:
description: Modify upstream parts of the graph
attributes:
stage: all
src_locale: "{src_locale}"
trg_locale: "{trg_locale}"

task-context:
from-parameters:
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
substitution-fields:
- attributes

run-on-tasks-for: []
expires-after: "90 days"
worker-type: succeed
3 changes: 3 additions & 0 deletions taskcluster/translations_taskgraph/actions/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ def validate_pretrained_models(params):
},
"bicleaner": {
"properties": {
"disable": {
"type": "string",
},
"default-threshold": {
"type": "number",
"description": "bicleaner threshold",
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/translations_taskgraph/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def get_defaults(_):
"best-model": "chrf",
"use-opuscleaner": "true",
"bicleaner": {
"disable": "true",
"default-threshold": 0.5,
"dataset-thresholds": {
"opus_ada83/v1": 0.0,
Expand Down Expand Up @@ -126,6 +127,7 @@ def get_defaults(_):
Required("best-model"): str,
Required("use-opuscleaner"): str,
Required("bicleaner"): {
Required("disable"): str,
Required("default-threshold"): float,
Optional("dataset-thresholds"): {
str: float,
Expand Down
49 changes: 49 additions & 0 deletions taskcluster/translations_taskgraph/transforms/modify_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

import json

from taskgraph.transforms.base import TransformSequence

transforms = TransformSequence()


@transforms.add
def enable_or_disable_bicleaner(config, jobs):
"""Set the appropriate dependencies and fetches based on whether bicleaner
is enabled or disabled."""

merge_corpus_task = None
for name, task in config.kind_dependencies_tasks.items():
if name.startswith("merge-corpus"):
merge_corpus_task = task

if not merge_corpus_task:
raise Exception("Couldn't find merge corpus task!")

pull_from = "bicleaner"
if config.params["training_config"]["experiment"]["bicleaner"]["disable"]:
pull_from = "clean-corpus"

# If bicleaner is disabled, we pull upstream artifacts from `clean-corpus`
new_deps = {}
for name, task in config.kind_dependencies_tasks.items():
if name.startswith(pull_from):
new_deps[task.label] = task.label

merge_corpus_task.dependencies.update(new_deps)
# Also need to set up fetches, which will involve setting the `MOZ_FETCHES` env var
# like the `run` transform does: https://github.com/taskcluster/taskgraph/blob/1a7dba0db709c84940fcf85a421c1dfc931f9747/src/taskgraph/transforms/run/__init__.py#L207
# this will have to get somehow combined with the existing `MOZ_FETCHES` present in merge_corpus_task.task["payload"]["env"]
fetches = json.loads(merge_corpus_task.task["payload"]["env"]["MOZ_FETCHES"]["task-reference"])
# Consult an existing task definition for the MOZ_FETCHES format
fetches.append([
{"artifact": "something", "extract": True, "task": f"<{pull_from}>"}
])
merge_corpus_task.task["payload"]["env"]["MOZ_FETCHES"]["task-reference"] = json.dumps(fetches)

# We don't actually make any adjustements to the new jobs; this transform only exists
# to cause the side effects done above.
yield from jobs

0 comments on commit 04ee6de

Please sign in to comment.