Fix unstable training (#352)

* Adjust opus trainer settings * Fix optimizer delay * Use default learning rate * Enable back translations * Report learning rate for teacher * Remove old link * Match validation and save frequency * Roll back learning rate * Disable snakemake dry run * Add a note about optimizer delay * Add a link to opus trainer paper
mozilla · Jan 17, 2024 · 61f5ec2 · 61f5ec2
1 parent 3458855
commit 61f5ec2
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 60 deletions.
diff --git a/docs/opus-trainer.md b/docs/opus-trainer.md
@@ -11,6 +11,8 @@ nav_order: 7
 It feeds training data to Marian and provides the ability to do useful manipulations with the data, 
 such as shuffling, mixing multiple datasets in the specified proportion, splitting training into multiple stages and augmentation.
 
+See [this paper](https://arxiv.org/pdf/2311.14838.pdf) for more details and recommendations on how to set augmentation values.
+
 ## Data augmentation
 
 Data augmentation helps make translation models more robust, which is especially useful for usage with noisy internet pages.

diff --git a/docs/training-guide.md b/docs/training-guide.md
@@ -157,6 +157,9 @@ marian-args:
     early-stopping: 20
 ```
 
+Make sure to set `optimizer-delay` so that GPU devices * optimizer-delay = 8. 
+It makes training more stable.
+
 ### Decoding (translation)
 
 `mini-batch-words` can be set depending on available GPU memory and the number of teachers.

diff --git a/pipeline/train/configs/opustrainer/backward.yml b/pipeline/train/configs/opustrainer/backward.yml
@@ -9,8 +9,8 @@ train:
   - until original 10 # Train for 10 epochs. Only OpusTrainer can control epochs, it's all one big epoch for Marian
 
 modifiers:
-- UpperCase: 0.1 # Apply randomly to 5% of sentences
-- TitleCase: 0.1
+- UpperCase: 0.07 # Apply randomly to 7% of sentences
+- TitleCase: 0.05
 #- Typos: 0.05
 
 seed: 1111

diff --git a/pipeline/train/configs/opustrainer/student.yml b/pipeline/train/configs/opustrainer/student.yml
@@ -11,8 +11,8 @@ train:
 # TODO: augment corpus before decoding or reduce augmentation rate
 # TODO: https://github.com/mozilla/firefox-translations-training/issues/272
 #modifiers:
-#- UpperCase: 0.1 # Apply randomly to 5% of sentences
-#- TitleCase: 0.1
+- UpperCase: 0.07 # Apply randomly to 7% of sentences
+- TitleCase: 0.05
 # TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
 #- Typos: 0.05
 # TODO: enable tags, currently doesn't work because of the issue with  tokenization

diff --git a/pipeline/train/configs/opustrainer/teacher.yml b/pipeline/train/configs/opustrainer/teacher.yml
@@ -3,28 +3,14 @@ datasets:
   backtranslated: <dataset1> # Back-translated data
 
 stages:
-  - start
-  - mid
-  - end
+  - pretrain
   - finetune
 
- # One epoch of only original high-quality data to warm up the model
-start:
-  - original 1.0
-  - until original 1
-
-# Gradually add back-translations to the mix
-# Back-translated corpus can vary a lot in size, so we can try using original to count epochs
-mid:
-  - original 0.7
-  - backtranslated 0.3
-  - until original 1
-
-# Expand back-translations
-end:
+# Back-translated corpus can vary a lot in size, so we can try using original one to count epochs
+pretrain:
   - original 0.6
   - backtranslated 0.4
-  - until original 1
+  - until original 2
 
 # Fine-tuning only on original clean corpus until the early stopping
 finetune:
@@ -33,8 +19,8 @@ finetune:
 
 
 modifiers:
-- UpperCase: 0.1 # Apply randomly to 10% of sentences
-- TitleCase: 0.1
+- UpperCase: 0.07 # Apply randomly to 7% of sentences
+- TitleCase: 0.05
 # TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
 #- Typos: 0.05
 

diff --git a/pipeline/train/configs/training/teacher.train.yml b/pipeline/train/configs/training/teacher.train.yml
@@ -1,9 +1,10 @@
-# https://discourse.translatelocally.com/t/marian-configuration-to-use/24
 disp-freq: 1000
+# default learning rate for transformer-big is 0.0002 https://github.com/marian-nmt/marian-dev/blob/master/src/common/aliases.cpp
 learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001
-optimizer-delay: 1 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer
+optimizer-delay: 2 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer
+lr-report: True
 save-freq: 5000
-valid-freq: 3000
+valid-freq: 5000
 valid-max-length: 300
 valid-mini-batch: 8
-early-stopping: 20
+early-stopping: 20
diff --git a/taskcluster/kinds/tests/kind.yml b/taskcluster/kinds/tests/kind.yml
@@ -22,38 +22,39 @@ task-defaults:
     cwd: '{checkout}'
 
 tasks:
-  snakemake-dry-run:
-    # Ensure that the snakemake workflow is still executing correctly, even though
-    # taskcluster is the preferred execution environment.
-    worker-type: b-cpu
-    worker:
-      max-run-time: 3600
-      docker-image: {in-tree: test}
-    run-on-tasks-for: ["github-push", "github-pull-request"]
-    optimization:
-        skip-unless-changed:
-            - pipeline/**
-            - envs/**
-            - configs/**
-    run:
-      command:
-        - bash
-        - -c
-        - >-
-          echo "Setting environment variables"                              &&
-          export CONDA_PATH=/builds/worker/artifacts/mambaforge             &&
-          export SNAKEMAKE_OUTPUT_CACHE=/builds/worker/artifacts/mambaforge &&
-          export REPORTS=/builds/worker/artifacts/reports                   &&
-          export MODELS=/builds/worker/artifacts/models                     &&
-
-          echo "Install necessary dependencies"                             &&
-          make conda                                                        &&
-          make snakemake                                                    &&
-          make git-modules                                                  &&
-
-          echo "Start the dry run"                                          &&
-          make dry-run                                                      &&
-          make test-dry-run
+  # See issue: https://github.com/mozilla/firefox-translations-training/issues/363
+  #  snakemake-dry-run:
+#    # Ensure that the snakemake workflow is still executing correctly, even though
+#    # taskcluster is the preferred execution environment.
+#    worker-type: b-cpu
+#    worker:
+#      max-run-time: 3600
+#      docker-image: {in-tree: test}
+#    run-on-tasks-for: ["github-push", "github-pull-request"]
+#    optimization:
+#        skip-unless-changed:
+#            - pipeline/**
+#            - envs/**
+#            - configs/**
+#    run:
+#      command:
+#        - bash
+#        - -c
+#        - >-
+#          echo "Setting environment variables"                              &&
+#          export CONDA_PATH=/builds/worker/artifacts/mambaforge             &&
+#          export SNAKEMAKE_OUTPUT_CACHE=/builds/worker/artifacts/mambaforge &&
+#          export REPORTS=/builds/worker/artifacts/reports                   &&
+#          export MODELS=/builds/worker/artifacts/models                     &&
+#
+#          echo "Install necessary dependencies"                             &&
+#          make conda                                                        &&
+#          make snakemake                                                    &&
+#          make git-modules                                                  &&
+#
+#          echo "Start the dry run"                                          &&
+#          make dry-run                                                      &&
+#          make test-dry-run
 
   black:
     # Run python's black formatter, which formats python files.