Expand out marian command arguments (#779)

mozilla · Aug 7, 2024 · 747821a · 747821a
1 parent 9a82925
commit 747821a
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 32 deletions.
diff --git a/pipeline/cefilter/score.sh b/pipeline/cefilter/score.sh
@@ -33,17 +33,17 @@ dir=$(dirname "${output}")
 mkdir -p "${dir}"
 
 "${MARIAN}/marian-scorer" \
-  -m "${model}" \
-  -v "${vocab}" "${vocab}" \
-  -t "${corpus_prefix}.${TRG}${ARTIFACT_EXT}" "${corpus_prefix}.${SRC}${ARTIFACT_EXT}" \
+  --model "${model}" \
+  --vocabs "${vocab}" "${vocab}" \
+  --train-sets "${corpus_prefix}.${TRG}${ARTIFACT_EXT}" "${corpus_prefix}.${SRC}${ARTIFACT_EXT}" \
   --mini-batch 32 \
   --mini-batch-words 1500 \
   --maxi-batch 1000 \
   --max-length 250 \
   --max-length-crop \
   --normalize \
-  -d ${GPUS} \
-  -w "${WORKSPACE}" \
+  --devices ${GPUS} \
+  --workspace "${WORKSPACE}" \
   --log "${dir}/scores.txt.log" \
   >"${output}"
 

diff --git a/pipeline/quantize/export.sh b/pipeline/quantize/export.sh
@@ -2,6 +2,9 @@
 ##
 # Export the quantized model to bergamot translator format.
 #
+# This script requires the browsermt fork of Marian for the int8shiftAlphaAll mode.
+# https://github.com/browsermt/marian-dev
+# https://github.com/browsermt/students/tree/master/train-student#5-8-bit-quantization
 
 set -x
 set -euo pipefail

diff --git a/pipeline/quantize/quantize.sh b/pipeline/quantize/quantize.sh
@@ -28,11 +28,11 @@ cp "${vocab}" "${output_dir}"
 echo "### Decoding a sample test set in order to get typical quantization values"
 test -s "${output_dir}/quantmults" ||
   "${BMT_MARIAN}"/marian-decoder \
-    -m "${model}" \
-    -v "${vocab}" "${vocab}" \
-    -c "decoder.yml" \
-    -i "${devtest_src}" \
-    -o "${output_dir}/output.${TRG}" \
+    --models "${model}" \
+    --vocabs "${vocab}" "${vocab}" \
+    --config "decoder.yml" \
+    --input "${devtest_src}" \
+    --output "${output_dir}/output.${TRG}" \
     --shortlist "${shortlist}" false \
     --quiet \
     --quiet-translation \
@@ -50,8 +50,8 @@ test -s "${output_dir}/model.alphas.npz" ||
 echo "### Converting"
 test -s "${res_model}" ||
   "${BMT_MARIAN}"/marian-conv \
-    -f "${output_dir}/model.alphas.npz" \
-    -t "${res_model}" \
+    --from "${output_dir}/model.alphas.npz" \
+    --to "${res_model}" \
     --gemm-type intgemm8
 
 echo "### The result models is saved to ${res_model}"

diff --git a/pipeline/train/train.sh b/pipeline/train/train.sh
@@ -135,10 +135,10 @@ opustrainer-train \
   --log-level ERROR \
   "${MARIAN}/marian" \
     --model "${model_dir}/model.npz" \
-    -c "configs/model/${model_type}.yml" "configs/training/${model_type}.${training_type}.yml" \
-    -T "${model_dir}/tmp" \
+    --config "configs/model/${model_type}.yml" "configs/training/${model_type}.${training_type}.yml" \
+    --tempdir "${model_dir}/tmp" \
     --vocabs "${vocab}" "${vocab}" \
-    -w "${WORKSPACE}" \
+    --workspace "${WORKSPACE}" \
     --devices ${GPUS} \
     --valid-metrics "${best_model_metric}" ${all_model_metrics[@]/$best_model_metric} \
     --valid-sets "${valid_tsv_dataset}" \

diff --git a/pipeline/translate/translate-nbest.sh b/pipeline/translate/translate-nbest.sh
@@ -14,17 +14,20 @@ input=$1
 vocab=$2
 models=( "${@:3}" )
 
+output="${input}.nbest"
+
 cd "$(dirname "${0}")"
 
 "${MARIAN}/marian-decoder" \
-  -c decoder.yml \
-  -m "${models[@]}" \
-  -v "${vocab}" "${vocab}" \
-  -i "${input}" \
-  -o "${input}.nbest" \
+  --config decoder.yml \
+  --models "${models[@]}" \
+  --vocabs "${vocab}" "${vocab}" \
+  --input "${input}" \
+  --output "${output}" \
   --log "${input}.log" \
   --n-best \
-  -d ${GPUS} \
-  -w "${WORKSPACE}"
+  --devices ${GPUS} \
+  --workspace "${WORKSPACE}"
 
-test "$(wc -l <"${input}.nbest")" -eq "$(( $(wc -l <"${input}") * 8 ))"
+# Test that the input and output have the same number of sentences.
+test "$(wc -l <"${output}")" -eq "$(( $(wc -l <"${input}") * 8 ))"
diff --git a/pipeline/translate/translate.sh b/pipeline/translate/translate.sh
@@ -13,18 +13,19 @@ test -v WORKSPACE
 input=$1
 vocab=$2
 models=( "${@:3}" )
-
+output="${input}.out"
 
 cd "$(dirname "${0}")"
 
 "${MARIAN}/marian-decoder" \
-  -c decoder.yml \
-  -m "${models[@]}" \
-  -v "${vocab}" "${vocab}" \
-  -i "${input}" \
-  -o "${input}.out" \
+  --config decoder.yml \
+  --models "${models[@]}" \
+  --vocabs "${vocab}" "${vocab}" \
+  --input "${input}" \
+  --output "${output}" \
   --log "${input}.log" \
-  -d ${GPUS} \
-  -w "${WORKSPACE}"
+  --devices ${GPUS} \
+  --workspace "${WORKSPACE}"
 
-test "$(wc -l <"${input}")" == "$(wc -l <"${input}.out")"
+# Test that the input and output have the same number of sentences.
+test "$(wc -l <"${input}")" == "$(wc -l <"${output}")"