Skip to content

Commit

Permalink
Expand out marian command arguments (#779)
Browse files Browse the repository at this point in the history
  • Loading branch information
gregtatum authored Aug 7, 2024
1 parent 9a82925 commit 747821a
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 32 deletions.
10 changes: 5 additions & 5 deletions pipeline/cefilter/score.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ dir=$(dirname "${output}")
mkdir -p "${dir}"

"${MARIAN}/marian-scorer" \
-m "${model}" \
-v "${vocab}" "${vocab}" \
-t "${corpus_prefix}.${TRG}${ARTIFACT_EXT}" "${corpus_prefix}.${SRC}${ARTIFACT_EXT}" \
--model "${model}" \
--vocabs "${vocab}" "${vocab}" \
--train-sets "${corpus_prefix}.${TRG}${ARTIFACT_EXT}" "${corpus_prefix}.${SRC}${ARTIFACT_EXT}" \
--mini-batch 32 \
--mini-batch-words 1500 \
--maxi-batch 1000 \
--max-length 250 \
--max-length-crop \
--normalize \
-d ${GPUS} \
-w "${WORKSPACE}" \
--devices ${GPUS} \
--workspace "${WORKSPACE}" \
--log "${dir}/scores.txt.log" \
>"${output}"

Expand Down
3 changes: 3 additions & 0 deletions pipeline/quantize/export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
##
# Export the quantized model to bergamot translator format.
#
# This script requires the browsermt fork of Marian for the int8shiftAlphaAll mode.
# https://github.com/browsermt/marian-dev
# https://github.com/browsermt/students/tree/master/train-student#5-8-bit-quantization

set -x
set -euo pipefail
Expand Down
14 changes: 7 additions & 7 deletions pipeline/quantize/quantize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ cp "${vocab}" "${output_dir}"
echo "### Decoding a sample test set in order to get typical quantization values"
test -s "${output_dir}/quantmults" ||
"${BMT_MARIAN}"/marian-decoder \
-m "${model}" \
-v "${vocab}" "${vocab}" \
-c "decoder.yml" \
-i "${devtest_src}" \
-o "${output_dir}/output.${TRG}" \
--models "${model}" \
--vocabs "${vocab}" "${vocab}" \
--config "decoder.yml" \
--input "${devtest_src}" \
--output "${output_dir}/output.${TRG}" \
--shortlist "${shortlist}" false \
--quiet \
--quiet-translation \
Expand All @@ -50,8 +50,8 @@ test -s "${output_dir}/model.alphas.npz" ||
echo "### Converting"
test -s "${res_model}" ||
"${BMT_MARIAN}"/marian-conv \
-f "${output_dir}/model.alphas.npz" \
-t "${res_model}" \
--from "${output_dir}/model.alphas.npz" \
--to "${res_model}" \
--gemm-type intgemm8

echo "### The result models is saved to ${res_model}"
Expand Down
6 changes: 3 additions & 3 deletions pipeline/train/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ opustrainer-train \
--log-level ERROR \
"${MARIAN}/marian" \
--model "${model_dir}/model.npz" \
-c "configs/model/${model_type}.yml" "configs/training/${model_type}.${training_type}.yml" \
-T "${model_dir}/tmp" \
--config "configs/model/${model_type}.yml" "configs/training/${model_type}.${training_type}.yml" \
--tempdir "${model_dir}/tmp" \
--vocabs "${vocab}" "${vocab}" \
-w "${WORKSPACE}" \
--workspace "${WORKSPACE}" \
--devices ${GPUS} \
--valid-metrics "${best_model_metric}" ${all_model_metrics[@]/$best_model_metric} \
--valid-sets "${valid_tsv_dataset}" \
Expand Down
19 changes: 11 additions & 8 deletions pipeline/translate/translate-nbest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,20 @@ input=$1
vocab=$2
models=( "${@:3}" )

output="${input}.nbest"

cd "$(dirname "${0}")"

"${MARIAN}/marian-decoder" \
-c decoder.yml \
-m "${models[@]}" \
-v "${vocab}" "${vocab}" \
-i "${input}" \
-o "${input}.nbest" \
--config decoder.yml \
--models "${models[@]}" \
--vocabs "${vocab}" "${vocab}" \
--input "${input}" \
--output "${output}" \
--log "${input}.log" \
--n-best \
-d ${GPUS} \
-w "${WORKSPACE}"
--devices ${GPUS} \
--workspace "${WORKSPACE}"

test "$(wc -l <"${input}.nbest")" -eq "$(( $(wc -l <"${input}") * 8 ))"
# Test that the input and output have the same number of sentences.
test "$(wc -l <"${output}")" -eq "$(( $(wc -l <"${input}") * 8 ))"
19 changes: 10 additions & 9 deletions pipeline/translate/translate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@ test -v WORKSPACE
input=$1
vocab=$2
models=( "${@:3}" )

output="${input}.out"

cd "$(dirname "${0}")"

"${MARIAN}/marian-decoder" \
-c decoder.yml \
-m "${models[@]}" \
-v "${vocab}" "${vocab}" \
-i "${input}" \
-o "${input}.out" \
--config decoder.yml \
--models "${models[@]}" \
--vocabs "${vocab}" "${vocab}" \
--input "${input}" \
--output "${output}" \
--log "${input}.log" \
-d ${GPUS} \
-w "${WORKSPACE}"
--devices ${GPUS} \
--workspace "${WORKSPACE}"

test "$(wc -l <"${input}")" == "$(wc -l <"${input}.out")"
# Test that the input and output have the same number of sentences.
test "$(wc -l <"${input}")" == "$(wc -l <"${output}")"

0 comments on commit 747821a

Please sign in to comment.