Skip to content

Commit

Permalink
Update Python packages for CJK support (#990)
Browse files Browse the repository at this point in the history
* Update OpusCleaner

* Update OpusTrainer

* Update OpusCleaner

* Add Japanese and Korean to sacrebleu

* Add opusfilter package
  • Loading branch information
eu9ene authored Jan 15, 2025
1 parent 17a3f41 commit 99f3e96
Show file tree
Hide file tree
Showing 15 changed files with 3,354 additions and 457 deletions.
7 changes: 2 additions & 5 deletions pipeline/clean/requirements/clean.in
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# Add support for bs, id, sr, tr, vi.
# TODO - Remove this once upstream supports the languages.
# See: https://github.com/mozilla/translations/issues/694
# See: https://github.com/hplt-project/OpusCleaner/pull/157
git+https://github.com/mozilla/OpusCleaner.git@spring2024
opuscleaner==0.4.3
fasttext==0.9.2
sacremoses==0.0.53
more_itertools==10.1.0
requests==2.31.0
opusfilter==3.2.0
1,282 changes: 1,241 additions & 41 deletions pipeline/clean/requirements/clean.txt

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions pipeline/data/requirements/data.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# TODO: ICU tokenizer commit, replace to PyPi version when released. Issue: https://github.com/mozilla/translations/issues/967
git+https://github.com/mozilla/OpusTrainer.git@554b7202cecbb2eaae38819aebb6c5020685f670
opustrainer==0.3
simalign==0.4
mtdata==0.4.1
psutil==6.0.0
Expand Down
1,096 changes: 1,042 additions & 54 deletions pipeline/data/requirements/data.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pipeline/eval/requirements/eval.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sacrebleu==2.4.2
sacrebleu[ja,ko]==2.4.2
unbabel-comet==2.2.2
60 changes: 59 additions & 1 deletion pipeline/eval/requirements/eval.txt
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,9 @@ idna==3.7 \
# via
# requests
# yarl
ipadic==1.0.0 \
--hash=sha256:f5923d31eca6131acaaf18ed28d8998665b1347b640d3a6476f64650e9a71c07
# via sacrebleu
jinja2==3.1.4 \
--hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \
--hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d
Expand Down Expand Up @@ -459,6 +462,61 @@ markupsafe==2.1.5 \
--hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
--hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
# via jinja2
mecab-ko==1.0.1 \
--hash=sha256:016ffa232cd40c82bebeeaaae74050f7901f586e52b44c056ca2928168022f04 \
--hash=sha256:07fb4b1a95135f2d12156719af712f98405976862f5b98c164a368f512b395dc \
--hash=sha256:0ef9588fcd7ad9c28bffc4520194718bf193a5e7d017cc0ebb75332c0eebf7db \
--hash=sha256:16a762171d41afa0d937fad13428b07d8d6f677ea52c185d0a73f02e6dba5988 \
--hash=sha256:1bfa1394b65c19a0a8a61655e806a73b6421aba500e9b57cc2cc7bd2dccc97aa \
--hash=sha256:1f4172ed0cee3615ef6f93312e344e39e4424622191253bc9483b7cc7dff98e9 \
--hash=sha256:39363e184bb76806a9781b25a233846e4ba7e877cba0f3b4aa2f15a6057564e5 \
--hash=sha256:453aa072bb42e982391fa8c7b07d262346d09aa3552dc2d1c50986cdd341a12a \
--hash=sha256:4cdcf1568f29814d1c68af7f38f9d6bb566ea65497136ec0e207eb063d46cc67 \
--hash=sha256:4d0675359d82a26e381441376a1400e87144f1ca1180046df91a48261077e248 \
--hash=sha256:91e037e2be3b4d61496d0546f68fa7eef745bc7ca645c41d0a4c0bb88d6ac836 \
--hash=sha256:a6c55cf3079e0578aec0e29a72aa1d21125a119dbf7cd57059b01b2523fe26cf \
--hash=sha256:ab92f6a4601d76500d3aff34bfd19f181dcab493ae481eaca42265e9aa5b74ad \
--hash=sha256:b380495c4f43306b9f6fa8e030d2d86286ce75c364c84393e7d5043f7093a5b2 \
--hash=sha256:bbc6f0fa30dcfded65e65c6295a6cbd1a6a7f2ba33f710ac3e4921361c3623f7 \
--hash=sha256:c345ecf7005bc7bbd1f57cc17cb7a047e3d1e59269625d4c6fc60120000233da \
--hash=sha256:cdc9d64e92bfb415f8c1e695a6dd1287b4c4a9e8a455b0d15dadffd33ce7d115 \
--hash=sha256:dd19106d5f9ea1b5ee847ed56d0bddd261685e10b15db852848bfd233c9ae579 \
--hash=sha256:e19367e157c9cd912cbf9bbbb93fddb07f38a636d1c1364eda878ebe7f52d94d \
--hash=sha256:eb10a7e1da1e66a16798d3d96d7c4b52c1290e57c8d38922b0f6f72ddc3915bc \
--hash=sha256:ec5029cafeaf684c8d16a37038eadc1fdbfa8daf96c49583801ec7b9859ed27d \
--hash=sha256:ee71ad251c57526af54cf005b2803e1df3e0ffe8b350464d8f75f936e2470087 \
--hash=sha256:f03885c4c306f21e840627fbfe0bbab7b3d5607ff9243daa8cd725ee8320615f
# via sacrebleu
mecab-ko-dic==1.0.0 \
--hash=sha256:3ba22858736e02e8a0e92f2a7f099528c733ae47701b29d12c75e982a85d1f11
# via
# mecab-ko
# sacrebleu
mecab-python3==1.0.6 \
--hash=sha256:023cfd4efa26fc61563ee185e8ac3d5fcdffcaba01931bbae34aa0ec06814875 \
--hash=sha256:0fb9f5a06f9cd9c5724608956128b91f5330248a78e9de322a0abd5beaba24a3 \
--hash=sha256:16f38ace484020bd3446a10054829678c1e7b97f1740b58b00a31d595cff025e \
--hash=sha256:2958a94ee7ed521e95c333a7b66adcead9973db3d4f4873789c425d41726e52c \
--hash=sha256:30177272efa6ba26909b540e3816659c9f93ee49a5cc2b8ea47c17eff05c686b \
--hash=sha256:3eb8655d4eb7e17dbe7f409b8dda327539de9ee8cf06ebe338aa595bb597b53d \
--hash=sha256:68334cbc6e0442ccd57770137ece585de9a512f01405bd2ad074fd28578180f8 \
--hash=sha256:6c99ec5f5581e4b56c8b68b4eeac4c6b59c6de12e66a2cfe4b1c6f8e87d38bd7 \
--hash=sha256:849aa7cfb828f0fb65340666fb2ae281855f5319699291121e116f43f478401c \
--hash=sha256:8f0c5fe224c28ebd6fad2199be129755246091310de71eb9a43d88473d5b8950 \
--hash=sha256:9c3eeb4adf864a74ced9a1c5443b174f1e851323143ca6f24391fb6db8d44110 \
--hash=sha256:a455f0f1830392548366aae56819a22fa0f5786c39c4cf6cf84fe438a5fc7cbc \
--hash=sha256:b2a3b6243d709ccf6672d54c9c17cb6281a17737626caaafb0219944835d6b9c \
--hash=sha256:b56692cef89cd8ad0179bd2f989e4b3209686eb59124e0191167d9a3e64cd1a0 \
--hash=sha256:b8c995629aeb6bdddf37eee1c4049549ff1993e3888e28eb6cc59c8dce79de41 \
--hash=sha256:c80e58dcc90da095b64799950046dc4b9db3fb6bf30a8f817d1639cdaf601ba1 \
--hash=sha256:d1061bcd3d5567d6522e8794773689e6f9dedadcd3d3b1652893142f6334ec95 \
--hash=sha256:e00f0c329c7708b6cf5e5b2b1772535de3d6d69f99e6105a1acf40b94c62afe5 \
--hash=sha256:e049805f8342020b559357f4ea2b69b7d4faf8681c979ffd868d4d25d3a89021 \
--hash=sha256:e28997040b5ea9df3cf8969c5d28e7ba730ef4f924badac0548cd14383e56c66 \
--hash=sha256:ef43e4ab82396d1a59f4d8e2237c4136e09391e8e8ba623af4b52e6b52377fdd \
--hash=sha256:fd10ba05f94dea7fe5e281336b7a4d07f79062c25139077d935833d0db303450 \
--hash=sha256:fde76ebc2fcf4f375b520a706af316f88be304302f5e5d36105992a1ecd3ea1b
# via sacrebleu
mpmath==1.3.0 \
--hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
--hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
Expand Down Expand Up @@ -876,7 +934,7 @@ requests==2.31.0 \
# via
# huggingface-hub
# transformers
sacrebleu==2.4.2 \
sacrebleu[ja,ko]==2.4.2 \
--hash=sha256:611a581d205828912f0b05f806b110180087184d3be2dc650fda7a729d6ecb89
# via
# -r pipeline/eval/requirements/eval.in
Expand Down
3 changes: 1 addition & 2 deletions pipeline/train/requirements/train.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
# TODO: ICU tokenizer commit, replace to PyPi version when released. Issue: https://github.com/mozilla/translations/issues/967
git+https://github.com/mozilla/OpusTrainer.git@554b7202cecbb2eaae38819aebb6c5020685f670
opustrainer==0.3
gpustat==1.1.1
Loading

0 comments on commit 99f3e96

Please sign in to comment.