forked from Helsinki-NLP/Tatoeba-Challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
opus-2021-02-18.yml
44 lines (44 loc) · 1.16 KB
/
opus-2021-02-18.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
release: msa-ita/opus-2021-02-18.zip
release-date: 2021-02-18
dataset-name: opus
modeltype: transformer
pre-processing: normalization + SentencePiece (spm32k,spm32k)
subwords:
- source: spm32k
- target: spm32k
subword-models:
- source: source.spm
- target: target.spm
source-languages:
- ind
- jak
- msa
- zlm
- zsm
target-languages:
- ita
training-data:
- ind-ita: Tatoeba-train (6076279)
- jak_Latn-ita: Tatoeba-train (6536)
- msa_Latn-ita: Tatoeba-train (1414818)
- zlm-ita: Tatoeba-train (171)
- zlm_Latn-ita: Tatoeba-train (73963)
validation-data:
- ind-ita: Tatoeba-dev, 817
- ita-jak_Latn: Tatoeba-dev, 2
- ita-msa_Latn: Tatoeba-dev, 172
- ita-zlm_Latn: Tatoeba-dev, 9
- total size of shuffled dev data: 994
- devset = top 994 lines of Tatoeba-dev.src.shuffled!
test-data:
- Tatoeba-test.ind-ita: 368/2334
- Tatoeba-test.msa-ita: 426/2758
- Tatoeba-test.zsm_Latn-ita: 58/424
BLEU-scores:
- Tatoeba-test.ind-ita: 39.6
- Tatoeba-test.msa-ita: 39.7
- Tatoeba-test.zsm_Latn-ita: 36.0
chr-F-scores:
- Tatoeba-test.ind-ita: 0.629
- Tatoeba-test.msa-ita: 0.628
- Tatoeba-test.zsm_Latn-ita: 0.603