forked from Helsinki-NLP/Tatoeba-Challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
opus-2021-02-18.yml
175 lines (175 loc) · 5.66 KB
/
opus-2021-02-18.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
release: sit-sit/opus-2021-02-18.zip
release-date: 2021-02-18
dataset-name: opus
modeltype: transformer
pre-processing: normalization + SentencePiece (spm32k,spm32k)
subwords:
- source: spm32k
- target: spm32k
subword-models:
- source: source.spm
- target: target.spm
use-target-labels:
- >>cmn<<
- >>cmn_Hans<<
- >>cmn_Hant<<
- >>mya<<
- >>nan<<
- >>yue_Hans<<
- >>yue_Hant<<
source-languages:
- cmn
- mya
- nan
- yue
target-languages:
- cmn
- mya
- nan
- yue
training-data:
- bod-eng: Tatoeba-train (20131)
- cmn-eng: Tatoeba-train (405192)
- cmn-mya: Tatoeba-train (58)
- cmn_Hans-eng: Tatoeba-train (1000000)
- cmn_Hans-mya: Tatoeba-train (1032)
- cmn_Hant-eng: Tatoeba-train (1000000)
- cmn_Hant-mya: Tatoeba-train (27507)
- eng-bod: Tatoeba-train (20131)
- eng-cmn: Tatoeba-train (405192)
- eng-cmn_Hans: Tatoeba-train (1000000)
- eng-cmn_Hant: Tatoeba-train (1000000)
- eng-lzh: Tatoeba-train (40)
- eng-lzh_Hans: Tatoeba-train (11)
- eng-mya: Tatoeba-train (677056)
- eng-nan: Tatoeba-train (374)
- eng-wuu: Tatoeba-train (174)
- eng-yue_Hans: Tatoeba-train (450)
- eng-yue_Hant: Tatoeba-train (23249)
- lzh-eng: Tatoeba-train (40)
- lzh_Hans-eng: Tatoeba-train (11)
- mya-cmn: Tatoeba-train (58)
- mya-cmn_Hans: Tatoeba-train (1032)
- mya-cmn_Hant: Tatoeba-train (27507)
- mya-eng: Tatoeba-train (677056)
- mya-nan: Tatoeba-train (52)
- mya-yue_Hans: Tatoeba-train (1)
- mya-yue_Hant: Tatoeba-train (601)
- nan-eng: Tatoeba-train (374)
- nan-mya: Tatoeba-train (52)
- wuu-eng: Tatoeba-train (174)
- yue_Hans-eng: Tatoeba-train (450)
- yue_Hans-mya: Tatoeba-train (1)
- yue_Hant-eng: Tatoeba-train (23249)
- yue_Hant-mya: Tatoeba-train (601)
validation-data:
- bod-eng: Tatoeba-dev, 999
- brx-eng: Tatoeba-dev, 135
- brx_Latn-eng: Tatoeba-dev, 6
- cjy_Hans-eng: Tatoeba-dev, 4
- cjy_Hant-eng: Tatoeba-dev, 3
- cmn-eng: Tatoeba-dev, 52
- cmn-mya: Tatoeba-dev, 1
- cmn_Hans-eng: Tatoeba-dev, 17917
- cmn_Hans-mya: Tatoeba-dev, 37
- cmn_Hant-eng: Tatoeba-dev, 19430
- cmn_Hant-mya: Tatoeba-dev, 934
- bod-eng: Tatoeba-dev, 999
- brx-eng: Tatoeba-dev, 135
- brx_Latn-eng: Tatoeba-dev, 6
- cjy_Hans-eng: Tatoeba-dev, 4
- cjy_Hant-eng: Tatoeba-dev, 3
- cmn-eng: Tatoeba-dev, 52
- cmn_Hans-eng: Tatoeba-dev, 17917
- cmn_Hant-eng: Tatoeba-dev, 19430
- eng-gan: Tatoeba-dev, 3
- eng-hak: Tatoeba-dev, 3
- eng-hak_Hani: Tatoeba-dev, 1
- eng-hsn_Hani: Tatoeba-dev, 3
- eng-lzh: Tatoeba-dev, 419
- eng-lzh_Hans: Tatoeba-dev, 20
- eng-mya: Tatoeba-dev, 995
- eng-nan: Tatoeba-dev, 13
- eng-wuu: Tatoeba-dev, 877
- eng-yue_Hans: Tatoeba-dev, 2717
- eng-yue_Hant: Tatoeba-dev, 1512
- eng-gan: Tatoeba-dev, 3
- eng-hak: Tatoeba-dev, 3
- eng-hak_Hani: Tatoeba-dev, 1
- eng-hsn_Hani: Tatoeba-dev, 3
- eng-lzh: Tatoeba-dev, 419
- eng-lzh_Hans: Tatoeba-dev, 20
- cmn-mya: Tatoeba-dev, 1
- cmn_Hans-mya: Tatoeba-dev, 37
- cmn_Hant-mya: Tatoeba-dev, 934
- eng-mya: Tatoeba-dev, 995
- mya-nan: Tatoeba-dev, 3
- mya-yue_Hant: Tatoeba-dev, 22
- eng-nan: Tatoeba-dev, 13
- mya-nan: Tatoeba-dev, 3
- eng-wuu: Tatoeba-dev, 877
- eng-yue_Hans: Tatoeba-dev, 2717
- eng-yue_Hant: Tatoeba-dev, 1512
- mya-yue_Hant: Tatoeba-dev, 22
- total size of shuffled dev data: 91448
- devset = top 5000 lines of Tatoeba-dev.src.shuffled!
test-data:
- Tatoeba-test.cmn_Hans-mya: 10/40
- Tatoeba-test.cmn_Hans-yue_Hans: 59/678
- Tatoeba-test.cmn_Hans-yue_Hant: 26/234
- Tatoeba-test.cmn_Hant-mya: 7/30
- Tatoeba-test.cmn_Hant-yue_Hans: 50/546
- Tatoeba-test.cmn_Hant-yue_Hant: 31/245
- Tatoeba-test.mya-cmn_Hans: 10/150
- Tatoeba-test.mya-cmn_Hant: 7/90
- Tatoeba-test.mya-zho: 17/240
- Tatoeba-test.nan-cmn_Hans: 1/4
- Tatoeba-test.nan-cmn_Hant: 1/10
- Tatoeba-test.multi-multi: 10000/97851
- Tatoeba-test.yue_Hans-cmn_Hans: 56/602
- Tatoeba-test.yue_Hans-cmn_Hant: 66/793
- Tatoeba-test.yue_Hant-cmn_Hans: 35/320
- Tatoeba-test.yue_Hant-cmn_Hant: 37/326
- Tatoeba-test.zho-mya: 17/70
- Tatoeba-test.zho-zho: 2500/29103
- tico19-test.eng-mya: 2100/32295
BLEU-scores:
- Tatoeba-test.cmn_Hans-mya: 0.6
- Tatoeba-test.cmn_Hans-yue_Hans: 7.0
- Tatoeba-test.cmn_Hans-yue_Hant: 7.2
- Tatoeba-test.cmn_Hant-mya: 0.9
- Tatoeba-test.cmn_Hant-yue_Hans: 5.7
- Tatoeba-test.cmn_Hant-yue_Hant: 14.6
- Tatoeba-test.mya-cmn_Hans: 2.1
- Tatoeba-test.mya-cmn_Hant: 6.8
- Tatoeba-test.mya-zho: 3.6
- Tatoeba-test.nan-cmn_Hans: 0.8
- Tatoeba-test.nan-cmn_Hant: 1.9
- Tatoeba-test.multi-multi: 26.3
- Tatoeba-test.yue_Hans-cmn_Hans: 12.4
- Tatoeba-test.yue_Hans-cmn_Hant: 8.2
- Tatoeba-test.yue_Hant-cmn_Hans: 15.6
- Tatoeba-test.yue_Hant-cmn_Hant: 17.7
- Tatoeba-test.zho-mya: 0.4
- Tatoeba-test.zho-zho: 13.1
- tico19-test.eng-mya: 1.9
chr-F-scores:
- Tatoeba-test.cmn_Hans-mya: 0.124
- Tatoeba-test.cmn_Hans-yue_Hans: 0.072
- Tatoeba-test.cmn_Hans-yue_Hant: 0.070
- Tatoeba-test.cmn_Hant-mya: 0.185
- Tatoeba-test.cmn_Hant-yue_Hans: 0.060
- Tatoeba-test.cmn_Hant-yue_Hant: 0.119
- Tatoeba-test.mya-cmn_Hans: 0.039
- Tatoeba-test.mya-cmn_Hant: 0.083
- Tatoeba-test.mya-zho: 0.056
- Tatoeba-test.nan-cmn_Hans: 0.000
- Tatoeba-test.nan-cmn_Hant: 0.000
- Tatoeba-test.multi-multi: 0.400
- Tatoeba-test.yue_Hans-cmn_Hans: 0.112
- Tatoeba-test.yue_Hans-cmn_Hant: 0.057
- Tatoeba-test.yue_Hant-cmn_Hans: 0.128
- Tatoeba-test.yue_Hant-cmn_Hant: 0.141
- Tatoeba-test.zho-mya: 0.146
- Tatoeba-test.zho-zho: 0.099
- tico19-test.eng-mya: 0.174