-
Notifications
You must be signed in to change notification settings - Fork 0
/
dvc.lock
79 lines (79 loc) · 2.79 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
schema: '2.0'
stages:
debug:
cmd: echo "Masked Language Modeling pipeline is working"
train_tokenizers_ven@0:
cmd: /bin/bash scripts/train_sp_tokenizer.sh --input-texts-path data/interim/train/bantu.ven.txt --sampled-texts-path
data/temp/tshivenda-xlmr-30k/0/ --seed 47 --alpha 0.5 --tokenizer-model-type
unigram --vocab-size 30000 --tokenizer-output-dir data/tokenizers/tshivenda-xlmr-30k
deps:
- path: data/interim/train/bantu.ven.txt
hash: md5
md5: 464acc68763232527d9fc739eab1bae3
size: 11400638
- path: scripts/train_sp_tokenizer.sh
hash: md5
md5: 6fcd0030bb4e94e0530887499f973359
size: 4340
params:
params.yaml:
global.seed: 47
paths.tokenizer_root: data/tokenizers
paths.train_data: data/interim/train
tokenization.alpha: 0.5
outs:
- path: data/tokenizers/tshivenda-xlmr-30k
hash: md5
md5: 925bb1963c2d0417ba4d3428e102cf6e.dir
size: 1281740
nfiles: 2
train_tokenizers_ven@1:
cmd: /bin/bash scripts/train_sp_tokenizer.sh --input-texts-path data/interim/train/bantu.ven.txt --sampled-texts-path
data/temp/tshivenda-xlmr-bpe-30k/0/ --seed 47 --alpha 0.5 --tokenizer-model-type
bpe --vocab-size 30000 --tokenizer-output-dir data/tokenizers/tshivenda-xlmr-bpe-30k
deps:
- path: data/interim/train/bantu.ven.txt
hash: md5
md5: 464acc68763232527d9fc739eab1bae3
size: 11400638
- path: scripts/train_sp_tokenizer.sh
hash: md5
md5: 6fcd0030bb4e94e0530887499f973359
size: 4340
params:
params.yaml:
global.seed: 47
paths.tokenizer_root: data/tokenizers
paths.train_data: data/interim/train
tokenization.alpha: 0.5
outs:
- path: data/tokenizers/tshivenda-xlmr-bpe-30k
hash: md5
md5: 974f843034126a96e469c95a3bc162e9.dir
size: 1237388
nfiles: 2
train_tokenizers_ven@2:
cmd: /bin/bash scripts/train_sp_tokenizer.sh --input-texts-path data/interim/train/bantu.ven.txt --sampled-texts-path
data/temp/tshivenda-xlmr-bpe-50k/0/ --seed 47 --alpha 0.5 --tokenizer-model-type
bpe --vocab-size 50000 --tokenizer-output-dir data/tokenizers/tshivenda-xlmr-bpe-50k
deps:
- path: data/interim/train/bantu.ven.txt
hash: md5
md5: 464acc68763232527d9fc739eab1bae3
size: 11400638
- path: scripts/train_sp_tokenizer.sh
hash: md5
md5: 6fcd0030bb4e94e0530887499f973359
size: 4340
params:
params.yaml:
global.seed: 47
paths.tokenizer_root: data/tokenizers
paths.train_data: data/interim/train
tokenization.alpha: 0.5
outs:
- path: data/tokenizers/tshivenda-xlmr-bpe-50k
hash: md5
md5: 35e196efb76ee4ae68563d28240fd630.dir
size: 1908730
nfiles: 2