forked from facebookresearch/cc_net
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
181 lines (152 loc) · 5.52 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Makefile to install CC-Net and train the LMs.
# `make` or `make help` to get some help.
# Arguments:
lang?=en
process?=8
servers?=0
# List of languages for LM.
langs?=af,ar,az,be,bg,bn,ca,cs,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,hy,id,\
is,it,ja,ka,kk,km,kn,ko,lt,lv,mk,ml,mn,mr,my,ne,nl,no,pl,pt,ro,ru,uk,zh
# Experiment config
NDOC_FOR_LM=1_000_000
NDOC_FOR_SENTPIECE=400000
VOCAB_SIZE=65536
# Static resources, scripts, ...
KENLM=./bin/lmplz
KENLM_BUILD_BINARY=./bin/build_binary
SPM_TRAIN=./bin/spm_train
SPM_ENCODE=./bin/spm_encode
# DISTRIBUTE will run locally, or on slurm if "servers" is set.
DISTRIBUTE=xargs -L1 -P $(process)
ifneq ($(servers), 0)
DISTRIBUTE=xargs -L1 -P $(servers) srun -t 240 --mem 5000
endif
help:
# Show help
grep -i -A1 '^[a-z0-9_]*:' Makefile
# Deletes output files on error (useful when piping output)
SHELL=/bin/bash
.SHELLFLAGS = -o pipefail -c
.DELETE_ON_ERROR:
install: bin/lid.bin $(KENLM) $(SPM_TRAIN)
# Installs dependencies.
@if [ -f "data" ]; then\
echo "Please create/simlink a 'data' directory.";\
fi
@if ! python -c "from cc_net import __main__" 2> /dev/null; then\
pip install . ;\
fi
echo " --> All dependencies looks good !"
dl_lm:
# Download a pretrained language model
mkdir -p data/lm_sp
wget -c -P data/lm_sp http://dl.fbaipublicfiles.com/cc_net/lm/$(lang).arpa.bin
wget -c -P data/lm_sp http://dl.fbaipublicfiles.com/cc_net/lm/$(lang).sp.model
lm: data/lm_sp/$(lang).sp.model data/lm_sp/$(lang).arpa.bin
# Computes a 5-gram LM for the given language -> make lang=it lm
# Restricted to the first NDOC_FOR_LM documents
sp: data/lm_sp/$(lang).sp.model
# Train a sentence piece model on Wikipedia -> make lang=it sp
get_lang = $(firstword $(subst ., ,$1))
all_lms:
# Train a list of language models -> make process=10 langs=en,it,fr all_lms
# Defaults to the LMs trained in the paper.
echo "$(langs)" \
| tr -s ', ' '\n' | awk '{print "data/lm_sp/" $$0 ".arpa.bin"}' \
| $(DISTRIBUTE) make
dl_all_lms:
# Download all pretrained language models
echo "$(langs)" \
| tr -s ', ' '\n' | awk '{print "lang=" $$0 " dl_lm"}' \
| $(DISTRIBUTE) make
%.arpa.bin: %.arpa
# Compress a learned LM to a binary format.
$(KENLM_BUILD_BINARY) $< $@
%.vocab.txt: %.txt
# Extracts the vocabulary of a corpus.
# Restricted to the first NDOC_FOR_LM documents and VOCAB_SIZE top words.
cat $< | tr ' ' '\n' | sort | uniq -c | sort -rn > [email protected]_sort
head -$(VOCAB_SIZE) [email protected]_sort | sed "s/^ *//" | cut -d' ' -f2 > $@
echo Extracted `wc -l $@` words
data/lm_sp/%.arpa: data/cirrus/sp/%.opening.txt
mkdir -p $(@D)
$(KENLM) -o 5 -S 8G -T /tmp --vocab_estimate $(VOCAB_SIZE) --discount_fallback \
< $< > $@
data/lm_sp/%.sp.model: data/cirrus/txt/%.opening.txt
mkdir -p $(@D)
$(SPM_TRAIN) --input=$< \
--vocab_size=$(VOCAB_SIZE) --hard_vocab_limit \
--character_coverage=0.9995 \
--model_type=unigram \
--model_prefix=$(basename $@) \
|| echo "WARNING: Corpus is too small, will train smaller model" && \
$(SPM_TRAIN) --input=$< \
--vocab_size=40000 \
--character_coverage=0.9995 \
--model_type=unigram \
--model_prefix=$(basename $@)
echo "Trained SentencePiece model with `wc -l $(basename $@).vocab` pieces"
data/cirrus/sp/%.opening.txt: data/cirrus/gz/%.json.gz data/lm_sp/%.sp.model
$(SPM_ENCODE) \
--model=$(word 2,$^) \
--output_format=piece \
< <(python get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \
> $@
data/cirrus/txt/%.opening.txt: data/cirrus/gz/%.json.gz
python get_wiki_cirrus.py opening \
--n_docs $(NDOC_FOR_LM) \
--file $< --output $@
data/cirrus/gz/%.json.gz:
mkdir $(@D)
python get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D)
clean:
# Remove intemediary files, dataset, third_party sources
# We don't need the vocab files nor the text version of the LM.
rm -r data/cirrus
rm -r data/lm_sp/*.arpa data/lm_sp/*.vocab
rm -r third_party
# Installation
bin/lid.bin:
# DL languages id from Fasttext releases.
mkdir -p $(@D)
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $@
third_party/kenlm:
# Download kenlm sources: https://kheafield.com/code/kenlm/"
mkdir -p $(@D)
wget -O - https://kheafield.com/code/kenlm.tar.gz | tar -xz -C $(@D)
bin/lmplz: third_party/kenlm
# Compiles kenlm binaries
mkdir -p $(@D)
mkdir -p $</build
(cd $</build && cmake ..)
make -C $</build -j2
mv $</build/bin/lmplz $</build/bin/build_binary $(@D)
third_party/sentencepiece:
# Download sentencepiece sources: https://github.com/google/sentencepiece
mkdir -p $(@D)
wget -c -O $(@D)/sentencepiece.zip https://github.com/google/sentencepiece/archive/v0.1.83.zip
unzip -o -d $(@D) $(@D)/sentencepiece.zip
rm $(@D)/sentencepiece.zip
# remove the version id from the folder name
mv $(@D)/sentencepiece-* $@
bin/spm_train: third_party/sentencepiece
# Compiles sentencepiece binaries
mkdir -p $(@D)
mkdir -p $</build
(cd $</build && cmake ..)
make -C $</build -j2
mv $</build/src/spm_train $</build/src/spm_encode $(@D)
# Installed SentencePiece locally to install globally do:
# $ cd $</build
# $ sudo make install
# $ sudo ldconfig -v
test:
python -m cc_net mine --config test
mkdir -p test_data/mini
python -m cc_net.minify minify -f test_data/regroup/2019-09 -o test_data/mini/2019-09
mkdir -p test_data/reconstruct
python -m cc_net.minify unminify -f test_data/mini/2019-09 -o test_data/reproduce/2019-09
diff \
<(zcat test_data/regroup/2019-09/de_head_0000.json.gz | head | jq -r .raw_content) \
<(zcat test_data/reproduce/2019-09/de_head_0000.json.gz | head | jq -r .raw_content)