-
Notifications
You must be signed in to change notification settings - Fork 4
/
dvc.lock
448 lines (448 loc) · 16.5 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
schema: '2.0'
stages:
preprocess_wellcome_science:
cmd: grants_tagger preprocess wellcome-science data/raw/science_tags_full_version.xlsx
data/processed/science_grants_tagged_title_synopsis.jsonl models/label_binarizer.pkl
deps:
- path: data/raw/science_tags_full_version.xlsx
md5: 74da2bf7a507e52b8b677ddce19156a9
size: 2638299
- path: grants_tagger/preprocess_wellcome.py
md5: 738ccb78c7ee261c7e934cd4196e9b46
size: 6654
params:
params.yaml:
preprocess_wellcome_science.meta_cols: Grant_ID,Title
preprocess_wellcome_science.text_cols: Title,Synopsis
outs:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
train:
cmd: grants_tagger train data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl models/tfidf-svm-2020.05.2.pkl --approach tfidf-svm
deps:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: grants_tagger/train.py
md5: 162e8650a0a7e970420f48eb5c253f82
size: 3970
params:
params.yaml:
train.class_weight: balanced
train.min_df: 5
train.ngram_range:
- 1
- 2
outs:
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/tfidf-svm-2020.05.2.pkl
md5: eceaf3846999e47380ca670c096b810a
size: 17768856
train_tfidf_svm:
cmd: grants_tagger train data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl models/tfidf-svm.pkl --approach tfidf-svm --train-info
results/tfidf_svm_train_info.json
deps:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: grants_tagger/train.py
md5: 7ad0632959accc481d8d5300f3c9fd84
size: 5996
params:
params.yaml:
train.tfidf-svm.svm__estimator.class_weight: balanced
train.tfidf-svm.tfidf.min_df: 5
train.tfidf-svm.tfidf.ngram_range:
- 1
- 2
outs:
- path: models/tfidf-svm.pkl
md5: 38f35b3381d116adad18dc5e2a7dab03
size: 17768857
- path: results/tfidf_svm_train_info.json
md5: 6f98a7bec4325ece260581465a1d9847
size: 62
evaluate:
cmd: grants_tagger evaluate model tfidf-svm models/tfidf-svm-2020.05.2.pkl data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl
deps:
- path: grants_tagger/evaluate_model.py
md5: cd583e8d10e0c889647834cab217ce2f
size: 1872
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/tfidf-svm-2020.05.2.pkl
md5: f24b224be6a867f96400b3df2ad26ac9
size: 17768856
outs:
- path: results.json
md5: 1d0d4fb63ae1d1b911373cc558147737
size: 89
evaluate_tfidf_svm:
cmd: grants_tagger evaluate model tfidf-svm models/tfidf-svm.pkl data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl --results-path results/tfidf_svm.json
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/tfidf-svm.pkl
md5: 38f35b3381d116adad18dc5e2a7dab03
size: 17768857
outs:
- path: results/tfidf_svm.json
md5: b2f118de649f2da71cb40b5f8694fcf1
size: 120
train_scibert:
cmd: grants_tagger train data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl models/scibert --approach scibert --train-info results/scibert_train_info.json
deps:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: grants_tagger/train.py
md5: 7ad0632959accc481d8d5300f3c9fd84
size: 5996
params:
params.yaml:
train.scibert.epochs: 10
train.scibert.learning_rate: 2e-05
train.scibert.validation_split: 0.1
outs:
- path: models/scibert
md5: 5cf7e7f8e11a1e00d1c214d637618b85.dir
size: 440020006
nfiles: 2
evaluate_scibert:
cmd: grants_tagger evaluate model scibert models/scibert data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl --results-path results/scibert.json
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/scibert
md5: 5cf7e7f8e11a1e00d1c214d637618b85.dir
size: 440020006
nfiles: 2
outs:
- path: results/scibert.json
md5: 936e3229c02ea1a562eeb90298d1ecca
size: 120
preprocess_bioasq_mesh:
cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021.jsonl
models/xlinear/label_binarizer.pkl --test-split 0.01 --test-output-path data/processed/test_mesh2021.jsonl
--mesh-tags-path data/processed/descriptors_to_use.csv
deps:
- path: data/raw/allMeSH_2021.json
md5: e827a6b8062d1312664dcf075c12d89f
size: 27547042745
- path: grants_tagger/preprocess_mesh.py
md5: 4a6539c093e465852206e4455e67cda3
size: 5675
outs:
- path: data/processed/test_mesh2021.jsonl
md5: 01c96f3be37e5329d72e6dfd52668787
size: 255517412
- path: data/processed/train_mesh2021.jsonl
md5: 55e2cb37296169499491b58ee1e8fa67
size: 25271198680
- path: models/xlinear/label_binarizer.pkl
md5: 17de879cfa59aa3b0da81cf29bcb4584
size: 622349
evaluate_science_ensemble:
cmd: grants_tagger evaluate model science-ensemble models/tfidf-svm.pkl,models/scibert
data/processed/science_grants_tagged_title_synopsis.jsonl models/label_binarizer.pkl
--results-path results/science_ensemble.json
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/scibert
md5: 5cf7e7f8e11a1e00d1c214d637618b85.dir
size: 440020006
nfiles: 2
- path: models/tfidf-svm.pkl
md5: 38f35b3381d116adad18dc5e2a7dab03
size: 17768857
outs:
- path: results/science_ensemble.json
md5: a9fb045af166aa9901e7cd89678fed25
size: 120
train_mesh_cnn:
cmd: grants_tagger train data/processed/disease_mesh.jsonl models/disease_mesh_label_binarizer-2021.06.0.pkl
models/disease_mesh_cnn-2021.06.0 --approach mesh-cnn --sparse-labels
deps:
- path: data/processed/disease_mesh.jsonl
md5: f4463f861869e7516caef78ff75600ac
size: 12179582610
- path: grants_tagger/models.py
md5: 234124a679b6f761241c4ff7ba7b2fd7
size: 25324
- path: grants_tagger/train.py
md5: 78683e5785fe5f403b4d1f58211b5dc8
size: 3971
params:
params.yaml:
train.mesh-cnn.cnn.attention: true
train.mesh-cnn.cnn.batch_size: 256
train.mesh-cnn.cnn.dense_size: 10000
train.mesh-cnn.cnn.dropout: 0.1
train.mesh-cnn.cnn.hidden_size: 400
train.mesh-cnn.cnn.l2: 7e-07
train.mesh-cnn.cnn.learning_rate: 0.0001
train.mesh-cnn.cnn.learning_rate_decay: 0.8
train.mesh-cnn.cnn.multilabel: true
train.mesh-cnn.cnn.nb_epochs: 10
train.mesh-cnn.vec.sequence_length: 400
train.mesh-cnn.vec.tokenizer_library: transformers
train.mesh-cnn.vec.vocab_size: 30000
outs:
- path: models/disease_mesh_cnn-2021.06.0
md5: 08ec776180c1d45ff0f35024833f843d.dir
size: 808278043
nfiles: 5
- path: models/disease_mesh_label_binarizer-2021.06.0.pkl
md5: 85be50a39457aa83f1dd4fbb2b1f26b6
size: 147371
filter_mesh_tags:
cmd: python grants_tagger/filter_mesh_tags.py data/raw/desc2021.xml data/processed/mesh_disease_tags.csv
deps:
- path: data/raw/desc2021.xml
md5: 8663a7dd8e1895dd22525d42b80cd2df
size: 300410104
outs:
- path: data/processed/mesh_disease_tags.csv
md5: 4311b12fb4f381ffab1d76f55069683d
size: 260080
train_mesh_xlinear:
cmd: grants_tagger train data/processed/train_mesh2021.jsonl models/xlinear/label_binarizer.pkl
models/xlinear/model --approach mesh-xlinear --sparse-labels --train-info results/mesh_xlinear_train_info.json
--slim
deps:
- path: data/processed/train_mesh2021.jsonl
md5: 55e2cb37296169499491b58ee1e8fa67
size: 25271198680
- path: grants_tagger/slim/mesh_xlinear.py
md5: 56a24b32c2a49032be5009ffab01f3f7
size: 5735
- path: grants_tagger/train.py
md5: c94acba1bef2ecf4a904975070bd5ae0
size: 5702
params:
params.yaml:
train.mesh-xlinear.config: configs/mesh/2022.9.0.ini
outs:
- path: models/xlinear/model
md5: 7d889f5b6ce7af8f18997d7c72991d0b.dir
size: 3711702599
nfiles: 34
evaluate_mesh_xlinear_on_grants:
cmd: grants_tagger evaluate grants mesh-xlinear models/xlinear/model data/raw/disease_tags_validation_grants.xlsx
models/xlinear/label_binarizer.pkl --results-path results/mesh_xlinear_on_grants.json
--mesh-tags-path data/processed/mesh_disease_tags.csv
deps:
- path: data/processed/mesh_disease_tags.csv
md5: 4311b12fb4f381ffab1d76f55069683d
size: 260080
- path: data/raw/disease_tags_validation_grants.xlsx
md5: 71554cf90758773fb996351000384d4f
size: 615751
- path: grants_tagger/evaluate_mesh_on_grants.py
md5: f19ad62ab308aa3534d7eb7718b8b701
size: 4218
- path: models/xlinear/label_binarizer.pkl
md5: 17de879cfa59aa3b0da81cf29bcb4584
size: 622349
- path: models/xlinear/model
md5: 7d889f5b6ce7af8f18997d7c72991d0b.dir
size: 3711702599
nfiles: 34
outs:
- path: results/mesh_xlinear_on_grants.json
md5: 0bba58131d082341832bbe5d97a5bfaf
size: 26
evaluate_mesh_xlinear:
cmd: grants_tagger evaluate model mesh-xlinear models/xlinear/model data/processed/test_mesh2021.jsonl
models/xlinear/label_binarizer.pkl --results-path results/mesh_xlinear.json
--full-report-path results/mesh_xlinear_full_report.json --no-split-data
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/xlinear/label_binarizer.pkl
md5: 17de879cfa59aa3b0da81cf29bcb4584
size: 622349
- path: models/xlinear/model
md5: 7d889f5b6ce7af8f18997d7c72991d0b.dir
size: 3711702599
nfiles: 34
outs:
- path: results/mesh_xlinear.json
md5: 411316e6830d76c0db5839b9cba445cb
size: 120
generate_validation_data_xlinear:
cmd: python scripts/generate_validation_data_xlinear.py
deps:
- path: data/interim/mesh_pipeline_result.csv
md5: 10639bbe244b986919efc8f7866b98b4
size: 138862818
- path: data/raw/grants.csv
md5: 9732c21dd1954cce8baaf3746f301ead
size: 152523849
- path: scripts/generate_validation_data_xlinear.py
md5: e3eb14350c3ea5f7692cce5a48fed5fe
size: 2734
outs:
- path: data/processed/merged_mesh_predictions_mesh_xlinear_for_validation.xlsx
md5: f174b02bf804885f769816cc27540b89
size: 40795153
get_grants:
cmd: python scripts/get_grants.py
deps:
- path: scripts/get_grants.py
md5: 50c0cf255eb0252fd4f3412920430d19
size: 1010
outs:
- path: data/raw/grants.csv
md5: 9732c21dd1954cce8baaf3746f301ead
size: 152523849
preprocess_bioasq_mesh_toy:
cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021_toy.jsonl
models/xlinear-toy/label_binarizer_toy.pkl --test-split 0.01 --test-output-path
data/processed/test_mesh2021_toy.jsonl --n-max 1000
deps:
- path: data/processed/wt_tags_used.csv
md5: a2b7c13fc9d40732e1777b4e6e7a6454
size: 638161
- path: data/raw/allMeSH_2021.json
md5: e827a6b8062d1312664dcf075c12d89f
size: 27547042745
- path: grants_tagger/preprocess_mesh.py
md5: 4a6539c093e465852206e4455e67cda3
size: 5675
outs:
- path: data/processed/test_mesh2021_toy.jsonl
md5: 4854f52928e0f018e9c062ccf7479d75
size: 17777
- path: data/processed/train_mesh2021_toy.jsonl
md5: 76794bc0196dc76873a7b201c3ded238
size: 1926754
- path: models/xlinear-toy/label_binarizer_toy.pkl
md5: a37d8263441d2ed3bb9ac8ccac84cef5
size: 66404
preprocess_bioasq_mesh_wt_only:
cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021_wt.jsonl
models/xlinear-wt/label_binarizer_wt.pkl --test-split 0.01 --test-output-path
data/processed/test_mesh2021_wt.jsonl --mesh-tags-path data/processed/wt_tags_used.csv
deps:
- path: data/processed/wt_tags_used.csv
md5: a2b7c13fc9d40732e1777b4e6e7a6454
size: 638161
- path: data/raw/allMeSH_2021.json
md5: e827a6b8062d1312664dcf075c12d89f
size: 27547042745
- path: grants_tagger/preprocess_mesh.py
md5: 4a6539c093e465852206e4455e67cda3
size: 5675
outs:
- path: data/processed/test_mesh2021_wt.jsonl
md5: f71372a0bfc08aa800261ead73f4ec99
size: 255911103
- path: data/processed/train_mesh2021_wt.jsonl
md5: f56ce75266204248252e1355ca160276
size: 25357487172
- path: models/xlinear-wt/label_binarizer_wt.pkl
md5: 4e03c4c649c22b3cc2fb2eb4707b7862
size: 541135
create_inclusion_list:
cmd: python grants_tagger/create_inclusion_list.py data/raw/desc2021.xml data/processed/descriptors_not_to_use_manual.csv
data/processed/descriptors_to_use.csv
deps:
- path: data/raw/desc2021.xml
md5: 8663a7dd8e1895dd22525d42b80cd2df
size: 300410104
outs:
- path: data/processed/descriptors_to_use.csv
md5: 01114e287def70117deee0aca5abb61c
size: 746295
train_mesh_xlinear_toy:
cmd: grants_tagger train data/processed/train_mesh2021_toy.jsonl models/xlinear-toy/label_binarizer_toy.pkl
models/xlinear-toy/model --approach mesh-xlinear --sparse-labels --train-info
results/mesh_xlinear_train_info_toy.json --slim
deps:
- path: data/processed/train_mesh2021_toy.jsonl
md5: 76794bc0196dc76873a7b201c3ded238
size: 1926754
- path: grants_tagger/slim/mesh_xlinear.py
md5: 56a24b32c2a49032be5009ffab01f3f7
size: 5735
- path: grants_tagger/train.py
md5: c94acba1bef2ecf4a904975070bd5ae0
size: 5702
params:
params.yaml:
train.mesh-xlinear.config: configs/mesh/2022.9.0.ini
outs:
- path: models/xlinear-toy/model
md5: 4358025265067e229f8d82629429b937.dir
size: 2645142
nfiles: 25
train_mesh_xlinear_wt_only:
cmd: grants_tagger train data/processed/train_mesh2021_wt.jsonl models/xlinear-wt/label_binarizer_wt.pkl
models/xlinear-wt/model-2022.12.0 --approach mesh-xlinear --sparse-labels --train-info
results/mesh_xlinear_train_info_wt.json --slim
deps:
- path: data/processed/train_mesh2021_wt.jsonl
md5: 52b0358e553f4b373f06c5f1c13672f8
size: 25357487172
- path: grants_tagger/slim/mesh_xlinear.py
md5: 56a24b32c2a49032be5009ffab01f3f7
size: 5735
- path: grants_tagger/train.py
md5: c94acba1bef2ecf4a904975070bd5ae0
size: 5702
params:
params.yaml:
train.mesh-xlinear.config: configs/mesh/2022.9.0.ini
outs:
- path: models/xlinear-wt/model-2022.12.0
md5: 0ed7ca9710837ebeccaee9634e81c433.dir
size: 3470257704
nfiles: 31
evaluate_mesh_xlinear_wt_only:
cmd: grants_tagger evaluate model mesh-xlinear models/xlinear-wt/model-2022.12.0
data/processed/test_mesh2021_wt.jsonl models/xlinear-wt/label_binarizer_wt.pkl
--results-path results/mesh_xlinear_wt_only.json --full-report-path results/mesh_xlinear_full_report_wt_only.json
--no-split-data
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/xlinear-wt/label_binarizer_wt.pkl
md5: 75224e7ef280c0c0437e0001b5a5ef77
size: 541138
- path: models/xlinear-wt/model-2022.12.0
md5: 0ed7ca9710837ebeccaee9634e81c433.dir
size: 3470257704
nfiles: 31
outs:
- path: results/mesh_xlinear_wt_only.json
md5: cef90bdba2ef2cf30ea69714f51ed1f0
size: 120