-
Notifications
You must be signed in to change notification settings - Fork 2
/
dvc.lock
172 lines (172 loc) · 5.05 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
schema: '2.0'
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json -s 0
deps:
- path: scripts/fetch_eidc_metadata.py
hash: md5
md5: 82907434d9521996e30014df01bbba8e
size: 964
outs:
- path: data/eidc_metadata.json
hash: md5
md5: fb338ea98ce71bf7f002be952b6db0e1
size: 12275265
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: 423dc3a61ede72e1d5c818d74277c0b4
size: 12140491
- path: scripts/extract_metadata.py
hash: md5
md5: c2fa7d2c4b8f28a6e24536ce0df244fd
size: 1296
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 7d2ae8d6a41a960592f30496eb498af7
size: 4578493
extract-metadata:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: fb338ea98ce71bf7f002be952b6db0e1
size: 12275265
- path: scripts/extract_metadata.py
hash: md5
md5: e66f21369c5106eaaad4476612c6fb5e
size: 1313
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 9f4fc9cb1e8af8e0f2d1c95b311989fc
size: 4616342
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json data/supporting-docs.json -m
0
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 9f4fc9cb1e8af8e0f2d1c95b311989fc
size: 4616342
- path: data/supporting-docs.json
hash: md5
md5: 0b14da8f2e73dc8e15747f693c0f70ce
size: 72383140
- path: scripts/chunk_data.py
hash: md5
md5: 3ad449140b03e1c2904b22a5b401a12e
size: 2705
outs:
- path: data/chunked_data.json
hash: md5
md5: b107dfb052c12ea47b04a5176e8bab4a
size: 176342129
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
-m all-MiniLM-L6-v2
deps:
- path: data/chunked_data.json
hash: md5
md5: b107dfb052c12ea47b04a5176e8bab4a
size: 176342129
- path: scripts/create_embeddings.py
hash: md5
md5: 87bd2ed6373552bea229c9f3465fd3db
size: 1594
outs:
- path: data/embeddings.json
hash: md5
md5: 68a9de7fcf765be8ae2f4d3ff6537228
size: 3739724900
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em
all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 68a9de7fcf765be8ae2f4d3ff6537228
size: 3739724900
- path: scripts/upload_to_docstore.py
hash: md5
md5: 930456cedd43723c1d643ad90c146952
size: 2793
outs:
- path: data/chroma-data
hash: md5
md5: 486d560a81dc951bdd85772996e62f00.dir
size: 1815042692
nfiles: 6
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py -i data/eidc_rag_testset.csv -o data/evaluation_data.csv -ds
data/chroma-data -c eidc-data -m llama3.1 -p data/pipeline.yml
deps:
- path: data/chroma-data
hash: md5
md5: 486d560a81dc951bdd85772996e62f00.dir
size: 1815042692
nfiles: 6
- path: data/eidc_rag_testset.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 2d6dc886728d4bd46676ecd1882f1fd1
size: 5838
outs:
- path: data/evaluation_data.csv
hash: md5
md5: a473732be874c8256f7178ef3f4dc7a9
size: 9576
- path: data/pipeline.yml
hash: md5
md5: 8e3c4e49d4d97f613e83468d010a96e9
size: 3440
generate-testset:
cmd: head -n 101 data/synthetic-datasets/eidc_rag_test_sample.csv > data/eidc_rag_testset.csv
outs:
- path: data/eidc_rag_testset.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
fetch-supporting-docs:
cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: fb338ea98ce71bf7f002be952b6db0e1
size: 12275265
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: 02b94a2cc7bff711784cbdec3650b618
size: 1718
outs:
- path: data/supporting-docs.json
hash: md5
md5: 0b14da8f2e73dc8e15747f693c0f70ce
size: 72383140
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: a473732be874c8256f7178ef3f4dc7a9
size: 9576
- path: scripts/evaluate.py
hash: md5
md5: 4154acf8e74c1d8bcd0b0da72af038e0
size: 2728
outs:
- path: data/eval.png
hash: md5
md5: 7bfd424fa4c9a3550d6e9605bb2f6af2
size: 89143
- path: data/metrics.json
hash: md5
md5: f768092fe2696328ff4da565e763e743
size: 270