Skip to content

Commit

Permalink
Caught exception when supporitng docs not available
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 18, 2024
1 parent 68e17ac commit be38526
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 38 deletions.
56 changes: 28 additions & 28 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -54,60 +54,60 @@ stages:
size: 4584498
- path: data/supporting-docs.json
hash: md5
md5: b0941cc9a7ca7df456157380bcc28f39
size: 75646
md5: f3ea9980226e5408497c96a10cc77b80
size: 72013526
- path: scripts/chunk_data.py
hash: md5
md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
size: 2509
outs:
- path: data/chunked_data.json
hash: md5
md5: 97f06c3b76ff05d62ccdecd9d5742712
size: 137681
md5: f6426396e1a3564b53649ef5fc0571fd
size: 993814
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
deps:
- path: data/chunked_data.json
hash: md5
md5: 97f06c3b76ff05d62ccdecd9d5742712
size: 137681
md5: f6426396e1a3564b53649ef5fc0571fd
size: 993814
- path: scripts/create_embeddings.py
hash: md5
md5: 4649c700dfae922b43b3608ee4f00c1a
size: 808
outs:
- path: data/embeddings.json
hash: md5
md5: 8d80ef225c59ede34d026f6f2930bae3
size: 1894126
md5: 8fd682131a282736f6a81a6c53040b1e
size: 13422675
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
-em all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 8d80ef225c59ede34d026f6f2930bae3
size: 1894126
md5: 8fd682131a282736f6a81a6c53040b1e
size: 13422675
- path: scripts/upload_to_docstore.py
hash: md5
md5: 41da88e3bb6d2592bee938ce347f6983
size: 1905
outs:
- path: data/chroma-data
hash: md5
md5: cc85398c596d4c5839714e93e33468bb.dir
size: 3580644
nfiles: 5
md5: 5c99644f30def03f87b37c98341c6f25.dir
size: 13758136
nfiles: 6
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
data/chroma-data -c eidc-data
deps:
- path: data/chroma-data
hash: md5
md5: cc85398c596d4c5839714e93e33468bb.dir
size: 3580644
nfiles: 5
md5: 5c99644f30def03f87b37c98341c6f25.dir
size: 13758136
nfiles: 6
- path: data/eidc_rag_test_sample.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
Expand All @@ -119,8 +119,8 @@ stages:
outs:
- path: data/evaluation_data.csv
hash: md5
md5: 9825cf7e7a89ca17634b44e9256eefc9
size: 9695
md5: 8ea0a3f240478e9db41855922ac534a6
size: 9894
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
Expand All @@ -137,31 +137,31 @@ stages:
size: 12157676
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: de0c11e81bf10e040bef67e43466b789
size: 1472
md5: 923af3b6ce1447d388b08fab0e3ab77d
size: 1660
outs:
- path: data/supporting-docs.json
hash: md5
md5: b0941cc9a7ca7df456157380bcc28f39
size: 75646
md5: f3ea9980226e5408497c96a10cc77b80
size: 72013526
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: 9825cf7e7a89ca17634b44e9256eefc9
size: 9695
md5: 8ea0a3f240478e9db41855922ac534a6
size: 9894
- path: scripts/evaluate.py
hash: md5
md5: 10f76511eafc8a1a9b90e9ae92a76bc5
size: 2633
outs:
- path: data/eval.png
hash: md5
md5: 1279778c7e509e972d1f366157d24966
size: 58228
md5: bae77b1b721bf283a30a64f67af45fea
size: 74438
- path: data/metrics.json
hash: md5
md5: 2b93334ba0e8226c916d0964237cb72c
size: 225
md5: 0145280f36071a6df551ef57d3f8393e
size: 229
24 changes: 14 additions & 10 deletions scripts/fetch_supporting_docs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from argparse import ArgumentParser
import logging
import json
from tqdm import tqdm
import requests
import os
from typing import Dict, List
from dotenv import load_dotenv

logger = logging.getLogger(__name__)

def extract_ids(metadata_file: str):
with open(metadata_file) as f:
Expand All @@ -15,14 +17,18 @@ def extract_ids(metadata_file: str):


def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
res = requests.get(
f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
)
json_data = res.json()
docs = []
for key, val in json_data["success"].items():
docs.append({"id": eidc_id, "field": key, "value": val})
return docs
try:
res = requests.get(
f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
)
json_data = res.json()
docs = []
for key, val in json_data["success"].items():
docs.append({"id": eidc_id, "field": key, "value": val})
return docs
except Exception as e:
logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e)
return []


def main(metadata_file: str, supporting_docs_file: str):
Expand All @@ -33,8 +39,6 @@ def main(metadata_file: str, supporting_docs_file: str):
docs = []
for id in tqdm(ids):
docs.extend(get_supporting_docs(id, user, password))
if len(docs) > 0:
break
with open(supporting_docs_file, "w") as f:
json.dump(docs, f, indent=4)

Expand Down

0 comments on commit be38526

Please sign in to comment.