From be3852603eb3558d900717f7e4f5e8a8611f2fec Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 18:45:52 +0100 Subject: [PATCH] Caught exception when supporitng docs not available --- dvc.lock | 56 ++++++++++++++++---------------- scripts/fetch_supporting_docs.py | 24 ++++++++------ 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/dvc.lock b/dvc.lock index dd7f7b1..1d52d2e 100644 --- a/dvc.lock +++ b/dvc.lock @@ -54,8 +54,8 @@ stages: size: 4584498 - path: data/supporting-docs.json hash: md5 - md5: b0941cc9a7ca7df456157380bcc28f39 - size: 75646 + md5: f3ea9980226e5408497c96a10cc77b80 + size: 72013526 - path: scripts/chunk_data.py hash: md5 md5: 681528e4aa1dc8cfb5fe5e5472e25fdf @@ -63,15 +63,15 @@ stages: outs: - path: data/chunked_data.json hash: md5 - md5: 97f06c3b76ff05d62ccdecd9d5742712 - size: 137681 + md5: f6426396e1a3564b53649ef5fc0571fd + size: 993814 create-embeddings: cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json deps: - path: data/chunked_data.json hash: md5 - md5: 97f06c3b76ff05d62ccdecd9d5742712 - size: 137681 + md5: f6426396e1a3564b53649ef5fc0571fd + size: 993814 - path: scripts/create_embeddings.py hash: md5 md5: 4649c700dfae922b43b3608ee4f00c1a @@ -79,16 +79,16 @@ stages: outs: - path: data/embeddings.json hash: md5 - md5: 8d80ef225c59ede34d026f6f2930bae3 - size: 1894126 + md5: 8fd682131a282736f6a81a6c53040b1e + size: 13422675 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 - md5: 8d80ef225c59ede34d026f6f2930bae3 - size: 1894126 + md5: 8fd682131a282736f6a81a6c53040b1e + size: 13422675 - path: scripts/upload_to_docstore.py hash: md5 md5: 41da88e3bb6d2592bee938ce347f6983 @@ -96,18 +96,18 @@ stages: outs: - path: data/chroma-data hash: md5 - md5: cc85398c596d4c5839714e93e33468bb.dir - size: 3580644 - nfiles: 5 + md5: 5c99644f30def03f87b37c98341c6f25.dir + size: 13758136 + nfiles: 6 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv data/chroma-data -c eidc-data deps: - path: data/chroma-data hash: md5 - md5: cc85398c596d4c5839714e93e33468bb.dir - size: 3580644 - nfiles: 5 + md5: 5c99644f30def03f87b37c98341c6f25.dir + size: 13758136 + nfiles: 6 - path: data/eidc_rag_test_sample.csv hash: md5 md5: a371d83c5822d256286e80d64d58c3fe @@ -119,8 +119,8 @@ stages: outs: - path: data/evaluation_data.csv hash: md5 - md5: 9825cf7e7a89ca17634b44e9256eefc9 - size: 9695 + md5: 8ea0a3f240478e9db41855922ac534a6 + size: 9894 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -137,21 +137,21 @@ stages: size: 12157676 - path: scripts/fetch_supporting_docs.py hash: md5 - md5: de0c11e81bf10e040bef67e43466b789 - size: 1472 + md5: 923af3b6ce1447d388b08fab0e3ab77d + size: 1660 outs: - path: data/supporting-docs.json hash: md5 - md5: b0941cc9a7ca7df456157380bcc28f39 - size: 75646 + md5: f3ea9980226e5408497c96a10cc77b80 + size: 72013526 evaluate: cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: 9825cf7e7a89ca17634b44e9256eefc9 - size: 9695 + md5: 8ea0a3f240478e9db41855922ac534a6 + size: 9894 - path: scripts/evaluate.py hash: md5 md5: 10f76511eafc8a1a9b90e9ae92a76bc5 @@ -159,9 +159,9 @@ stages: outs: - path: data/eval.png hash: md5 - md5: 1279778c7e509e972d1f366157d24966 - size: 58228 + md5: bae77b1b721bf283a30a64f67af45fea + size: 74438 - path: data/metrics.json hash: md5 - md5: 2b93334ba0e8226c916d0964237cb72c - size: 225 + md5: 0145280f36071a6df551ef57d3f8393e + size: 229 diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py index 36354e7..66e77ac 100644 --- a/scripts/fetch_supporting_docs.py +++ b/scripts/fetch_supporting_docs.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser +import logging import json from tqdm import tqdm import requests @@ -6,6 +7,7 @@ from typing import Dict, List from dotenv import load_dotenv +logger = logging.getLogger(__name__) def extract_ids(metadata_file: str): with open(metadata_file) as f: @@ -15,14 +17,18 @@ def extract_ids(metadata_file: str): def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: - res = requests.get( - f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) - ) - json_data = res.json() - docs = [] - for key, val in json_data["success"].items(): - docs.append({"id": eidc_id, "field": key, "value": val}) - return docs + try: + res = requests.get( + f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) + ) + json_data = res.json() + docs = [] + for key, val in json_data["success"].items(): + docs.append({"id": eidc_id, "field": key, "value": val}) + return docs + except Exception as e: + logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e) + return [] def main(metadata_file: str, supporting_docs_file: str): @@ -33,8 +39,6 @@ def main(metadata_file: str, supporting_docs_file: str): docs = [] for id in tqdm(ids): docs.extend(get_supporting_docs(id, user, password)) - if len(docs) > 0: - break with open(supporting_docs_file, "w") as f: json.dump(docs, f, indent=4)