From be3852603eb3558d900717f7e4f5e8a8611f2fec Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 18 Oct 2024 18:45:52 +0100
Subject: [PATCH] Caught exception when supporitng docs not available

---
 dvc.lock                         | 56 ++++++++++++++++----------------
 scripts/fetch_supporting_docs.py | 24 ++++++++------
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index dd7f7b1..1d52d2e 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -54,8 +54,8 @@ stages:
       size: 4584498
     - path: data/supporting-docs.json
       hash: md5
-      md5: b0941cc9a7ca7df456157380bcc28f39
-      size: 75646
+      md5: f3ea9980226e5408497c96a10cc77b80
+      size: 72013526
     - path: scripts/chunk_data.py
       hash: md5
       md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
@@ -63,15 +63,15 @@ stages:
     outs:
     - path: data/chunked_data.json
       hash: md5
-      md5: 97f06c3b76ff05d62ccdecd9d5742712
-      size: 137681
+      md5: f6426396e1a3564b53649ef5fc0571fd
+      size: 993814
   create-embeddings:
     cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
     deps:
     - path: data/chunked_data.json
       hash: md5
-      md5: 97f06c3b76ff05d62ccdecd9d5742712
-      size: 137681
+      md5: f6426396e1a3564b53649ef5fc0571fd
+      size: 993814
     - path: scripts/create_embeddings.py
       hash: md5
       md5: 4649c700dfae922b43b3608ee4f00c1a
@@ -79,16 +79,16 @@ stages:
     outs:
     - path: data/embeddings.json
       hash: md5
-      md5: 8d80ef225c59ede34d026f6f2930bae3
-      size: 1894126
+      md5: 8fd682131a282736f6a81a6c53040b1e
+      size: 13422675
   upload-to-docstore:
     cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
       -em all-MiniLM-L6-v2 -c eidc-data
     deps:
     - path: data/embeddings.json
       hash: md5
-      md5: 8d80ef225c59ede34d026f6f2930bae3
-      size: 1894126
+      md5: 8fd682131a282736f6a81a6c53040b1e
+      size: 13422675
     - path: scripts/upload_to_docstore.py
       hash: md5
       md5: 41da88e3bb6d2592bee938ce347f6983
@@ -96,18 +96,18 @@ stages:
     outs:
     - path: data/chroma-data
       hash: md5
-      md5: cc85398c596d4c5839714e93e33468bb.dir
-      size: 3580644
-      nfiles: 5
+      md5: 5c99644f30def03f87b37c98341c6f25.dir
+      size: 13758136
+      nfiles: 6
   run-rag-pipeline:
     cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
       data/chroma-data -c eidc-data
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: cc85398c596d4c5839714e93e33468bb.dir
-      size: 3580644
-      nfiles: 5
+      md5: 5c99644f30def03f87b37c98341c6f25.dir
+      size: 13758136
+      nfiles: 6
     - path: data/eidc_rag_test_sample.csv
       hash: md5
       md5: a371d83c5822d256286e80d64d58c3fe
@@ -119,8 +119,8 @@ stages:
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 9825cf7e7a89ca17634b44e9256eefc9
-      size: 9695
+      md5: 8ea0a3f240478e9db41855922ac534a6
+      size: 9894
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -137,21 +137,21 @@ stages:
       size: 12157676
     - path: scripts/fetch_supporting_docs.py
       hash: md5
-      md5: de0c11e81bf10e040bef67e43466b789
-      size: 1472
+      md5: 923af3b6ce1447d388b08fab0e3ab77d
+      size: 1660
     outs:
     - path: data/supporting-docs.json
       hash: md5
-      md5: b0941cc9a7ca7df456157380bcc28f39
-      size: 75646
+      md5: f3ea9980226e5408497c96a10cc77b80
+      size: 72013526
   evaluate:
     cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
       -img data/eval.png
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 9825cf7e7a89ca17634b44e9256eefc9
-      size: 9695
+      md5: 8ea0a3f240478e9db41855922ac534a6
+      size: 9894
     - path: scripts/evaluate.py
       hash: md5
       md5: 10f76511eafc8a1a9b90e9ae92a76bc5
@@ -159,9 +159,9 @@ stages:
     outs:
     - path: data/eval.png
       hash: md5
-      md5: 1279778c7e509e972d1f366157d24966
-      size: 58228
+      md5: bae77b1b721bf283a30a64f67af45fea
+      size: 74438
     - path: data/metrics.json
       hash: md5
-      md5: 2b93334ba0e8226c916d0964237cb72c
-      size: 225
+      md5: 0145280f36071a6df551ef57d3f8393e
+      size: 229
diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py
index 36354e7..66e77ac 100644
--- a/scripts/fetch_supporting_docs.py
+++ b/scripts/fetch_supporting_docs.py
@@ -1,4 +1,5 @@
 from argparse import ArgumentParser
+import logging
 import json
 from tqdm import tqdm
 import requests
@@ -6,6 +7,7 @@
 from typing import Dict, List
 from dotenv import load_dotenv
 
+logger = logging.getLogger(__name__)
 
 def extract_ids(metadata_file: str):
     with open(metadata_file) as f:
@@ -15,14 +17,18 @@ def extract_ids(metadata_file: str):
 
 
 def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
-    res = requests.get(
-        f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
-    )
-    json_data = res.json()
-    docs = []
-    for key, val in json_data["success"].items():
-        docs.append({"id": eidc_id, "field": key, "value": val})
-    return docs
+    try:
+        res = requests.get(
+            f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
+        )
+        json_data = res.json()
+        docs = []
+        for key, val in json_data["success"].items():
+            docs.append({"id": eidc_id, "field": key, "value": val})
+        return docs
+    except Exception as e:
+        logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e)
+        return []
 
 
 def main(metadata_file: str, supporting_docs_file: str):
@@ -33,8 +39,6 @@ def main(metadata_file: str, supporting_docs_file: str):
     docs = []
     for id in tqdm(ids):
         docs.extend(get_supporting_docs(id, user, password))
-        if len(docs) > 0:
-            break
     with open(supporting_docs_file, "w") as f:
         json.dump(docs, f, indent=4)