From 9e9434780c7346e6d6749a2e2b9fe37cdd3a00f6 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Wed, 9 Oct 2024 09:23:23 +0100
Subject: [PATCH 01/28] Set up basic dvc pipeline

---
 .gitignore                       |  1 -
 data.dvc                         |  6 ------
 data/.gitignore                  |  4 ++++
 data/evaluation-sets.dvc         |  6 ++++++
 data/synthetic-datasets.dvc      |  6 ++++++
 dvc.lock                         | 30 ++++++++++++++++++++++++++++++
 dvc.yaml                         | 14 ++++++++++++++
 pyproject.toml                   |  1 +
 src/llm_eval/evaluate.py         |  1 +
 src/llm_eval/fetch_data.py       | 16 ++++++++++++++++
 src/llm_eval/prepare_data.py     | 18 ++++++++++++++++++
 src/llm_eval/run_rag_pipeline.py |  1 +
 12 files changed, 97 insertions(+), 7 deletions(-)
 delete mode 100644 data.dvc
 create mode 100644 data/.gitignore
 create mode 100644 data/evaluation-sets.dvc
 create mode 100644 data/synthetic-datasets.dvc
 create mode 100644 dvc.lock
 create mode 100644 dvc.yaml
 create mode 100644 src/llm_eval/evaluate.py
 create mode 100644 src/llm_eval/fetch_data.py
 create mode 100644 src/llm_eval/prepare_data.py
 create mode 100644 src/llm_eval/run_rag_pipeline.py

diff --git a/.gitignore b/.gitignore
index c92c62d..bf560c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,5 +163,4 @@ cython_debug/
 
 metrics.txt
 metrics.png
-/data
 gdrive-oauth.txt
diff --git a/data.dvc b/data.dvc
deleted file mode 100644
index d79d822..0000000
--- a/data.dvc
+++ /dev/null
@@ -1,6 +0,0 @@
-outs:
-- md5: 9f50d9dbc781216d5aac93d599e190d7.dir
-  size: 376640
-  nfiles: 3
-  hash: md5
-  path: data
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..d752729
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,4 @@
+/synthetic-datasets
+/evaluation-sets
+/eidc_metadata.json
+/prepared_data.json
diff --git a/data/evaluation-sets.dvc b/data/evaluation-sets.dvc
new file mode 100644
index 0000000..bf21ef3
--- /dev/null
+++ b/data/evaluation-sets.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: c3b5aefd8b8ab17f3087a49eb8265689.dir
+  size: 232043
+  nfiles: 2
+  hash: md5
+  path: evaluation-sets
diff --git a/data/synthetic-datasets.dvc b/data/synthetic-datasets.dvc
new file mode 100644
index 0000000..dc27bb2
--- /dev/null
+++ b/data/synthetic-datasets.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 61b4177259b03a7227784b5b7560726d.dir
+  size: 144597
+  nfiles: 1
+  hash: md5
+  path: synthetic-datasets
diff --git a/dvc.lock b/dvc.lock
new file mode 100644
index 0000000..e13d895
--- /dev/null
+++ b/dvc.lock
@@ -0,0 +1,30 @@
+schema: '2.0'
+stages:
+  fetch:
+    cmd: python src/llm_eval/fetch_data.py
+    deps:
+    - path: src/llm_eval/fetch_data.py
+      hash: md5
+      md5: 10194a16edb7620ed4342e00104f5f95
+      size: 307
+    outs:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: 5db14ae6031ed3bb3a99588a0a313bda
+      size: 101
+  prepare:
+    cmd: python src/llm_eval/prepare_data.py
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: 5db14ae6031ed3bb3a99588a0a313bda
+      size: 101
+    - path: src/llm_eval/prepare_data.py
+      hash: md5
+      md5: d285150e5a1f7c252c0a4562bf24ce0e
+      size: 519
+    outs:
+    - path: data/prepared_data.json
+      hash: md5
+      md5: a6ed512685f3c5f2073517183fbad9fa
+      size: 17005
diff --git a/dvc.yaml b/dvc.yaml
new file mode 100644
index 0000000..3b96094
--- /dev/null
+++ b/dvc.yaml
@@ -0,0 +1,14 @@
+stages:
+  fetch:
+    cmd: python src/llm_eval/fetch_data.py
+    deps:
+    - src/llm_eval/fetch_data.py
+    outs:
+    - data/eidc_metadata.json
+  prepare:
+    cmd: python src/llm_eval/prepare_data.py
+    deps:
+    - data/eidc_metadata.json
+    - src/llm_eval/prepare_data.py
+    outs:
+    - data/prepared_data.json
diff --git a/pyproject.toml b/pyproject.toml
index f62ea0a..8fe55c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "bitsandbytes",
     "haystack-ai",
     "accelerate",
+    "sentence-transformers",
 ]
 
 [project.optional-dependencies]
diff --git a/src/llm_eval/evaluate.py b/src/llm_eval/evaluate.py
new file mode 100644
index 0000000..7fcf1a7
--- /dev/null
+++ b/src/llm_eval/evaluate.py
@@ -0,0 +1 @@
+# Run RAGAS to evaluate
\ No newline at end of file
diff --git a/src/llm_eval/fetch_data.py b/src/llm_eval/fetch_data.py
new file mode 100644
index 0000000..a591b65
--- /dev/null
+++ b/src/llm_eval/fetch_data.py
@@ -0,0 +1,16 @@
+import json
+
+
+def main():
+    data = {
+        "datasets": [
+            {"name": "dsone", "desc": "some description"},
+            {"name": "dstwo", "desc": "some text"},
+        ]
+    }
+    with open("data/eidc_metadata.json", "w") as f:
+        json.dump(data, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/llm_eval/prepare_data.py b/src/llm_eval/prepare_data.py
new file mode 100644
index 0000000..68d43f6
--- /dev/null
+++ b/src/llm_eval/prepare_data.py
@@ -0,0 +1,18 @@
+import json
+from sentence_transformers import SentenceTransformer
+
+def create_embedding(text):
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    return model.encode(text) 
+
+
+def main():
+    with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output:
+        data = json.load(input)
+        for dataset in data["datasets"]:
+            dataset["desc_embedding"] = create_embedding(dataset["desc"]).tolist()
+        json.dump(data, output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/llm_eval/run_rag_pipeline.py b/src/llm_eval/run_rag_pipeline.py
new file mode 100644
index 0000000..c7f9258
--- /dev/null
+++ b/src/llm_eval/run_rag_pipeline.py
@@ -0,0 +1 @@
+# Generate RAG responses
\ No newline at end of file

From 3bfa70a7e419f205fe8956427ccd7bed2c3e9476 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Wed, 9 Oct 2024 15:13:42 +0100
Subject: [PATCH 02/28] Removed hf token

---
 notebooks/vllm_test.ipynb | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/notebooks/vllm_test.ipynb b/notebooks/vllm_test.ipynb
index e8b19df..755c34c 100644
--- a/notebooks/vllm_test.ipynb
+++ b/notebooks/vllm_test.ipynb
@@ -27,8 +27,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "os.environ[\"HF_TOKEN\"] = \"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\""
+    "import os"
    ]
   },
   {

From 090770016204e6906b6bc4c27b86709fac7db572 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 10 Oct 2024 11:34:11 +0100
Subject: [PATCH 03/28] Added dvc lock file

---
 dvc.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index e13d895..3b748c3 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -21,10 +21,10 @@ stages:
       size: 101
     - path: src/llm_eval/prepare_data.py
       hash: md5
-      md5: d285150e5a1f7c252c0a4562bf24ce0e
-      size: 519
+      md5: 91e15fb87f6a4d0188cf9ed011194411
+      size: 513
     outs:
     - path: data/prepared_data.json
       hash: md5
-      md5: a6ed512685f3c5f2073517183fbad9fa
-      size: 17005
+      md5: 6eac511808d32275195826bdce66a2d2
+      size: 16962

From 7d89512da499b22269fe83ee42dd3f25b6e0d4e6 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 10 Oct 2024 11:35:41 +0100
Subject: [PATCH 04/28] Changed embedding json name

---
 src/llm_eval/prepare_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llm_eval/prepare_data.py b/src/llm_eval/prepare_data.py
index 68d43f6..ccaae20 100644
--- a/src/llm_eval/prepare_data.py
+++ b/src/llm_eval/prepare_data.py
@@ -10,7 +10,7 @@ def main():
     with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output:
         data = json.load(input)
         for dataset in data["datasets"]:
-            dataset["desc_embedding"] = create_embedding(dataset["desc"]).tolist()
+            dataset["desc_emb"] = create_embedding(dataset["desc"]).tolist()
         json.dump(data, output)
 
 

From 2e99516dbd18d4259663b7609d158954e08f3c34 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 11 Oct 2024 11:28:41 +0100
Subject: [PATCH 05/28] Cleaned pipeline

---
 .dvc/config                                   |  1 +
 data/.gitignore                               |  1 +
 dvc.lock                                      | 32 +++++++--------
 dvc.yaml                                      | 12 +++---
 .../create_embeddings.py                      |  0
 {src/llm_eval => scripts}/evaluate.py         |  0
 scripts/fetch_eidc_metadata.py                | 25 ++++++++++++
 scripts/fetch_eidc_supporting_docs.py         |  0
 scripts/prepare_data.py                       | 40 +++++++++++++++++++
 {src/llm_eval => scripts}/run_rag_pipeline.py |  0
 src/llm_eval/fetch_data.py                    | 16 --------
 11 files changed, 89 insertions(+), 38 deletions(-)
 rename src/llm_eval/prepare_data.py => scripts/create_embeddings.py (100%)
 rename {src/llm_eval => scripts}/evaluate.py (100%)
 create mode 100644 scripts/fetch_eidc_metadata.py
 create mode 100644 scripts/fetch_eidc_supporting_docs.py
 create mode 100644 scripts/prepare_data.py
 rename {src/llm_eval => scripts}/run_rag_pipeline.py (100%)
 delete mode 100644 src/llm_eval/fetch_data.py

diff --git a/.dvc/config b/.dvc/config
index 14b6315..8a3434e 100644
--- a/.dvc/config
+++ b/.dvc/config
@@ -1,5 +1,6 @@
 [core]
     remote = jasmin
+    autostage = true
 ['remote "jasmin"']
     url = s3://dvc-test
     endpointurl = https://llm-eval-o.s3-ext.jc.rl.ac.uk
diff --git a/data/.gitignore b/data/.gitignore
index d752729..d703ed1 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -2,3 +2,4 @@
 /evaluation-sets
 /eidc_metadata.json
 /prepared_data.json
+/prepared_eidc_metadata.json
diff --git a/dvc.lock b/dvc.lock
index e13d895..79ba668 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -1,30 +1,30 @@
 schema: '2.0'
 stages:
-  fetch:
-    cmd: python src/llm_eval/fetch_data.py
+  fetch-metadata:
+    cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
     deps:
-    - path: src/llm_eval/fetch_data.py
+    - path: scripts/fetch_eidc_metadata.py
       hash: md5
-      md5: 10194a16edb7620ed4342e00104f5f95
-      size: 307
+      md5: ff336062c921e5e8f95bd569cd064e22
+      size: 664
     outs:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: 5db14ae6031ed3bb3a99588a0a313bda
-      size: 101
+      md5: 3c4cd23bfc699358c955d4e7c68e8c9d
+      size: 8968593
   prepare:
-    cmd: python src/llm_eval/prepare_data.py
+    cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
     deps:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: 5db14ae6031ed3bb3a99588a0a313bda
-      size: 101
-    - path: src/llm_eval/prepare_data.py
+      md5: 3c4cd23bfc699358c955d4e7c68e8c9d
+      size: 8968593
+    - path: scripts/prepare_data.py
       hash: md5
-      md5: d285150e5a1f7c252c0a4562bf24ce0e
-      size: 519
+      md5: bdab13adab508052f1d16ab0967b428b
+      size: 1215
     outs:
-    - path: data/prepared_data.json
+    - path: data/prepared_eidc_metadata.json
       hash: md5
-      md5: a6ed512685f3c5f2073517183fbad9fa
-      size: 17005
+      md5: ffd8914e46ffba8c47cdca6eb6ae0140
+      size: 2120475
diff --git a/dvc.yaml b/dvc.yaml
index 3b96094..dc7a55c 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -1,14 +1,14 @@
 stages:
-  fetch:
-    cmd: python src/llm_eval/fetch_data.py
+  fetch-metadata:
+    cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
     deps:
-    - src/llm_eval/fetch_data.py
+    - scripts/fetch_eidc_metadata.py
     outs:
     - data/eidc_metadata.json
   prepare:
-    cmd: python src/llm_eval/prepare_data.py
+    cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
     deps:
     - data/eidc_metadata.json
-    - src/llm_eval/prepare_data.py
+    - scripts/prepare_data.py
     outs:
-    - data/prepared_data.json
+    - data/prepared_eidc_metadata.json
diff --git a/src/llm_eval/prepare_data.py b/scripts/create_embeddings.py
similarity index 100%
rename from src/llm_eval/prepare_data.py
rename to scripts/create_embeddings.py
diff --git a/src/llm_eval/evaluate.py b/scripts/evaluate.py
similarity index 100%
rename from src/llm_eval/evaluate.py
rename to scripts/evaluate.py
diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
new file mode 100644
index 0000000..561f935
--- /dev/null
+++ b/scripts/fetch_eidc_metadata.py
@@ -0,0 +1,25 @@
+import requests
+import json
+from argparse import ArgumentParser
+
+URL = "https://catalogue.ceh.ac.uk/eidc/documents"
+
+def main(output_file: str) -> None:
+    res = requests.get(
+        URL,
+        headers={"content-type": "application/json"},
+        params={
+            "page": 1,
+            "rows": 2000,
+            "term": "recordType:Dataset",
+        },
+    )
+    with open(output_file, "w") as f:
+        json.dump(res.json(), f)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("fetch_eidc_metadata.py")
+    parser.add_argument("output", help="The file path to save the downloaded data to.")
+    args = parser.parse_args()
+    main(args.output)
diff --git a/scripts/fetch_eidc_supporting_docs.py b/scripts/fetch_eidc_supporting_docs.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py
new file mode 100644
index 0000000..89a744c
--- /dev/null
+++ b/scripts/prepare_data.py
@@ -0,0 +1,40 @@
+from typing import List, Dict
+import json
+from argparse import ArgumentParser
+
+
+METADATA_FIELDS = ["title", "description", "lineage"]
+
+
+def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]:
+    metadata = {}
+    metadata["id"] = json_data["identifier"]
+    for field in fields:
+        if json_data[field]:
+            metadata["field"] = field
+            metadata["value"] = json_data[field]
+    return metadata
+
+
+def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
+    data = []
+    with open(file_path) as f:
+        json_data = json.load(f)
+        for dataset in json_data["results"]:
+            dataset_metadata = extact_eidc_metadata_fields(dataset)
+            data.append(dataset_metadata)
+    return data
+
+
+def main(input, output) -> None:
+    data = parse_eidc_metadata(input)
+    with open(output, "w") as f:
+        json.dump(data, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("prepare_data.py")
+    parser.add_argument("input", help="The file to be used as input.")
+    parser.add_argument("output", help="The path to save the processed result.")
+    args = parser.parse_args()
+    main(args.input, args.output)
diff --git a/src/llm_eval/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
similarity index 100%
rename from src/llm_eval/run_rag_pipeline.py
rename to scripts/run_rag_pipeline.py
diff --git a/src/llm_eval/fetch_data.py b/src/llm_eval/fetch_data.py
deleted file mode 100644
index a591b65..0000000
--- a/src/llm_eval/fetch_data.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import json
-
-
-def main():
-    data = {
-        "datasets": [
-            {"name": "dsone", "desc": "some description"},
-            {"name": "dstwo", "desc": "some text"},
-        ]
-    }
-    with open("data/eidc_metadata.json", "w") as f:
-        json.dump(data, f)
-
-
-if __name__ == "__main__":
-    main()

From 739f8fc563099cb4112debb4dbdb423dfdf3f0f3 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 11 Oct 2024 11:33:43 +0100
Subject: [PATCH 06/28] Cleaned scripts and added additional metadata field

---
 dvc.lock                       | 20 ++++++++++----------
 scripts/fetch_eidc_metadata.py |  2 +-
 scripts/prepare_data.py        |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index 79ba668..81cd8e2 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -5,26 +5,26 @@ stages:
     deps:
     - path: scripts/fetch_eidc_metadata.py
       hash: md5
-      md5: ff336062c921e5e8f95bd569cd064e22
-      size: 664
+      md5: 43a63d91a3d66caa03738a000c841406
+      size: 674
     outs:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: 3c4cd23bfc699358c955d4e7c68e8c9d
-      size: 8968593
+      md5: 423dc3a61ede72e1d5c818d74277c0b4
+      size: 12140491
   prepare:
     cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
     deps:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: 3c4cd23bfc699358c955d4e7c68e8c9d
-      size: 8968593
+      md5: 423dc3a61ede72e1d5c818d74277c0b4
+      size: 12140491
     - path: scripts/prepare_data.py
       hash: md5
-      md5: bdab13adab508052f1d16ab0967b428b
-      size: 1215
+      md5: bcbf4413aeee83928054d9c6c6c2bacc
+      size: 1224
     outs:
     - path: data/prepared_eidc_metadata.json
       hash: md5
-      md5: ffd8914e46ffba8c47cdca6eb6ae0140
-      size: 2120475
+      md5: 0b4ca8c49da450bc8fec0e92d577466c
+      size: 411936
diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
index 561f935..5ea1064 100644
--- a/scripts/fetch_eidc_metadata.py
+++ b/scripts/fetch_eidc_metadata.py
@@ -15,7 +15,7 @@ def main(output_file: str) -> None:
         },
     )
     with open(output_file, "w") as f:
-        json.dump(res.json(), f)
+        json.dump(res.json(), f, indent=4)
 
 
 if __name__ == "__main__":
diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py
index 89a744c..4adffdc 100644
--- a/scripts/prepare_data.py
+++ b/scripts/prepare_data.py
@@ -3,7 +3,7 @@
 from argparse import ArgumentParser
 
 
-METADATA_FIELDS = ["title", "description", "lineage"]
+METADATA_FIELDS = ["title", "description", "lineage", "title"]
 
 
 def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]:

From 8148da1083c70aa5f94ad885702524adb913208e Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 11 Oct 2024 11:52:29 +0100
Subject: [PATCH 07/28] Removed modules from project file

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 8fe55c9..eb48198 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,3 +21,6 @@ jupyter = [
     "ipykernel",
     "ipywidgets",
 ]
+
+[tool.setuptools]
+py-modules = []

From 2ec86cd49d44e8aa8816f0ecf94c6ab295fff7fc Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 11 Oct 2024 14:04:23 +0100
Subject: [PATCH 08/28] Fixed bug so all metadata fields are extracted

---
 data/.gitignore                                  |  1 +
 dvc.lock                                         | 14 +++++++-------
 dvc.yaml                                         |  6 +++---
 scripts/{prepare_data.py => extract_metadata.py} | 14 ++++++++------
 4 files changed, 19 insertions(+), 16 deletions(-)
 rename scripts/{prepare_data.py => extract_metadata.py} (76%)

diff --git a/data/.gitignore b/data/.gitignore
index d703ed1..88737f3 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -3,3 +3,4 @@
 /eidc_metadata.json
 /prepared_data.json
 /prepared_eidc_metadata.json
+/extracted_metadata.json
diff --git a/dvc.lock b/dvc.lock
index 81cd8e2..bf2dea4 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -13,18 +13,18 @@ stages:
       md5: 423dc3a61ede72e1d5c818d74277c0b4
       size: 12140491
   prepare:
-    cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
+    cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
     deps:
     - path: data/eidc_metadata.json
       hash: md5
       md5: 423dc3a61ede72e1d5c818d74277c0b4
       size: 12140491
-    - path: scripts/prepare_data.py
+    - path: scripts/extract_metadata.py
       hash: md5
-      md5: bcbf4413aeee83928054d9c6c6c2bacc
-      size: 1224
+      md5: c2fa7d2c4b8f28a6e24536ce0df244fd
+      size: 1296
     outs:
-    - path: data/prepared_eidc_metadata.json
+    - path: data/extracted_metadata.json
       hash: md5
-      md5: 0b4ca8c49da450bc8fec0e92d577466c
-      size: 411936
+      md5: 7d2ae8d6a41a960592f30496eb498af7
+      size: 4578493
diff --git a/dvc.yaml b/dvc.yaml
index dc7a55c..517a69d 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -6,9 +6,9 @@ stages:
     outs:
     - data/eidc_metadata.json
   prepare:
-    cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
+    cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
     deps:
     - data/eidc_metadata.json
-    - scripts/prepare_data.py
+    - scripts/extract_metadata.py
     outs:
-    - data/prepared_eidc_metadata.json
+    - data/extracted_metadata.json
diff --git a/scripts/prepare_data.py b/scripts/extract_metadata.py
similarity index 76%
rename from scripts/prepare_data.py
rename to scripts/extract_metadata.py
index 4adffdc..241bc1a 100644
--- a/scripts/prepare_data.py
+++ b/scripts/extract_metadata.py
@@ -3,17 +3,19 @@
 from argparse import ArgumentParser
 
 
-METADATA_FIELDS = ["title", "description", "lineage", "title"]
+METADATA_FIELDS = ["title", "description", "lineage"]
 
 
-def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]:
-    metadata = {}
-    metadata["id"] = json_data["identifier"]
+def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]:
+    metadatas = []
     for field in fields:
         if json_data[field]:
+            metadata = {}
+            metadata["id"] = json_data["identifier"]
             metadata["field"] = field
             metadata["value"] = json_data[field]
-    return metadata
+            metadatas.append(metadata)
+    return metadatas
 
 
 def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
@@ -22,7 +24,7 @@ def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
         json_data = json.load(f)
         for dataset in json_data["results"]:
             dataset_metadata = extact_eidc_metadata_fields(dataset)
-            data.append(dataset_metadata)
+            data.extend(dataset_metadata)
     return data
 
 

From 24cad77d5539b31a1b374b3695f5531a95d19793 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 11 Oct 2024 14:52:22 +0100
Subject: [PATCH 09/28] Added chunking to dvc pipeline

---
 data/.gitignore       |  1 +
 dvc.lock              | 32 ++++++++++++++++++++++++
 dvc.yaml              |  9 ++++++-
 scripts/chunk_data.py | 57 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 scripts/chunk_data.py

diff --git a/data/.gitignore b/data/.gitignore
index 88737f3..e675951 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -4,3 +4,4 @@
 /prepared_data.json
 /prepared_eidc_metadata.json
 /extracted_metadata.json
+/chunked_data.json
diff --git a/dvc.lock b/dvc.lock
index bf2dea4..a69544a 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -28,3 +28,35 @@ stages:
       hash: md5
       md5: 7d2ae8d6a41a960592f30496eb498af7
       size: 4578493
+  extract-metadata:
+    cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: 423dc3a61ede72e1d5c818d74277c0b4
+      size: 12140491
+    - path: scripts/extract_metadata.py
+      hash: md5
+      md5: c2fa7d2c4b8f28a6e24536ce0df244fd
+      size: 1296
+    outs:
+    - path: data/extracted_metadata.json
+      hash: md5
+      md5: 7d2ae8d6a41a960592f30496eb498af7
+      size: 4578493
+  chunk-data:
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
+    deps:
+    - path: data/extracted_metadata.json
+      hash: md5
+      md5: 7d2ae8d6a41a960592f30496eb498af7
+      size: 4578493
+    - path: scripts/chunk_data.py
+      hash: md5
+      md5: b89a3ae9f6f9a0142149e70dc6fc5735
+      size: 1903
+    outs:
+    - path: data/chunked_data.json
+      hash: md5
+      md5: 7ba3d3785db066283e35d654e11cf28b
+      size: 6373503
diff --git a/dvc.yaml b/dvc.yaml
index 517a69d..bf00465 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -5,10 +5,17 @@ stages:
     - scripts/fetch_eidc_metadata.py
     outs:
     - data/eidc_metadata.json
-  prepare:
+  extract-metadata:
     cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
     deps:
     - data/eidc_metadata.json
     - scripts/extract_metadata.py
     outs:
     - data/extracted_metadata.json
+  chunk-data:
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
+    deps:
+    - data/extracted_metadata.json
+    - scripts/chunk_data.py
+    outs:
+    - data/chunked_data.json
\ No newline at end of file
diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py
new file mode 100644
index 0000000..cdec705
--- /dev/null
+++ b/scripts/chunk_data.py
@@ -0,0 +1,57 @@
+from typing import List, Dict
+import json
+from argparse import ArgumentParser
+
+
+def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]:
+    chunks = []
+    start = 0
+    while start < len(value):
+        chunks.append(value[start : (start + chunk_size)])
+        start += chunk_size - overlap
+    return chunks
+
+
+def chunk_metadata_value(metada_value, chunk_size, overlap):
+    chunks = chunk_value(metada_value["value"], chunk_size, overlap)
+    return [
+        {
+            "chunk": chunks[i],
+            "field": metada_value["field"],
+            "id": metada_value["id"],
+            "index": i,
+        }
+        for i in range(len(chunks))
+    ]
+
+
+def chunk_metadata_file(file: str, chunk_size: int, overlap: int) -> List[Dict[str, str]]:
+    chunked_metadata = []
+    with open(file) as f:
+        json_data = json.load(f)
+        for metadata in json_data:
+            chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
+    return chunked_metadata
+
+
+def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
+    all_chunked_metadata = []
+    for file in files:
+        all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
+    with open(ouput_file, "w") as f:
+        json.dump(all_chunked_metadata, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("prepare_data.py")
+    parser.add_argument("input_files", nargs="+", help="List of files to chunk.")
+    parser.add_argument("-o", "--output", help="The file to write the output to.")
+    parser.add_argument(
+        "-c", "--chunk", help="Desired chunk size in characters.", type=int
+    )
+    parser.add_argument(
+        "-ol", "--overlap", help="Chunk overlap in characters.", type=int
+    )
+    args = parser.parse_args()
+    assert args.chunk > args.overlap
+    main(args.input_files, args.output, args.chunk, args.overlap)

From 40710525cec556eb4cc644e0389eb6d4d0d9bee8 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Tue, 15 Oct 2024 15:35:55 +0100
Subject: [PATCH 10/28] Added pipeline step for embeddings and parameter
 options

---
 data/.gitignore                |  2 ++
 dvc.lock                       | 45 +++++++++++++++++++++----------
 dvc.yaml                       | 23 ++++++++++------
 params.yaml                    |  9 +++++++
 scripts/chunk_data.py          | 49 +++++++++++++++++++++++++++++-----
 scripts/create_embeddings.py   | 16 +++++++----
 scripts/fetch_eidc_metadata.py |  2 +-
 scripts/upload_to_docstore.py  |  6 +++++
 8 files changed, 117 insertions(+), 35 deletions(-)
 create mode 100644 params.yaml
 create mode 100644 scripts/upload_to_docstore.py

diff --git a/data/.gitignore b/data/.gitignore
index e675951..133b354 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -5,3 +5,5 @@
 /prepared_eidc_metadata.json
 /extracted_metadata.json
 /chunked_data.json
+/chunked_embeddings.json
+/embeddings.json
diff --git a/dvc.lock b/dvc.lock
index a69544a..e537716 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -5,13 +5,13 @@ stages:
     deps:
     - path: scripts/fetch_eidc_metadata.py
       hash: md5
-      md5: 43a63d91a3d66caa03738a000c841406
+      md5: ba838a284da239217d0464f08e0a45ce
       size: 674
     outs:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: 423dc3a61ede72e1d5c818d74277c0b4
-      size: 12140491
+      md5: fc2f9ebe92cbd07eb06ff6e39366fdac
+      size: 12146216
   prepare:
     cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
     deps:
@@ -33,8 +33,8 @@ stages:
     deps:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: 423dc3a61ede72e1d5c818d74277c0b4
-      size: 12140491
+      md5: fc2f9ebe92cbd07eb06ff6e39366fdac
+      size: 12146216
     - path: scripts/extract_metadata.py
       hash: md5
       md5: c2fa7d2c4b8f28a6e24536ce0df244fd
@@ -42,21 +42,38 @@ stages:
     outs:
     - path: data/extracted_metadata.json
       hash: md5
-      md5: 7d2ae8d6a41a960592f30496eb498af7
-      size: 4578493
+      md5: fce18ce3c43175af1cea5d84dac9baf9
+      size: 4579965
   chunk-data:
-    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s
+      10 data/extracted_metadata.json
     deps:
     - path: data/extracted_metadata.json
       hash: md5
-      md5: 7d2ae8d6a41a960592f30496eb498af7
-      size: 4578493
+      md5: fce18ce3c43175af1cea5d84dac9baf9
+      size: 4579965
     - path: scripts/chunk_data.py
       hash: md5
-      md5: b89a3ae9f6f9a0142149e70dc6fc5735
-      size: 1903
+      md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
+      size: 2509
     outs:
     - path: data/chunked_data.json
       hash: md5
-      md5: 7ba3d3785db066283e35d654e11cf28b
-      size: 6373503
+      md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
+      size: 14947
+  create-embeddings:
+    cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
+    deps:
+    - path: data/chunked_data.json
+      hash: md5
+      md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
+      size: 14947
+    - path: scripts/create_embeddings.py
+      hash: md5
+      md5: 3dc6ef284730398375a13df4bff41846
+      size: 808
+    outs:
+    - path: data/embeddings.json
+      hash: md5
+      md5: b08299369d1f243eb8d8ffa2cdb9a90f
+      size: 351126
diff --git a/dvc.yaml b/dvc.yaml
index bf00465..f1f20cc 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -1,21 +1,28 @@
 stages:
   fetch-metadata:
-    cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
+    cmd: python scripts/fetch_eidc_metadata.py ${files.metadata}
     deps:
     - scripts/fetch_eidc_metadata.py
     outs:
-    - data/eidc_metadata.json
+    - ${files.metadata}
   extract-metadata:
-    cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
+    cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted}
     deps:
-    - data/eidc_metadata.json
+    - ${files.metadata}
     - scripts/extract_metadata.py
     outs:
-    - data/extracted_metadata.json
+    - ${files.extracted}
   chunk-data:
-    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
+    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
     deps:
-    - data/extracted_metadata.json
+    - ${files.extracted}
     - scripts/chunk_data.py
     outs:
-    - data/chunked_data.json
\ No newline at end of file
+    - ${files.chunked}
+  create-embeddings:
+    cmd: python scripts/create_embeddings.py ${files.chunked} ${files.embeddings}
+    deps:
+    - ${files.chunked}
+    - scripts/create_embeddings.py
+    outs:
+    - ${files.embeddings}
\ No newline at end of file
diff --git a/params.yaml b/params.yaml
new file mode 100644
index 0000000..d079be0
--- /dev/null
+++ b/params.yaml
@@ -0,0 +1,9 @@
+hp:
+  chunk-size: 300
+  overlap: 100
+files:
+  metadata: "data/eidc_metadata.json"
+  extracted: "data/extracted_metadata.json"
+  chunked: "data/chunked_data.json"
+  embeddings: "data/embeddings.json"
+sample-size: 10 # sample size of 0 will process all data
\ No newline at end of file
diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py
index cdec705..ace111d 100644
--- a/scripts/chunk_data.py
+++ b/scripts/chunk_data.py
@@ -25,19 +25,29 @@ def chunk_metadata_value(metada_value, chunk_size, overlap):
     ]
 
 
-def chunk_metadata_file(file: str, chunk_size: int, overlap: int) -> List[Dict[str, str]]:
+def chunk_metadata_file(
+    file: str, chunk_size: int, overlap: int, sample_size: int
+) -> List[Dict[str, str]]:
     chunked_metadata = []
     with open(file) as f:
         json_data = json.load(f)
+        count = 0
         for metadata in json_data:
             chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
+            count += 1
+            if count == sample_size:
+                break
     return chunked_metadata
 
 
-def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
+def main(
+    files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int
+) -> None:
     all_chunked_metadata = []
     for file in files:
-        all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
+        all_chunked_metadata.extend(
+            chunk_metadata_file(file, chunk_size, overlap, sample_size)
+        )
     with open(ouput_file, "w") as f:
         json.dump(all_chunked_metadata, f, indent=4)
 
@@ -45,13 +55,38 @@ def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> No
 if __name__ == "__main__":
     parser = ArgumentParser("prepare_data.py")
     parser.add_argument("input_files", nargs="+", help="List of files to chunk.")
-    parser.add_argument("-o", "--output", help="The file to write the output to.")
     parser.add_argument(
-        "-c", "--chunk", help="Desired chunk size in characters.", type=int
+        "-o",
+        "--output",
+        help="The json file to write the output to.",
+        type=str,
+        nargs="?",
+        const="chunk_data_output.json",
     )
     parser.add_argument(
-        "-ol", "--overlap", help="Chunk overlap in characters.", type=int
+        "-c",
+        "--chunk",
+        help="Desired chunk size in characters.",
+        type=int,
+        nargs="?",
+        const=300,
+    )
+    parser.add_argument(
+        "-ol",
+        "--overlap",
+        help="Chunk overlap in characters.",
+        type=int,
+        nargs="?",
+        const=100,
+    )
+    parser.add_argument(
+        "-s",
+        "--sample",
+        help="Only generate chunks for n datasets",
+        type=int,
+        nargs="?",
+        const=0,
     )
     args = parser.parse_args()
     assert args.chunk > args.overlap
-    main(args.input_files, args.output, args.chunk, args.overlap)
+    main(args.input_files, args.output, args.chunk, args.overlap, args.sample)
diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py
index ccaae20..ce1c37b 100644
--- a/scripts/create_embeddings.py
+++ b/scripts/create_embeddings.py
@@ -1,18 +1,24 @@
 import json
 from sentence_transformers import SentenceTransformer
+from argparse import ArgumentParser
+from tqdm import tqdm
 
 def create_embedding(text):
     model = SentenceTransformer("all-MiniLM-L6-v2")
     return model.encode(text) 
 
 
-def main():
-    with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output:
+def main(input_file, output_file):
+    with open(input_file) as input, open(output_file, "w") as output:
         data = json.load(input)
-        for dataset in data["datasets"]:
-            dataset["desc_emb"] = create_embedding(dataset["desc"]).tolist()
+        for chunk in tqdm(data):
+            chunk["embedding"] = create_embedding(chunk["chunk"]).tolist()
         json.dump(data, output)
 
 
 if __name__ == "__main__":
-    main()
+    parser = ArgumentParser("prepare_data.py")
+    parser.add_argument("input", help="The file to be used as input.")
+    parser.add_argument("output", help="The path to save the processed result.")
+    args = parser.parse_args()
+    main(args.input, args.output)
diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
index 5ea1064..cd56b4e 100644
--- a/scripts/fetch_eidc_metadata.py
+++ b/scripts/fetch_eidc_metadata.py
@@ -10,7 +10,7 @@ def main(output_file: str) -> None:
         headers={"content-type": "application/json"},
         params={
             "page": 1,
-            "rows": 2000,
+            "rows": 2500,
             "term": "recordType:Dataset",
         },
     )
diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py
new file mode 100644
index 0000000..6e4fb85
--- /dev/null
+++ b/scripts/upload_to_docstore.py
@@ -0,0 +1,6 @@
+from argparse import ArgumentParser
+
+if __name__ == "__main__":
+    parser = ArgumentParser("prepare_data.py")
+    parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store")
+    parser.add_argument("-o", "--output", help="The file to write the output to.")
\ No newline at end of file

From ec183d3dacdf4f0fc08fb77b0ef233c0595787d3 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Wed, 16 Oct 2024 11:28:34 +0100
Subject: [PATCH 11/28] Added chroma upload to pipeline

---
 data/.gitignore               |  1 +
 dvc.lock                      | 18 +++++++++++
 dvc.yaml                      |  9 +++++-
 params.yaml                   |  2 ++
 pyproject.toml                |  1 +
 scripts/upload_to_docstore.py | 56 +++++++++++++++++++++++++++++++++--
 6 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/data/.gitignore b/data/.gitignore
index 133b354..c3f2331 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -7,3 +7,4 @@
 /chunked_data.json
 /chunked_embeddings.json
 /embeddings.json
+/chroma-data
diff --git a/dvc.lock b/dvc.lock
index e537716..27e749b 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -77,3 +77,21 @@ stages:
       hash: md5
       md5: b08299369d1f243eb8d8ffa2cdb9a90f
       size: 351126
+  upload-to-docstore:
+    cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
+      -em all-MiniLM-L6-v2
+    deps:
+    - path: data/embeddings.json
+      hash: md5
+      md5: b08299369d1f243eb8d8ffa2cdb9a90f
+      size: 351126
+    - path: scripts/upload_to_docstore.py
+      hash: md5
+      md5: ae8755770166dd3d6c1efb9f15723116
+      size: 1836
+    outs:
+    - path: data/chroma-data
+      hash: md5
+      md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
+      size: 2069220
+      nfiles: 5
diff --git a/dvc.yaml b/dvc.yaml
index f1f20cc..5446540 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -25,4 +25,11 @@ stages:
     - ${files.chunked}
     - scripts/create_embeddings.py
     outs:
-    - ${files.embeddings}
\ No newline at end of file
+    - ${files.embeddings}
+  upload-to-docstore:
+    cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model}
+    deps:
+    - ${files.embeddings}
+    - scripts/upload_to_docstore.py
+    outs:
+    - ${files.doc-store}
\ No newline at end of file
diff --git a/params.yaml b/params.yaml
index d079be0..812a62e 100644
--- a/params.yaml
+++ b/params.yaml
@@ -1,9 +1,11 @@
 hp:
   chunk-size: 300
   overlap: 100
+  embeddings-model: "all-MiniLM-L6-v2"
 files:
   metadata: "data/eidc_metadata.json"
   extracted: "data/extracted_metadata.json"
   chunked: "data/chunked_data.json"
   embeddings: "data/embeddings.json"
+  doc-store: "data/chroma-data"
 sample-size: 10 # sample size of 0 will process all data
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index eb48198..a8e8384 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "haystack-ai",
     "accelerate",
     "sentence-transformers",
+    "chromadb",
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py
index 6e4fb85..4f2e8af 100644
--- a/scripts/upload_to_docstore.py
+++ b/scripts/upload_to_docstore.py
@@ -1,6 +1,58 @@
 from argparse import ArgumentParser
+import json
+import uuid
+
+import chromadb
+from chromadb.utils import embedding_functions
+
+
+def main(input_file: str, output_path: str, collection_name: str, embedding_model: str):
+    print(collection_name)
+    with open(input_file) as f:
+        json_data = json.load(f)
+
+        docs = [chunk["chunk"] for chunk in json_data]
+        metas = [
+            {field: chunk[field] for field in ["field", "id", "index"]}
+            for chunk in json_data
+        ]
+        embs = [chunk["embedding"] for chunk in json_data]
+        ids = [uuid.uuid4().hex for _ in json_data]
+
+        func = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name=embedding_model
+        )
+
+        client = chromadb.PersistentClient(output_path)
+        collection = client.create_collection(
+            name=collection_name, embedding_function=func
+        )
+        collection.add(documents=docs, metadatas=metas, embeddings=embs, ids=ids)
+
 
 if __name__ == "__main__":
     parser = ArgumentParser("prepare_data.py")
-    parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store")
-    parser.add_argument("-o", "--output", help="The file to write the output to.")
\ No newline at end of file
+    parser.add_argument(
+        "input_file",
+        help="File containing chunks and embeddings to upload to document store",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="The file to write the output to.",
+        default="data/chroma-data",
+    )
+    parser.add_argument(
+        "-c",
+        "--collection",
+        help="Collection name to use in doc store.",
+        default="eidc-data",
+    )
+    parser.add_argument(
+        "-em",
+        "--embedding_model",
+        help="Embedding model to use in the doc store (must be the same as the function used to create embeddings.)",
+        default="all-MiniLM-L6-v2",
+    )
+    args = parser.parse_args()
+    main(args.input_file, args.output, args.collection, args.embedding_model)

From aaa3a93ef06c13590c6b3c9f6425a5eeafd7f012 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Wed, 16 Oct 2024 15:25:52 +0100
Subject: [PATCH 12/28] Added script for running rag pipeline on eval datasets

---
 dvc.yaml                    |  5 +-
 params.yaml                 | 14 +++++-
 pyproject.toml              |  2 +
 scripts/run_rag_pipeline.py | 99 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/dvc.yaml b/dvc.yaml
index 5446540..ab5a76f 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -32,4 +32,7 @@ stages:
     - ${files.embeddings}
     - scripts/upload_to_docstore.py
     outs:
-    - ${files.doc-store}
\ No newline at end of file
+    - ${files.doc-store}
+  run-rag-pipeline:
+    cmd python scripts/run_rag_pipeline.py ${files.test-set}
+    
\ No newline at end of file
diff --git a/params.yaml b/params.yaml
index 812a62e..a4792b9 100644
--- a/params.yaml
+++ b/params.yaml
@@ -8,4 +8,16 @@ files:
   chunked: "data/chunked_data.json"
   embeddings: "data/embeddings.json"
   doc-store: "data/chroma-data"
-sample-size: 10 # sample size of 0 will process all data
\ No newline at end of file
+  test-set: "data/synthetic-datasets/eidc_rag_test_sample.csv"
+sample-size: 10 # sample size of 0 will process all data
+rag:
+  model: llama3.1
+  prompt: >
+    You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n
+    Do not use your own knowledge to answer the question.\n
+    The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n
+    Do not refer to "context" in your answer, instead refer to the context as available information.
+    If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n
+    Question: {{query}}\n
+    Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %}
+    Answer:
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index a8e8384..f86faac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,8 @@ dependencies = [
     "accelerate",
     "sentence-transformers",
     "chromadb",
+    "ollama-haystack == 0.0.7",
+    "chroma-haystack",
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
index c7f9258..16fab98 100644
--- a/scripts/run_rag_pipeline.py
+++ b/scripts/run_rag_pipeline.py
@@ -1 +1,98 @@
-# Generate RAG responses
\ No newline at end of file
+from argparse import ArgumentParser
+from haystack import Pipeline
+from haystack_integrations.document_stores.chroma import ChromaDocumentStore
+from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
+from haystack.components.builders import PromptBuilder
+from haystack_integrations.components.generators.ollama.generator import OllamaGenerator
+from haystack.components.builders.answer_builder import AnswerBuilder
+import pandas as pd
+
+
+def build_rag_pipeline(model_name: str) -> Pipeline:
+    document_store = ChromaDocumentStore(
+        collection_name="eidc-data", persist_path="data/chroma-data"
+    )
+    retriever = ChromaQueryTextRetriever(document_store, top_k=3)
+    print("Creating prompt template...")
+
+    template = """
+    Given the following information, answer the question.
+
+    Question: {{query}}
+
+    Context:
+    {% for document in documents %}
+        {{ document.content }}
+    {% endfor %}
+
+    Answer:
+    """
+
+    prompt_builder = PromptBuilder(template=template)
+
+    model_name = "llama3.1"
+
+    print(f"Setting up model ({model_name})...")
+    llm = OllamaGenerator(
+        model=model_name,
+        generation_kwargs={"num_ctx": 16384},
+        url="http://localhost:11434/api/generate",
+    )
+
+    answer_builder = AnswerBuilder()
+
+    rag_pipe = Pipeline()
+
+    rag_pipe.add_component("retriever", retriever)
+    rag_pipe.add_component("prompt_builder", prompt_builder)
+    rag_pipe.add_component("llm", llm)
+    rag_pipe.add_component("answer_builder", answer_builder)
+
+    rag_pipe.connect("retriever.documents", "prompt_builder.documents")
+    rag_pipe.connect("retriever.documents", "answer_builder.documents")
+
+    rag_pipe.connect("prompt_builder", "llm")
+
+    rag_pipe.connect("llm.replies", "answer_builder.replies")
+    return rag_pipe
+
+
+def query_pipeline(query: str, pipeline: Pipeline):
+    return pipeline.run(
+        {
+            "retriever": {"query": query},
+            "prompt_builder": {"query": query},
+            "answer_builder": {"query": query},
+        }
+    )
+
+
+def main(test_data_file: str):
+    rag_pipe = build_rag_pipeline("llama3.1")
+
+    df = pd.read_csv(test_data_file)
+    responses = []
+    for q in df["question"]:
+        responses.append(query_pipeline(q, rag_pipe))
+    df["rag_response"] = responses
+    df.to_csv("data/rag_response.csv")
+
+    query = "Who collected the land cover map data?"
+    result = rag_pipe.run(
+        {
+            "retriever": {"query": query},
+            "prompt_builder": {"query": query},
+            "answer_builder": {"query": query},
+        }
+    )
+    print(result)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("run_rag_pipeline.py")
+    parser.add_argument(
+        "test_data_file",
+        help="File containing test queries to generate response from the RAG pipeline.",
+    )
+    args = parser.parse_args()
+    main(args.test_data_file)

From f9b9b3eb80d2abd44d1e4778242e51ff03c7b5d8 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 17 Oct 2024 10:31:04 +0100
Subject: [PATCH 13/28] Added dummy steps to pipeline

---
 data/.gitignore             |  3 +++
 data/synthetic-datasets.dvc |  6 ++---
 dvc.lock                    | 46 +++++++++++++++++++++++++++++++++++++
 dvc.yaml                    | 22 ++++++++++++++++--
 params.yaml                 |  4 +++-
 pyproject.toml              |  2 ++
 scripts/run_rag_pipeline.py | 42 ++++++++++++++++++---------------
 7 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/data/.gitignore b/data/.gitignore
index c3f2331..b90999a 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -8,3 +8,6 @@
 /chunked_embeddings.json
 /embeddings.json
 /chroma-data
+/evaluation_data.csv
+/eidc_rag_test_sample.csv
+/supporting-docs.json
diff --git a/data/synthetic-datasets.dvc b/data/synthetic-datasets.dvc
index dc27bb2..cd53100 100644
--- a/data/synthetic-datasets.dvc
+++ b/data/synthetic-datasets.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 61b4177259b03a7227784b5b7560726d.dir
-  size: 144597
-  nfiles: 1
+- md5: 9d87c638c5cc518ea360c474c4e1e9ef.dir
+  size: 152121
+  nfiles: 2
   hash: md5
   path: synthetic-datasets
diff --git a/dvc.lock b/dvc.lock
index 27e749b..3fb8862 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -52,6 +52,10 @@ stages:
       hash: md5
       md5: fce18ce3c43175af1cea5d84dac9baf9
       size: 4579965
+    - path: data/supporting-docs.json
+      hash: md5
+      md5: 0febface6f1d23fda46c11bef65284f4
+      size: 34
     - path: scripts/chunk_data.py
       hash: md5
       md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
@@ -95,3 +99,45 @@ stages:
       md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
       size: 2069220
       nfiles: 5
+  run-rag-pipeline:
+    cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
+    deps:
+    - path: data/chroma-data
+      hash: md5
+      md5: 1d7c499f71791267391ff4108632988c.dir
+      size: 2069220
+      nfiles: 5
+    - path: data/eidc_rag_test_sample.csv
+      hash: md5
+      md5: a371d83c5822d256286e80d64d58c3fe
+      size: 7524
+    - path: scripts/run_rag_pipeline.py
+      hash: md5
+      md5: 6d1f49fa8b22288ecd50ed0e3898fd60
+      size: 3153
+    outs:
+    - path: data/evaluation_data.csv
+      hash: md5
+      md5: e313cb899c10a2b5ad670b8bc84d059f
+      size: 8407
+  generate-testset:
+    cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
+    outs:
+    - path: data/eidc_rag_test_sample.csv
+      hash: md5
+      md5: a371d83c5822d256286e80d64d58c3fe
+      size: 7524
+  fetch-supporting-docs:
+    cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json
+    outs:
+    - path: data/supporting-docs.json
+      hash: md5
+      md5: 0febface6f1d23fda46c11bef65284f4
+      size: 34
+  evaluate:
+    cmd: echo "Evaluate responses"
+    deps:
+    - path: data/evaluation_data.csv
+      hash: md5
+      md5: e313cb899c10a2b5ad670b8bc84d059f
+      size: 8407
diff --git a/dvc.yaml b/dvc.yaml
index ab5a76f..2028fa4 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -5,6 +5,10 @@ stages:
     - scripts/fetch_eidc_metadata.py
     outs:
     - ${files.metadata}
+  fetch-supporting-docs:
+    cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs}
+    outs:
+    - ${files.supporting-docs}
   extract-metadata:
     cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted}
     deps:
@@ -16,6 +20,7 @@ stages:
     cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
     deps:
     - ${files.extracted}
+    - ${files.supporting-docs}
     - scripts/chunk_data.py
     outs:
     - ${files.chunked}
@@ -33,6 +38,19 @@ stages:
     - scripts/upload_to_docstore.py
     outs:
     - ${files.doc-store}
+  generate-testset:
+    cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
+    outs: 
+    - ${files.test-set}
   run-rag-pipeline:
-    cmd python scripts/run_rag_pipeline.py ${files.test-set}
-    
\ No newline at end of file
+    cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set}
+    deps:
+    - ${files.test-set}
+    - ${files.doc-store}
+    - scripts/run_rag_pipeline.py
+    outs: 
+    - ${files.eval-set}
+  evaluate:
+    cmd: echo "Evaluate responses"
+    deps:
+    - ${files.eval-set}
\ No newline at end of file
diff --git a/params.yaml b/params.yaml
index a4792b9..2f5354f 100644
--- a/params.yaml
+++ b/params.yaml
@@ -5,10 +5,12 @@ hp:
 files:
   metadata: "data/eidc_metadata.json"
   extracted: "data/extracted_metadata.json"
+  supporting-docs: "data/supporting-docs.json"
   chunked: "data/chunked_data.json"
   embeddings: "data/embeddings.json"
   doc-store: "data/chroma-data"
-  test-set: "data/synthetic-datasets/eidc_rag_test_sample.csv"
+  test-set: "data/eidc_rag_test_sample.csv"
+  eval-set: "data/evaluation_data.csv"
 sample-size: 10 # sample size of 0 will process all data
 rag:
   model: llama3.1
diff --git a/pyproject.toml b/pyproject.toml
index f86faac..454973a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,8 @@ dependencies = [
     "chromadb",
     "ollama-haystack == 0.0.7",
     "chroma-haystack",
+    "ragas == 0.1.10",
+    "nltk"
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
index 16fab98..830a052 100644
--- a/scripts/run_rag_pipeline.py
+++ b/scripts/run_rag_pipeline.py
@@ -57,7 +57,7 @@ def build_rag_pipeline(model_name: str) -> Pipeline:
     return rag_pipe
 
 
-def query_pipeline(query: str, pipeline: Pipeline):
+def run_query(query: str, pipeline: Pipeline):
     return pipeline.run(
         {
             "retriever": {"query": query},
@@ -67,25 +67,27 @@ def query_pipeline(query: str, pipeline: Pipeline):
     )
 
 
-def main(test_data_file: str):
+def query_pipeline(questions, rag_pipe):
+    answers = []
+    contexts = []
+    for q in questions:
+        response = run_query(q, rag_pipe)
+        answers.append(response["answer_builder"]["answers"][0].data)
+        contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
+    return answers, contexts
+
+
+def main(test_data_file: str, ouput_file: str):
     rag_pipe = build_rag_pipeline("llama3.1")
 
     df = pd.read_csv(test_data_file)
-    responses = []
-    for q in df["question"]:
-        responses.append(query_pipeline(q, rag_pipe))
-    df["rag_response"] = responses
-    df.to_csv("data/rag_response.csv")
-
-    query = "Who collected the land cover map data?"
-    result = rag_pipe.run(
-        {
-            "retriever": {"query": query},
-            "prompt_builder": {"query": query},
-            "answer_builder": {"query": query},
-        }
-    )
-    print(result)
+    df.drop(columns=["rating", "contexts"], inplace=True)
+
+    answers, contexts = query_pipeline(df["question"], rag_pipe)
+    
+    df["answer"] = answers
+    df["contexts"] = contexts
+    df.to_csv(ouput_file, index=False)
 
 
 if __name__ == "__main__":
@@ -94,5 +96,9 @@ def main(test_data_file: str):
         "test_data_file",
         help="File containing test queries to generate response from the RAG pipeline.",
     )
+    parser.add_argument(
+        "output_file",
+        help="File to output results to.",
+    )
     args = parser.parse_args()
-    main(args.test_data_file)
+    main(args.test_data_file, args.output_file)

From 419e83f790e2f36164185ce9ce6d441d4800004a Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 17 Oct 2024 14:44:57 +0100
Subject: [PATCH 14/28] Added evaluation script to pipeline

---
 data/.gitignore            |   2 +
 dvc.lock                   |  26 +++++--
 dvc.yaml                   |   8 ++-
 notebooks/ragas_eval.ipynb | 143 ++++++++++++++++++++++++++++---------
 params.yaml                |   2 +
 pyproject.toml             |   3 +-
 scripts/evaluate.py        |  78 +++++++++++++++++++-
 7 files changed, 220 insertions(+), 42 deletions(-)

diff --git a/data/.gitignore b/data/.gitignore
index b90999a..09fbf7e 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -11,3 +11,5 @@
 /evaluation_data.csv
 /eidc_rag_test_sample.csv
 /supporting-docs.json
+/metrics.json
+/eval.png
diff --git a/dvc.lock b/dvc.lock
index 3fb8862..8b454b4 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -104,7 +104,7 @@ stages:
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: 1d7c499f71791267391ff4108632988c.dir
+      md5: 0254e85bb660da611cfa14e5221dae92.dir
       size: 2069220
       nfiles: 5
     - path: data/eidc_rag_test_sample.csv
@@ -118,8 +118,8 @@ stages:
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: e313cb899c10a2b5ad670b8bc84d059f
-      size: 8407
+      md5: 47a0adeb2ee1cb67202048684064d30f
+      size: 7293
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -135,9 +135,23 @@ stages:
       md5: 0febface6f1d23fda46c11bef65284f4
       size: 34
   evaluate:
-    cmd: echo "Evaluate responses"
+    cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
+      -img data/eval.png
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: e313cb899c10a2b5ad670b8bc84d059f
-      size: 8407
+      md5: 47a0adeb2ee1cb67202048684064d30f
+      size: 7293
+    - path: scripts/evaluate.py
+      hash: md5
+      md5: 51f036b805f23dd3ebfd5d819bc9d457
+      size: 2489
+    outs:
+    - path: data/eval.png
+      hash: md5
+      md5: 8c11f987449f8718b6f6011078b6c259
+      size: 49498
+    - path: data/metrics.json
+      hash: md5
+      md5: 53fba29cb236fedd3c6446ea94fea3cc
+      size: 215
diff --git a/dvc.yaml b/dvc.yaml
index 2028fa4..fe6f0ea 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -51,6 +51,10 @@ stages:
     outs: 
     - ${files.eval-set}
   evaluate:
-    cmd: echo "Evaluate responses"
+    cmd: python scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot}
     deps:
-    - ${files.eval-set}
\ No newline at end of file
+    - ${files.eval-set}
+    - scripts/evaluate.py
+    outs:
+    - ${files.metrics}
+    - ${files.eval-plot}
\ No newline at end of file
diff --git a/notebooks/ragas_eval.ipynb b/notebooks/ragas_eval.ipynb
index b1e39b7..53d862b 100644
--- a/notebooks/ragas_eval.ipynb
+++ b/notebooks/ragas_eval.ipynb
@@ -18,7 +18,21 @@
      "output_type": "stream",
      "text": [
       "/home/mpc/github/llm-eval/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/mpc/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/metrics/__init__.py:1: LangChainDeprecationWarning: As of langchain-core 0.3.0, LangChain uses pydantic v2 internally. The langchain_core.pydantic_v1 module was a compatibility shim for pydantic v1, and should no longer be used. Please update the code to import from Pydantic directly.\n",
+      "\n",
+      "For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`\n",
+      "with: `from pydantic import BaseModel`\n",
+      "or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. \tfrom pydantic.v1 import BaseModel\n",
+      "\n",
+      "  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness\n",
+      "/home/mpc/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/metrics/__init__.py:4: LangChainDeprecationWarning: As of langchain-core 0.3.0, LangChain uses pydantic v2 internally. The langchain.pydantic_v1 module was a compatibility shim for pydantic v1, and should no longer be used. Please update the code to import from Pydantic directly.\n",
+      "\n",
+      "For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`\n",
+      "with: `from pydantic import BaseModel`\n",
+      "or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. \tfrom pydantic.v1 import BaseModel\n",
+      "\n",
+      "  from ragas.metrics._context_entities_recall import (\n"
      ]
     }
    ],
@@ -52,17 +66,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_csv(\"../data/evaluation-sets/eidc-eval-sample.csv\")\n",
+    "df = pd.read_csv(\"../data/evaluation-sets/eidc-eval-sample.csv\", converters={\"contexts\": pd.eval})\n",
     "eval_dataset = Dataset.from_pandas(df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['The dataset entitled \"Snow Survey of Great Britain: transcribed data for Scotland, 1945 to 2007\" contains the following information in it\\'s \"description\" metadata field: This dataset comprises observations of snowline from the Snow Survey of Great Britain (SSGB) at 140 sites across Scotland . Daily observations were made between 1945 and 2007. Observations were made by a ground observer who looked out from a given location at 0900 GMT each day and noted the elevation at which snow cover was greater than 50%. \\n\\nThe initial aim was to \\'secure representative data relating to the occurrence of snow cover at different altitudes in the various upland districts over the period October to June\\'. \\n\\nThe data were collated by the British Glaciological Society until 1954 and thereafter by the Met Office. It has been transcribed from paper records held in the Met Office archives in Edinburgh.',\n",
+       "  'The dataset entitled \"Global Navigation Satellite System (GNSS) survey of Ciste Mhearad snow patch perimeter, Cairngorm, Scotland, 2023\" contains the following information in it\\'s \"description\" metadata field: This dataset contains geographic locations, including the horizontal and vertical position, of the perimeter of the Ciste Mhearad snow patch on Cairngorm for three dates in the summer of 2023. Points on the perimeter were located using two Global Navigation Satellite System (GNSS) receivers as base and roving stations during visits on 19 June, 27 July and 28 July 2023.',\n",
+       "  'The dataset entitled \"Snow water equivalent estimates using cosmic-ray neutron sensors in the United Kingdom  (2014-2019)\" contains the following information in it\\'s \"description\" metadata field: This dataset provides daily estimates of the Snow Water Equivalent (SWE) using data from 46 COSMOS-UK sites across the United Kingdom. One set of estimates is derived from the cosmic ray neutron sensor and provides an estimate of the average SWE within the sensor’s large (>100m) footprint. Other SWE estimates are based on either a snowmelt model, or, for certain sites, either a snow depth sensor or a buried \\'SnowFox\\' neutron sensor. Additionally, daily neutron counts, the albedo, and a collection of figures for each snow event are provided.',\n",
+       "  'The dataset entitled \"Net ecosystem carbon dioxide (CO2) exchange and meteorological observations from an eroding high altitude blanket bog, Scotland, 2018-2020\" contains the following information in it\\'s \"description\" metadata field: This record contains time series observations of land surface-atmosphere exchanges of net ecosystem carbon dioxide exchange (NEE), sensible heat (H) and latent heat (LE), and meteorological observations measured at an eroded upland blanket bog peatland (UK-BAL) in the Eastern Cairngorms in Scotland, UK (56.93° N, -3.16° E, 642 m asl). The dataset comprises eddy covariance CO2, water and energy fluxes, originally collected at 20Hz and processed to 30-minute data, as well as accompanying meteorological observations, originally collected at 15 min and processed to 30-minute data. Time period covered in this dataset is 04/07/2018 until 04/11/2020.',\n",
+       "  'The dataset entitled \"Loch Leven Waterfowl 1968-2007\" contains the following information in it\\'s \"description\" metadata field: The dataset comprises counts of ten waterfowl species collected from Loch Leven from 1968-2007 by staff at Scottish Natural Heritage (SNH) and its predecessor bodies (Nature Conservancy), as part of their long-term monitoring programme of the lake. Counts are for the whole loch and represent an annual peak count based on the monthly counts recorded from September (of the year indicated) through to March (of the following year). '],\n",
+       " ['The dataset entitled \"Diet, timing of egg laying and breeding success data for Isle of May European shag population 1985-2015\" contains the following information in it\\'s \"description\" metadata field: Data on timing of breeding, breeding success and diet of the European shag, sampled from the Isle of May population. The data were collected between 1985 and 2015 by visually checking nests and collecting regurgitated diet samples. These data are part of the Isle of May long-term study to assess population trends of seabirds under environmental change (IMLOTS https://www.ceh.ac.uk/our-science/projects/isle-may-long-term-study).',\n",
+       "  'The dataset entitled \"Behaviour, diet, condition and demography data for common guillemots from the Isle of May, 1982-2019\" contains the following information in it\\'s \"description\" metadata field: This dataset contains information on the parental behaviour, diet, condition and demography of common guillemots on the Isle of May, south-east Scotland. Annual data are available for 1982 to 2019 inclusive. These data are part of the Isle of May long-term study to assess population trends of seabirds under environmental change (IMLOTS https://www.ceh.ac.uk/our-science/projects/isle-may-long-term-study).',\n",
+       "  'The dataset entitled \"Breeding success, population size, and site quality data for a population of common guillemots (Uria aalge) on the Isle of May, Scotland, 1981-2018\" contains the following information in it\\'s \"description\" metadata field: This dataset contains information on the breeding outcome, breeding site occupancy, and breeding site quality for a sample of common guillemots breeding on the Isle of May, Scotland. Data is available for all attributes from 1981-2018. These data are part of the Isle of May long-term study to assess population trends of seabirds under environmental change (IMLOTS https://www.ceh.ac.uk/our-science/projects/isle-may-long-term-study). ',\n",
+       "  'The dataset entitled \"The Isle of May long-term study (IMLOTS) seabird annual breeding success 1982-2016\" contains the following information in it\\'s \"description\" metadata field: This dataset contains calculated breeding success rates for six seabird species from representative colonies on the Isle of May, off the East coast of Scotland. Annual breeding success has been measured as the number of chicks fledged per active nest for the Atlantic puffin (Fratercula arctica, since 1982), common guillemot (Uria aalge, since 1982), razorbill (Alca torda, since 1982), European shag (Phalacrocorax aristotelis, since 1987), black-legged kittiwake (Rissa tridactyla, since 1987) and northern fulmar (Fulmarus glacialis, since 1987). The number of active nests recorded are also provided. Data were collected as part of the Isle of May long-term study (IMLOTS), which aims to identify the impact of environmental change on seabirds and their associated ecosystems. This monitoring has been ongoing since 1974, by essentially the same team of scientists, using the same well-documented methods throughout this time.',\n",
+       "  'The dataset entitled \"The Isle of May long-term study (IMLOTS) seabird annual breeding success 1982-2012\" contains the following information in it\\'s \"description\" metadata field: This dataset contains calculated breeding success rates for six seabird species from representative colonies on the Isle of May, off the East coast of Scotland. Annual breeding success has been measured as the number of chicks fledged per active nest for the Atlantic puffin (Fratercula arctica, since 1982), common guillemot (Uria aalge, since 1982), razorbill (Alca torda, since 1982), European shag (Phalacrocorax aristotelis, since 1987), black-legged kittiwake (Rissa tridactyla, since 1987) and northern fulmar (Fulmarus glacialis, since 1987). The number of active nests recorded are also provided. Data were collected as part of the Isle of May long-term study (IMLOTS), which aims to identify the impact of environmental change on seabirds and their associated ecosystems. This monitoring has been ongoing since 1974, by essentially the same team of scientists, using the same well-documented methods throughout this time.'],\n",
+       " ['The dataset entitled \"Land Cover Map 2020 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcel (polygon) dataset for the UKCEH Land Cover Map of 2020 (LCM2020) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2020 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.   A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation accompanying this dataset.\\n\\nLCM2020 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2020. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2020.   These are one of a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps in 2000, 2007, 2015 and annually since 2017.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability. ',\n",
+       "  'The dataset entitled \"Land Cover Map 2019 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2019 (LCM2019) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2019 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2019 20m classified pixels dataset.  All further LCM2019 datasets for Northern Ireland are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2019 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2019.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2019. LCM2019 was simultaneously released with LCM2017 and LCM2018.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2017 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2017 20m classified pixels dataset.  All further LCM2017 datasets for Northern Ireland are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2017 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2017.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2017. LCM2017 was simultaneously released with LCM2018 and LCM2019.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2018 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2018(LCM2018) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2018 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2018 20m classified pixels dataset.  All further LCM2018 datasets for Northern Ireland are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2018 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2018.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2018. LCM2018 was simultaneously released with LCM2017 and LCM2019.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2018 (25m rasterised land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the 25m rasterised land parcels dataset for the UKCEH Land Cover Map of 2018(LCM2018) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2018 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived by rasterising the corresponding LCM2018 land parcels dataset into 25m pixels.  It is provided as a 3-band, 8-bit integer raster.  The first band is the UKCEH Land Cover Class identifier.  Bands 2 and 3 are indicators of classification confidence.  For a fuller description please refer to the product documentation.\\n\\nLCM2018 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2018.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2018. LCM2018 was simultaneously released with LCM2017 and LCM2019.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Northern Ireland (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.'],\n",
+       " ['The dataset entitled \"Land Cover Map 2017 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2017 20m classified pixels dataset.  All further LCM2017 datasets for Northern Ireland are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2017 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2017.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2017. LCM2017 was simultaneously released with LCM2018 and LCM2019.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2018 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2018(LCM2018) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2018 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2018 20m classified pixels dataset.  All further LCM2018 datasets for Northern Ireland are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2018 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2018.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2018. LCM2018 was simultaneously released with LCM2017 and LCM2019.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2019 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2019 (LCM2019) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2019 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2019 20m classified pixels dataset.  All further LCM2019 datasets for Northern Ireland are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2019 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2019.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2019. LCM2019 was simultaneously released with LCM2017 and LCM2018.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2017 (25m rasterised land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the 25m rasterised land parcels dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived by rasterising the corresponding LCM2017 land parcels dataset into 25m pixels.  It is provided as a 3-band, 8-bit integer raster.  The first band is the UKCEH Land Cover Class identifier.  Bands 2 and 3 are indicators  of classification confidence.  For a fuller description please refer to the product documentation.\\n\\nLCM2017 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2017.  These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2017. LCM2017 was simultaneously released with LCM2018 and LCM2019.  These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Northern Ireland (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n",
+       "  'The dataset entitled \"Land Cover Map 2020 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcel (polygon) dataset for the UKCEH Land Cover Map of 2020 (LCM2020) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2020 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.   A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation accompanying this dataset.\\n\\nLCM2020 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2020. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2020.   These are one of a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps in 2000, 2007, 2015 and annually since 2017.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability. '],\n",
+       " ['The dataset entitled \"Ammonia measurements from passive samplers at Fenn\\'s, Whixall, Wem & Cadney Mosses SAC (2018)\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at three sites on Fenn’s, Whixall, Bettisfield, Wem and Cadney Mosses SSSI on the border of Wrexham County Borough (North Wales) and Shropshire (West Midlands). The ammonia measurements are taken from a set of ALPHA (R) (Adapted Low-cost Passive High Absorption) samplers from July to December\\xa0in the year 2018. The sites were established in order to monitor ammonia during implementation of Site Nitrogen Action Plan (SNAP), as part of the Marches Mosses BogLIFE project. This project aims to restore Britain\\'s third largest lowland raised bog within the Fenn’s, Whixall & Bettisfield Mosses and Wem Moss National Nature Reserves near Whitchurch, Shropshire and Wrexham in Wales.',\n",
+       "  'The dataset entitled \" Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire, 2021\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside and the other is outside in the garden area. The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from January 2021 to December 2021. Samplers are exposed in monthly cycles at the beginning of each month.',\n",
+       "  'The dataset entitled \"Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire (2017-2018)\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside in the hall and the other is outside in the garden area . The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from January 2017 to November 2018. Samplers are exposed in monthly cycles at the beginning of each month.',\n",
+       "  'The dataset entitled \"Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire, 2019-2020\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside  and the other is outside in the garden area . The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from November 2018 to January 2021. Samplers are exposed in monthly cycles at the beginning of each month.',\n",
+       "  'The dataset entitled \" Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire, 2022\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside and the other is outside in the garden area. The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from January 2022 to December 2022. Samplers are exposed in monthly cycles at the beginning of each month.']]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_dataset[\"contexts\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -98,35 +156,35 @@
        "      <td>What was the frequency of snowline observation...</td>\n",
        "      <td>The frequency of snowline observations made da...</td>\n",
        "      <td>The available information does not provide a c...</td>\n",
-       "      <td>['The dataset entitled \"Snow Survey of Great B...</td>\n",
+       "      <td>[The dataset entitled \"Snow Survey of Great Br...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>What was the primary focus of studying the Eur...</td>\n",
        "      <td>The primary focus of studying the European sha...</td>\n",
        "      <td>The available information does not clearly sta...</td>\n",
-       "      <td>['The dataset entitled \"Diet, timing of egg la...</td>\n",
+       "      <td>[The dataset entitled \"Diet, timing of egg lay...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>What are the UKCEH Land Cover Classes used to ...</td>\n",
        "      <td>The UKCEH Land Cover Classes used to describe ...</td>\n",
        "      <td>The UKCEH Land Cover Classes used to describe ...</td>\n",
-       "      <td>['The dataset entitled \"Land Cover Map 2020 (l...</td>\n",
+       "      <td>[The dataset entitled \"Land Cover Map 2020 (la...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>What method was used to classify the pixels in...</td>\n",
        "      <td>The Random Forest classification method was us...</td>\n",
        "      <td>Based on the available information, it appears...</td>\n",
-       "      <td>['The dataset entitled \"Land Cover Map 2017 (l...</td>\n",
+       "      <td>[The dataset entitled \"Land Cover Map 2017 (la...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>What were the specific locations where the exp...</td>\n",
        "      <td>The answer to given question is not present in...</td>\n",
        "      <td>Based on the available information, it does no...</td>\n",
-       "      <td>['The dataset entitled \"Ammonia measurements f...</td>\n",
+       "      <td>[The dataset entitled \"Ammonia measurements fr...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -155,14 +213,14 @@
        "4  Based on the available information, it does no...   \n",
        "\n",
        "                                            contexts  \n",
-       "0  ['The dataset entitled \"Snow Survey of Great B...  \n",
-       "1  ['The dataset entitled \"Diet, timing of egg la...  \n",
-       "2  ['The dataset entitled \"Land Cover Map 2020 (l...  \n",
-       "3  ['The dataset entitled \"Land Cover Map 2017 (l...  \n",
-       "4  ['The dataset entitled \"Ammonia measurements f...  "
+       "0  [The dataset entitled \"Snow Survey of Great Br...  \n",
+       "1  [The dataset entitled \"Diet, timing of egg lay...  \n",
+       "2  [The dataset entitled \"Land Cover Map 2020 (la...  \n",
+       "3  [The dataset entitled \"Land Cover Map 2017 (la...  \n",
+       "4  [The dataset entitled \"Ammonia measurements fr...  "
       ]
      },
-     "execution_count": 7,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -173,7 +231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -183,7 +241,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -191,7 +249,6 @@
     "    faithfulness,\n",
     "    answer_relevancy,\n",
     "    context_precision,\n",
-    "    context_utilization,\n",
     "    context_recall,\n",
     "    context_entity_recall,\n",
     "    answer_similarity,\n",
@@ -201,21 +258,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
-     "ename": "ValueError",
-     "evalue": "Dataset feature \"contexts\" should be of type Sequence[string], got <class 'datasets.features.features.Value'>",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43meval_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfaithfulness\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m        \u001b[49m\u001b[43manswer_relevancy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontext_precision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontext_utilization\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontext_recall\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontext_entity_recall\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     10\u001b[0m \u001b[43m        \u001b[49m\u001b[43manswer_similarity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     11\u001b[0m \u001b[43m        \u001b[49m\u001b[43manswer_correctness\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[43m    \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     13\u001b[0m \u001b[43m    \u001b[49m\u001b[43mllm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mllm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     14\u001b[0m \u001b[43m    \u001b[49m\u001b[43membeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     15\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_async\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     16\u001b[0m \u001b[43m    \u001b[49m\u001b[43mraise_exceptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     17\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrun_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mRunConfig\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m     19\u001b[0m result\n",
-      "File \u001b[0;32m~/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/evaluation.py:157\u001b[0m, in \u001b[0;36mevaluate\u001b[0;34m(dataset, metrics, llm, embeddings, callbacks, in_ci, is_async, run_config, raise_exceptions, column_map)\u001b[0m\n\u001b[1;32m    155\u001b[0m dataset \u001b[38;5;241m=\u001b[39m handle_deprecated_ground_truths(dataset)\n\u001b[1;32m    156\u001b[0m validate_evaluation_modes(dataset, metrics)\n\u001b[0;32m--> 157\u001b[0m \u001b[43mvalidate_column_dtypes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# set the llm and embeddings\u001b[39;00m\n\u001b[1;32m    160\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(llm, LangchainLLM):\n",
-      "File \u001b[0;32m~/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/validation.py:56\u001b[0m, in \u001b[0;36mvalidate_column_dtypes\u001b[0;34m(ds)\u001b[0m\n\u001b[1;32m     51\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column_names \u001b[38;5;129;01min\u001b[39;00m ds\u001b[38;5;241m.\u001b[39mfeatures:\n\u001b[1;32m     52\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\n\u001b[1;32m     53\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(ds\u001b[38;5;241m.\u001b[39mfeatures[column_names], Sequence)\n\u001b[1;32m     54\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m ds\u001b[38;5;241m.\u001b[39mfeatures[column_names]\u001b[38;5;241m.\u001b[39mfeature\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstring\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     55\u001b[0m     ):\n\u001b[0;32m---> 56\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m     57\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDataset feature \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcolumn_names\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m should be of type\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     58\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Sequence[string], got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(ds\u001b[38;5;241m.\u001b[39mfeatures[column_names])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     59\u001b[0m         )\n",
-      "\u001b[0;31mValueError\u001b[0m: Dataset feature \"contexts\" should be of type Sequence[string], got <class 'datasets.features.features.Value'>"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating:  14%|█▍        | 5/35 [00:35<03:02,  6.07s/it]Failed to parse output. Returning None.\n",
+      "Evaluating: 100%|██████████| 35/35 [03:40<00:00,  6.31s/it]\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'faithfulness': 0.6956, 'answer_relevancy': 0.1845, 'context_precision': 0.3775, 'context_recall': 0.8000, 'context_entity_recall': 0.3667, 'answer_similarity': 0.2146, 'answer_correctness': 0.0534}"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -225,7 +287,6 @@
     "        faithfulness,\n",
     "        answer_relevancy,\n",
     "        context_precision,\n",
-    "        context_utilization,\n",
     "        context_recall,\n",
     "        context_entity_recall,\n",
     "        answer_similarity,\n",
@@ -233,7 +294,6 @@
     "    ],\n",
     "    llm=llm,\n",
     "    embeddings=embeddings,\n",
-    "    is_async=False,\n",
     "    raise_exceptions=False,\n",
     "    run_config=RunConfig(max_workers=1),\n",
     ")\n",
@@ -245,6 +305,13 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "result_df = result.to_pandas()\n",
     "pio.templates.default = \"gridon\"\n",
@@ -253,7 +320,19 @@
     "for metric in metrics:\n",
     "    fig.add_trace(go.Violin(y=result_df[metric], name=metric, points=\"all\", box_visible=True, meanline_visible=True))\n",
     "fig.update_yaxes(range=[-0.02,1.02])\n",
-    "fig.show()"
+    "with open(\"eval.png\", \"wb\") as f:\n",
+    "    f.write(fig.to_image(format=\"png\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open(\"metrics.json\", \"w\") as f:\n",
+    "    json.dump(result, f)"
    ]
   }
  ],
diff --git a/params.yaml b/params.yaml
index 2f5354f..900e48f 100644
--- a/params.yaml
+++ b/params.yaml
@@ -11,6 +11,8 @@ files:
   doc-store: "data/chroma-data"
   test-set: "data/eidc_rag_test_sample.csv"
   eval-set: "data/evaluation_data.csv"
+  metrics: "data/metrics.json"
+  eval-plot: "data/eval.png"
 sample-size: 10 # sample size of 0 will process all data
 rag:
   model: llama3.1
diff --git a/pyproject.toml b/pyproject.toml
index 454973a..4844faf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,8 @@ dependencies = [
     "ollama-haystack == 0.0.7",
     "chroma-haystack",
     "ragas == 0.1.10",
-    "nltk"
+    "nltk",
+    "nbformat>=4.2.0",
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
index 7fcf1a7..10b3a61 100644
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@@ -1 +1,77 @@
-# Run RAGAS to evaluate
\ No newline at end of file
+from argparse import ArgumentParser
+import pandas as pd
+from datasets import Dataset
+from ragas import evaluate
+from ragas.run_config import RunConfig
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.chat_models import ChatOllama
+import plotly.graph_objects as go
+import plotly.io as pio
+import nest_asyncio
+from ragas.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall,
+    context_entity_recall,
+    answer_similarity,
+    answer_correctness,
+)
+import json
+
+def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
+    nest_asyncio.apply() # apply the event loop async fix
+    df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval})
+    eval_dataset = Dataset.from_pandas(df)
+    llm = ChatOllama(model='mistral-nemo', num_ctx=16384)
+    embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)
+    result = evaluate(
+        eval_dataset,
+        metrics=[
+            faithfulness,
+            answer_relevancy,
+            context_precision,
+            context_recall,
+            context_entity_recall,
+            answer_similarity,
+            answer_correctness,
+        ],
+        llm=llm,
+        embeddings=embeddings,
+        raise_exceptions=False,
+        run_config=RunConfig(max_workers=1),
+    )
+    result_df = result.to_pandas()
+    pio.templates.default = "gridon"
+    fig = go.Figure()
+
+    
+    with open(metric_output, "w") as f:
+        json.dump(result, f)
+    metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]]
+
+    for metric in metrics:
+        fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True))
+    fig.update_yaxes(range=[-0.02,1.02])
+    with open(image_output, "wb") as f:
+        f.write(fig.to_image(format="png"))
+
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("evaluate.py")
+    parser.add_argument("eval_dataset", help="File containing the evaluation data.")
+    parser.add_argument(
+        "-m",
+        "--metrics_output",
+        help="File to save evaluation metrics to.",
+        default="data/metrics.json",
+    )
+    parser.add_argument(
+        "-img",
+        "--image_output",
+        help="File to save image plot to.",
+        default="data/evaluation.png",
+    )
+    args = parser.parse_args()
+    main(args.eval_dataset, args.metrics_output, args.image_output)

From d119b007645f930c97edd191c032b286346b1aff Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 17 Oct 2024 14:59:25 +0100
Subject: [PATCH 15/28] Updated readme

---
 README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b34c1ce..588b68f 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,82 @@
 # llm-eval
-Scripts and data for LLM evaluation.
+This repository contains a reproducible workflow setup using [DVC](https://dvc.org/) backed by a [JASMIN object store](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/). Before working with the repository please contact [Matt Coole](mailto:matcoo@ceh.ac.uk) to request access to the Jasmin object store `llm-eval-o`. Then follow the instructions below.
 
-This repository is setup to work with [DVC](https://dvc.org/) backed by a [JASMIN object store](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/). Please follow the instruction in [`dvc.md`](dvc.md) to get up and running.
+## Requirements
+- [Ollama](https://ollama.com/download) ([`llama3.1`](https://ollama.com/library/llama3.1) and [`mistral-nemo`](https://ollama.com/library/mistral-nemo) models)
 
-## DVC and CML
+## Getting started
+First create a new virtual environment and install the required dependencies:
+```shell
+python -m venv .venv
+source .venv/bin/activate
+pip install .
+```
+Next setup your local DVC configuration with your [Jasmin object store access key](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/#creating-an-access-key-and-secret):
+```shell
+dvc remote modify --local jasmin access_key_id '<ACCES_KEY_ID>'
+dvc remote modify --local jasmin secret_access_key '<KEY_SECRET>'
+```
+Pull the data from the object store using DVC:
+```shell
+dvc pull
+```
+You should now be ready to re-run the pipeline:
+```shell
+dvc repro
+```
+This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the command:
+```shell
+dvc dag
+```
+```
+                 +----------------+                                        
+                 | fetch-metadata |                                        
+                 +----------------+                                        
+                          *                                                
+                          *                                                
+                          *                                                
+                +------------------+            +-----------------------+  
+                | extract-metadata |            | fetch-supporting-docs |  
+                +------------------+            +-----------------------+  
+                                  **               **                      
+                                    ***         ***                        
+                                       **     **                           
+                                    +------------+                         
+                                    | chunk-data |                         
+                                    +------------+                         
+                                           *                               
+                                           *                               
+                                           *                               
+                                +-------------------+                      
+                                | create-embeddings |                      
+                                +-------------------+                      
+                                           *                               
+                                           *                               
+                                           *                               
++------------------+            +--------------------+                     
+| generate-testset |            | upload-to-docstore |                     
++------------------+            +--------------------+                     
+                  **              **                                       
+                    ***        ***                                         
+                       **    **                                            
+                +------------------+                                       
+                | run-rag-pipeline |                                       
+                +------------------+                                       
+                          *                                                
+                          *                                                
+                          *                                                
+                    +----------+                                           
+                    | evaluate |                                           
+                    +----------+
+```
+
+## Notes
+
+### DVC and CML
 Notes on the use of Data Version Control and Continuous Machine Learning:
 - [DVC](dvc.md)
 - [CML](cml.md)
 
-## vLLM
+### vLLM
 Notes on running models with vLLM:
 - [vLLM](vllm.md)
\ No newline at end of file

From ccd4e3ce832e99c0f7930eedd51793214bddbba9 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 17 Oct 2024 15:18:30 +0100
Subject: [PATCH 16/28] Added metrics file to dvc config

---
 dvc.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dvc.yaml b/dvc.yaml
index fe6f0ea..59a6ccc 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -57,4 +57,6 @@ stages:
     - scripts/evaluate.py
     outs:
     - ${files.metrics}
-    - ${files.eval-plot}
\ No newline at end of file
+    - ${files.eval-plot}
+metrics: 
+- ${files.metrics}
\ No newline at end of file

From 4f7ab43272ae4dd80d27fbeafa05f1e8ba415844 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 18 Oct 2024 09:51:39 +0100
Subject: [PATCH 17/28] Added ruff, mypy and cleaned scripts

---
 dvc.lock                       | 61 +++++++++++++++++-----------------
 dvc.yaml                       |  4 +--
 params.yaml                    |  3 ++
 pyproject.toml                 |  8 +++++
 scripts/create_embeddings.py   |  3 +-
 scripts/evaluate.py            | 27 ++++++++++-----
 scripts/extract_metadata.py    |  6 ++--
 scripts/fetch_eidc_metadata.py |  1 +
 scripts/run_rag_pipeline.py    | 36 ++++++++++++++++----
 scripts/upload_to_docstore.py  |  6 +++-
 10 files changed, 104 insertions(+), 51 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index 8b454b4..d143f87 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -5,13 +5,13 @@ stages:
     deps:
     - path: scripts/fetch_eidc_metadata.py
       hash: md5
-      md5: ba838a284da239217d0464f08e0a45ce
-      size: 674
+      md5: 53d620665448ef91f2deedb517e2f502
+      size: 675
     outs:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: fc2f9ebe92cbd07eb06ff6e39366fdac
-      size: 12146216
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
   prepare:
     cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
     deps:
@@ -33,25 +33,25 @@ stages:
     deps:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: fc2f9ebe92cbd07eb06ff6e39366fdac
-      size: 12146216
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
     - path: scripts/extract_metadata.py
       hash: md5
-      md5: c2fa7d2c4b8f28a6e24536ce0df244fd
-      size: 1296
+      md5: 3f0269a6413845f4425af55e7cea7bf8
+      size: 1304
     outs:
     - path: data/extracted_metadata.json
       hash: md5
-      md5: fce18ce3c43175af1cea5d84dac9baf9
-      size: 4579965
+      md5: 789fda7a14f9a85c6ee0e10af8170a95
+      size: 4584498
   chunk-data:
     cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s
       10 data/extracted_metadata.json
     deps:
     - path: data/extracted_metadata.json
       hash: md5
-      md5: fce18ce3c43175af1cea5d84dac9baf9
-      size: 4579965
+      md5: 789fda7a14f9a85c6ee0e10af8170a95
+      size: 4584498
     - path: data/supporting-docs.json
       hash: md5
       md5: 0febface6f1d23fda46c11bef65284f4
@@ -74,7 +74,7 @@ stages:
       size: 14947
     - path: scripts/create_embeddings.py
       hash: md5
-      md5: 3dc6ef284730398375a13df4bff41846
+      md5: 4649c700dfae922b43b3608ee4f00c1a
       size: 808
     outs:
     - path: data/embeddings.json
@@ -83,7 +83,7 @@ stages:
       size: 351126
   upload-to-docstore:
     cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
-      -em all-MiniLM-L6-v2
+      -em all-MiniLM-L6-v2 -c eidc-data
     deps:
     - path: data/embeddings.json
       hash: md5
@@ -91,20 +91,21 @@ stages:
       size: 351126
     - path: scripts/upload_to_docstore.py
       hash: md5
-      md5: ae8755770166dd3d6c1efb9f15723116
-      size: 1836
+      md5: 41da88e3bb6d2592bee938ce347f6983
+      size: 1905
     outs:
     - path: data/chroma-data
       hash: md5
-      md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
+      md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
       size: 2069220
       nfiles: 5
   run-rag-pipeline:
     cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
+      data/chroma-data -c eidc-data
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: 0254e85bb660da611cfa14e5221dae92.dir
+      md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
       size: 2069220
       nfiles: 5
     - path: data/eidc_rag_test_sample.csv
@@ -113,13 +114,13 @@ stages:
       size: 7524
     - path: scripts/run_rag_pipeline.py
       hash: md5
-      md5: 6d1f49fa8b22288ecd50ed0e3898fd60
-      size: 3153
+      md5: 8d5fc0669771146562c773186f4f44f6
+      size: 3667
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 47a0adeb2ee1cb67202048684064d30f
-      size: 7293
+      md5: f6bce3f5c551e84da224d36201858839
+      size: 6638
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -140,18 +141,18 @@ stages:
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 47a0adeb2ee1cb67202048684064d30f
-      size: 7293
+      md5: f6bce3f5c551e84da224d36201858839
+      size: 6638
     - path: scripts/evaluate.py
       hash: md5
-      md5: 51f036b805f23dd3ebfd5d819bc9d457
-      size: 2489
+      md5: 10f76511eafc8a1a9b90e9ae92a76bc5
+      size: 2633
     outs:
     - path: data/eval.png
       hash: md5
-      md5: 8c11f987449f8718b6f6011078b6c259
-      size: 49498
+      md5: fd66aa842f93e8f370399dae5b68e2fe
+      size: 50525
     - path: data/metrics.json
       hash: md5
-      md5: 53fba29cb236fedd3c6446ea94fea3cc
-      size: 215
+      md5: 55266ae1bd64a3499508d07651a5aa13
+      size: 214
diff --git a/dvc.yaml b/dvc.yaml
index 59a6ccc..fa419ff 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -32,7 +32,7 @@ stages:
     outs:
     - ${files.embeddings}
   upload-to-docstore:
-    cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model}
+    cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${doc-store.files} -em ${hp.embeddings-model} -c ${doc-store.collection}
     deps:
     - ${files.embeddings}
     - scripts/upload_to_docstore.py
@@ -43,7 +43,7 @@ stages:
     outs: 
     - ${files.test-set}
   run-rag-pipeline:
-    cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set}
+    cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} ${files.doc-store} -c ${doc-store.collection}
     deps:
     - ${files.test-set}
     - ${files.doc-store}
diff --git a/params.yaml b/params.yaml
index 900e48f..988dbdb 100644
--- a/params.yaml
+++ b/params.yaml
@@ -2,6 +2,9 @@ hp:
   chunk-size: 300
   overlap: 100
   embeddings-model: "all-MiniLM-L6-v2"
+doc-store:
+  collection: "eidc-data"
+  files: "data/chroma-data"
 files:
   metadata: "data/eidc_metadata.json"
   extracted: "data/extracted_metadata.json"
diff --git a/pyproject.toml b/pyproject.toml
index 4844faf..5abe51d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,11 @@ dependencies = [
     "ragas == 0.1.10",
     "nltk",
     "nbformat>=4.2.0",
+    "ruff",
+    "mypy",
+    "types-requests",
+    "types-tqdm",
+    "pandas-stubs",
 ]
 
 [project.optional-dependencies]
@@ -30,3 +35,6 @@ jupyter = [
 
 [tool.setuptools]
 py-modules = []
+
+[tool.mypy]
+files = ["scripts"]
\ No newline at end of file
diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py
index ce1c37b..2ad9cc9 100644
--- a/scripts/create_embeddings.py
+++ b/scripts/create_embeddings.py
@@ -3,9 +3,10 @@
 from argparse import ArgumentParser
 from tqdm import tqdm
 
+
 def create_embedding(text):
     model = SentenceTransformer("all-MiniLM-L6-v2")
-    return model.encode(text) 
+    return model.encode(text)
 
 
 def main(input_file, output_file):
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
index 10b3a61..d7ac98f 100644
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@@ -19,12 +19,13 @@
 )
 import json
 
+
 def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
-    nest_asyncio.apply() # apply the event loop async fix
+    nest_asyncio.apply()  # apply the event loop async fix
     df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval})
     eval_dataset = Dataset.from_pandas(df)
-    llm = ChatOllama(model='mistral-nemo', num_ctx=16384)
-    embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)
+    llm = ChatOllama(model="mistral-nemo", num_ctx=16384)
+    embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384)
     result = evaluate(
         eval_dataset,
         metrics=[
@@ -45,19 +46,29 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
     pio.templates.default = "gridon"
     fig = go.Figure()
 
-    
     with open(metric_output, "w") as f:
         json.dump(result, f)
-    metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]]
+    metrics = [
+        metric
+        for metric in result_df.columns.to_list()
+        if metric not in ["question", "ground_truth", "answer", "contexts"]
+    ]
 
     for metric in metrics:
-        fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True))
-    fig.update_yaxes(range=[-0.02,1.02])
+        fig.add_trace(
+            go.Violin(
+                y=result_df[metric],
+                name=metric,
+                points="all",
+                box_visible=True,
+                meanline_visible=True,
+            )
+        )
+    fig.update_yaxes(range=[-0.02, 1.02])
     with open(image_output, "wb") as f:
         f.write(fig.to_image(format="png"))
 
 
-
 if __name__ == "__main__":
     parser = ArgumentParser("evaluate.py")
     parser.add_argument("eval_dataset", help="File containing the evaluation data.")
diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py
index 241bc1a..8007d09 100644
--- a/scripts/extract_metadata.py
+++ b/scripts/extract_metadata.py
@@ -6,7 +6,9 @@
 METADATA_FIELDS = ["title", "description", "lineage"]
 
 
-def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]:
+def extact_eidc_metadata_fields(
+    json_data: Dict, fields: List[str] = METADATA_FIELDS
+) -> List[Dict[str, str]]:
     metadatas = []
     for field in fields:
         if json_data[field]:
@@ -18,7 +20,7 @@ def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FI
     return metadatas
 
 
-def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
+def parse_eidc_metadata(file_path: str) -> List[Dict[str, str]]:
     data = []
     with open(file_path) as f:
         json_data = json.load(f)
diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
index cd56b4e..f411c16 100644
--- a/scripts/fetch_eidc_metadata.py
+++ b/scripts/fetch_eidc_metadata.py
@@ -4,6 +4,7 @@
 
 URL = "https://catalogue.ceh.ac.uk/eidc/documents"
 
+
 def main(output_file: str) -> None:
     res = requests.get(
         URL,
diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
index 830a052..91408ea 100644
--- a/scripts/run_rag_pipeline.py
+++ b/scripts/run_rag_pipeline.py
@@ -1,4 +1,5 @@
 from argparse import ArgumentParser
+import shutil
 from haystack import Pipeline
 from haystack_integrations.document_stores.chroma import ChromaDocumentStore
 from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
@@ -8,9 +9,12 @@
 import pandas as pd
 
 
-def build_rag_pipeline(model_name: str) -> Pipeline:
+TMP_DOC_PATH = ".tmp/doc-store"
+
+
+def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline:
     document_store = ChromaDocumentStore(
-        collection_name="eidc-data", persist_path="data/chroma-data"
+        collection_name=collection_name, persist_path=TMP_DOC_PATH
     )
     retriever = ChromaQueryTextRetriever(document_store, top_k=3)
     print("Creating prompt template...")
@@ -73,22 +77,30 @@ def query_pipeline(questions, rag_pipe):
     for q in questions:
         response = run_query(q, rag_pipe)
         answers.append(response["answer_builder"]["answers"][0].data)
-        contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
+        contexts.append(
+            [doc.content for doc in response["answer_builder"]["answers"][0].documents]
+        )
     return answers, contexts
 
 
-def main(test_data_file: str, ouput_file: str):
-    rag_pipe = build_rag_pipeline("llama3.1")
+def main(
+    test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str
+):
+    shutil.copytree(doc_store_path, TMP_DOC_PATH)
+
+    rag_pipe = build_rag_pipeline("llama3.1", collection_name)
 
     df = pd.read_csv(test_data_file)
     df.drop(columns=["rating", "contexts"], inplace=True)
 
     answers, contexts = query_pipeline(df["question"], rag_pipe)
-    
+
     df["answer"] = answers
     df["contexts"] = contexts
     df.to_csv(ouput_file, index=False)
 
+    shutil.rmtree(TMP_DOC_PATH)
+
 
 if __name__ == "__main__":
     parser = ArgumentParser("run_rag_pipeline.py")
@@ -100,5 +112,15 @@ def main(test_data_file: str, ouput_file: str):
         "output_file",
         help="File to output results to.",
     )
+    parser.add_argument(
+        "doc_store_path",
+        help="Path to the doc store.",
+    )
+    parser.add_argument(
+        "-c",
+        "--collection",
+        help="Collection name in doc store.",
+        default="eidc-data",
+    )
     args = parser.parse_args()
-    main(args.test_data_file, args.output_file)
+    main(args.test_data_file, args.output_file, args.doc_store_path, args.collection)
diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py
index 4f2e8af..7b547d7 100644
--- a/scripts/upload_to_docstore.py
+++ b/scripts/upload_to_docstore.py
@@ -1,13 +1,17 @@
 from argparse import ArgumentParser
 import json
 import uuid
+import shutil
+import os
 
 import chromadb
 from chromadb.utils import embedding_functions
 
 
 def main(input_file: str, output_path: str, collection_name: str, embedding_model: str):
-    print(collection_name)
+    if os.path.exists(output_path):
+        shutil.rmtree(output_path)
+
     with open(input_file) as f:
         json_data = json.load(f)
 

From 9705b616b2ddd44f7e632bcd2bec3e606b544ed8 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 18 Oct 2024 12:06:21 +0100
Subject: [PATCH 18/28] Added script to download supporting docs

---
 README.md                             | 39 ++++++++++++++---
 dvc.lock                              | 63 +++++++++++++++------------
 dvc.yaml                              |  7 ++-
 params.yaml                           | 44 ++++++++++---------
 scripts/fetch_eidc_supporting_docs.py |  0
 scripts/fetch_supporting_docs.py      | 47 ++++++++++++++++++++
 6 files changed, 144 insertions(+), 56 deletions(-)
 delete mode 100644 scripts/fetch_eidc_supporting_docs.py
 create mode 100644 scripts/fetch_supporting_docs.py

diff --git a/README.md b/README.md
index 588b68f..20d03ac 100644
--- a/README.md
+++ b/README.md
@@ -29,12 +29,12 @@ This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the co
 dvc dag
 ```
 ```
-                 +----------------+                                        
-                 | fetch-metadata |                                        
-                 +----------------+                                        
-                          *                                                
-                          *                                                
-                          *                                                
+                                  +----------------+                       
+                                  | fetch-metadata |                       
+                                  +----------------+                       
+                                  **               **                      
+                               ***                   ***                   
+                             **                         **                 
                 +------------------+            +-----------------------+  
                 | extract-metadata |            | fetch-supporting-docs |  
                 +------------------+            +-----------------------+  
@@ -67,9 +67,34 @@ dvc dag
                           *                                                
                     +----------+                                           
                     | evaluate |                                           
-                    +----------+
+                    +----------+  
+```
+
+> Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file.
+
+## Running Experiments
+The pipeline by default will run using the parameters defind in [`params.yaml`](params.yaml). To experiment with varying these paramaters you can change them directly, or use [DVC experiments](). 
+
+To run an experiment varying a particual parameter:
+```shell
+dvc exp run -S hp.chunk-size=1000
 ```
+This will re-run the pipeline but override the value of the `hp.chunk-size` parameter in [`params.yaml`](params.yaml) and set it to `1000`. Only the necessary stages of the pipeline should be re-run and the result should appear in your workspace.
 
+You can compare the results of your experiment to the results of the baseline run of the pipeline using:
+```shell
+dvc exp diff
+```
+```shell
+Path               Metric              HEAD      workspace    Change
+data/metrics.json  answer_correctness  0.049482  0.043685     -0.0057974
+data/metrics.json  answer_similarity   0.19793   0.17474      -0.02319
+data/metrics.json  context_recall      0.125     0            -0.125
+data/metrics.json  faithfulness        0.75      0.69375      -0.05625
+
+Path         Param          HEAD    workspace    Change
+params.yaml  hp.chunk-size  300     1000         700
+```
 ## Notes
 
 ### DVC and CML
diff --git a/dvc.lock b/dvc.lock
index d143f87..dd7f7b1 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -45,8 +45,8 @@ stages:
       md5: 789fda7a14f9a85c6ee0e10af8170a95
       size: 4584498
   chunk-data:
-    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s
-      10 data/extracted_metadata.json
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s
+      10 data/extracted_metadata.json data/supporting-docs.json
     deps:
     - path: data/extracted_metadata.json
       hash: md5
@@ -54,8 +54,8 @@ stages:
       size: 4584498
     - path: data/supporting-docs.json
       hash: md5
-      md5: 0febface6f1d23fda46c11bef65284f4
-      size: 34
+      md5: b0941cc9a7ca7df456157380bcc28f39
+      size: 75646
     - path: scripts/chunk_data.py
       hash: md5
       md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
@@ -63,15 +63,15 @@ stages:
     outs:
     - path: data/chunked_data.json
       hash: md5
-      md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
-      size: 14947
+      md5: 97f06c3b76ff05d62ccdecd9d5742712
+      size: 137681
   create-embeddings:
     cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
     deps:
     - path: data/chunked_data.json
       hash: md5
-      md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
-      size: 14947
+      md5: 97f06c3b76ff05d62ccdecd9d5742712
+      size: 137681
     - path: scripts/create_embeddings.py
       hash: md5
       md5: 4649c700dfae922b43b3608ee4f00c1a
@@ -79,16 +79,16 @@ stages:
     outs:
     - path: data/embeddings.json
       hash: md5
-      md5: b08299369d1f243eb8d8ffa2cdb9a90f
-      size: 351126
+      md5: 8d80ef225c59ede34d026f6f2930bae3
+      size: 1894126
   upload-to-docstore:
     cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
       -em all-MiniLM-L6-v2 -c eidc-data
     deps:
     - path: data/embeddings.json
       hash: md5
-      md5: b08299369d1f243eb8d8ffa2cdb9a90f
-      size: 351126
+      md5: 8d80ef225c59ede34d026f6f2930bae3
+      size: 1894126
     - path: scripts/upload_to_docstore.py
       hash: md5
       md5: 41da88e3bb6d2592bee938ce347f6983
@@ -96,8 +96,8 @@ stages:
     outs:
     - path: data/chroma-data
       hash: md5
-      md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
-      size: 2069220
+      md5: cc85398c596d4c5839714e93e33468bb.dir
+      size: 3580644
       nfiles: 5
   run-rag-pipeline:
     cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
@@ -105,8 +105,8 @@ stages:
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
-      size: 2069220
+      md5: cc85398c596d4c5839714e93e33468bb.dir
+      size: 3580644
       nfiles: 5
     - path: data/eidc_rag_test_sample.csv
       hash: md5
@@ -119,8 +119,8 @@ stages:
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: f6bce3f5c551e84da224d36201858839
-      size: 6638
+      md5: 9825cf7e7a89ca17634b44e9256eefc9
+      size: 9695
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -129,20 +129,29 @@ stages:
       md5: a371d83c5822d256286e80d64d58c3fe
       size: 7524
   fetch-supporting-docs:
-    cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json
+    cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
+    - path: scripts/fetch_supporting_docs.py
+      hash: md5
+      md5: de0c11e81bf10e040bef67e43466b789
+      size: 1472
     outs:
     - path: data/supporting-docs.json
       hash: md5
-      md5: 0febface6f1d23fda46c11bef65284f4
-      size: 34
+      md5: b0941cc9a7ca7df456157380bcc28f39
+      size: 75646
   evaluate:
     cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
       -img data/eval.png
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: f6bce3f5c551e84da224d36201858839
-      size: 6638
+      md5: 9825cf7e7a89ca17634b44e9256eefc9
+      size: 9695
     - path: scripts/evaluate.py
       hash: md5
       md5: 10f76511eafc8a1a9b90e9ae92a76bc5
@@ -150,9 +159,9 @@ stages:
     outs:
     - path: data/eval.png
       hash: md5
-      md5: fd66aa842f93e8f370399dae5b68e2fe
-      size: 50525
+      md5: 1279778c7e509e972d1f366157d24966
+      size: 58228
     - path: data/metrics.json
       hash: md5
-      md5: 55266ae1bd64a3499508d07651a5aa13
-      size: 214
+      md5: 2b93334ba0e8226c916d0964237cb72c
+      size: 225
diff --git a/dvc.yaml b/dvc.yaml
index fa419ff..0e9f154 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -6,7 +6,10 @@ stages:
     outs:
     - ${files.metadata}
   fetch-supporting-docs:
-    cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs}
+    cmd: python scripts/fetch_supporting_docs.py ${files.metadata} ${files.supporting-docs}
+    deps:
+    - ${files.metadata}
+    - scripts/fetch_supporting_docs.py
     outs:
     - ${files.supporting-docs}
   extract-metadata:
@@ -17,7 +20,7 @@ stages:
     outs:
     - ${files.extracted}
   chunk-data:
-    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
+    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs}
     deps:
     - ${files.extracted}
     - ${files.supporting-docs}
diff --git a/params.yaml b/params.yaml
index 988dbdb..85c3119 100644
--- a/params.yaml
+++ b/params.yaml
@@ -1,30 +1,34 @@
 hp:
-  chunk-size: 300
+  chunk-size: 500
   overlap: 100
-  embeddings-model: "all-MiniLM-L6-v2"
+  embeddings-model: all-MiniLM-L6-v2
 doc-store:
-  collection: "eidc-data"
-  files: "data/chroma-data"
+  collection: eidc-data
+  files: data/chroma-data
 files:
-  metadata: "data/eidc_metadata.json"
-  extracted: "data/extracted_metadata.json"
-  supporting-docs: "data/supporting-docs.json"
-  chunked: "data/chunked_data.json"
-  embeddings: "data/embeddings.json"
-  doc-store: "data/chroma-data"
-  test-set: "data/eidc_rag_test_sample.csv"
-  eval-set: "data/evaluation_data.csv"
-  metrics: "data/metrics.json"
-  eval-plot: "data/eval.png"
+  metadata: data/eidc_metadata.json
+  extracted: data/extracted_metadata.json
+  supporting-docs: data/supporting-docs.json
+  chunked: data/chunked_data.json
+  embeddings: data/embeddings.json
+  doc-store: data/chroma-data
+  test-set: data/eidc_rag_test_sample.csv
+  eval-set: data/evaluation_data.csv
+  metrics: data/metrics.json
+  eval-plot: data/eval.png
 sample-size: 10 # sample size of 0 will process all data
 rag:
   model: llama3.1
-  prompt: >
-    You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n
+  prompt: >-
+    You are part of a retrieval augmented pipeline. You will be given a question and
+    a context on which to base your answer.\n
     Do not use your own knowledge to answer the question.\n
-    The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n
-    Do not refer to "context" in your answer, instead refer to the context as available information.
-    If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n
+    The context provided will be metadata from datasets contained in the Environmental
+    Information Data Centre (EIDC).\n
+    Do not refer to "context" in your answer, instead refer to the context as available
+    information.
+    If the answer to the question is not clear from the context, suggest which dataset
+    or datasets might be helpful in answering the question.\n
     Question: {{query}}\n
     Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %}
-    Answer:
\ No newline at end of file
+    Answer:
diff --git a/scripts/fetch_eidc_supporting_docs.py b/scripts/fetch_eidc_supporting_docs.py
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py
new file mode 100644
index 0000000..36354e7
--- /dev/null
+++ b/scripts/fetch_supporting_docs.py
@@ -0,0 +1,47 @@
+from argparse import ArgumentParser
+import json
+from tqdm import tqdm
+import requests
+import os
+from typing import Dict, List
+from dotenv import load_dotenv
+
+
+def extract_ids(metadata_file: str):
+    with open(metadata_file) as f:
+        json_data = json.load(f)
+        ids = [dataset["identifier"] for dataset in json_data["results"]]
+        return ids
+
+
+def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
+    res = requests.get(
+        f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
+    )
+    json_data = res.json()
+    docs = []
+    for key, val in json_data["success"].items():
+        docs.append({"id": eidc_id, "field": key, "value": val})
+    return docs
+
+
+def main(metadata_file: str, supporting_docs_file: str):
+    load_dotenv()
+    user = os.getenv("username")
+    password = os.getenv("password")
+    ids = extract_ids(metadata_file)
+    docs = []
+    for id in tqdm(ids):
+        docs.extend(get_supporting_docs(id, user, password))
+        if len(docs) > 0:
+            break
+    with open(supporting_docs_file, "w") as f:
+        json.dump(docs, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("fetch_supporting_docs.py")
+    parser.add_argument("metadata", help="File containing EIDC metadata.")
+    parser.add_argument("supporting_docs", help="File to save supporting docs to.")
+    args = parser.parse_args()
+    main(args.metadata, args.supporting_docs)

From 68e17ac4ce10df582ffa0b941d5c787178f4eab8 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 18 Oct 2024 14:02:56 +0100
Subject: [PATCH 19/28] Reformatted code with ruff

---
 dummy-evaluation.py         | 25 +++++++++++++++++++++----
 notebooks/ragas_eval.ipynb  | 29 ++++++++++++++++++++++-------
 notebooks/ragas_synth.ipynb | 15 +++++++--------
 notebooks/vllm_test.ipynb   | 15 ++++++++-------
 4 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/dummy-evaluation.py b/dummy-evaluation.py
index 318b80d..53ec812 100644
--- a/dummy-evaluation.py
+++ b/dummy-evaluation.py
@@ -3,7 +3,12 @@
 import plotly.graph_objects as go
 import plotly.io as pio
 
-metrics = {"answer_relevancy", "answer_correctness", "context_precision", "context_recall"}
+metrics = {
+    "answer_relevancy",
+    "answer_correctness",
+    "context_precision",
+    "context_recall",
+}
 dummy_data = {metric: np.random.rand(100) for metric in metrics}
 df = pd.DataFrame(dummy_data)
 
@@ -13,8 +18,20 @@
 
 pio.templates.default = "gridon"
 fig = go.Figure()
-metrics = [metric for metric in df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]]
+metrics = [
+    metric
+    for metric in df.columns.to_list()
+    if metric not in ["question", "ground_truth", "answer", "contexts"]
+]
 for metric in metrics:
-    fig.add_trace(go.Violin(y=df[metric], name=metric, points="all", box_visible=True, meanline_visible=True))
-fig.update_yaxes(range=[-0.02,1.02])
+    fig.add_trace(
+        go.Violin(
+            y=df[metric],
+            name=metric,
+            points="all",
+            box_visible=True,
+            meanline_visible=True,
+        )
+    )
+fig.update_yaxes(range=[-0.02, 1.02])
 fig.write_image("metrics.png")
diff --git a/notebooks/ragas_eval.ipynb b/notebooks/ragas_eval.ipynb
index 53d862b..56bcb43 100644
--- a/notebooks/ragas_eval.ipynb
+++ b/notebooks/ragas_eval.ipynb
@@ -54,7 +54,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "nest_asyncio.apply() # apply the event loop async fix"
+    "nest_asyncio.apply()  # apply the event loop async fix"
    ]
   },
   {
@@ -70,7 +70,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_csv(\"../data/evaluation-sets/eidc-eval-sample.csv\", converters={\"contexts\": pd.eval})\n",
+    "df = pd.read_csv(\n",
+    "    \"../data/evaluation-sets/eidc-eval-sample.csv\", converters={\"contexts\": pd.eval}\n",
+    ")\n",
     "eval_dataset = Dataset.from_pandas(df)"
    ]
   },
@@ -235,8 +237,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llm = ChatOllama(model='mistral-nemo', num_ctx=16384)\n",
-    "embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)"
+    "llm = ChatOllama(model=\"mistral-nemo\", num_ctx=16384)\n",
+    "embeddings = OllamaEmbeddings(model=\"mistral-nemo\", num_ctx=16384)"
    ]
   },
   {
@@ -316,10 +318,22 @@
     "result_df = result.to_pandas()\n",
     "pio.templates.default = \"gridon\"\n",
     "fig = go.Figure()\n",
-    "metrics = [metric for metric in result_df.columns.to_list() if metric not in [\"question\", \"ground_truth\", \"answer\", \"contexts\"]]\n",
+    "metrics = [\n",
+    "    metric\n",
+    "    for metric in result_df.columns.to_list()\n",
+    "    if metric not in [\"question\", \"ground_truth\", \"answer\", \"contexts\"]\n",
+    "]\n",
     "for metric in metrics:\n",
-    "    fig.add_trace(go.Violin(y=result_df[metric], name=metric, points=\"all\", box_visible=True, meanline_visible=True))\n",
-    "fig.update_yaxes(range=[-0.02,1.02])\n",
+    "    fig.add_trace(\n",
+    "        go.Violin(\n",
+    "            y=result_df[metric],\n",
+    "            name=metric,\n",
+    "            points=\"all\",\n",
+    "            box_visible=True,\n",
+    "            meanline_visible=True,\n",
+    "        )\n",
+    "    )\n",
+    "fig.update_yaxes(range=[-0.02, 1.02])\n",
     "with open(\"eval.png\", \"wb\") as f:\n",
     "    f.write(fig.to_image(format=\"png\"))"
    ]
@@ -331,6 +345,7 @@
    "outputs": [],
    "source": [
     "import json\n",
+    "\n",
     "with open(\"metrics.json\", \"w\") as f:\n",
     "    json.dump(result, f)"
    ]
diff --git a/notebooks/ragas_synth.ipynb b/notebooks/ragas_synth.ipynb
index b0c4371..cf39b04 100644
--- a/notebooks/ragas_synth.ipynb
+++ b/notebooks/ragas_synth.ipynb
@@ -19,8 +19,6 @@
     "from ragas.testset.generator import TestsetGenerator\n",
     "from ragas.testset.evolutions import simple, reasoning, multi_context\n",
     "from ragas.run_config import RunConfig\n",
-    "from langchain.docstore.document import Document\n",
-    "import pandas as pd\n",
     "import nest_asyncio"
    ]
   },
@@ -30,7 +28,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "nest_asyncio.apply() # apply the event loop async fix"
+    "nest_asyncio.apply()  # apply the event loop async fix"
    ]
   },
   {
@@ -46,9 +44,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llm = ChatOllama(model='mistral-nemo', num_ctx=16384)\n",
-    "embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)\n",
-    "gen = TestsetGenerator.from_langchain(llm, llm, embeddings, run_config=RunConfig(max_workers=1, max_retries=1))\n",
+    "llm = ChatOllama(model=\"mistral-nemo\", num_ctx=16384)\n",
+    "embeddings = OllamaEmbeddings(model=\"mistral-nemo\", num_ctx=16384)\n",
+    "gen = TestsetGenerator.from_langchain(\n",
+    "    llm, llm, embeddings, run_config=RunConfig(max_workers=1, max_retries=1)\n",
+    ")\n",
     "dist = {simple: 0.6, multi_context: 0.2, reasoning: 0.2}"
    ]
   },
@@ -65,7 +65,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "docs = [] # load a set of langchain documents to base the synthetic test set generation on"
+    "docs = []  # load a set of langchain documents to base the synthetic test set generation on"
    ]
   },
   {
@@ -81,7 +81,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "testset = gen.generate_with_langchain_docs(docs, 5, dist, is_async=False)"
    ]
   },
diff --git a/notebooks/vllm_test.ipynb b/notebooks/vllm_test.ipynb
index 755c34c..86701c7 100644
--- a/notebooks/vllm_test.ipynb
+++ b/notebooks/vllm_test.ipynb
@@ -15,9 +15,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prompts = [\n",
-    "    \"Tell me a joke.\"\n",
-    "]\n",
+    "prompts = [\"Tell me a joke.\"]\n",
     "params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)"
    ]
   },
@@ -26,9 +24,7 @@
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import os"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
@@ -87,7 +83,12 @@
     }
    ],
    "source": [
-    "llm = LLM(model=\"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit\", quantization=\"bitsandbytes\", load_format=\"bitsandbytes\", max_model_len=4096)"
+    "llm = LLM(\n",
+    "    model=\"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit\",\n",
+    "    quantization=\"bitsandbytes\",\n",
+    "    load_format=\"bitsandbytes\",\n",
+    "    max_model_len=4096,\n",
+    ")"
    ]
   },
   {

From be3852603eb3558d900717f7e4f5e8a8611f2fec Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 18 Oct 2024 18:45:52 +0100
Subject: [PATCH 20/28] Caught exception when supporitng docs not available

---
 dvc.lock                         | 56 ++++++++++++++++----------------
 scripts/fetch_supporting_docs.py | 24 ++++++++------
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index dd7f7b1..1d52d2e 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -54,8 +54,8 @@ stages:
       size: 4584498
     - path: data/supporting-docs.json
       hash: md5
-      md5: b0941cc9a7ca7df456157380bcc28f39
-      size: 75646
+      md5: f3ea9980226e5408497c96a10cc77b80
+      size: 72013526
     - path: scripts/chunk_data.py
       hash: md5
       md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
@@ -63,15 +63,15 @@ stages:
     outs:
     - path: data/chunked_data.json
       hash: md5
-      md5: 97f06c3b76ff05d62ccdecd9d5742712
-      size: 137681
+      md5: f6426396e1a3564b53649ef5fc0571fd
+      size: 993814
   create-embeddings:
     cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
     deps:
     - path: data/chunked_data.json
       hash: md5
-      md5: 97f06c3b76ff05d62ccdecd9d5742712
-      size: 137681
+      md5: f6426396e1a3564b53649ef5fc0571fd
+      size: 993814
     - path: scripts/create_embeddings.py
       hash: md5
       md5: 4649c700dfae922b43b3608ee4f00c1a
@@ -79,16 +79,16 @@ stages:
     outs:
     - path: data/embeddings.json
       hash: md5
-      md5: 8d80ef225c59ede34d026f6f2930bae3
-      size: 1894126
+      md5: 8fd682131a282736f6a81a6c53040b1e
+      size: 13422675
   upload-to-docstore:
     cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
       -em all-MiniLM-L6-v2 -c eidc-data
     deps:
     - path: data/embeddings.json
       hash: md5
-      md5: 8d80ef225c59ede34d026f6f2930bae3
-      size: 1894126
+      md5: 8fd682131a282736f6a81a6c53040b1e
+      size: 13422675
     - path: scripts/upload_to_docstore.py
       hash: md5
       md5: 41da88e3bb6d2592bee938ce347f6983
@@ -96,18 +96,18 @@ stages:
     outs:
     - path: data/chroma-data
       hash: md5
-      md5: cc85398c596d4c5839714e93e33468bb.dir
-      size: 3580644
-      nfiles: 5
+      md5: 5c99644f30def03f87b37c98341c6f25.dir
+      size: 13758136
+      nfiles: 6
   run-rag-pipeline:
     cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
       data/chroma-data -c eidc-data
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: cc85398c596d4c5839714e93e33468bb.dir
-      size: 3580644
-      nfiles: 5
+      md5: 5c99644f30def03f87b37c98341c6f25.dir
+      size: 13758136
+      nfiles: 6
     - path: data/eidc_rag_test_sample.csv
       hash: md5
       md5: a371d83c5822d256286e80d64d58c3fe
@@ -119,8 +119,8 @@ stages:
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 9825cf7e7a89ca17634b44e9256eefc9
-      size: 9695
+      md5: 8ea0a3f240478e9db41855922ac534a6
+      size: 9894
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -137,21 +137,21 @@ stages:
       size: 12157676
     - path: scripts/fetch_supporting_docs.py
       hash: md5
-      md5: de0c11e81bf10e040bef67e43466b789
-      size: 1472
+      md5: 923af3b6ce1447d388b08fab0e3ab77d
+      size: 1660
     outs:
     - path: data/supporting-docs.json
       hash: md5
-      md5: b0941cc9a7ca7df456157380bcc28f39
-      size: 75646
+      md5: f3ea9980226e5408497c96a10cc77b80
+      size: 72013526
   evaluate:
     cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
       -img data/eval.png
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 9825cf7e7a89ca17634b44e9256eefc9
-      size: 9695
+      md5: 8ea0a3f240478e9db41855922ac534a6
+      size: 9894
     - path: scripts/evaluate.py
       hash: md5
       md5: 10f76511eafc8a1a9b90e9ae92a76bc5
@@ -159,9 +159,9 @@ stages:
     outs:
     - path: data/eval.png
       hash: md5
-      md5: 1279778c7e509e972d1f366157d24966
-      size: 58228
+      md5: bae77b1b721bf283a30a64f67af45fea
+      size: 74438
     - path: data/metrics.json
       hash: md5
-      md5: 2b93334ba0e8226c916d0964237cb72c
-      size: 225
+      md5: 0145280f36071a6df551ef57d3f8393e
+      size: 229
diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py
index 36354e7..66e77ac 100644
--- a/scripts/fetch_supporting_docs.py
+++ b/scripts/fetch_supporting_docs.py
@@ -1,4 +1,5 @@
 from argparse import ArgumentParser
+import logging
 import json
 from tqdm import tqdm
 import requests
@@ -6,6 +7,7 @@
 from typing import Dict, List
 from dotenv import load_dotenv
 
+logger = logging.getLogger(__name__)
 
 def extract_ids(metadata_file: str):
     with open(metadata_file) as f:
@@ -15,14 +17,18 @@ def extract_ids(metadata_file: str):
 
 
 def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
-    res = requests.get(
-        f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
-    )
-    json_data = res.json()
-    docs = []
-    for key, val in json_data["success"].items():
-        docs.append({"id": eidc_id, "field": key, "value": val})
-    return docs
+    try:
+        res = requests.get(
+            f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
+        )
+        json_data = res.json()
+        docs = []
+        for key, val in json_data["success"].items():
+            docs.append({"id": eidc_id, "field": key, "value": val})
+        return docs
+    except Exception as e:
+        logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e)
+        return []
 
 
 def main(metadata_file: str, supporting_docs_file: str):
@@ -33,8 +39,6 @@ def main(metadata_file: str, supporting_docs_file: str):
     docs = []
     for id in tqdm(ids):
         docs.extend(get_supporting_docs(id, user, password))
-        if len(docs) > 0:
-            break
     with open(supporting_docs_file, "w") as f:
         json.dump(docs, f, indent=4)
 

From e9b504f66f6253c103c196d20176970e436de632 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Mon, 21 Oct 2024 13:42:33 +0100
Subject: [PATCH 21/28] Testing dag diagram in mermaid format

---
 dag.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 dag.md

diff --git a/dag.md b/dag.md
new file mode 100644
index 0000000..91e16fe
--- /dev/null
+++ b/dag.md
@@ -0,0 +1,23 @@
+```mermaid
+flowchart TD
+	node1["chunk-data"]
+	node2["create-embeddings"]
+	node3["evaluate"]
+	node4["extract-metadata"]
+	node5["fetch-metadata"]
+	node6["fetch-supporting-docs"]
+	node7["generate-testset"]
+	node8["run-rag-pipeline"]
+	node9["upload-to-docstore"]
+	node1-->node2
+	node2-->node9
+	node4-->node1
+	node5-->node4
+	node5-->node6
+	node6-->node1
+	node7-->node8
+	node8-->node3
+	node9-->node8
+	node10["data/evaluation-sets.dvc"]
+	node11["data/synthetic-datasets.dvc"]
+```

From dbfb2bcfc610eab6b8c60b4cf350034e0b58f397 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Mon, 21 Oct 2024 13:47:32 +0100
Subject: [PATCH 22/28] Updated readme /w mermaid format dag

---
 README.md | 67 +++++++++++++++++++++----------------------------------
 dag.md    | 23 -------------------
 2 files changed, 26 insertions(+), 64 deletions(-)
 delete mode 100644 dag.md

diff --git a/README.md b/README.md
index 20d03ac..3ccf9c1 100644
--- a/README.md
+++ b/README.md
@@ -28,48 +28,33 @@ This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the co
 ```shell
 dvc dag
 ```
+or it can be output to mermaid format to display in markdown:
+```shell
+dvc dag -md
 ```
-                                  +----------------+                       
-                                  | fetch-metadata |                       
-                                  +----------------+                       
-                                  **               **                      
-                               ***                   ***                   
-                             **                         **                 
-                +------------------+            +-----------------------+  
-                | extract-metadata |            | fetch-supporting-docs |  
-                +------------------+            +-----------------------+  
-                                  **               **                      
-                                    ***         ***                        
-                                       **     **                           
-                                    +------------+                         
-                                    | chunk-data |                         
-                                    +------------+                         
-                                           *                               
-                                           *                               
-                                           *                               
-                                +-------------------+                      
-                                | create-embeddings |                      
-                                +-------------------+                      
-                                           *                               
-                                           *                               
-                                           *                               
-+------------------+            +--------------------+                     
-| generate-testset |            | upload-to-docstore |                     
-+------------------+            +--------------------+                     
-                  **              **                                       
-                    ***        ***                                         
-                       **    **                                            
-                +------------------+                                       
-                | run-rag-pipeline |                                       
-                +------------------+                                       
-                          *                                                
-                          *                                                
-                          *                                                
-                    +----------+                                           
-                    | evaluate |                                           
-                    +----------+  
+```mermaid
+flowchart TD
+	node1["chunk-data"]
+	node2["create-embeddings"]
+	node3["evaluate"]
+	node4["extract-metadata"]
+	node5["fetch-metadata"]
+	node6["fetch-supporting-docs"]
+	node7["generate-testset"]
+	node8["run-rag-pipeline"]
+	node9["upload-to-docstore"]
+	node1-->node2
+	node2-->node9
+	node4-->node1
+	node5-->node4
+	node5-->node6
+	node6-->node1
+	node7-->node8
+	node8-->node3
+	node9-->node8
+	node10["data/evaluation-sets.dvc"]
+	node11["data/synthetic-datasets.dvc"]
 ```
-
 > Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file.
 
 ## Running Experiments
@@ -104,4 +89,4 @@ Notes on the use of Data Version Control and Continuous Machine Learning:
 
 ### vLLM
 Notes on running models with vLLM:
-- [vLLM](vllm.md)
\ No newline at end of file
+- [vLLM](vllm.md)
diff --git a/dag.md b/dag.md
deleted file mode 100644
index 91e16fe..0000000
--- a/dag.md
+++ /dev/null
@@ -1,23 +0,0 @@
-```mermaid
-flowchart TD
-	node1["chunk-data"]
-	node2["create-embeddings"]
-	node3["evaluate"]
-	node4["extract-metadata"]
-	node5["fetch-metadata"]
-	node6["fetch-supporting-docs"]
-	node7["generate-testset"]
-	node8["run-rag-pipeline"]
-	node9["upload-to-docstore"]
-	node1-->node2
-	node2-->node9
-	node4-->node1
-	node5-->node4
-	node5-->node6
-	node6-->node1
-	node7-->node8
-	node8-->node3
-	node9-->node8
-	node10["data/evaluation-sets.dvc"]
-	node11["data/synthetic-datasets.dvc"]
-```

From 501df1a1e46fc87dde4fd88be7a532b179bdcf62 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Mon, 21 Oct 2024 13:51:32 +0100
Subject: [PATCH 23/28] Removed vllm dependency and updated readme

---
 README.md      | 5 +++++
 pyproject.toml | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3ccf9c1..414f241 100644
--- a/README.md
+++ b/README.md
@@ -3,23 +3,28 @@ This repository contains a reproducible workflow setup using [DVC](https://dvc.o
 
 ## Requirements
 - [Ollama](https://ollama.com/download) ([`llama3.1`](https://ollama.com/library/llama3.1) and [`mistral-nemo`](https://ollama.com/library/mistral-nemo) models)
+- [Python 3.9+](https://www.python.org/downloads/)
 
 ## Getting started
+### Setup
 First create a new virtual environment and install the required dependencies:
 ```shell
 python -m venv .venv
 source .venv/bin/activate
 pip install .
 ```
+### Configuration
 Next setup your local DVC configuration with your [Jasmin object store access key](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/#creating-an-access-key-and-secret):
 ```shell
 dvc remote modify --local jasmin access_key_id '<ACCES_KEY_ID>'
 dvc remote modify --local jasmin secret_access_key '<KEY_SECRET>'
 ```
+### Getting the data
 Pull the data from the object store using DVC:
 ```shell
 dvc pull
 ```
+### Working with the pipeline
 You should now be ready to re-run the pipeline:
 ```shell
 dvc repro
diff --git a/pyproject.toml b/pyproject.toml
index 5abe51d..16077cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,6 @@ dependencies = [
     "kaleido",
     "dvc",
     "dvc[s3]",
-    "vllm",
     "bitsandbytes",
     "haystack-ai",
     "accelerate",
@@ -37,4 +36,4 @@ jupyter = [
 py-modules = []
 
 [tool.mypy]
-files = ["scripts"]
\ No newline at end of file
+files = ["scripts"]

From 731544ac35bf41db87dc75bd1de963156ee8a643 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 25 Oct 2024 09:25:21 +0100
Subject: [PATCH 24/28] Froze dependencies to try and fix gh action

---
 .github/workflows/cml.yaml |  2 +-
 pyproject.toml             | 37 ++++++++++++++-----------------------
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml
index e48e94c..147f0b3 100644
--- a/.github/workflows/cml.yaml
+++ b/.github/workflows/cml.yaml
@@ -6,7 +6,7 @@ jobs:
     container: docker://ghcr.io/iterative/cml:0-dvc2-base1
     steps:
       - uses: actions/checkout@v3
-      - name: Train model
+      - name: Run pipeline
         env:
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
diff --git a/pyproject.toml b/pyproject.toml
index 5abe51d..7ecdec5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,28 +3,22 @@ name = "llm-eval"
 dynamic = ["version"]
 
 dependencies = [
-    "plotly",
-    "pandas",
-    "numpy",
-    "kaleido",
-    "dvc",
-    "dvc[s3]",
-    "vllm",
-    "bitsandbytes",
-    "haystack-ai",
-    "accelerate",
-    "sentence-transformers",
-    "chromadb",
+    "plotly == 5.24.1",
+    "pandas == 2.2.3",
+    "numpy == 1.26.4",
+    "kaleido == 0.2.1",
+    "dvc[s3] == 3.2.0 ",
+    "bitsandbytes == 0.44.1",
+    "haystack-ai == 2.6.0",
+    "accelerate == 1.0.0",
+    "sentence-transformers == 3.1.1",
+    "chromadb == 0.5.15",
     "ollama-haystack == 0.0.7",
-    "chroma-haystack",
+    "chroma-haystack == 0.22.1",
     "ragas == 0.1.10",
-    "nltk",
-    "nbformat>=4.2.0",
-    "ruff",
-    "mypy",
-    "types-requests",
-    "types-tqdm",
-    "pandas-stubs",
+    "nltk == 3.9.1",
+    "nbformat == 4.2.0",
+    "ruff == 0.7.0",
 ]
 
 [project.optional-dependencies]
@@ -35,6 +29,3 @@ jupyter = [
 
 [tool.setuptools]
 py-modules = []
-
-[tool.mypy]
-files = ["scripts"]
\ No newline at end of file

From 68469993c89783d54b0bf50fa9c1756ad0eced6b Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 25 Oct 2024 10:18:21 +0100
Subject: [PATCH 25/28] Fixed gh actions pipeline

---
 .github/workflows/cml.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml
index 147f0b3..02ca014 100644
--- a/.github/workflows/cml.yaml
+++ b/.github/workflows/cml.yaml
@@ -3,9 +3,12 @@ on: [push]
 jobs:
   train-and-report:
     runs-on: ubuntu-latest
-    container: docker://ghcr.io/iterative/cml:0-dvc2-base1
     steps:
       - uses: actions/checkout@v3
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - uses: iterative/setup-cml@v2
       - name: Run pipeline
         env:
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 6a2f0b4881afa6c0a97e347335e3b118b12b717a Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 25 Oct 2024 11:13:50 +0100
Subject: [PATCH 26/28] re-added ruff

---
 notebooks/ragas_eval.ipynb       | 20 ++++++++++----------
 notebooks/ragas_synth.ipynb      | 10 +++++-----
 pyproject.toml                   | 27 ++++++++++++++++++++++-----
 scripts/chunk_data.py            |  6 ++++--
 scripts/create_embeddings.py     |  8 +++++---
 scripts/evaluate.py              | 23 ++++++++++++-----------
 scripts/extract_metadata.py      |  5 ++---
 scripts/fetch_eidc_metadata.py   |  3 ++-
 scripts/fetch_supporting_docs.py | 21 +++++++++++++--------
 scripts/run_rag_pipeline.py      | 19 ++++++++++---------
 scripts/upload_to_docstore.py    | 13 ++++++++-----
 11 files changed, 93 insertions(+), 62 deletions(-)

diff --git a/notebooks/ragas_eval.ipynb b/notebooks/ragas_eval.ipynb
index 56bcb43..395269f 100644
--- a/notebooks/ragas_eval.ipynb
+++ b/notebooks/ragas_eval.ipynb
@@ -37,15 +37,15 @@
     }
    ],
    "source": [
+    "import nest_asyncio\n",
     "import pandas as pd\n",
-    "from datasets import Dataset\n",
-    "from ragas import evaluate\n",
-    "from ragas.run_config import RunConfig\n",
-    "from langchain_community.embeddings import OllamaEmbeddings\n",
-    "from langchain_community.chat_models import ChatOllama\n",
     "import plotly.graph_objects as go\n",
     "import plotly.io as pio\n",
-    "import nest_asyncio"
+    "from datasets import Dataset\n",
+    "from langchain_community.chat_models import ChatOllama\n",
+    "from langchain_community.embeddings import OllamaEmbeddings\n",
+    "from ragas import evaluate\n",
+    "from ragas.run_config import RunConfig"
    ]
   },
   {
@@ -248,13 +248,13 @@
    "outputs": [],
    "source": [
     "from ragas.metrics import (\n",
-    "    faithfulness,\n",
+    "    answer_correctness,\n",
     "    answer_relevancy,\n",
+    "    answer_similarity,\n",
+    "    context_entity_recall,\n",
     "    context_precision,\n",
     "    context_recall,\n",
-    "    context_entity_recall,\n",
-    "    answer_similarity,\n",
-    "    answer_correctness,\n",
+    "    faithfulness,\n",
     ")"
    ]
   },
diff --git a/notebooks/ragas_synth.ipynb b/notebooks/ragas_synth.ipynb
index cf39b04..f8057b0 100644
--- a/notebooks/ragas_synth.ipynb
+++ b/notebooks/ragas_synth.ipynb
@@ -14,12 +14,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain_community.embeddings import OllamaEmbeddings\n",
+    "import nest_asyncio\n",
     "from langchain_community.chat_models import ChatOllama\n",
-    "from ragas.testset.generator import TestsetGenerator\n",
-    "from ragas.testset.evolutions import simple, reasoning, multi_context\n",
+    "from langchain_community.embeddings import OllamaEmbeddings\n",
     "from ragas.run_config import RunConfig\n",
-    "import nest_asyncio"
+    "from ragas.testset.evolutions import multi_context, reasoning, simple\n",
+    "from ragas.testset.generator import TestsetGenerator"
    ]
   },
   {
@@ -65,7 +65,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "docs = []  # load a set of langchain documents to base the synthetic test set generation on"
+    "docs = []  # load a set of langchain docs to base the synthetic test set generation on"
    ]
   },
   {
diff --git a/pyproject.toml b/pyproject.toml
index 7ecdec5..bb37d7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,6 @@
 [project]
 name = "llm-eval"
-dynamic = ["version"]
-
+version = "0.1.0"
 dependencies = [
     "plotly == 5.24.1",
     "pandas == 2.2.3",
@@ -18,7 +17,6 @@ dependencies = [
     "ragas == 0.1.10",
     "nltk == 3.9.1",
     "nbformat == 4.2.0",
-    "ruff == 0.7.0",
 ]
 
 [project.optional-dependencies]
@@ -26,6 +24,25 @@ jupyter = [
     "ipykernel",
     "ipywidgets",
 ]
+lint = [
+    "ruff == 0.7.1",
+    "mypy == 1.13.0",
+]
+dev = [
+    "llm-eval[jupyter,lint]"
+]
+
+[tool.ruff.lint]
+select = [
+    "I",
+    "E",
+    "F",
+    "ANN"
+]
+fixable = ["ALL"]
+
+[tool.ruff]
+line-length = 88
 
-[tool.setuptools]
-py-modules = []
+[tool.ruff.lint.pydocstyle]
+convention = "google"
diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py
index ace111d..28707ed 100644
--- a/scripts/chunk_data.py
+++ b/scripts/chunk_data.py
@@ -1,6 +1,6 @@
-from typing import List, Dict
 import json
 from argparse import ArgumentParser
+from typing import Any, Dict, List
 
 
 def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]:
@@ -12,7 +12,9 @@ def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]:
     return chunks
 
 
-def chunk_metadata_value(metada_value, chunk_size, overlap):
+def chunk_metadata_value(
+    metada_value: str, chunk_size: int, overlap: int
+) -> List[Dict[str, Any]]:
     chunks = chunk_value(metada_value["value"], chunk_size, overlap)
     return [
         {
diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py
index 2ad9cc9..7aa507c 100644
--- a/scripts/create_embeddings.py
+++ b/scripts/create_embeddings.py
@@ -1,15 +1,17 @@
 import json
-from sentence_transformers import SentenceTransformer
 from argparse import ArgumentParser
+
+from sentence_transformers import SentenceTransformer
+from torch import Tensor
 from tqdm import tqdm
 
 
-def create_embedding(text):
+def create_embedding(text: str) -> Tensor:
     model = SentenceTransformer("all-MiniLM-L6-v2")
     return model.encode(text)
 
 
-def main(input_file, output_file):
+def main(input_file: str, output_file: str) -> None:
     with open(input_file) as input, open(output_file, "w") as output:
         data = json.load(input)
         for chunk in tqdm(data):
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
index d7ac98f..c130e96 100644
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@@ -1,23 +1,24 @@
+import json
 from argparse import ArgumentParser
+
+import nest_asyncio
 import pandas as pd
-from datasets import Dataset
-from ragas import evaluate
-from ragas.run_config import RunConfig
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_community.chat_models import ChatOllama
 import plotly.graph_objects as go
 import plotly.io as pio
-import nest_asyncio
+from datasets import Dataset
+from langchain_community.chat_models import ChatOllama
+from langchain_community.embeddings import OllamaEmbeddings
+from ragas import evaluate
 from ragas.metrics import (
-    faithfulness,
+    answer_correctness,
     answer_relevancy,
+    answer_similarity,
+    context_entity_recall,
     context_precision,
     context_recall,
-    context_entity_recall,
-    answer_similarity,
-    answer_correctness,
+    faithfulness,
 )
-import json
+from ragas.run_config import RunConfig
 
 
 def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py
index 8007d09..9bd4c3c 100644
--- a/scripts/extract_metadata.py
+++ b/scripts/extract_metadata.py
@@ -1,7 +1,6 @@
-from typing import List, Dict
 import json
 from argparse import ArgumentParser
-
+from typing import Dict, List
 
 METADATA_FIELDS = ["title", "description", "lineage"]
 
@@ -30,7 +29,7 @@ def parse_eidc_metadata(file_path: str) -> List[Dict[str, str]]:
     return data
 
 
-def main(input, output) -> None:
+def main(input: str, output: str) -> None:
     data = parse_eidc_metadata(input)
     with open(output, "w") as f:
         json.dump(data, f, indent=4)
diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
index f411c16..0ab6297 100644
--- a/scripts/fetch_eidc_metadata.py
+++ b/scripts/fetch_eidc_metadata.py
@@ -1,7 +1,8 @@
-import requests
 import json
 from argparse import ArgumentParser
 
+import requests
+
 URL = "https://catalogue.ceh.ac.uk/eidc/documents"
 
 
diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py
index 66e77ac..d95493b 100644
--- a/scripts/fetch_supporting_docs.py
+++ b/scripts/fetch_supporting_docs.py
@@ -1,15 +1,17 @@
-from argparse import ArgumentParser
-import logging
 import json
-from tqdm import tqdm
-import requests
+import logging
 import os
+from argparse import ArgumentParser
 from typing import Dict, List
+
+import requests
 from dotenv import load_dotenv
+from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
 
-def extract_ids(metadata_file: str):
+
+def extract_ids(metadata_file: str) -> List[str]:
     with open(metadata_file) as f:
         json_data = json.load(f)
         ids = [dataset["identifier"] for dataset in json_data["results"]]
@@ -19,7 +21,8 @@ def extract_ids(metadata_file: str):
 def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
     try:
         res = requests.get(
-            f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
+            f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents",
+            auth=(user, password),
         )
         json_data = res.json()
         docs = []
@@ -27,11 +30,13 @@ def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str
             docs.append({"id": eidc_id, "field": key, "value": val})
         return docs
     except Exception as e:
-        logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e)
+        logger.error(
+            f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e
+        )
         return []
 
 
-def main(metadata_file: str, supporting_docs_file: str):
+def main(metadata_file: str, supporting_docs_file: str) -> None:
     load_dotenv()
     user = os.getenv("username")
     password = os.getenv("password")
diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
index 91408ea..2c620e5 100644
--- a/scripts/run_rag_pipeline.py
+++ b/scripts/run_rag_pipeline.py
@@ -1,13 +1,14 @@
-from argparse import ArgumentParser
 import shutil
+from argparse import ArgumentParser
+from typing import Any, Dict, List, Tuple
+
+import pandas as pd
 from haystack import Pipeline
-from haystack_integrations.document_stores.chroma import ChromaDocumentStore
-from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
 from haystack.components.builders import PromptBuilder
-from haystack_integrations.components.generators.ollama.generator import OllamaGenerator
 from haystack.components.builders.answer_builder import AnswerBuilder
-import pandas as pd
-
+from haystack_integrations.components.generators.ollama.generator import OllamaGenerator
+from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
+from haystack_integrations.document_stores.chroma import ChromaDocumentStore
 
 TMP_DOC_PATH = ".tmp/doc-store"
 
@@ -61,7 +62,7 @@ def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline:
     return rag_pipe
 
 
-def run_query(query: str, pipeline: Pipeline):
+def run_query(query: str, pipeline: Pipeline) -> Dict[str, Any]:
     return pipeline.run(
         {
             "retriever": {"query": query},
@@ -71,7 +72,7 @@ def run_query(query: str, pipeline: Pipeline):
     )
 
 
-def query_pipeline(questions, rag_pipe):
+def query_pipeline(questions: List[str], rag_pipe: Pipeline) -> Tuple[str, List[str]]:
     answers = []
     contexts = []
     for q in questions:
@@ -85,7 +86,7 @@ def query_pipeline(questions, rag_pipe):
 
 def main(
     test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str
-):
+) -> None:
     shutil.copytree(doc_store_path, TMP_DOC_PATH)
 
     rag_pipe = build_rag_pipeline("llama3.1", collection_name)
diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py
index 7b547d7..9f1a880 100644
--- a/scripts/upload_to_docstore.py
+++ b/scripts/upload_to_docstore.py
@@ -1,14 +1,16 @@
-from argparse import ArgumentParser
 import json
-import uuid
-import shutil
 import os
+import shutil
+import uuid
+from argparse import ArgumentParser
 
 import chromadb
 from chromadb.utils import embedding_functions
 
 
-def main(input_file: str, output_path: str, collection_name: str, embedding_model: str):
+def main(
+    input_file: str, output_path: str, collection_name: str, embedding_model: str
+) -> None:
     if os.path.exists(output_path):
         shutil.rmtree(output_path)
 
@@ -55,7 +57,8 @@ def main(input_file: str, output_path: str, collection_name: str, embedding_mode
     parser.add_argument(
         "-em",
         "--embedding_model",
-        help="Embedding model to use in the doc store (must be the same as the function used to create embeddings.)",
+        help="""Embedding model to use in the doc store (must be the same as the
+        function used to create embeddings.)""",
         default="all-MiniLM-L6-v2",
     )
     args = parser.parse_args()

From aa292247c15489b050a473e9acb711d6c3393703 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 25 Oct 2024 11:53:29 +0100
Subject: [PATCH 27/28] red-added pymodule setup

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index bb37d7f..fdd2586 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,3 +46,6 @@ line-length = 88
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
+
+- [tool.setuptools]
+- py-modules = []
\ No newline at end of file

From 5d0430116d93c40306bb565a53995e2d1bc8fba5 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:47:22 +0000
Subject: [PATCH 28/28] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fdd2586..3dda280 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,5 +47,5 @@ line-length = 88
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
-- [tool.setuptools]
-- py-modules = []
\ No newline at end of file
+[tool.setuptools]
+py-modules = []