NERC-CEH · matthewcoole · Oct 29, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 10, 2024
diff --git a/.dvc/config b/.dvc/config
@@ -1,5 +1,6 @@
 [core]
     remote = jasmin
+    autostage = true
 ['remote "jasmin"']
     url = s3://dvc-test
     endpointurl = https://llm-eval-o.s3-ext.jc.rl.ac.uk
diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml
@@ -3,10 +3,13 @@ on: [push]
 jobs:
   train-and-report:
     runs-on: ubuntu-latest
-    container: docker://ghcr.io/iterative/cml:0-dvc2-base1
     steps:
       - uses: actions/checkout@v3
-      - name: Train model
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - uses: iterative/setup-cml@v2
+      - name: Run pipeline
         env:
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |

diff --git a/.gitignore b/.gitignore
@@ -163,5 +163,4 @@ cython_debug/
 
 metrics.txt
 metrics.png
-/data
 gdrive-oauth.txt
diff --git a/README.md b/README.md
@@ -1,13 +1,97 @@
 # llm-eval
-Scripts and data for LLM evaluation.
+This repository contains a reproducible workflow setup using [DVC](https://dvc.org/) backed by a [JASMIN object store](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/). Before working with the repository please contact [Matt Coole](mailto:[email protected]) to request access to the Jasmin object store `llm-eval-o`. Then follow the instructions below.
 
-This repository is setup to work with [DVC](https://dvc.org/) backed by a [JASMIN object store](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/). Please follow the instruction in [`dvc.md`](dvc.md) to get up and running.
+## Requirements
+- [Ollama](https://ollama.com/download) ([`llama3.1`](https://ollama.com/library/llama3.1) and [`mistral-nemo`](https://ollama.com/library/mistral-nemo) models)
+- [Python 3.9+](https://www.python.org/downloads/)
 
-## DVC and CML
+## Getting started
+### Setup
+First create a new virtual environment and install the required dependencies:
+```shell
+python -m venv .venv
+source .venv/bin/activate
+pip install .
+```
+### Configuration
+Next setup your local DVC configuration with your [Jasmin object store access key](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/#creating-an-access-key-and-secret):
+```shell
+dvc remote modify --local jasmin access_key_id '<ACCES_KEY_ID>'
+dvc remote modify --local jasmin secret_access_key '<KEY_SECRET>'
+```
+### Getting the data
+Pull the data from the object store using DVC:
+```shell
+dvc pull
+```
+### Working with the pipeline
+You should now be ready to re-run the pipeline:
+```shell
+dvc repro
+```
+This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the command:
+```shell
+dvc dag
+```
+or it can be output to mermaid format to display in markdown:
+```shell
+dvc dag -md
+```
+```mermaid
+flowchart TD
+	node1["chunk-data"]
+	node2["create-embeddings"]
+	node3["evaluate"]
+	node4["extract-metadata"]
+	node5["fetch-metadata"]
+	node6["fetch-supporting-docs"]
+	node7["generate-testset"]
+	node8["run-rag-pipeline"]
+	node9["upload-to-docstore"]
+	node1-->node2
+	node2-->node9
+	node4-->node1
+	node5-->node4
+	node5-->node6
+	node6-->node1
+	node7-->node8
+	node8-->node3
+	node9-->node8
+	node10["data/evaluation-sets.dvc"]
+	node11["data/synthetic-datasets.dvc"]
+```
+> Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file.
+
+## Running Experiments
+The pipeline by default will run using the parameters defind in [`params.yaml`](params.yaml). To experiment with varying these paramaters you can change them directly, or use [DVC experiments](). 
+
+To run an experiment varying a particual parameter:
+```shell
+dvc exp run -S hp.chunk-size=1000
+```
+This will re-run the pipeline but override the value of the `hp.chunk-size` parameter in [`params.yaml`](params.yaml) and set it to `1000`. Only the necessary stages of the pipeline should be re-run and the result should appear in your workspace.
+
+You can compare the results of your experiment to the results of the baseline run of the pipeline using:
+```shell
+dvc exp diff
+```
+```shell
+Path               Metric              HEAD      workspace    Change
+data/metrics.json  answer_correctness  0.049482  0.043685     -0.0057974
+data/metrics.json  answer_similarity   0.19793   0.17474      -0.02319
+data/metrics.json  context_recall      0.125     0            -0.125
+data/metrics.json  faithfulness        0.75      0.69375      -0.05625
+
+Path         Param          HEAD    workspace    Change
+params.yaml  hp.chunk-size  300     1000         700
+```
+## Notes
+
+### DVC and CML
 Notes on the use of Data Version Control and Continuous Machine Learning:
 - [DVC](dvc.md)
 - [CML](cml.md)
 
-## vLLM
+### vLLM
 Notes on running models with vLLM:
-- [vLLM](vllm.md)
+- [vLLM](vllm.md)
diff --git a/data.dvc b/data.dvc
diff --git a/data/.gitignore b/data/.gitignore
@@ -0,0 +1,15 @@
+/synthetic-datasets
+/evaluation-sets
+/eidc_metadata.json
+/prepared_data.json
+/prepared_eidc_metadata.json
+/extracted_metadata.json
+/chunked_data.json
+/chunked_embeddings.json
+/embeddings.json
+/chroma-data
+/evaluation_data.csv
+/eidc_rag_test_sample.csv
+/supporting-docs.json
+/metrics.json
+/eval.png
diff --git a/data/evaluation-sets.dvc b/data/evaluation-sets.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: c3b5aefd8b8ab17f3087a49eb8265689.dir
+  size: 232043
+  nfiles: 2
+  hash: md5
+  path: evaluation-sets
diff --git a/data/synthetic-datasets.dvc b/data/synthetic-datasets.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 9d87c638c5cc518ea360c474c4e1e9ef.dir
+  size: 152121
+  nfiles: 2
+  hash: md5
+  path: synthetic-datasets
diff --git a/dummy-evaluation.py b/dummy-evaluation.py
@@ -3,7 +3,12 @@
 import plotly.graph_objects as go
 import plotly.io as pio
 
-metrics = {"answer_relevancy", "answer_correctness", "context_precision", "context_recall"}
+metrics = {
+    "answer_relevancy",
+    "answer_correctness",
+    "context_precision",
+    "context_recall",
+}
 dummy_data = {metric: np.random.rand(100) for metric in metrics}
 df = pd.DataFrame(dummy_data)
 
@@ -13,8 +18,20 @@
 
 pio.templates.default = "gridon"
 fig = go.Figure()
-metrics = [metric for metric in df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]]
+metrics = [
+    metric
+    for metric in df.columns.to_list()
+    if metric not in ["question", "ground_truth", "answer", "contexts"]
+]
 for metric in metrics:
-    fig.add_trace(go.Violin(y=df[metric], name=metric, points="all", box_visible=True, meanline_visible=True))
-fig.update_yaxes(range=[-0.02,1.02])
+    fig.add_trace(
+        go.Violin(
+            y=df[metric],
+            name=metric,
+            points="all",
+            box_visible=True,
+            meanline_visible=True,
+        )
+    )
+fig.update_yaxes(range=[-0.02, 1.02])
 fig.write_image("metrics.png")
diff --git a/dvc.lock b/dvc.lock
@@ -0,0 +1,167 @@
+schema: '2.0'
+stages:
+  fetch-metadata:
+    cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
+    deps:
+    - path: scripts/fetch_eidc_metadata.py
+      hash: md5
+      md5: 53d620665448ef91f2deedb517e2f502
+      size: 675
+    outs:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
+  prepare:
+    cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: 423dc3a61ede72e1d5c818d74277c0b4
+      size: 12140491
+    - path: scripts/extract_metadata.py
+      hash: md5
+      md5: c2fa7d2c4b8f28a6e24536ce0df244fd
+      size: 1296
+    outs:
+    - path: data/extracted_metadata.json
+      hash: md5
+      md5: 7d2ae8d6a41a960592f30496eb498af7
+      size: 4578493
+  extract-metadata:
+    cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
+    - path: scripts/extract_metadata.py
+      hash: md5
+      md5: 3f0269a6413845f4425af55e7cea7bf8
+      size: 1304
+    outs:
+    - path: data/extracted_metadata.json
+      hash: md5
+      md5: 789fda7a14f9a85c6ee0e10af8170a95
+      size: 4584498
+  chunk-data:
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s
+      10 data/extracted_metadata.json data/supporting-docs.json
+    deps:
+    - path: data/extracted_metadata.json
+      hash: md5
+      md5: 789fda7a14f9a85c6ee0e10af8170a95
+      size: 4584498
+    - path: data/supporting-docs.json
+      hash: md5
+      md5: f3ea9980226e5408497c96a10cc77b80
+      size: 72013526
+    - path: scripts/chunk_data.py
+      hash: md5
+      md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
+      size: 2509
+    outs:
+    - path: data/chunked_data.json
+      hash: md5
+      md5: f6426396e1a3564b53649ef5fc0571fd
+      size: 993814
+  create-embeddings:
+    cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
+    deps:
+    - path: data/chunked_data.json
+      hash: md5
+      md5: f6426396e1a3564b53649ef5fc0571fd
+      size: 993814
+    - path: scripts/create_embeddings.py
+      hash: md5
+      md5: 4649c700dfae922b43b3608ee4f00c1a
+      size: 808
+    outs:
+    - path: data/embeddings.json
+      hash: md5
+      md5: 8fd682131a282736f6a81a6c53040b1e
+      size: 13422675
+  upload-to-docstore:
+    cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
+      -em all-MiniLM-L6-v2 -c eidc-data
+    deps:
+    - path: data/embeddings.json
+      hash: md5
+      md5: 8fd682131a282736f6a81a6c53040b1e
+      size: 13422675
+    - path: scripts/upload_to_docstore.py
+      hash: md5
+      md5: 41da88e3bb6d2592bee938ce347f6983
+      size: 1905
+    outs:
+    - path: data/chroma-data
+      hash: md5
+      md5: 5c99644f30def03f87b37c98341c6f25.dir
+      size: 13758136
+      nfiles: 6
+  run-rag-pipeline:
+    cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
+      data/chroma-data -c eidc-data
+    deps:
+    - path: data/chroma-data
+      hash: md5
+      md5: 5c99644f30def03f87b37c98341c6f25.dir
+      size: 13758136
+      nfiles: 6
+    - path: data/eidc_rag_test_sample.csv
+      hash: md5
+      md5: a371d83c5822d256286e80d64d58c3fe
+      size: 7524
+    - path: scripts/run_rag_pipeline.py
+      hash: md5
+      md5: 8d5fc0669771146562c773186f4f44f6
+      size: 3667
+    outs:
+    - path: data/evaluation_data.csv
+      hash: md5
+      md5: 8ea0a3f240478e9db41855922ac534a6
+      size: 9894
+  generate-testset:
+    cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
+    outs:
+    - path: data/eidc_rag_test_sample.csv
+      hash: md5
+      md5: a371d83c5822d256286e80d64d58c3fe
+      size: 7524
+  fetch-supporting-docs:
+    cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
+    - path: scripts/fetch_supporting_docs.py
+      hash: md5
+      md5: 923af3b6ce1447d388b08fab0e3ab77d
+      size: 1660
+    outs:
+    - path: data/supporting-docs.json
+      hash: md5
+      md5: f3ea9980226e5408497c96a10cc77b80
+      size: 72013526
+  evaluate:
+    cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
+      -img data/eval.png
+    deps:
+    - path: data/evaluation_data.csv
+      hash: md5
+      md5: 8ea0a3f240478e9db41855922ac534a6
+      size: 9894
+    - path: scripts/evaluate.py
+      hash: md5
+      md5: 10f76511eafc8a1a9b90e9ae92a76bc5
+      size: 2633
+    outs:
+    - path: data/eval.png
+      hash: md5
+      md5: bae77b1b721bf283a30a64f67af45fea
+      size: 74438
+    - path: data/metrics.json
+      hash: md5
+      md5: 0145280f36071a6df551ef57d3f8393e
+      size: 229