NERC-CEH · matthewcoole · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,4 @@ cython_debug/
 metrics.txt
 metrics.png
 gdrive-oauth.txt
+/eval
diff --git a/README.md b/README.md
@@ -85,7 +85,31 @@ data/metrics.json  faithfulness        0.75      0.69375      -0.05625
 Path         Param          HEAD    workspace    Change
 params.yaml  hp.chunk-size  300     1000         700
 ```
-## Notes
+
+It is also possible to compare the results of all experiments:
+```shell
+dvc exp show --only-changed
+```
+Experiments can be remove using (`-A` flag removes all experiment, but individually experiment can be removed using their name or ID):
+```shell
+dvc exp remove -A
+```
+### Experiment Runner
+The repository includes a simple shell script that can be used as an experiment runner to test various different models:
+```shell
+./run-experiments.sh
+```
+This will run the dvc pipeline with various different llm model (check the shell scripts for details) and save the results as experiments. 
+
+An experiment for each model defined will be queued and run in the background. To check the status of the experiments:
+```shell
+dvc queue status
+```
+To check the output for an experiment currently running use:
+```shell
+dvc queue log $EXPERIMENT_NAME
+```
+## Other Notes
 
 ### DVC and CML
 Notes on the use of Data Version Control and Continuous Machine Learning:

diff --git a/data/.gitignore b/data/.gitignore
@@ -13,4 +13,5 @@
 /supporting-docs.json
 /metrics.json
 /eval.png
+/eidc_rag_testset.csv
 /eidc_rag_test_set.csv
diff --git a/dvc.lock b/dvc.lock
@@ -5,8 +5,8 @@ stages:
     deps:
     - path: scripts/fetch_eidc_metadata.py
       hash: md5
-      md5: a564cb0804b482ef09658f0cb4a0a705
-      size: 941
+      md5: 82907434d9521996e30014df01bbba8e
+      size: 964
     outs:
     - path: data/eidc_metadata.json
       hash: md5
@@ -45,8 +45,8 @@ stages:
       md5: f6123510b2b337bc8a2b6a7180e54b36
       size: 4606527
   chunk-data:
-    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
-      data/supporting-docs.json
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json
+      data/supporting-docs.json -m 250
     deps:
     - path: data/extracted_metadata.json
       hash: md5
@@ -58,8 +58,8 @@ stages:
       size: 72280322
     - path: scripts/chunk_data.py
       hash: md5
-      md5: e8de02d6b14c8fc22533d0becfb7d35d
-      size: 2198
+      md5: 3ad449140b03e1c2904b22a5b401a12e
+      size: 2705
     outs:
     - path: data/chunked_data.json
       hash: md5
@@ -74,8 +74,8 @@ stages:
       size: 124484286
     - path: scripts/create_embeddings.py
       hash: md5
-      md5: d9282fc92ed400855c4fc2a290289f14
-      size: 867
+      md5: fa4627c83a65af2e3ea9b2b749f1b29d
+      size: 952
     outs:
     - path: data/embeddings.json
       hash: md5
@@ -154,8 +154,8 @@ stages:
       size: 203253
     - path: scripts/evaluate.py
       hash: md5
-      md5: a9c4c04157007c12c068aacdf5e099a9
-      size: 2634
+      md5: 4154acf8e74c1d8bcd0b0da72af038e0
+      size: 2728
     outs:
     - path: data/eval.png
       hash: md5

diff --git a/dvc.yaml b/dvc.yaml
@@ -1,3 +1,5 @@
+metrics: 
+- data/metrics.json
 stages:
   fetch-metadata:
     cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample}
@@ -20,7 +22,7 @@ stages:
     outs:
     - ${files.extracted}
   chunk-data:
-    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs}
+    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} -m ${max-length}
     deps:
     - ${files.extracted}
     - ${files.supporting-docs}
@@ -42,11 +44,11 @@ stages:
     outs:
     - ${files.doc-store}
   generate-testset:
-    cmd: cp data/synthetic-datasets/eidc_rag_test_set.csv data/
+    cmd: head -n ${test-set-size} data/synthetic-datasets/eidc_rag_test_sample.csv > ${files.test-set}
     outs: 
     - ${files.test-set}
   run-rag-pipeline:
-    cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} ${files.doc-store} -c ${doc-store.collection}
+    cmd: python scripts/run_rag_pipeline.py -i ${files.test-set} -o ${files.eval-set} -ds ${files.doc-store} -c ${doc-store.collection} -m ${rag.model}
     deps:
     - ${files.test-set}
     - ${files.doc-store}
@@ -61,5 +63,3 @@ stages:
     outs:
     - ${files.metrics}
     - ${files.eval-plot}
-metrics: 
-- ${files.metrics}
diff --git a/params.yaml b/params.yaml
@@ -1,6 +1,6 @@
 hp:
-  chunk-size: 500
-  overlap: 100
+  chunk-size: 250
+  overlap: 75
   embeddings-model: all-MiniLM-L6-v2
 doc-store:
   collection: eidc-data
@@ -12,11 +12,13 @@ files:
   chunked: data/chunked_data.json
   embeddings: data/embeddings.json
   doc-store: data/chroma-data
-  test-set: data/eidc_rag_test_sample.csv
+  test-set: data/eidc_rag_testset.csv
   eval-set: data/evaluation_data.csv
   metrics: data/metrics.json
   eval-plot: data/eval.png
-sub-sample: 3 # sample size of 0 will process all data
+sub-sample: 0 # sample n datasets for testing (0 will use all datasets)
+max-length: 0 # truncate longer texts for testing (0 will use all data)
+test-set-size: 101 # reduce the size of the test set for faster testing
 rag:
   model: llama3.1
   prompt: >-

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
     "ragas == 0.1.10",
     "nltk == 3.9.1",
     "nbformat == 4.2.0",
+    "pygit2 == 1.14.1",
 ]
 
 [project.optional-dependencies]

diff --git a/run-experiments.sh b/run-experiments.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+NC='\033[0m'
+GREEN='\033[0;32m'
+dvc queue remove --all
+models=("llama3 llama3.1 mistral-nemo")
+for model in $models
+do
+    dvc exp run --queue -S rag.model=$model
+done
+dvc queue start
+dvc queue status
+echo -e "Run ${GREEN}dvc queue status${NC} to check the state of the experiments"
diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py
@@ -3,19 +3,22 @@
 from typing import Any, Dict, List
 
 
-def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]:
+def chunk_value(
+    value: str, chunk_size: int, overlap: int, max_length: int
+) -> List[str]:
     chunks = []
     start = 0
-    while start < len(value):
+    end = max_length if len(value) > max_length > 0 else len(value)
+    while start < end:
         chunks.append(value[start : (start + chunk_size)])
         start += chunk_size - overlap
     return chunks
 
 
 def chunk_metadata_value(
-    metada_value: str, chunk_size: int, overlap: int
+    metada_value: str, chunk_size: int, overlap: int, max_length: int
 ) -> List[Dict[str, Any]]:
-    chunks = chunk_value(metada_value["value"], chunk_size, overlap)
+    chunks = chunk_value(metada_value["value"], chunk_size, overlap, max_length)
     return [
         {
             "chunk": chunks[i],
@@ -28,20 +31,26 @@ def chunk_metadata_value(
 
 
 def chunk_metadata_file(
-    file: str, chunk_size: int, overlap: int
+    file: str, chunk_size: int, overlap: int, max_length: int
 ) -> List[Dict[str, str]]:
     chunked_metadata = []
     with open(file) as f:
         json_data = json.load(f)
         for metadata in json_data:
-            chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
+            chunked_metadata.extend(
+                chunk_metadata_value(metadata, chunk_size, overlap, max_length)
+            )
     return chunked_metadata
 
 
-def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
+def main(
+    files: List[str], ouput_file: str, chunk_size: int, overlap: int, max_length: int
+) -> None:
     all_chunked_metadata = []
     for file in files:
-        all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
+        all_chunked_metadata.extend(
+            chunk_metadata_file(file, chunk_size, overlap, max_length)
+        )
     with open(ouput_file, "w") as f:
         json.dump(all_chunked_metadata, f, indent=4)
 
@@ -73,6 +82,15 @@ def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> No
         nargs="?",
         const=100,
     )
+    parser.add_argument(
+        "-m",
+        "--max_length",
+        help="""Maximum length of data in characters - meant for truncating large 
+        strings in testing. 0 defaults to all data""",
+        type=int,
+        nargs="?",
+        const=0,
+    )
     args = parser.parse_args()
     assert args.chunk > args.overlap
-    main(args.input_files, args.output, args.chunk, args.overlap)
+    main(args.input_files, args.output, args.chunk, args.overlap, args.max_length)
diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py
@@ -1,6 +1,8 @@
+import gc
 import json
 from argparse import ArgumentParser
 
+import torch
 from sentence_transformers import SentenceTransformer
 from torch import Tensor
 from tqdm import tqdm
@@ -16,6 +18,8 @@ def main(input_file: str, output_file: str) -> None:
         data = json.load(input)
         for chunk in tqdm(data):
             chunk["embedding"] = create_embedding(chunk["chunk"]).tolist()
+            gc.collect()
+            torch.cuda.empty_cache()
         json.dump(data, output)
 
 

diff --git a/scripts/evaluate.py b/scripts/evaluate.py
@@ -1,5 +1,6 @@
 import json
 from argparse import ArgumentParser
+from pathlib import Path
 
 import nest_asyncio
 import pandas as pd
@@ -44,17 +45,19 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
         run_config=RunConfig(max_workers=1),
     )
     result_df = result.to_pandas()
-    pio.templates.default = "gridon"
-    fig = go.Figure()
 
-    with open(metric_output, "w") as f:
+    Path(metric_output).parent.mkdir(parents=True, exist_ok=True)
+    with open(metric_output, "w+") as f:
         json.dump(result, f)
     metrics = [
         metric
         for metric in result_df.columns.to_list()
         if metric not in ["question", "ground_truth", "answer", "contexts"]
     ]
 
+    pio.templates.default = "gridon"
+    fig = go.Figure()
+
     for metric in metrics:
         fig.add_trace(
             go.Violin(
@@ -66,7 +69,7 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
             )
         )
     fig.update_yaxes(range=[-0.02, 1.02])
-    with open(image_output, "wb") as f:
+    with open(image_output, "wb+") as f:
         f.write(fig.to_image(format="png"))
 
 

diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
@@ -17,7 +17,8 @@ def main(output_file: str, sample: int) -> None:
         },
     )
     json_data = res.json()
-    json_data["results"] = json_data["results"][:sample]
+    if sample > 0:
+        json_data["results"] = json_data["results"][:sample]
     with open(output_file, "w") as f:
         json.dump(json_data, f, indent=4)
 

diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
@@ -37,8 +37,6 @@ def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline:
 
     prompt_builder = PromptBuilder(template=template)
 
-    model_name = "llama3.1"
-
     print(f"Setting up model ({model_name})...")
     llm = OllamaGenerator(
         model=model_name,
@@ -87,11 +85,15 @@ def query_pipeline(questions: List[str], rag_pipe: Pipeline) -> Tuple[str, List[
 
 
 def main(
-    test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str
+    test_data_file: str,
+    ouput_file: str,
+    doc_store_path: str,
+    collection_name: str,
+    model: str,
 ) -> None:
     shutil.copytree(doc_store_path, TMP_DOC_PATH)
 
-    rag_pipe = build_rag_pipeline("llama3.1", collection_name)
+    rag_pipe = build_rag_pipeline(model, collection_name)
 
     df = pd.read_csv(test_data_file)
     df.drop(columns=["rating", "contexts"], inplace=True)
@@ -108,15 +110,18 @@ def main(
 if __name__ == "__main__":
     parser = ArgumentParser("run_rag_pipeline.py")
     parser.add_argument(
-        "test_data_file",
+        "-i",
+        "--input",
         help="File containing test queries to generate response from the RAG pipeline.",
     )
     parser.add_argument(
-        "output_file",
+        "-o",
+        "--output",
         help="File to output results to.",
     )
     parser.add_argument(
-        "doc_store_path",
+        "-ds",
+        "--doc_store",
         help="Path to the doc store.",
     )
     parser.add_argument(
@@ -125,5 +130,11 @@ def main(
         help="Collection name in doc store.",
         default="eidc-data",
     )
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="Model to use in RAG pipeline.",
+        default="llama3.1",
+    )
     args = parser.parse_args()
-    main(args.test_data_file, args.output_file, args.doc_store_path, args.collection)
+    main(args.input, args.output, args.doc_store, args.collection, args.model)