diff --git a/.gitignore b/.gitignore index bf560c6..03bcc6a 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ cython_debug/ metrics.txt metrics.png gdrive-oauth.txt +/eval diff --git a/README.md b/README.md index 414f241..5168631 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,31 @@ data/metrics.json faithfulness 0.75 0.69375 -0.05625 Path Param HEAD workspace Change params.yaml hp.chunk-size 300 1000 700 ``` -## Notes + +It is also possible to compare the results of all experiments: +```shell +dvc exp show --only-changed +``` +Experiments can be remove using (`-A` flag removes all experiment, but individually experiment can be removed using their name or ID): +```shell +dvc exp remove -A +``` +### Experiment Runner +The repository includes a simple shell script that can be used as an experiment runner to test various different models: +```shell +./run-experiments.sh +``` +This will run the dvc pipeline with various different llm model (check the shell scripts for details) and save the results as experiments. + +An experiment for each model defined will be queued and run in the background. To check the status of the experiments: +```shell +dvc queue status +``` +To check the output for an experiment currently running use: +```shell +dvc queue log $EXPERIMENT_NAME +``` +## Other Notes ### DVC and CML Notes on the use of Data Version Control and Continuous Machine Learning: diff --git a/data/.gitignore b/data/.gitignore index 2db9c05..14b46ba 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -13,4 +13,5 @@ /supporting-docs.json /metrics.json /eval.png +/eidc_rag_testset.csv /eidc_rag_test_set.csv diff --git a/dvc.lock b/dvc.lock index afedaab..a343f7c 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: a564cb0804b482ef09658f0cb4a0a705 - size: 941 + md5: 82907434d9521996e30014df01bbba8e + size: 964 outs: - path: data/eidc_metadata.json hash: md5 @@ -45,8 +45,8 @@ stages: md5: f6123510b2b337bc8a2b6a7180e54b36 size: 4606527 chunk-data: - cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json - data/supporting-docs.json + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json + data/supporting-docs.json -m 250 deps: - path: data/extracted_metadata.json hash: md5 @@ -58,8 +58,8 @@ stages: size: 72280322 - path: scripts/chunk_data.py hash: md5 - md5: e8de02d6b14c8fc22533d0becfb7d35d - size: 2198 + md5: 3ad449140b03e1c2904b22a5b401a12e + size: 2705 outs: - path: data/chunked_data.json hash: md5 @@ -74,8 +74,8 @@ stages: size: 124484286 - path: scripts/create_embeddings.py hash: md5 - md5: d9282fc92ed400855c4fc2a290289f14 - size: 867 + md5: fa4627c83a65af2e3ea9b2b749f1b29d + size: 952 outs: - path: data/embeddings.json hash: md5 @@ -154,8 +154,8 @@ stages: size: 203253 - path: scripts/evaluate.py hash: md5 - md5: a9c4c04157007c12c068aacdf5e099a9 - size: 2634 + md5: 4154acf8e74c1d8bcd0b0da72af038e0 + size: 2728 outs: - path: data/eval.png hash: md5 diff --git a/dvc.yaml b/dvc.yaml index 3bb11da..db54d68 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,3 +1,5 @@ +metrics: +- data/metrics.json stages: fetch-metadata: cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample} @@ -20,7 +22,7 @@ stages: outs: - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} + cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} -m ${max-length} deps: - ${files.extracted} - ${files.supporting-docs} @@ -42,11 +44,11 @@ stages: outs: - ${files.doc-store} generate-testset: - cmd: cp data/synthetic-datasets/eidc_rag_test_set.csv data/ + cmd: head -n ${test-set-size} data/synthetic-datasets/eidc_rag_test_sample.csv > ${files.test-set} outs: - ${files.test-set} run-rag-pipeline: - cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} ${files.doc-store} -c ${doc-store.collection} + cmd: python scripts/run_rag_pipeline.py -i ${files.test-set} -o ${files.eval-set} -ds ${files.doc-store} -c ${doc-store.collection} -m ${rag.model} deps: - ${files.test-set} - ${files.doc-store} @@ -61,5 +63,3 @@ stages: outs: - ${files.metrics} - ${files.eval-plot} -metrics: -- ${files.metrics} diff --git a/params.yaml b/params.yaml index edf0085..e679c5f 100644 --- a/params.yaml +++ b/params.yaml @@ -1,6 +1,6 @@ hp: - chunk-size: 500 - overlap: 100 + chunk-size: 250 + overlap: 75 embeddings-model: all-MiniLM-L6-v2 doc-store: collection: eidc-data @@ -12,11 +12,13 @@ files: chunked: data/chunked_data.json embeddings: data/embeddings.json doc-store: data/chroma-data - test-set: data/eidc_rag_test_sample.csv + test-set: data/eidc_rag_testset.csv eval-set: data/evaluation_data.csv metrics: data/metrics.json eval-plot: data/eval.png -sub-sample: 3 # sample size of 0 will process all data +sub-sample: 0 # sample n datasets for testing (0 will use all datasets) +max-length: 0 # truncate longer texts for testing (0 will use all data) +test-set-size: 101 # reduce the size of the test set for faster testing rag: model: llama3.1 prompt: >- diff --git a/pyproject.toml b/pyproject.toml index 3dda280..a56e3f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "ragas == 0.1.10", "nltk == 3.9.1", "nbformat == 4.2.0", + "pygit2 == 1.14.1", ] [project.optional-dependencies] diff --git a/run-experiments.sh b/run-experiments.sh new file mode 100755 index 0000000..83849a2 --- /dev/null +++ b/run-experiments.sh @@ -0,0 +1,12 @@ +#!/bin/bash +NC='\033[0m' +GREEN='\033[0;32m' +dvc queue remove --all +models=("llama3 llama3.1 mistral-nemo") +for model in $models +do + dvc exp run --queue -S rag.model=$model +done +dvc queue start +dvc queue status +echo -e "Run ${GREEN}dvc queue status${NC} to check the state of the experiments" diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py index 7fe672b..d2e70d6 100644 --- a/scripts/chunk_data.py +++ b/scripts/chunk_data.py @@ -3,19 +3,22 @@ from typing import Any, Dict, List -def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]: +def chunk_value( + value: str, chunk_size: int, overlap: int, max_length: int +) -> List[str]: chunks = [] start = 0 - while start < len(value): + end = max_length if len(value) > max_length > 0 else len(value) + while start < end: chunks.append(value[start : (start + chunk_size)]) start += chunk_size - overlap return chunks def chunk_metadata_value( - metada_value: str, chunk_size: int, overlap: int + metada_value: str, chunk_size: int, overlap: int, max_length: int ) -> List[Dict[str, Any]]: - chunks = chunk_value(metada_value["value"], chunk_size, overlap) + chunks = chunk_value(metada_value["value"], chunk_size, overlap, max_length) return [ { "chunk": chunks[i], @@ -28,20 +31,26 @@ def chunk_metadata_value( def chunk_metadata_file( - file: str, chunk_size: int, overlap: int + file: str, chunk_size: int, overlap: int, max_length: int ) -> List[Dict[str, str]]: chunked_metadata = [] with open(file) as f: json_data = json.load(f) for metadata in json_data: - chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap)) + chunked_metadata.extend( + chunk_metadata_value(metadata, chunk_size, overlap, max_length) + ) return chunked_metadata -def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None: +def main( + files: List[str], ouput_file: str, chunk_size: int, overlap: int, max_length: int +) -> None: all_chunked_metadata = [] for file in files: - all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap)) + all_chunked_metadata.extend( + chunk_metadata_file(file, chunk_size, overlap, max_length) + ) with open(ouput_file, "w") as f: json.dump(all_chunked_metadata, f, indent=4) @@ -73,6 +82,15 @@ def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> No nargs="?", const=100, ) + parser.add_argument( + "-m", + "--max_length", + help="""Maximum length of data in characters - meant for truncating large + strings in testing. 0 defaults to all data""", + type=int, + nargs="?", + const=0, + ) args = parser.parse_args() assert args.chunk > args.overlap - main(args.input_files, args.output, args.chunk, args.overlap) + main(args.input_files, args.output, args.chunk, args.overlap, args.max_length) diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index 7aa507c..1ae255e 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -1,6 +1,8 @@ +import gc import json from argparse import ArgumentParser +import torch from sentence_transformers import SentenceTransformer from torch import Tensor from tqdm import tqdm @@ -16,6 +18,8 @@ def main(input_file: str, output_file: str) -> None: data = json.load(input) for chunk in tqdm(data): chunk["embedding"] = create_embedding(chunk["chunk"]).tolist() + gc.collect() + torch.cuda.empty_cache() json.dump(data, output) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index c130e96..fbe348f 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -1,5 +1,6 @@ import json from argparse import ArgumentParser +from pathlib import Path import nest_asyncio import pandas as pd @@ -44,10 +45,9 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: run_config=RunConfig(max_workers=1), ) result_df = result.to_pandas() - pio.templates.default = "gridon" - fig = go.Figure() - with open(metric_output, "w") as f: + Path(metric_output).parent.mkdir(parents=True, exist_ok=True) + with open(metric_output, "w+") as f: json.dump(result, f) metrics = [ metric @@ -55,6 +55,9 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: if metric not in ["question", "ground_truth", "answer", "contexts"] ] + pio.templates.default = "gridon" + fig = go.Figure() + for metric in metrics: fig.add_trace( go.Violin( @@ -66,7 +69,7 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: ) ) fig.update_yaxes(range=[-0.02, 1.02]) - with open(image_output, "wb") as f: + with open(image_output, "wb+") as f: f.write(fig.to_image(format="png")) diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index 5e883d9..c53220c 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -17,7 +17,8 @@ def main(output_file: str, sample: int) -> None: }, ) json_data = res.json() - json_data["results"] = json_data["results"][:sample] + if sample > 0: + json_data["results"] = json_data["results"][:sample] with open(output_file, "w") as f: json.dump(json_data, f, indent=4) diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index 97d0fb2..f4f9ff4 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -37,8 +37,6 @@ def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline: prompt_builder = PromptBuilder(template=template) - model_name = "llama3.1" - print(f"Setting up model ({model_name})...") llm = OllamaGenerator( model=model_name, @@ -87,11 +85,15 @@ def query_pipeline(questions: List[str], rag_pipe: Pipeline) -> Tuple[str, List[ def main( - test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str + test_data_file: str, + ouput_file: str, + doc_store_path: str, + collection_name: str, + model: str, ) -> None: shutil.copytree(doc_store_path, TMP_DOC_PATH) - rag_pipe = build_rag_pipeline("llama3.1", collection_name) + rag_pipe = build_rag_pipeline(model, collection_name) df = pd.read_csv(test_data_file) df.drop(columns=["rating", "contexts"], inplace=True) @@ -108,15 +110,18 @@ def main( if __name__ == "__main__": parser = ArgumentParser("run_rag_pipeline.py") parser.add_argument( - "test_data_file", + "-i", + "--input", help="File containing test queries to generate response from the RAG pipeline.", ) parser.add_argument( - "output_file", + "-o", + "--output", help="File to output results to.", ) parser.add_argument( - "doc_store_path", + "-ds", + "--doc_store", help="Path to the doc store.", ) parser.add_argument( @@ -125,5 +130,11 @@ def main( help="Collection name in doc store.", default="eidc-data", ) + parser.add_argument( + "-m", + "--model", + help="Model to use in RAG pipeline.", + default="llama3.1", + ) args = parser.parse_args() - main(args.test_data_file, args.output_file, args.doc_store_path, args.collection) + main(args.input, args.output, args.doc_store, args.collection, args.model)