From 9e9434780c7346e6d6749a2e2b9fe37cdd3a00f6 Mon Sep 17 00:00:00 2001 From: mpc Date: Wed, 9 Oct 2024 09:23:23 +0100 Subject: [PATCH 01/28] Set up basic dvc pipeline --- .gitignore | 1 - data.dvc | 6 ------ data/.gitignore | 4 ++++ data/evaluation-sets.dvc | 6 ++++++ data/synthetic-datasets.dvc | 6 ++++++ dvc.lock | 30 ++++++++++++++++++++++++++++++ dvc.yaml | 14 ++++++++++++++ pyproject.toml | 1 + src/llm_eval/evaluate.py | 1 + src/llm_eval/fetch_data.py | 16 ++++++++++++++++ src/llm_eval/prepare_data.py | 18 ++++++++++++++++++ src/llm_eval/run_rag_pipeline.py | 1 + 12 files changed, 97 insertions(+), 7 deletions(-) delete mode 100644 data.dvc create mode 100644 data/.gitignore create mode 100644 data/evaluation-sets.dvc create mode 100644 data/synthetic-datasets.dvc create mode 100644 dvc.lock create mode 100644 dvc.yaml create mode 100644 src/llm_eval/evaluate.py create mode 100644 src/llm_eval/fetch_data.py create mode 100644 src/llm_eval/prepare_data.py create mode 100644 src/llm_eval/run_rag_pipeline.py diff --git a/.gitignore b/.gitignore index c92c62d..bf560c6 100644 --- a/.gitignore +++ b/.gitignore @@ -163,5 +163,4 @@ cython_debug/ metrics.txt metrics.png -/data gdrive-oauth.txt diff --git a/data.dvc b/data.dvc deleted file mode 100644 index d79d822..0000000 --- a/data.dvc +++ /dev/null @@ -1,6 +0,0 @@ -outs: -- md5: 9f50d9dbc781216d5aac93d599e190d7.dir - size: 376640 - nfiles: 3 - hash: md5 - path: data diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..d752729 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,4 @@ +/synthetic-datasets +/evaluation-sets +/eidc_metadata.json +/prepared_data.json diff --git a/data/evaluation-sets.dvc b/data/evaluation-sets.dvc new file mode 100644 index 0000000..bf21ef3 --- /dev/null +++ b/data/evaluation-sets.dvc @@ -0,0 +1,6 @@ +outs: +- md5: c3b5aefd8b8ab17f3087a49eb8265689.dir + size: 232043 + nfiles: 2 + hash: md5 + path: evaluation-sets diff --git a/data/synthetic-datasets.dvc b/data/synthetic-datasets.dvc new file mode 100644 index 0000000..dc27bb2 --- /dev/null +++ b/data/synthetic-datasets.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 61b4177259b03a7227784b5b7560726d.dir + size: 144597 + nfiles: 1 + hash: md5 + path: synthetic-datasets diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..e13d895 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,30 @@ +schema: '2.0' +stages: + fetch: + cmd: python src/llm_eval/fetch_data.py + deps: + - path: src/llm_eval/fetch_data.py + hash: md5 + md5: 10194a16edb7620ed4342e00104f5f95 + size: 307 + outs: + - path: data/eidc_metadata.json + hash: md5 + md5: 5db14ae6031ed3bb3a99588a0a313bda + size: 101 + prepare: + cmd: python src/llm_eval/prepare_data.py + deps: + - path: data/eidc_metadata.json + hash: md5 + md5: 5db14ae6031ed3bb3a99588a0a313bda + size: 101 + - path: src/llm_eval/prepare_data.py + hash: md5 + md5: d285150e5a1f7c252c0a4562bf24ce0e + size: 519 + outs: + - path: data/prepared_data.json + hash: md5 + md5: a6ed512685f3c5f2073517183fbad9fa + size: 17005 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..3b96094 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,14 @@ +stages: + fetch: + cmd: python src/llm_eval/fetch_data.py + deps: + - src/llm_eval/fetch_data.py + outs: + - data/eidc_metadata.json + prepare: + cmd: python src/llm_eval/prepare_data.py + deps: + - data/eidc_metadata.json + - src/llm_eval/prepare_data.py + outs: + - data/prepared_data.json diff --git a/pyproject.toml b/pyproject.toml index f62ea0a..8fe55c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "bitsandbytes", "haystack-ai", "accelerate", + "sentence-transformers", ] [project.optional-dependencies] diff --git a/src/llm_eval/evaluate.py b/src/llm_eval/evaluate.py new file mode 100644 index 0000000..7fcf1a7 --- /dev/null +++ b/src/llm_eval/evaluate.py @@ -0,0 +1 @@ +# Run RAGAS to evaluate \ No newline at end of file diff --git a/src/llm_eval/fetch_data.py b/src/llm_eval/fetch_data.py new file mode 100644 index 0000000..a591b65 --- /dev/null +++ b/src/llm_eval/fetch_data.py @@ -0,0 +1,16 @@ +import json + + +def main(): + data = { + "datasets": [ + {"name": "dsone", "desc": "some description"}, + {"name": "dstwo", "desc": "some text"}, + ] + } + with open("data/eidc_metadata.json", "w") as f: + json.dump(data, f) + + +if __name__ == "__main__": + main() diff --git a/src/llm_eval/prepare_data.py b/src/llm_eval/prepare_data.py new file mode 100644 index 0000000..68d43f6 --- /dev/null +++ b/src/llm_eval/prepare_data.py @@ -0,0 +1,18 @@ +import json +from sentence_transformers import SentenceTransformer + +def create_embedding(text): + model = SentenceTransformer("all-MiniLM-L6-v2") + return model.encode(text) + + +def main(): + with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output: + data = json.load(input) + for dataset in data["datasets"]: + dataset["desc_embedding"] = create_embedding(dataset["desc"]).tolist() + json.dump(data, output) + + +if __name__ == "__main__": + main() diff --git a/src/llm_eval/run_rag_pipeline.py b/src/llm_eval/run_rag_pipeline.py new file mode 100644 index 0000000..c7f9258 --- /dev/null +++ b/src/llm_eval/run_rag_pipeline.py @@ -0,0 +1 @@ +# Generate RAG responses \ No newline at end of file From 3bfa70a7e419f205fe8956427ccd7bed2c3e9476 Mon Sep 17 00:00:00 2001 From: mpc Date: Wed, 9 Oct 2024 15:13:42 +0100 Subject: [PATCH 02/28] Removed hf token --- notebooks/vllm_test.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/notebooks/vllm_test.ipynb b/notebooks/vllm_test.ipynb index e8b19df..755c34c 100644 --- a/notebooks/vllm_test.ipynb +++ b/notebooks/vllm_test.ipynb @@ -27,8 +27,7 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "os.environ[\"HF_TOKEN\"] = \"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\"" + "import os" ] }, { From 090770016204e6906b6bc4c27b86709fac7db572 Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 10 Oct 2024 11:34:11 +0100 Subject: [PATCH 03/28] Added dvc lock file --- dvc.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc.lock b/dvc.lock index e13d895..3b748c3 100644 --- a/dvc.lock +++ b/dvc.lock @@ -21,10 +21,10 @@ stages: size: 101 - path: src/llm_eval/prepare_data.py hash: md5 - md5: d285150e5a1f7c252c0a4562bf24ce0e - size: 519 + md5: 91e15fb87f6a4d0188cf9ed011194411 + size: 513 outs: - path: data/prepared_data.json hash: md5 - md5: a6ed512685f3c5f2073517183fbad9fa - size: 17005 + md5: 6eac511808d32275195826bdce66a2d2 + size: 16962 From 7d89512da499b22269fe83ee42dd3f25b6e0d4e6 Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 10 Oct 2024 11:35:41 +0100 Subject: [PATCH 04/28] Changed embedding json name --- src/llm_eval/prepare_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llm_eval/prepare_data.py b/src/llm_eval/prepare_data.py index 68d43f6..ccaae20 100644 --- a/src/llm_eval/prepare_data.py +++ b/src/llm_eval/prepare_data.py @@ -10,7 +10,7 @@ def main(): with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output: data = json.load(input) for dataset in data["datasets"]: - dataset["desc_embedding"] = create_embedding(dataset["desc"]).tolist() + dataset["desc_emb"] = create_embedding(dataset["desc"]).tolist() json.dump(data, output) From 2e99516dbd18d4259663b7609d158954e08f3c34 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 11 Oct 2024 11:28:41 +0100 Subject: [PATCH 05/28] Cleaned pipeline --- .dvc/config | 1 + data/.gitignore | 1 + dvc.lock | 32 +++++++-------- dvc.yaml | 12 +++--- .../create_embeddings.py | 0 {src/llm_eval => scripts}/evaluate.py | 0 scripts/fetch_eidc_metadata.py | 25 ++++++++++++ scripts/fetch_eidc_supporting_docs.py | 0 scripts/prepare_data.py | 40 +++++++++++++++++++ {src/llm_eval => scripts}/run_rag_pipeline.py | 0 src/llm_eval/fetch_data.py | 16 -------- 11 files changed, 89 insertions(+), 38 deletions(-) rename src/llm_eval/prepare_data.py => scripts/create_embeddings.py (100%) rename {src/llm_eval => scripts}/evaluate.py (100%) create mode 100644 scripts/fetch_eidc_metadata.py create mode 100644 scripts/fetch_eidc_supporting_docs.py create mode 100644 scripts/prepare_data.py rename {src/llm_eval => scripts}/run_rag_pipeline.py (100%) delete mode 100644 src/llm_eval/fetch_data.py diff --git a/.dvc/config b/.dvc/config index 14b6315..8a3434e 100644 --- a/.dvc/config +++ b/.dvc/config @@ -1,5 +1,6 @@ [core] remote = jasmin + autostage = true ['remote "jasmin"'] url = s3://dvc-test endpointurl = https://llm-eval-o.s3-ext.jc.rl.ac.uk diff --git a/data/.gitignore b/data/.gitignore index d752729..d703ed1 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -2,3 +2,4 @@ /evaluation-sets /eidc_metadata.json /prepared_data.json +/prepared_eidc_metadata.json diff --git a/dvc.lock b/dvc.lock index e13d895..79ba668 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,30 +1,30 @@ schema: '2.0' stages: - fetch: - cmd: python src/llm_eval/fetch_data.py + fetch-metadata: + cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json deps: - - path: src/llm_eval/fetch_data.py + - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: 10194a16edb7620ed4342e00104f5f95 - size: 307 + md5: ff336062c921e5e8f95bd569cd064e22 + size: 664 outs: - path: data/eidc_metadata.json hash: md5 - md5: 5db14ae6031ed3bb3a99588a0a313bda - size: 101 + md5: 3c4cd23bfc699358c955d4e7c68e8c9d + size: 8968593 prepare: - cmd: python src/llm_eval/prepare_data.py + cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json deps: - path: data/eidc_metadata.json hash: md5 - md5: 5db14ae6031ed3bb3a99588a0a313bda - size: 101 - - path: src/llm_eval/prepare_data.py + md5: 3c4cd23bfc699358c955d4e7c68e8c9d + size: 8968593 + - path: scripts/prepare_data.py hash: md5 - md5: d285150e5a1f7c252c0a4562bf24ce0e - size: 519 + md5: bdab13adab508052f1d16ab0967b428b + size: 1215 outs: - - path: data/prepared_data.json + - path: data/prepared_eidc_metadata.json hash: md5 - md5: a6ed512685f3c5f2073517183fbad9fa - size: 17005 + md5: ffd8914e46ffba8c47cdca6eb6ae0140 + size: 2120475 diff --git a/dvc.yaml b/dvc.yaml index 3b96094..dc7a55c 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,14 +1,14 @@ stages: - fetch: - cmd: python src/llm_eval/fetch_data.py + fetch-metadata: + cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json deps: - - src/llm_eval/fetch_data.py + - scripts/fetch_eidc_metadata.py outs: - data/eidc_metadata.json prepare: - cmd: python src/llm_eval/prepare_data.py + cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json deps: - data/eidc_metadata.json - - src/llm_eval/prepare_data.py + - scripts/prepare_data.py outs: - - data/prepared_data.json + - data/prepared_eidc_metadata.json diff --git a/src/llm_eval/prepare_data.py b/scripts/create_embeddings.py similarity index 100% rename from src/llm_eval/prepare_data.py rename to scripts/create_embeddings.py diff --git a/src/llm_eval/evaluate.py b/scripts/evaluate.py similarity index 100% rename from src/llm_eval/evaluate.py rename to scripts/evaluate.py diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py new file mode 100644 index 0000000..561f935 --- /dev/null +++ b/scripts/fetch_eidc_metadata.py @@ -0,0 +1,25 @@ +import requests +import json +from argparse import ArgumentParser + +URL = "https://catalogue.ceh.ac.uk/eidc/documents" + +def main(output_file: str) -> None: + res = requests.get( + URL, + headers={"content-type": "application/json"}, + params={ + "page": 1, + "rows": 2000, + "term": "recordType:Dataset", + }, + ) + with open(output_file, "w") as f: + json.dump(res.json(), f) + + +if __name__ == "__main__": + parser = ArgumentParser("fetch_eidc_metadata.py") + parser.add_argument("output", help="The file path to save the downloaded data to.") + args = parser.parse_args() + main(args.output) diff --git a/scripts/fetch_eidc_supporting_docs.py b/scripts/fetch_eidc_supporting_docs.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py new file mode 100644 index 0000000..89a744c --- /dev/null +++ b/scripts/prepare_data.py @@ -0,0 +1,40 @@ +from typing import List, Dict +import json +from argparse import ArgumentParser + + +METADATA_FIELDS = ["title", "description", "lineage"] + + +def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]: + metadata = {} + metadata["id"] = json_data["identifier"] + for field in fields: + if json_data[field]: + metadata["field"] = field + metadata["value"] = json_data[field] + return metadata + + +def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]: + data = [] + with open(file_path) as f: + json_data = json.load(f) + for dataset in json_data["results"]: + dataset_metadata = extact_eidc_metadata_fields(dataset) + data.append(dataset_metadata) + return data + + +def main(input, output) -> None: + data = parse_eidc_metadata(input) + with open(output, "w") as f: + json.dump(data, f, indent=4) + + +if __name__ == "__main__": + parser = ArgumentParser("prepare_data.py") + parser.add_argument("input", help="The file to be used as input.") + parser.add_argument("output", help="The path to save the processed result.") + args = parser.parse_args() + main(args.input, args.output) diff --git a/src/llm_eval/run_rag_pipeline.py b/scripts/run_rag_pipeline.py similarity index 100% rename from src/llm_eval/run_rag_pipeline.py rename to scripts/run_rag_pipeline.py diff --git a/src/llm_eval/fetch_data.py b/src/llm_eval/fetch_data.py deleted file mode 100644 index a591b65..0000000 --- a/src/llm_eval/fetch_data.py +++ /dev/null @@ -1,16 +0,0 @@ -import json - - -def main(): - data = { - "datasets": [ - {"name": "dsone", "desc": "some description"}, - {"name": "dstwo", "desc": "some text"}, - ] - } - with open("data/eidc_metadata.json", "w") as f: - json.dump(data, f) - - -if __name__ == "__main__": - main() From 739f8fc563099cb4112debb4dbdb423dfdf3f0f3 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 11 Oct 2024 11:33:43 +0100 Subject: [PATCH 06/28] Cleaned scripts and added additional metadata field --- dvc.lock | 20 ++++++++++---------- scripts/fetch_eidc_metadata.py | 2 +- scripts/prepare_data.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dvc.lock b/dvc.lock index 79ba668..81cd8e2 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,26 +5,26 @@ stages: deps: - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: ff336062c921e5e8f95bd569cd064e22 - size: 664 + md5: 43a63d91a3d66caa03738a000c841406 + size: 674 outs: - path: data/eidc_metadata.json hash: md5 - md5: 3c4cd23bfc699358c955d4e7c68e8c9d - size: 8968593 + md5: 423dc3a61ede72e1d5c818d74277c0b4 + size: 12140491 prepare: cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json deps: - path: data/eidc_metadata.json hash: md5 - md5: 3c4cd23bfc699358c955d4e7c68e8c9d - size: 8968593 + md5: 423dc3a61ede72e1d5c818d74277c0b4 + size: 12140491 - path: scripts/prepare_data.py hash: md5 - md5: bdab13adab508052f1d16ab0967b428b - size: 1215 + md5: bcbf4413aeee83928054d9c6c6c2bacc + size: 1224 outs: - path: data/prepared_eidc_metadata.json hash: md5 - md5: ffd8914e46ffba8c47cdca6eb6ae0140 - size: 2120475 + md5: 0b4ca8c49da450bc8fec0e92d577466c + size: 411936 diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index 561f935..5ea1064 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -15,7 +15,7 @@ def main(output_file: str) -> None: }, ) with open(output_file, "w") as f: - json.dump(res.json(), f) + json.dump(res.json(), f, indent=4) if __name__ == "__main__": diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 89a744c..4adffdc 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -3,7 +3,7 @@ from argparse import ArgumentParser -METADATA_FIELDS = ["title", "description", "lineage"] +METADATA_FIELDS = ["title", "description", "lineage", "title"] def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]: From 8148da1083c70aa5f94ad885702524adb913208e Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 11 Oct 2024 11:52:29 +0100 Subject: [PATCH 07/28] Removed modules from project file --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 8fe55c9..eb48198 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,3 +21,6 @@ jupyter = [ "ipykernel", "ipywidgets", ] + +[tool.setuptools] +py-modules = [] From 2ec86cd49d44e8aa8816f0ecf94c6ab295fff7fc Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 11 Oct 2024 14:04:23 +0100 Subject: [PATCH 08/28] Fixed bug so all metadata fields are extracted --- data/.gitignore | 1 + dvc.lock | 14 +++++++------- dvc.yaml | 6 +++--- scripts/{prepare_data.py => extract_metadata.py} | 14 ++++++++------ 4 files changed, 19 insertions(+), 16 deletions(-) rename scripts/{prepare_data.py => extract_metadata.py} (76%) diff --git a/data/.gitignore b/data/.gitignore index d703ed1..88737f3 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -3,3 +3,4 @@ /eidc_metadata.json /prepared_data.json /prepared_eidc_metadata.json +/extracted_metadata.json diff --git a/dvc.lock b/dvc.lock index 81cd8e2..bf2dea4 100644 --- a/dvc.lock +++ b/dvc.lock @@ -13,18 +13,18 @@ stages: md5: 423dc3a61ede72e1d5c818d74277c0b4 size: 12140491 prepare: - cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json + cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: - path: data/eidc_metadata.json hash: md5 md5: 423dc3a61ede72e1d5c818d74277c0b4 size: 12140491 - - path: scripts/prepare_data.py + - path: scripts/extract_metadata.py hash: md5 - md5: bcbf4413aeee83928054d9c6c6c2bacc - size: 1224 + md5: c2fa7d2c4b8f28a6e24536ce0df244fd + size: 1296 outs: - - path: data/prepared_eidc_metadata.json + - path: data/extracted_metadata.json hash: md5 - md5: 0b4ca8c49da450bc8fec0e92d577466c - size: 411936 + md5: 7d2ae8d6a41a960592f30496eb498af7 + size: 4578493 diff --git a/dvc.yaml b/dvc.yaml index dc7a55c..517a69d 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -6,9 +6,9 @@ stages: outs: - data/eidc_metadata.json prepare: - cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json + cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: - data/eidc_metadata.json - - scripts/prepare_data.py + - scripts/extract_metadata.py outs: - - data/prepared_eidc_metadata.json + - data/extracted_metadata.json diff --git a/scripts/prepare_data.py b/scripts/extract_metadata.py similarity index 76% rename from scripts/prepare_data.py rename to scripts/extract_metadata.py index 4adffdc..241bc1a 100644 --- a/scripts/prepare_data.py +++ b/scripts/extract_metadata.py @@ -3,17 +3,19 @@ from argparse import ArgumentParser -METADATA_FIELDS = ["title", "description", "lineage", "title"] +METADATA_FIELDS = ["title", "description", "lineage"] -def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]: - metadata = {} - metadata["id"] = json_data["identifier"] +def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]: + metadatas = [] for field in fields: if json_data[field]: + metadata = {} + metadata["id"] = json_data["identifier"] metadata["field"] = field metadata["value"] = json_data[field] - return metadata + metadatas.append(metadata) + return metadatas def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]: @@ -22,7 +24,7 @@ def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]: json_data = json.load(f) for dataset in json_data["results"]: dataset_metadata = extact_eidc_metadata_fields(dataset) - data.append(dataset_metadata) + data.extend(dataset_metadata) return data From 24cad77d5539b31a1b374b3695f5531a95d19793 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 11 Oct 2024 14:52:22 +0100 Subject: [PATCH 09/28] Added chunking to dvc pipeline --- data/.gitignore | 1 + dvc.lock | 32 ++++++++++++++++++++++++ dvc.yaml | 9 ++++++- scripts/chunk_data.py | 57 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 scripts/chunk_data.py diff --git a/data/.gitignore b/data/.gitignore index 88737f3..e675951 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -4,3 +4,4 @@ /prepared_data.json /prepared_eidc_metadata.json /extracted_metadata.json +/chunked_data.json diff --git a/dvc.lock b/dvc.lock index bf2dea4..a69544a 100644 --- a/dvc.lock +++ b/dvc.lock @@ -28,3 +28,35 @@ stages: hash: md5 md5: 7d2ae8d6a41a960592f30496eb498af7 size: 4578493 + extract-metadata: + cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json + deps: + - path: data/eidc_metadata.json + hash: md5 + md5: 423dc3a61ede72e1d5c818d74277c0b4 + size: 12140491 + - path: scripts/extract_metadata.py + hash: md5 + md5: c2fa7d2c4b8f28a6e24536ce0df244fd + size: 1296 + outs: + - path: data/extracted_metadata.json + hash: md5 + md5: 7d2ae8d6a41a960592f30496eb498af7 + size: 4578493 + chunk-data: + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json + deps: + - path: data/extracted_metadata.json + hash: md5 + md5: 7d2ae8d6a41a960592f30496eb498af7 + size: 4578493 + - path: scripts/chunk_data.py + hash: md5 + md5: b89a3ae9f6f9a0142149e70dc6fc5735 + size: 1903 + outs: + - path: data/chunked_data.json + hash: md5 + md5: 7ba3d3785db066283e35d654e11cf28b + size: 6373503 diff --git a/dvc.yaml b/dvc.yaml index 517a69d..bf00465 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -5,10 +5,17 @@ stages: - scripts/fetch_eidc_metadata.py outs: - data/eidc_metadata.json - prepare: + extract-metadata: cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: - data/eidc_metadata.json - scripts/extract_metadata.py outs: - data/extracted_metadata.json + chunk-data: + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json + deps: + - data/extracted_metadata.json + - scripts/chunk_data.py + outs: + - data/chunked_data.json \ No newline at end of file diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py new file mode 100644 index 0000000..cdec705 --- /dev/null +++ b/scripts/chunk_data.py @@ -0,0 +1,57 @@ +from typing import List, Dict +import json +from argparse import ArgumentParser + + +def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]: + chunks = [] + start = 0 + while start < len(value): + chunks.append(value[start : (start + chunk_size)]) + start += chunk_size - overlap + return chunks + + +def chunk_metadata_value(metada_value, chunk_size, overlap): + chunks = chunk_value(metada_value["value"], chunk_size, overlap) + return [ + { + "chunk": chunks[i], + "field": metada_value["field"], + "id": metada_value["id"], + "index": i, + } + for i in range(len(chunks)) + ] + + +def chunk_metadata_file(file: str, chunk_size: int, overlap: int) -> List[Dict[str, str]]: + chunked_metadata = [] + with open(file) as f: + json_data = json.load(f) + for metadata in json_data: + chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap)) + return chunked_metadata + + +def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None: + all_chunked_metadata = [] + for file in files: + all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap)) + with open(ouput_file, "w") as f: + json.dump(all_chunked_metadata, f, indent=4) + + +if __name__ == "__main__": + parser = ArgumentParser("prepare_data.py") + parser.add_argument("input_files", nargs="+", help="List of files to chunk.") + parser.add_argument("-o", "--output", help="The file to write the output to.") + parser.add_argument( + "-c", "--chunk", help="Desired chunk size in characters.", type=int + ) + parser.add_argument( + "-ol", "--overlap", help="Chunk overlap in characters.", type=int + ) + args = parser.parse_args() + assert args.chunk > args.overlap + main(args.input_files, args.output, args.chunk, args.overlap) From 40710525cec556eb4cc644e0389eb6d4d0d9bee8 Mon Sep 17 00:00:00 2001 From: mpc Date: Tue, 15 Oct 2024 15:35:55 +0100 Subject: [PATCH 10/28] Added pipeline step for embeddings and parameter options --- data/.gitignore | 2 ++ dvc.lock | 45 +++++++++++++++++++++---------- dvc.yaml | 23 ++++++++++------ params.yaml | 9 +++++++ scripts/chunk_data.py | 49 +++++++++++++++++++++++++++++----- scripts/create_embeddings.py | 16 +++++++---- scripts/fetch_eidc_metadata.py | 2 +- scripts/upload_to_docstore.py | 6 +++++ 8 files changed, 117 insertions(+), 35 deletions(-) create mode 100644 params.yaml create mode 100644 scripts/upload_to_docstore.py diff --git a/data/.gitignore b/data/.gitignore index e675951..133b354 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -5,3 +5,5 @@ /prepared_eidc_metadata.json /extracted_metadata.json /chunked_data.json +/chunked_embeddings.json +/embeddings.json diff --git a/dvc.lock b/dvc.lock index a69544a..e537716 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,13 +5,13 @@ stages: deps: - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: 43a63d91a3d66caa03738a000c841406 + md5: ba838a284da239217d0464f08e0a45ce size: 674 outs: - path: data/eidc_metadata.json hash: md5 - md5: 423dc3a61ede72e1d5c818d74277c0b4 - size: 12140491 + md5: fc2f9ebe92cbd07eb06ff6e39366fdac + size: 12146216 prepare: cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: @@ -33,8 +33,8 @@ stages: deps: - path: data/eidc_metadata.json hash: md5 - md5: 423dc3a61ede72e1d5c818d74277c0b4 - size: 12140491 + md5: fc2f9ebe92cbd07eb06ff6e39366fdac + size: 12146216 - path: scripts/extract_metadata.py hash: md5 md5: c2fa7d2c4b8f28a6e24536ce0df244fd @@ -42,21 +42,38 @@ stages: outs: - path: data/extracted_metadata.json hash: md5 - md5: 7d2ae8d6a41a960592f30496eb498af7 - size: 4578493 + md5: fce18ce3c43175af1cea5d84dac9baf9 + size: 4579965 chunk-data: - cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s + 10 data/extracted_metadata.json deps: - path: data/extracted_metadata.json hash: md5 - md5: 7d2ae8d6a41a960592f30496eb498af7 - size: 4578493 + md5: fce18ce3c43175af1cea5d84dac9baf9 + size: 4579965 - path: scripts/chunk_data.py hash: md5 - md5: b89a3ae9f6f9a0142149e70dc6fc5735 - size: 1903 + md5: 681528e4aa1dc8cfb5fe5e5472e25fdf + size: 2509 outs: - path: data/chunked_data.json hash: md5 - md5: 7ba3d3785db066283e35d654e11cf28b - size: 6373503 + md5: e9160d8c6c0fa7f647c5baa03bd1b5dd + size: 14947 + create-embeddings: + cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json + deps: + - path: data/chunked_data.json + hash: md5 + md5: e9160d8c6c0fa7f647c5baa03bd1b5dd + size: 14947 + - path: scripts/create_embeddings.py + hash: md5 + md5: 3dc6ef284730398375a13df4bff41846 + size: 808 + outs: + - path: data/embeddings.json + hash: md5 + md5: b08299369d1f243eb8d8ffa2cdb9a90f + size: 351126 diff --git a/dvc.yaml b/dvc.yaml index bf00465..f1f20cc 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,21 +1,28 @@ stages: fetch-metadata: - cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json + cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} deps: - scripts/fetch_eidc_metadata.py outs: - - data/eidc_metadata.json + - ${files.metadata} extract-metadata: - cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json + cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted} deps: - - data/eidc_metadata.json + - ${files.metadata} - scripts/extract_metadata.py outs: - - data/extracted_metadata.json + - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json + cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} deps: - - data/extracted_metadata.json + - ${files.extracted} - scripts/chunk_data.py outs: - - data/chunked_data.json \ No newline at end of file + - ${files.chunked} + create-embeddings: + cmd: python scripts/create_embeddings.py ${files.chunked} ${files.embeddings} + deps: + - ${files.chunked} + - scripts/create_embeddings.py + outs: + - ${files.embeddings} \ No newline at end of file diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..d079be0 --- /dev/null +++ b/params.yaml @@ -0,0 +1,9 @@ +hp: + chunk-size: 300 + overlap: 100 +files: + metadata: "data/eidc_metadata.json" + extracted: "data/extracted_metadata.json" + chunked: "data/chunked_data.json" + embeddings: "data/embeddings.json" +sample-size: 10 # sample size of 0 will process all data \ No newline at end of file diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py index cdec705..ace111d 100644 --- a/scripts/chunk_data.py +++ b/scripts/chunk_data.py @@ -25,19 +25,29 @@ def chunk_metadata_value(metada_value, chunk_size, overlap): ] -def chunk_metadata_file(file: str, chunk_size: int, overlap: int) -> List[Dict[str, str]]: +def chunk_metadata_file( + file: str, chunk_size: int, overlap: int, sample_size: int +) -> List[Dict[str, str]]: chunked_metadata = [] with open(file) as f: json_data = json.load(f) + count = 0 for metadata in json_data: chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap)) + count += 1 + if count == sample_size: + break return chunked_metadata -def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None: +def main( + files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int +) -> None: all_chunked_metadata = [] for file in files: - all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap)) + all_chunked_metadata.extend( + chunk_metadata_file(file, chunk_size, overlap, sample_size) + ) with open(ouput_file, "w") as f: json.dump(all_chunked_metadata, f, indent=4) @@ -45,13 +55,38 @@ def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> No if __name__ == "__main__": parser = ArgumentParser("prepare_data.py") parser.add_argument("input_files", nargs="+", help="List of files to chunk.") - parser.add_argument("-o", "--output", help="The file to write the output to.") parser.add_argument( - "-c", "--chunk", help="Desired chunk size in characters.", type=int + "-o", + "--output", + help="The json file to write the output to.", + type=str, + nargs="?", + const="chunk_data_output.json", ) parser.add_argument( - "-ol", "--overlap", help="Chunk overlap in characters.", type=int + "-c", + "--chunk", + help="Desired chunk size in characters.", + type=int, + nargs="?", + const=300, + ) + parser.add_argument( + "-ol", + "--overlap", + help="Chunk overlap in characters.", + type=int, + nargs="?", + const=100, + ) + parser.add_argument( + "-s", + "--sample", + help="Only generate chunks for n datasets", + type=int, + nargs="?", + const=0, ) args = parser.parse_args() assert args.chunk > args.overlap - main(args.input_files, args.output, args.chunk, args.overlap) + main(args.input_files, args.output, args.chunk, args.overlap, args.sample) diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index ccaae20..ce1c37b 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -1,18 +1,24 @@ import json from sentence_transformers import SentenceTransformer +from argparse import ArgumentParser +from tqdm import tqdm def create_embedding(text): model = SentenceTransformer("all-MiniLM-L6-v2") return model.encode(text) -def main(): - with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output: +def main(input_file, output_file): + with open(input_file) as input, open(output_file, "w") as output: data = json.load(input) - for dataset in data["datasets"]: - dataset["desc_emb"] = create_embedding(dataset["desc"]).tolist() + for chunk in tqdm(data): + chunk["embedding"] = create_embedding(chunk["chunk"]).tolist() json.dump(data, output) if __name__ == "__main__": - main() + parser = ArgumentParser("prepare_data.py") + parser.add_argument("input", help="The file to be used as input.") + parser.add_argument("output", help="The path to save the processed result.") + args = parser.parse_args() + main(args.input, args.output) diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index 5ea1064..cd56b4e 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -10,7 +10,7 @@ def main(output_file: str) -> None: headers={"content-type": "application/json"}, params={ "page": 1, - "rows": 2000, + "rows": 2500, "term": "recordType:Dataset", }, ) diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py new file mode 100644 index 0000000..6e4fb85 --- /dev/null +++ b/scripts/upload_to_docstore.py @@ -0,0 +1,6 @@ +from argparse import ArgumentParser + +if __name__ == "__main__": + parser = ArgumentParser("prepare_data.py") + parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store") + parser.add_argument("-o", "--output", help="The file to write the output to.") \ No newline at end of file From ec183d3dacdf4f0fc08fb77b0ef233c0595787d3 Mon Sep 17 00:00:00 2001 From: mpc Date: Wed, 16 Oct 2024 11:28:34 +0100 Subject: [PATCH 11/28] Added chroma upload to pipeline --- data/.gitignore | 1 + dvc.lock | 18 +++++++++++ dvc.yaml | 9 +++++- params.yaml | 2 ++ pyproject.toml | 1 + scripts/upload_to_docstore.py | 56 +++++++++++++++++++++++++++++++++-- 6 files changed, 84 insertions(+), 3 deletions(-) diff --git a/data/.gitignore b/data/.gitignore index 133b354..c3f2331 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -7,3 +7,4 @@ /chunked_data.json /chunked_embeddings.json /embeddings.json +/chroma-data diff --git a/dvc.lock b/dvc.lock index e537716..27e749b 100644 --- a/dvc.lock +++ b/dvc.lock @@ -77,3 +77,21 @@ stages: hash: md5 md5: b08299369d1f243eb8d8ffa2cdb9a90f size: 351126 + upload-to-docstore: + cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data + -em all-MiniLM-L6-v2 + deps: + - path: data/embeddings.json + hash: md5 + md5: b08299369d1f243eb8d8ffa2cdb9a90f + size: 351126 + - path: scripts/upload_to_docstore.py + hash: md5 + md5: ae8755770166dd3d6c1efb9f15723116 + size: 1836 + outs: + - path: data/chroma-data + hash: md5 + md5: 2f2ba629bf078284bb6d6be73c6166a7.dir + size: 2069220 + nfiles: 5 diff --git a/dvc.yaml b/dvc.yaml index f1f20cc..5446540 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -25,4 +25,11 @@ stages: - ${files.chunked} - scripts/create_embeddings.py outs: - - ${files.embeddings} \ No newline at end of file + - ${files.embeddings} + upload-to-docstore: + cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model} + deps: + - ${files.embeddings} + - scripts/upload_to_docstore.py + outs: + - ${files.doc-store} \ No newline at end of file diff --git a/params.yaml b/params.yaml index d079be0..812a62e 100644 --- a/params.yaml +++ b/params.yaml @@ -1,9 +1,11 @@ hp: chunk-size: 300 overlap: 100 + embeddings-model: "all-MiniLM-L6-v2" files: metadata: "data/eidc_metadata.json" extracted: "data/extracted_metadata.json" chunked: "data/chunked_data.json" embeddings: "data/embeddings.json" + doc-store: "data/chroma-data" sample-size: 10 # sample size of 0 will process all data \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index eb48198..a8e8384 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "haystack-ai", "accelerate", "sentence-transformers", + "chromadb", ] [project.optional-dependencies] diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py index 6e4fb85..4f2e8af 100644 --- a/scripts/upload_to_docstore.py +++ b/scripts/upload_to_docstore.py @@ -1,6 +1,58 @@ from argparse import ArgumentParser +import json +import uuid + +import chromadb +from chromadb.utils import embedding_functions + + +def main(input_file: str, output_path: str, collection_name: str, embedding_model: str): + print(collection_name) + with open(input_file) as f: + json_data = json.load(f) + + docs = [chunk["chunk"] for chunk in json_data] + metas = [ + {field: chunk[field] for field in ["field", "id", "index"]} + for chunk in json_data + ] + embs = [chunk["embedding"] for chunk in json_data] + ids = [uuid.uuid4().hex for _ in json_data] + + func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=embedding_model + ) + + client = chromadb.PersistentClient(output_path) + collection = client.create_collection( + name=collection_name, embedding_function=func + ) + collection.add(documents=docs, metadatas=metas, embeddings=embs, ids=ids) + if __name__ == "__main__": parser = ArgumentParser("prepare_data.py") - parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store") - parser.add_argument("-o", "--output", help="The file to write the output to.") \ No newline at end of file + parser.add_argument( + "input_file", + help="File containing chunks and embeddings to upload to document store", + ) + parser.add_argument( + "-o", + "--output", + help="The file to write the output to.", + default="data/chroma-data", + ) + parser.add_argument( + "-c", + "--collection", + help="Collection name to use in doc store.", + default="eidc-data", + ) + parser.add_argument( + "-em", + "--embedding_model", + help="Embedding model to use in the doc store (must be the same as the function used to create embeddings.)", + default="all-MiniLM-L6-v2", + ) + args = parser.parse_args() + main(args.input_file, args.output, args.collection, args.embedding_model) From aaa3a93ef06c13590c6b3c9f6425a5eeafd7f012 Mon Sep 17 00:00:00 2001 From: mpc Date: Wed, 16 Oct 2024 15:25:52 +0100 Subject: [PATCH 12/28] Added script for running rag pipeline on eval datasets --- dvc.yaml | 5 +- params.yaml | 14 +++++- pyproject.toml | 2 + scripts/run_rag_pipeline.py | 99 ++++++++++++++++++++++++++++++++++++- 4 files changed, 117 insertions(+), 3 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 5446540..ab5a76f 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -32,4 +32,7 @@ stages: - ${files.embeddings} - scripts/upload_to_docstore.py outs: - - ${files.doc-store} \ No newline at end of file + - ${files.doc-store} + run-rag-pipeline: + cmd python scripts/run_rag_pipeline.py ${files.test-set} + \ No newline at end of file diff --git a/params.yaml b/params.yaml index 812a62e..a4792b9 100644 --- a/params.yaml +++ b/params.yaml @@ -8,4 +8,16 @@ files: chunked: "data/chunked_data.json" embeddings: "data/embeddings.json" doc-store: "data/chroma-data" -sample-size: 10 # sample size of 0 will process all data \ No newline at end of file + test-set: "data/synthetic-datasets/eidc_rag_test_sample.csv" +sample-size: 10 # sample size of 0 will process all data +rag: + model: llama3.1 + prompt: > + You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n + Do not use your own knowledge to answer the question.\n + The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n + Do not refer to "context" in your answer, instead refer to the context as available information. + If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n + Question: {{query}}\n + Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %} + Answer: \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a8e8384..f86faac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ dependencies = [ "accelerate", "sentence-transformers", "chromadb", + "ollama-haystack == 0.0.7", + "chroma-haystack", ] [project.optional-dependencies] diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index c7f9258..16fab98 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -1 +1,98 @@ -# Generate RAG responses \ No newline at end of file +from argparse import ArgumentParser +from haystack import Pipeline +from haystack_integrations.document_stores.chroma import ChromaDocumentStore +from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever +from haystack.components.builders import PromptBuilder +from haystack_integrations.components.generators.ollama.generator import OllamaGenerator +from haystack.components.builders.answer_builder import AnswerBuilder +import pandas as pd + + +def build_rag_pipeline(model_name: str) -> Pipeline: + document_store = ChromaDocumentStore( + collection_name="eidc-data", persist_path="data/chroma-data" + ) + retriever = ChromaQueryTextRetriever(document_store, top_k=3) + print("Creating prompt template...") + + template = """ + Given the following information, answer the question. + + Question: {{query}} + + Context: + {% for document in documents %} + {{ document.content }} + {% endfor %} + + Answer: + """ + + prompt_builder = PromptBuilder(template=template) + + model_name = "llama3.1" + + print(f"Setting up model ({model_name})...") + llm = OllamaGenerator( + model=model_name, + generation_kwargs={"num_ctx": 16384}, + url="http://localhost:11434/api/generate", + ) + + answer_builder = AnswerBuilder() + + rag_pipe = Pipeline() + + rag_pipe.add_component("retriever", retriever) + rag_pipe.add_component("prompt_builder", prompt_builder) + rag_pipe.add_component("llm", llm) + rag_pipe.add_component("answer_builder", answer_builder) + + rag_pipe.connect("retriever.documents", "prompt_builder.documents") + rag_pipe.connect("retriever.documents", "answer_builder.documents") + + rag_pipe.connect("prompt_builder", "llm") + + rag_pipe.connect("llm.replies", "answer_builder.replies") + return rag_pipe + + +def query_pipeline(query: str, pipeline: Pipeline): + return pipeline.run( + { + "retriever": {"query": query}, + "prompt_builder": {"query": query}, + "answer_builder": {"query": query}, + } + ) + + +def main(test_data_file: str): + rag_pipe = build_rag_pipeline("llama3.1") + + df = pd.read_csv(test_data_file) + responses = [] + for q in df["question"]: + responses.append(query_pipeline(q, rag_pipe)) + df["rag_response"] = responses + df.to_csv("data/rag_response.csv") + + query = "Who collected the land cover map data?" + result = rag_pipe.run( + { + "retriever": {"query": query}, + "prompt_builder": {"query": query}, + "answer_builder": {"query": query}, + } + ) + print(result) + + +if __name__ == "__main__": + parser = ArgumentParser("run_rag_pipeline.py") + parser.add_argument( + "test_data_file", + help="File containing test queries to generate response from the RAG pipeline.", + ) + args = parser.parse_args() + main(args.test_data_file) From f9b9b3eb80d2abd44d1e4778242e51ff03c7b5d8 Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 17 Oct 2024 10:31:04 +0100 Subject: [PATCH 13/28] Added dummy steps to pipeline --- data/.gitignore | 3 +++ data/synthetic-datasets.dvc | 6 ++--- dvc.lock | 46 +++++++++++++++++++++++++++++++++++++ dvc.yaml | 22 ++++++++++++++++-- params.yaml | 4 +++- pyproject.toml | 2 ++ scripts/run_rag_pipeline.py | 42 ++++++++++++++++++--------------- 7 files changed, 101 insertions(+), 24 deletions(-) diff --git a/data/.gitignore b/data/.gitignore index c3f2331..b90999a 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -8,3 +8,6 @@ /chunked_embeddings.json /embeddings.json /chroma-data +/evaluation_data.csv +/eidc_rag_test_sample.csv +/supporting-docs.json diff --git a/data/synthetic-datasets.dvc b/data/synthetic-datasets.dvc index dc27bb2..cd53100 100644 --- a/data/synthetic-datasets.dvc +++ b/data/synthetic-datasets.dvc @@ -1,6 +1,6 @@ outs: -- md5: 61b4177259b03a7227784b5b7560726d.dir - size: 144597 - nfiles: 1 +- md5: 9d87c638c5cc518ea360c474c4e1e9ef.dir + size: 152121 + nfiles: 2 hash: md5 path: synthetic-datasets diff --git a/dvc.lock b/dvc.lock index 27e749b..3fb8862 100644 --- a/dvc.lock +++ b/dvc.lock @@ -52,6 +52,10 @@ stages: hash: md5 md5: fce18ce3c43175af1cea5d84dac9baf9 size: 4579965 + - path: data/supporting-docs.json + hash: md5 + md5: 0febface6f1d23fda46c11bef65284f4 + size: 34 - path: scripts/chunk_data.py hash: md5 md5: 681528e4aa1dc8cfb5fe5e5472e25fdf @@ -95,3 +99,45 @@ stages: md5: 2f2ba629bf078284bb6d6be73c6166a7.dir size: 2069220 nfiles: 5 + run-rag-pipeline: + cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv + deps: + - path: data/chroma-data + hash: md5 + md5: 1d7c499f71791267391ff4108632988c.dir + size: 2069220 + nfiles: 5 + - path: data/eidc_rag_test_sample.csv + hash: md5 + md5: a371d83c5822d256286e80d64d58c3fe + size: 7524 + - path: scripts/run_rag_pipeline.py + hash: md5 + md5: 6d1f49fa8b22288ecd50ed0e3898fd60 + size: 3153 + outs: + - path: data/evaluation_data.csv + hash: md5 + md5: e313cb899c10a2b5ad670b8bc84d059f + size: 8407 + generate-testset: + cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ + outs: + - path: data/eidc_rag_test_sample.csv + hash: md5 + md5: a371d83c5822d256286e80d64d58c3fe + size: 7524 + fetch-supporting-docs: + cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json + outs: + - path: data/supporting-docs.json + hash: md5 + md5: 0febface6f1d23fda46c11bef65284f4 + size: 34 + evaluate: + cmd: echo "Evaluate responses" + deps: + - path: data/evaluation_data.csv + hash: md5 + md5: e313cb899c10a2b5ad670b8bc84d059f + size: 8407 diff --git a/dvc.yaml b/dvc.yaml index ab5a76f..2028fa4 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -5,6 +5,10 @@ stages: - scripts/fetch_eidc_metadata.py outs: - ${files.metadata} + fetch-supporting-docs: + cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs} + outs: + - ${files.supporting-docs} extract-metadata: cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted} deps: @@ -16,6 +20,7 @@ stages: cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} deps: - ${files.extracted} + - ${files.supporting-docs} - scripts/chunk_data.py outs: - ${files.chunked} @@ -33,6 +38,19 @@ stages: - scripts/upload_to_docstore.py outs: - ${files.doc-store} + generate-testset: + cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ + outs: + - ${files.test-set} run-rag-pipeline: - cmd python scripts/run_rag_pipeline.py ${files.test-set} - \ No newline at end of file + cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} + deps: + - ${files.test-set} + - ${files.doc-store} + - scripts/run_rag_pipeline.py + outs: + - ${files.eval-set} + evaluate: + cmd: echo "Evaluate responses" + deps: + - ${files.eval-set} \ No newline at end of file diff --git a/params.yaml b/params.yaml index a4792b9..2f5354f 100644 --- a/params.yaml +++ b/params.yaml @@ -5,10 +5,12 @@ hp: files: metadata: "data/eidc_metadata.json" extracted: "data/extracted_metadata.json" + supporting-docs: "data/supporting-docs.json" chunked: "data/chunked_data.json" embeddings: "data/embeddings.json" doc-store: "data/chroma-data" - test-set: "data/synthetic-datasets/eidc_rag_test_sample.csv" + test-set: "data/eidc_rag_test_sample.csv" + eval-set: "data/evaluation_data.csv" sample-size: 10 # sample size of 0 will process all data rag: model: llama3.1 diff --git a/pyproject.toml b/pyproject.toml index f86faac..454973a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,8 @@ dependencies = [ "chromadb", "ollama-haystack == 0.0.7", "chroma-haystack", + "ragas == 0.1.10", + "nltk" ] [project.optional-dependencies] diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index 16fab98..830a052 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -57,7 +57,7 @@ def build_rag_pipeline(model_name: str) -> Pipeline: return rag_pipe -def query_pipeline(query: str, pipeline: Pipeline): +def run_query(query: str, pipeline: Pipeline): return pipeline.run( { "retriever": {"query": query}, @@ -67,25 +67,27 @@ def query_pipeline(query: str, pipeline: Pipeline): ) -def main(test_data_file: str): +def query_pipeline(questions, rag_pipe): + answers = [] + contexts = [] + for q in questions: + response = run_query(q, rag_pipe) + answers.append(response["answer_builder"]["answers"][0].data) + contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents]) + return answers, contexts + + +def main(test_data_file: str, ouput_file: str): rag_pipe = build_rag_pipeline("llama3.1") df = pd.read_csv(test_data_file) - responses = [] - for q in df["question"]: - responses.append(query_pipeline(q, rag_pipe)) - df["rag_response"] = responses - df.to_csv("data/rag_response.csv") - - query = "Who collected the land cover map data?" - result = rag_pipe.run( - { - "retriever": {"query": query}, - "prompt_builder": {"query": query}, - "answer_builder": {"query": query}, - } - ) - print(result) + df.drop(columns=["rating", "contexts"], inplace=True) + + answers, contexts = query_pipeline(df["question"], rag_pipe) + + df["answer"] = answers + df["contexts"] = contexts + df.to_csv(ouput_file, index=False) if __name__ == "__main__": @@ -94,5 +96,9 @@ def main(test_data_file: str): "test_data_file", help="File containing test queries to generate response from the RAG pipeline.", ) + parser.add_argument( + "output_file", + help="File to output results to.", + ) args = parser.parse_args() - main(args.test_data_file) + main(args.test_data_file, args.output_file) From 419e83f790e2f36164185ce9ce6d441d4800004a Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 17 Oct 2024 14:44:57 +0100 Subject: [PATCH 14/28] Added evaluation script to pipeline --- data/.gitignore | 2 + dvc.lock | 26 +++++-- dvc.yaml | 8 ++- notebooks/ragas_eval.ipynb | 143 ++++++++++++++++++++++++++++--------- params.yaml | 2 + pyproject.toml | 3 +- scripts/evaluate.py | 78 +++++++++++++++++++- 7 files changed, 220 insertions(+), 42 deletions(-) diff --git a/data/.gitignore b/data/.gitignore index b90999a..09fbf7e 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -11,3 +11,5 @@ /evaluation_data.csv /eidc_rag_test_sample.csv /supporting-docs.json +/metrics.json +/eval.png diff --git a/dvc.lock b/dvc.lock index 3fb8862..8b454b4 100644 --- a/dvc.lock +++ b/dvc.lock @@ -104,7 +104,7 @@ stages: deps: - path: data/chroma-data hash: md5 - md5: 1d7c499f71791267391ff4108632988c.dir + md5: 0254e85bb660da611cfa14e5221dae92.dir size: 2069220 nfiles: 5 - path: data/eidc_rag_test_sample.csv @@ -118,8 +118,8 @@ stages: outs: - path: data/evaluation_data.csv hash: md5 - md5: e313cb899c10a2b5ad670b8bc84d059f - size: 8407 + md5: 47a0adeb2ee1cb67202048684064d30f + size: 7293 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -135,9 +135,23 @@ stages: md5: 0febface6f1d23fda46c11bef65284f4 size: 34 evaluate: - cmd: echo "Evaluate responses" + cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json + -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: e313cb899c10a2b5ad670b8bc84d059f - size: 8407 + md5: 47a0adeb2ee1cb67202048684064d30f + size: 7293 + - path: scripts/evaluate.py + hash: md5 + md5: 51f036b805f23dd3ebfd5d819bc9d457 + size: 2489 + outs: + - path: data/eval.png + hash: md5 + md5: 8c11f987449f8718b6f6011078b6c259 + size: 49498 + - path: data/metrics.json + hash: md5 + md5: 53fba29cb236fedd3c6446ea94fea3cc + size: 215 diff --git a/dvc.yaml b/dvc.yaml index 2028fa4..fe6f0ea 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -51,6 +51,10 @@ stages: outs: - ${files.eval-set} evaluate: - cmd: echo "Evaluate responses" + cmd: python scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot} deps: - - ${files.eval-set} \ No newline at end of file + - ${files.eval-set} + - scripts/evaluate.py + outs: + - ${files.metrics} + - ${files.eval-plot} \ No newline at end of file diff --git a/notebooks/ragas_eval.ipynb b/notebooks/ragas_eval.ipynb index b1e39b7..53d862b 100644 --- a/notebooks/ragas_eval.ipynb +++ b/notebooks/ragas_eval.ipynb @@ -18,7 +18,21 @@ "output_type": "stream", "text": [ "/home/mpc/github/llm-eval/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/mpc/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/metrics/__init__.py:1: LangChainDeprecationWarning: As of langchain-core 0.3.0, LangChain uses pydantic v2 internally. The langchain_core.pydantic_v1 module was a compatibility shim for pydantic v1, and should no longer be used. Please update the code to import from Pydantic directly.\n", + "\n", + "For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`\n", + "with: `from pydantic import BaseModel`\n", + "or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. \tfrom pydantic.v1 import BaseModel\n", + "\n", + " from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness\n", + "/home/mpc/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/metrics/__init__.py:4: LangChainDeprecationWarning: As of langchain-core 0.3.0, LangChain uses pydantic v2 internally. The langchain.pydantic_v1 module was a compatibility shim for pydantic v1, and should no longer be used. Please update the code to import from Pydantic directly.\n", + "\n", + "For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`\n", + "with: `from pydantic import BaseModel`\n", + "or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. \tfrom pydantic.v1 import BaseModel\n", + "\n", + " from ragas.metrics._context_entities_recall import (\n" ] } ], @@ -52,17 +66,61 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\"../data/evaluation-sets/eidc-eval-sample.csv\")\n", + "df = pd.read_csv(\"../data/evaluation-sets/eidc-eval-sample.csv\", converters={\"contexts\": pd.eval})\n", "eval_dataset = Dataset.from_pandas(df)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['The dataset entitled \"Snow Survey of Great Britain: transcribed data for Scotland, 1945 to 2007\" contains the following information in it\\'s \"description\" metadata field: This dataset comprises observations of snowline from the Snow Survey of Great Britain (SSGB) at 140 sites across Scotland . Daily observations were made between 1945 and 2007. Observations were made by a ground observer who looked out from a given location at 0900 GMT each day and noted the elevation at which snow cover was greater than 50%. \\n\\nThe initial aim was to \\'secure representative data relating to the occurrence of snow cover at different altitudes in the various upland districts over the period October to June\\'. \\n\\nThe data were collated by the British Glaciological Society until 1954 and thereafter by the Met Office. It has been transcribed from paper records held in the Met Office archives in Edinburgh.',\n", + " 'The dataset entitled \"Global Navigation Satellite System (GNSS) survey of Ciste Mhearad snow patch perimeter, Cairngorm, Scotland, 2023\" contains the following information in it\\'s \"description\" metadata field: This dataset contains geographic locations, including the horizontal and vertical position, of the perimeter of the Ciste Mhearad snow patch on Cairngorm for three dates in the summer of 2023. Points on the perimeter were located using two Global Navigation Satellite System (GNSS) receivers as base and roving stations during visits on 19 June, 27 July and 28 July 2023.',\n", + " 'The dataset entitled \"Snow water equivalent estimates using cosmic-ray neutron sensors in the United Kingdom (2014-2019)\" contains the following information in it\\'s \"description\" metadata field: This dataset provides daily estimates of the Snow Water Equivalent (SWE) using data from 46 COSMOS-UK sites across the United Kingdom. One set of estimates is derived from the cosmic ray neutron sensor and provides an estimate of the average SWE within the sensor’s large (>100m) footprint. Other SWE estimates are based on either a snowmelt model, or, for certain sites, either a snow depth sensor or a buried \\'SnowFox\\' neutron sensor. Additionally, daily neutron counts, the albedo, and a collection of figures for each snow event are provided.',\n", + " 'The dataset entitled \"Net ecosystem carbon dioxide (CO2) exchange and meteorological observations from an eroding high altitude blanket bog, Scotland, 2018-2020\" contains the following information in it\\'s \"description\" metadata field: This record contains time series observations of land surface-atmosphere exchanges of net ecosystem carbon dioxide exchange (NEE), sensible heat (H) and latent heat (LE), and meteorological observations measured at an eroded upland blanket bog peatland (UK-BAL) in the Eastern Cairngorms in Scotland, UK (56.93° N, -3.16° E, 642 m asl). The dataset comprises eddy covariance CO2, water and energy fluxes, originally collected at 20Hz and processed to 30-minute data, as well as accompanying meteorological observations, originally collected at 15 min and processed to 30-minute data. Time period covered in this dataset is 04/07/2018 until 04/11/2020.',\n", + " 'The dataset entitled \"Loch Leven Waterfowl 1968-2007\" contains the following information in it\\'s \"description\" metadata field: The dataset comprises counts of ten waterfowl species collected from Loch Leven from 1968-2007 by staff at Scottish Natural Heritage (SNH) and its predecessor bodies (Nature Conservancy), as part of their long-term monitoring programme of the lake. Counts are for the whole loch and represent an annual peak count based on the monthly counts recorded from September (of the year indicated) through to March (of the following year). '],\n", + " ['The dataset entitled \"Diet, timing of egg laying and breeding success data for Isle of May European shag population 1985-2015\" contains the following information in it\\'s \"description\" metadata field: Data on timing of breeding, breeding success and diet of the European shag, sampled from the Isle of May population. The data were collected between 1985 and 2015 by visually checking nests and collecting regurgitated diet samples. These data are part of the Isle of May long-term study to assess population trends of seabirds under environmental change (IMLOTS https://www.ceh.ac.uk/our-science/projects/isle-may-long-term-study).',\n", + " 'The dataset entitled \"Behaviour, diet, condition and demography data for common guillemots from the Isle of May, 1982-2019\" contains the following information in it\\'s \"description\" metadata field: This dataset contains information on the parental behaviour, diet, condition and demography of common guillemots on the Isle of May, south-east Scotland. Annual data are available for 1982 to 2019 inclusive. These data are part of the Isle of May long-term study to assess population trends of seabirds under environmental change (IMLOTS https://www.ceh.ac.uk/our-science/projects/isle-may-long-term-study).',\n", + " 'The dataset entitled \"Breeding success, population size, and site quality data for a population of common guillemots (Uria aalge) on the Isle of May, Scotland, 1981-2018\" contains the following information in it\\'s \"description\" metadata field: This dataset contains information on the breeding outcome, breeding site occupancy, and breeding site quality for a sample of common guillemots breeding on the Isle of May, Scotland. Data is available for all attributes from 1981-2018. These data are part of the Isle of May long-term study to assess population trends of seabirds under environmental change (IMLOTS https://www.ceh.ac.uk/our-science/projects/isle-may-long-term-study). ',\n", + " 'The dataset entitled \"The Isle of May long-term study (IMLOTS) seabird annual breeding success 1982-2016\" contains the following information in it\\'s \"description\" metadata field: This dataset contains calculated breeding success rates for six seabird species from representative colonies on the Isle of May, off the East coast of Scotland. Annual breeding success has been measured as the number of chicks fledged per active nest for the Atlantic puffin (Fratercula arctica, since 1982), common guillemot (Uria aalge, since 1982), razorbill (Alca torda, since 1982), European shag (Phalacrocorax aristotelis, since 1987), black-legged kittiwake (Rissa tridactyla, since 1987) and northern fulmar (Fulmarus glacialis, since 1987). The number of active nests recorded are also provided. Data were collected as part of the Isle of May long-term study (IMLOTS), which aims to identify the impact of environmental change on seabirds and their associated ecosystems. This monitoring has been ongoing since 1974, by essentially the same team of scientists, using the same well-documented methods throughout this time.',\n", + " 'The dataset entitled \"The Isle of May long-term study (IMLOTS) seabird annual breeding success 1982-2012\" contains the following information in it\\'s \"description\" metadata field: This dataset contains calculated breeding success rates for six seabird species from representative colonies on the Isle of May, off the East coast of Scotland. Annual breeding success has been measured as the number of chicks fledged per active nest for the Atlantic puffin (Fratercula arctica, since 1982), common guillemot (Uria aalge, since 1982), razorbill (Alca torda, since 1982), European shag (Phalacrocorax aristotelis, since 1987), black-legged kittiwake (Rissa tridactyla, since 1987) and northern fulmar (Fulmarus glacialis, since 1987). The number of active nests recorded are also provided. Data were collected as part of the Isle of May long-term study (IMLOTS), which aims to identify the impact of environmental change on seabirds and their associated ecosystems. This monitoring has been ongoing since 1974, by essentially the same team of scientists, using the same well-documented methods throughout this time.'],\n", + " ['The dataset entitled \"Land Cover Map 2020 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcel (polygon) dataset for the UKCEH Land Cover Map of 2020 (LCM2020) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2020 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation accompanying this dataset.\\n\\nLCM2020 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2020. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2020. These are one of a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps in 2000, 2007, 2015 and annually since 2017.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability. ',\n", + " 'The dataset entitled \"Land Cover Map 2019 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2019 (LCM2019) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2019 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2019 20m classified pixels dataset. All further LCM2019 datasets for Northern Ireland are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2019 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2019. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2019. LCM2019 was simultaneously released with LCM2017 and LCM2018. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2017 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2017 20m classified pixels dataset. All further LCM2017 datasets for Northern Ireland are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2017 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2017. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2017. LCM2017 was simultaneously released with LCM2018 and LCM2019. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2018 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2018(LCM2018) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2018 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2018 20m classified pixels dataset. All further LCM2018 datasets for Northern Ireland are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2018 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2018. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2018. LCM2018 was simultaneously released with LCM2017 and LCM2019. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2018 (25m rasterised land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the 25m rasterised land parcels dataset for the UKCEH Land Cover Map of 2018(LCM2018) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2018 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived by rasterising the corresponding LCM2018 land parcels dataset into 25m pixels. It is provided as a 3-band, 8-bit integer raster. The first band is the UKCEH Land Cover Class identifier. Bands 2 and 3 are indicators of classification confidence. For a fuller description please refer to the product documentation.\\n\\nLCM2018 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2018. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2018. LCM2018 was simultaneously released with LCM2017 and LCM2019. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Northern Ireland (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.'],\n", + " ['The dataset entitled \"Land Cover Map 2017 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2017 20m classified pixels dataset. All further LCM2017 datasets for Northern Ireland are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2017 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2017. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2017. LCM2017 was simultaneously released with LCM2018 and LCM2019. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2018 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2018(LCM2018) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2018 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2018 20m classified pixels dataset. All further LCM2018 datasets for Northern Ireland are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help to assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2018 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2018. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2018. LCM2018 was simultaneously released with LCM2017 and LCM2019. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2019 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2019 (LCM2019) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2019 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2019 20m classified pixels dataset. All further LCM2019 datasets for Northern Ireland are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nLCM2019 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2019. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2019. LCM2019 was simultaneously released with LCM2017 and LCM2018. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2017 (25m rasterised land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the 25m rasterised land parcels dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived by rasterising the corresponding LCM2017 land parcels dataset into 25m pixels. It is provided as a 3-band, 8-bit integer raster. The first band is the UKCEH Land Cover Class identifier. Bands 2 and 3 are indicators of classification confidence. For a fuller description please refer to the product documentation.\\n\\nLCM2017 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2017. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2017. LCM2017 was simultaneously released with LCM2018 and LCM2019. These are the latest in a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Northern Ireland (now usually referred to as LCM1990) followed by UK-wide land cover maps LCM2000, LCM2007 and LCM2015.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.',\n", + " 'The dataset entitled \"Land Cover Map 2020 (land parcels, N. Ireland)\" contains the following information in it\\'s \"description\" metadata field: This is the land parcel (polygon) dataset for the UKCEH Land Cover Map of 2020 (LCM2020) representing Northern Ireland. It describes Northern Ireland\\'s land cover in 2020 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation accompanying this dataset.\\n\\nLCM2020 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2020. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2020. These are one of a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps in 2000, 2007, 2015 and annually since 2017.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability. '],\n", + " ['The dataset entitled \"Ammonia measurements from passive samplers at Fenn\\'s, Whixall, Wem & Cadney Mosses SAC (2018)\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at three sites on Fenn’s, Whixall, Bettisfield, Wem and Cadney Mosses SSSI on the border of Wrexham County Borough (North Wales) and Shropshire (West Midlands). The ammonia measurements are taken from a set of ALPHA (R) (Adapted Low-cost Passive High Absorption) samplers from July to December\\xa0in the year 2018. The sites were established in order to monitor ammonia during implementation of Site Nitrogen Action Plan (SNAP), as part of the Marches Mosses BogLIFE project. This project aims to restore Britain\\'s third largest lowland raised bog within the Fenn’s, Whixall & Bettisfield Mosses and Wem Moss National Nature Reserves near Whitchurch, Shropshire and Wrexham in Wales.',\n", + " 'The dataset entitled \" Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire, 2021\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside and the other is outside in the garden area. The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from January 2021 to December 2021. Samplers are exposed in monthly cycles at the beginning of each month.',\n", + " 'The dataset entitled \"Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire (2017-2018)\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside in the hall and the other is outside in the garden area . The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from January 2017 to November 2018. Samplers are exposed in monthly cycles at the beginning of each month.',\n", + " 'The dataset entitled \"Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire, 2019-2020\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside and the other is outside in the garden area . The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from November 2018 to January 2021. Samplers are exposed in monthly cycles at the beginning of each month.',\n", + " 'The dataset entitled \" Passive sampler ammonia measurements indoors and outdoors at a rural dwelling in South Lanarkshire, 2022\" contains the following information in it\\'s \"description\" metadata field: This dataset consists of ammonia (NH3) measurements at two sites in a rural location in South Lanarkshire. The sites are located in a dwelling, one site is inside and the other is outside in the garden area. The garden backs onto grassland which is part of a large dairy farm. The ammonia measurements are taken from a set of UKCEH ALPHA® (Adapted Low-cost Passive High Absorption) samplers from January 2022 to December 2022. Samplers are exposed in monthly cycles at the beginning of each month.']]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_dataset[\"contexts\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -98,35 +156,35 @@ " What was the frequency of snowline observation...\n", " The frequency of snowline observations made da...\n", " The available information does not provide a c...\n", - " ['The dataset entitled \"Snow Survey of Great B...\n", + " [The dataset entitled \"Snow Survey of Great Br...\n", " \n", " \n", " 1\n", " What was the primary focus of studying the Eur...\n", " The primary focus of studying the European sha...\n", " The available information does not clearly sta...\n", - " ['The dataset entitled \"Diet, timing of egg la...\n", + " [The dataset entitled \"Diet, timing of egg lay...\n", " \n", " \n", " 2\n", " What are the UKCEH Land Cover Classes used to ...\n", " The UKCEH Land Cover Classes used to describe ...\n", " The UKCEH Land Cover Classes used to describe ...\n", - " ['The dataset entitled \"Land Cover Map 2020 (l...\n", + " [The dataset entitled \"Land Cover Map 2020 (la...\n", " \n", " \n", " 3\n", " What method was used to classify the pixels in...\n", " The Random Forest classification method was us...\n", " Based on the available information, it appears...\n", - " ['The dataset entitled \"Land Cover Map 2017 (l...\n", + " [The dataset entitled \"Land Cover Map 2017 (la...\n", " \n", " \n", " 4\n", " What were the specific locations where the exp...\n", " The answer to given question is not present in...\n", " Based on the available information, it does no...\n", - " ['The dataset entitled \"Ammonia measurements f...\n", + " [The dataset entitled \"Ammonia measurements fr...\n", " \n", " \n", "\n", @@ -155,14 +213,14 @@ "4 Based on the available information, it does no... \n", "\n", " contexts \n", - "0 ['The dataset entitled \"Snow Survey of Great B... \n", - "1 ['The dataset entitled \"Diet, timing of egg la... \n", - "2 ['The dataset entitled \"Land Cover Map 2020 (l... \n", - "3 ['The dataset entitled \"Land Cover Map 2017 (l... \n", - "4 ['The dataset entitled \"Ammonia measurements f... " + "0 [The dataset entitled \"Snow Survey of Great Br... \n", + "1 [The dataset entitled \"Diet, timing of egg lay... \n", + "2 [The dataset entitled \"Land Cover Map 2020 (la... \n", + "3 [The dataset entitled \"Land Cover Map 2017 (la... \n", + "4 [The dataset entitled \"Ammonia measurements fr... " ] }, - "execution_count": 7, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -173,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -183,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -191,7 +249,6 @@ " faithfulness,\n", " answer_relevancy,\n", " context_precision,\n", - " context_utilization,\n", " context_recall,\n", " context_entity_recall,\n", " answer_similarity,\n", @@ -201,21 +258,26 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "Dataset feature \"contexts\" should be of type Sequence[string], got ", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43meval_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mfaithfulness\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer_relevancy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext_precision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext_utilization\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext_recall\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext_entity_recall\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer_similarity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer_correctness\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43mllm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mllm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43membeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_async\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43mraise_exceptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mRunConfig\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m result\n", - "File \u001b[0;32m~/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/evaluation.py:157\u001b[0m, in \u001b[0;36mevaluate\u001b[0;34m(dataset, metrics, llm, embeddings, callbacks, in_ci, is_async, run_config, raise_exceptions, column_map)\u001b[0m\n\u001b[1;32m 155\u001b[0m dataset \u001b[38;5;241m=\u001b[39m handle_deprecated_ground_truths(dataset)\n\u001b[1;32m 156\u001b[0m validate_evaluation_modes(dataset, metrics)\n\u001b[0;32m--> 157\u001b[0m \u001b[43mvalidate_column_dtypes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# set the llm and embeddings\u001b[39;00m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(llm, LangchainLLM):\n", - "File \u001b[0;32m~/github/llm-eval/.venv/lib/python3.12/site-packages/ragas/validation.py:56\u001b[0m, in \u001b[0;36mvalidate_column_dtypes\u001b[0;34m(ds)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column_names \u001b[38;5;129;01min\u001b[39;00m ds\u001b[38;5;241m.\u001b[39mfeatures:\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28misinstance\u001b[39m(ds\u001b[38;5;241m.\u001b[39mfeatures[column_names], Sequence)\n\u001b[1;32m 54\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m ds\u001b[38;5;241m.\u001b[39mfeatures[column_names]\u001b[38;5;241m.\u001b[39mfeature\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstring\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 55\u001b[0m ):\n\u001b[0;32m---> 56\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDataset feature \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcolumn_names\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m should be of type\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Sequence[string], got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(ds\u001b[38;5;241m.\u001b[39mfeatures[column_names])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 59\u001b[0m )\n", - "\u001b[0;31mValueError\u001b[0m: Dataset feature \"contexts\" should be of type Sequence[string], got " + "name": "stderr", + "output_type": "stream", + "text": [ + "Evaluating: 14%|█▍ | 5/35 [00:35<03:02, 6.07s/it]Failed to parse output. Returning None.\n", + "Evaluating: 100%|██████████| 35/35 [03:40<00:00, 6.31s/it]\n" ] + }, + { + "data": { + "text/plain": [ + "{'faithfulness': 0.6956, 'answer_relevancy': 0.1845, 'context_precision': 0.3775, 'context_recall': 0.8000, 'context_entity_recall': 0.3667, 'answer_similarity': 0.2146, 'answer_correctness': 0.0534}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -225,7 +287,6 @@ " faithfulness,\n", " answer_relevancy,\n", " context_precision,\n", - " context_utilization,\n", " context_recall,\n", " context_entity_recall,\n", " answer_similarity,\n", @@ -233,7 +294,6 @@ " ],\n", " llm=llm,\n", " embeddings=embeddings,\n", - " is_async=False,\n", " raise_exceptions=False,\n", " run_config=RunConfig(max_workers=1),\n", ")\n", @@ -245,6 +305,13 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], "source": [ "result_df = result.to_pandas()\n", "pio.templates.default = \"gridon\"\n", @@ -253,7 +320,19 @@ "for metric in metrics:\n", " fig.add_trace(go.Violin(y=result_df[metric], name=metric, points=\"all\", box_visible=True, meanline_visible=True))\n", "fig.update_yaxes(range=[-0.02,1.02])\n", - "fig.show()" + "with open(\"eval.png\", \"wb\") as f:\n", + " f.write(fig.to_image(format=\"png\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open(\"metrics.json\", \"w\") as f:\n", + " json.dump(result, f)" ] } ], diff --git a/params.yaml b/params.yaml index 2f5354f..900e48f 100644 --- a/params.yaml +++ b/params.yaml @@ -11,6 +11,8 @@ files: doc-store: "data/chroma-data" test-set: "data/eidc_rag_test_sample.csv" eval-set: "data/evaluation_data.csv" + metrics: "data/metrics.json" + eval-plot: "data/eval.png" sample-size: 10 # sample size of 0 will process all data rag: model: llama3.1 diff --git a/pyproject.toml b/pyproject.toml index 454973a..4844faf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,8 @@ dependencies = [ "ollama-haystack == 0.0.7", "chroma-haystack", "ragas == 0.1.10", - "nltk" + "nltk", + "nbformat>=4.2.0", ] [project.optional-dependencies] diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 7fcf1a7..10b3a61 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -1 +1,77 @@ -# Run RAGAS to evaluate \ No newline at end of file +from argparse import ArgumentParser +import pandas as pd +from datasets import Dataset +from ragas import evaluate +from ragas.run_config import RunConfig +from langchain_community.embeddings import OllamaEmbeddings +from langchain_community.chat_models import ChatOllama +import plotly.graph_objects as go +import plotly.io as pio +import nest_asyncio +from ragas.metrics import ( + faithfulness, + answer_relevancy, + context_precision, + context_recall, + context_entity_recall, + answer_similarity, + answer_correctness, +) +import json + +def main(eval_dataset: str, metric_output: str, image_output: str) -> None: + nest_asyncio.apply() # apply the event loop async fix + df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval}) + eval_dataset = Dataset.from_pandas(df) + llm = ChatOllama(model='mistral-nemo', num_ctx=16384) + embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384) + result = evaluate( + eval_dataset, + metrics=[ + faithfulness, + answer_relevancy, + context_precision, + context_recall, + context_entity_recall, + answer_similarity, + answer_correctness, + ], + llm=llm, + embeddings=embeddings, + raise_exceptions=False, + run_config=RunConfig(max_workers=1), + ) + result_df = result.to_pandas() + pio.templates.default = "gridon" + fig = go.Figure() + + + with open(metric_output, "w") as f: + json.dump(result, f) + metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]] + + for metric in metrics: + fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True)) + fig.update_yaxes(range=[-0.02,1.02]) + with open(image_output, "wb") as f: + f.write(fig.to_image(format="png")) + + + +if __name__ == "__main__": + parser = ArgumentParser("evaluate.py") + parser.add_argument("eval_dataset", help="File containing the evaluation data.") + parser.add_argument( + "-m", + "--metrics_output", + help="File to save evaluation metrics to.", + default="data/metrics.json", + ) + parser.add_argument( + "-img", + "--image_output", + help="File to save image plot to.", + default="data/evaluation.png", + ) + args = parser.parse_args() + main(args.eval_dataset, args.metrics_output, args.image_output) From d119b007645f930c97edd191c032b286346b1aff Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 17 Oct 2024 14:59:25 +0100 Subject: [PATCH 15/28] Updated readme --- README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b34c1ce..588b68f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,82 @@ # llm-eval -Scripts and data for LLM evaluation. +This repository contains a reproducible workflow setup using [DVC](https://dvc.org/) backed by a [JASMIN object store](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/). Before working with the repository please contact [Matt Coole](mailto:matcoo@ceh.ac.uk) to request access to the Jasmin object store `llm-eval-o`. Then follow the instructions below. -This repository is setup to work with [DVC](https://dvc.org/) backed by a [JASMIN object store](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/). Please follow the instruction in [`dvc.md`](dvc.md) to get up and running. +## Requirements +- [Ollama](https://ollama.com/download) ([`llama3.1`](https://ollama.com/library/llama3.1) and [`mistral-nemo`](https://ollama.com/library/mistral-nemo) models) -## DVC and CML +## Getting started +First create a new virtual environment and install the required dependencies: +```shell +python -m venv .venv +source .venv/bin/activate +pip install . +``` +Next setup your local DVC configuration with your [Jasmin object store access key](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/#creating-an-access-key-and-secret): +```shell +dvc remote modify --local jasmin access_key_id '' +dvc remote modify --local jasmin secret_access_key '' +``` +Pull the data from the object store using DVC: +```shell +dvc pull +``` +You should now be ready to re-run the pipeline: +```shell +dvc repro +``` +This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the command: +```shell +dvc dag +``` +``` + +----------------+ + | fetch-metadata | + +----------------+ + * + * + * + +------------------+ +-----------------------+ + | extract-metadata | | fetch-supporting-docs | + +------------------+ +-----------------------+ + ** ** + *** *** + ** ** + +------------+ + | chunk-data | + +------------+ + * + * + * + +-------------------+ + | create-embeddings | + +-------------------+ + * + * + * ++------------------+ +--------------------+ +| generate-testset | | upload-to-docstore | ++------------------+ +--------------------+ + ** ** + *** *** + ** ** + +------------------+ + | run-rag-pipeline | + +------------------+ + * + * + * + +----------+ + | evaluate | + +----------+ +``` + +## Notes + +### DVC and CML Notes on the use of Data Version Control and Continuous Machine Learning: - [DVC](dvc.md) - [CML](cml.md) -## vLLM +### vLLM Notes on running models with vLLM: - [vLLM](vllm.md) \ No newline at end of file From ccd4e3ce832e99c0f7930eedd51793214bddbba9 Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 17 Oct 2024 15:18:30 +0100 Subject: [PATCH 16/28] Added metrics file to dvc config --- dvc.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dvc.yaml b/dvc.yaml index fe6f0ea..59a6ccc 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -57,4 +57,6 @@ stages: - scripts/evaluate.py outs: - ${files.metrics} - - ${files.eval-plot} \ No newline at end of file + - ${files.eval-plot} +metrics: +- ${files.metrics} \ No newline at end of file From 4f7ab43272ae4dd80d27fbeafa05f1e8ba415844 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 09:51:39 +0100 Subject: [PATCH 17/28] Added ruff, mypy and cleaned scripts --- dvc.lock | 61 +++++++++++++++++----------------- dvc.yaml | 4 +-- params.yaml | 3 ++ pyproject.toml | 8 +++++ scripts/create_embeddings.py | 3 +- scripts/evaluate.py | 27 ++++++++++----- scripts/extract_metadata.py | 6 ++-- scripts/fetch_eidc_metadata.py | 1 + scripts/run_rag_pipeline.py | 36 ++++++++++++++++---- scripts/upload_to_docstore.py | 6 +++- 10 files changed, 104 insertions(+), 51 deletions(-) diff --git a/dvc.lock b/dvc.lock index 8b454b4..d143f87 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,13 +5,13 @@ stages: deps: - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: ba838a284da239217d0464f08e0a45ce - size: 674 + md5: 53d620665448ef91f2deedb517e2f502 + size: 675 outs: - path: data/eidc_metadata.json hash: md5 - md5: fc2f9ebe92cbd07eb06ff6e39366fdac - size: 12146216 + md5: b4f3774a2921debb4d7740165ac604d4 + size: 12157676 prepare: cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: @@ -33,25 +33,25 @@ stages: deps: - path: data/eidc_metadata.json hash: md5 - md5: fc2f9ebe92cbd07eb06ff6e39366fdac - size: 12146216 + md5: b4f3774a2921debb4d7740165ac604d4 + size: 12157676 - path: scripts/extract_metadata.py hash: md5 - md5: c2fa7d2c4b8f28a6e24536ce0df244fd - size: 1296 + md5: 3f0269a6413845f4425af55e7cea7bf8 + size: 1304 outs: - path: data/extracted_metadata.json hash: md5 - md5: fce18ce3c43175af1cea5d84dac9baf9 - size: 4579965 + md5: 789fda7a14f9a85c6ee0e10af8170a95 + size: 4584498 chunk-data: cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s 10 data/extracted_metadata.json deps: - path: data/extracted_metadata.json hash: md5 - md5: fce18ce3c43175af1cea5d84dac9baf9 - size: 4579965 + md5: 789fda7a14f9a85c6ee0e10af8170a95 + size: 4584498 - path: data/supporting-docs.json hash: md5 md5: 0febface6f1d23fda46c11bef65284f4 @@ -74,7 +74,7 @@ stages: size: 14947 - path: scripts/create_embeddings.py hash: md5 - md5: 3dc6ef284730398375a13df4bff41846 + md5: 4649c700dfae922b43b3608ee4f00c1a size: 808 outs: - path: data/embeddings.json @@ -83,7 +83,7 @@ stages: size: 351126 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data - -em all-MiniLM-L6-v2 + -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 @@ -91,20 +91,21 @@ stages: size: 351126 - path: scripts/upload_to_docstore.py hash: md5 - md5: ae8755770166dd3d6c1efb9f15723116 - size: 1836 + md5: 41da88e3bb6d2592bee938ce347f6983 + size: 1905 outs: - path: data/chroma-data hash: md5 - md5: 2f2ba629bf078284bb6d6be73c6166a7.dir + md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir size: 2069220 nfiles: 5 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv + data/chroma-data -c eidc-data deps: - path: data/chroma-data hash: md5 - md5: 0254e85bb660da611cfa14e5221dae92.dir + md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir size: 2069220 nfiles: 5 - path: data/eidc_rag_test_sample.csv @@ -113,13 +114,13 @@ stages: size: 7524 - path: scripts/run_rag_pipeline.py hash: md5 - md5: 6d1f49fa8b22288ecd50ed0e3898fd60 - size: 3153 + md5: 8d5fc0669771146562c773186f4f44f6 + size: 3667 outs: - path: data/evaluation_data.csv hash: md5 - md5: 47a0adeb2ee1cb67202048684064d30f - size: 7293 + md5: f6bce3f5c551e84da224d36201858839 + size: 6638 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -140,18 +141,18 @@ stages: deps: - path: data/evaluation_data.csv hash: md5 - md5: 47a0adeb2ee1cb67202048684064d30f - size: 7293 + md5: f6bce3f5c551e84da224d36201858839 + size: 6638 - path: scripts/evaluate.py hash: md5 - md5: 51f036b805f23dd3ebfd5d819bc9d457 - size: 2489 + md5: 10f76511eafc8a1a9b90e9ae92a76bc5 + size: 2633 outs: - path: data/eval.png hash: md5 - md5: 8c11f987449f8718b6f6011078b6c259 - size: 49498 + md5: fd66aa842f93e8f370399dae5b68e2fe + size: 50525 - path: data/metrics.json hash: md5 - md5: 53fba29cb236fedd3c6446ea94fea3cc - size: 215 + md5: 55266ae1bd64a3499508d07651a5aa13 + size: 214 diff --git a/dvc.yaml b/dvc.yaml index 59a6ccc..fa419ff 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -32,7 +32,7 @@ stages: outs: - ${files.embeddings} upload-to-docstore: - cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model} + cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${doc-store.files} -em ${hp.embeddings-model} -c ${doc-store.collection} deps: - ${files.embeddings} - scripts/upload_to_docstore.py @@ -43,7 +43,7 @@ stages: outs: - ${files.test-set} run-rag-pipeline: - cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} + cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} ${files.doc-store} -c ${doc-store.collection} deps: - ${files.test-set} - ${files.doc-store} diff --git a/params.yaml b/params.yaml index 900e48f..988dbdb 100644 --- a/params.yaml +++ b/params.yaml @@ -2,6 +2,9 @@ hp: chunk-size: 300 overlap: 100 embeddings-model: "all-MiniLM-L6-v2" +doc-store: + collection: "eidc-data" + files: "data/chroma-data" files: metadata: "data/eidc_metadata.json" extracted: "data/extracted_metadata.json" diff --git a/pyproject.toml b/pyproject.toml index 4844faf..5abe51d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,11 @@ dependencies = [ "ragas == 0.1.10", "nltk", "nbformat>=4.2.0", + "ruff", + "mypy", + "types-requests", + "types-tqdm", + "pandas-stubs", ] [project.optional-dependencies] @@ -30,3 +35,6 @@ jupyter = [ [tool.setuptools] py-modules = [] + +[tool.mypy] +files = ["scripts"] \ No newline at end of file diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index ce1c37b..2ad9cc9 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -3,9 +3,10 @@ from argparse import ArgumentParser from tqdm import tqdm + def create_embedding(text): model = SentenceTransformer("all-MiniLM-L6-v2") - return model.encode(text) + return model.encode(text) def main(input_file, output_file): diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 10b3a61..d7ac98f 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -19,12 +19,13 @@ ) import json + def main(eval_dataset: str, metric_output: str, image_output: str) -> None: - nest_asyncio.apply() # apply the event loop async fix + nest_asyncio.apply() # apply the event loop async fix df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval}) eval_dataset = Dataset.from_pandas(df) - llm = ChatOllama(model='mistral-nemo', num_ctx=16384) - embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384) + llm = ChatOllama(model="mistral-nemo", num_ctx=16384) + embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384) result = evaluate( eval_dataset, metrics=[ @@ -45,19 +46,29 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: pio.templates.default = "gridon" fig = go.Figure() - with open(metric_output, "w") as f: json.dump(result, f) - metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]] + metrics = [ + metric + for metric in result_df.columns.to_list() + if metric not in ["question", "ground_truth", "answer", "contexts"] + ] for metric in metrics: - fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True)) - fig.update_yaxes(range=[-0.02,1.02]) + fig.add_trace( + go.Violin( + y=result_df[metric], + name=metric, + points="all", + box_visible=True, + meanline_visible=True, + ) + ) + fig.update_yaxes(range=[-0.02, 1.02]) with open(image_output, "wb") as f: f.write(fig.to_image(format="png")) - if __name__ == "__main__": parser = ArgumentParser("evaluate.py") parser.add_argument("eval_dataset", help="File containing the evaluation data.") diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py index 241bc1a..8007d09 100644 --- a/scripts/extract_metadata.py +++ b/scripts/extract_metadata.py @@ -6,7 +6,9 @@ METADATA_FIELDS = ["title", "description", "lineage"] -def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]: +def extact_eidc_metadata_fields( + json_data: Dict, fields: List[str] = METADATA_FIELDS +) -> List[Dict[str, str]]: metadatas = [] for field in fields: if json_data[field]: @@ -18,7 +20,7 @@ def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FI return metadatas -def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]: +def parse_eidc_metadata(file_path: str) -> List[Dict[str, str]]: data = [] with open(file_path) as f: json_data = json.load(f) diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index cd56b4e..f411c16 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -4,6 +4,7 @@ URL = "https://catalogue.ceh.ac.uk/eidc/documents" + def main(output_file: str) -> None: res = requests.get( URL, diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index 830a052..91408ea 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser +import shutil from haystack import Pipeline from haystack_integrations.document_stores.chroma import ChromaDocumentStore from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever @@ -8,9 +9,12 @@ import pandas as pd -def build_rag_pipeline(model_name: str) -> Pipeline: +TMP_DOC_PATH = ".tmp/doc-store" + + +def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline: document_store = ChromaDocumentStore( - collection_name="eidc-data", persist_path="data/chroma-data" + collection_name=collection_name, persist_path=TMP_DOC_PATH ) retriever = ChromaQueryTextRetriever(document_store, top_k=3) print("Creating prompt template...") @@ -73,22 +77,30 @@ def query_pipeline(questions, rag_pipe): for q in questions: response = run_query(q, rag_pipe) answers.append(response["answer_builder"]["answers"][0].data) - contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents]) + contexts.append( + [doc.content for doc in response["answer_builder"]["answers"][0].documents] + ) return answers, contexts -def main(test_data_file: str, ouput_file: str): - rag_pipe = build_rag_pipeline("llama3.1") +def main( + test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str +): + shutil.copytree(doc_store_path, TMP_DOC_PATH) + + rag_pipe = build_rag_pipeline("llama3.1", collection_name) df = pd.read_csv(test_data_file) df.drop(columns=["rating", "contexts"], inplace=True) answers, contexts = query_pipeline(df["question"], rag_pipe) - + df["answer"] = answers df["contexts"] = contexts df.to_csv(ouput_file, index=False) + shutil.rmtree(TMP_DOC_PATH) + if __name__ == "__main__": parser = ArgumentParser("run_rag_pipeline.py") @@ -100,5 +112,15 @@ def main(test_data_file: str, ouput_file: str): "output_file", help="File to output results to.", ) + parser.add_argument( + "doc_store_path", + help="Path to the doc store.", + ) + parser.add_argument( + "-c", + "--collection", + help="Collection name in doc store.", + default="eidc-data", + ) args = parser.parse_args() - main(args.test_data_file, args.output_file) + main(args.test_data_file, args.output_file, args.doc_store_path, args.collection) diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py index 4f2e8af..7b547d7 100644 --- a/scripts/upload_to_docstore.py +++ b/scripts/upload_to_docstore.py @@ -1,13 +1,17 @@ from argparse import ArgumentParser import json import uuid +import shutil +import os import chromadb from chromadb.utils import embedding_functions def main(input_file: str, output_path: str, collection_name: str, embedding_model: str): - print(collection_name) + if os.path.exists(output_path): + shutil.rmtree(output_path) + with open(input_file) as f: json_data = json.load(f) From 9705b616b2ddd44f7e632bcd2bec3e606b544ed8 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 12:06:21 +0100 Subject: [PATCH 18/28] Added script to download supporting docs --- README.md | 39 ++++++++++++++--- dvc.lock | 63 +++++++++++++++------------ dvc.yaml | 7 ++- params.yaml | 44 ++++++++++--------- scripts/fetch_eidc_supporting_docs.py | 0 scripts/fetch_supporting_docs.py | 47 ++++++++++++++++++++ 6 files changed, 144 insertions(+), 56 deletions(-) delete mode 100644 scripts/fetch_eidc_supporting_docs.py create mode 100644 scripts/fetch_supporting_docs.py diff --git a/README.md b/README.md index 588b68f..20d03ac 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,12 @@ This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the co dvc dag ``` ``` - +----------------+ - | fetch-metadata | - +----------------+ - * - * - * + +----------------+ + | fetch-metadata | + +----------------+ + ** ** + *** *** + ** ** +------------------+ +-----------------------+ | extract-metadata | | fetch-supporting-docs | +------------------+ +-----------------------+ @@ -67,9 +67,34 @@ dvc dag * +----------+ | evaluate | - +----------+ + +----------+ +``` + +> Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file. + +## Running Experiments +The pipeline by default will run using the parameters defind in [`params.yaml`](params.yaml). To experiment with varying these paramaters you can change them directly, or use [DVC experiments](). + +To run an experiment varying a particual parameter: +```shell +dvc exp run -S hp.chunk-size=1000 ``` +This will re-run the pipeline but override the value of the `hp.chunk-size` parameter in [`params.yaml`](params.yaml) and set it to `1000`. Only the necessary stages of the pipeline should be re-run and the result should appear in your workspace. +You can compare the results of your experiment to the results of the baseline run of the pipeline using: +```shell +dvc exp diff +``` +```shell +Path Metric HEAD workspace Change +data/metrics.json answer_correctness 0.049482 0.043685 -0.0057974 +data/metrics.json answer_similarity 0.19793 0.17474 -0.02319 +data/metrics.json context_recall 0.125 0 -0.125 +data/metrics.json faithfulness 0.75 0.69375 -0.05625 + +Path Param HEAD workspace Change +params.yaml hp.chunk-size 300 1000 700 +``` ## Notes ### DVC and CML diff --git a/dvc.lock b/dvc.lock index d143f87..dd7f7b1 100644 --- a/dvc.lock +++ b/dvc.lock @@ -45,8 +45,8 @@ stages: md5: 789fda7a14f9a85c6ee0e10af8170a95 size: 4584498 chunk-data: - cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s - 10 data/extracted_metadata.json + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s + 10 data/extracted_metadata.json data/supporting-docs.json deps: - path: data/extracted_metadata.json hash: md5 @@ -54,8 +54,8 @@ stages: size: 4584498 - path: data/supporting-docs.json hash: md5 - md5: 0febface6f1d23fda46c11bef65284f4 - size: 34 + md5: b0941cc9a7ca7df456157380bcc28f39 + size: 75646 - path: scripts/chunk_data.py hash: md5 md5: 681528e4aa1dc8cfb5fe5e5472e25fdf @@ -63,15 +63,15 @@ stages: outs: - path: data/chunked_data.json hash: md5 - md5: e9160d8c6c0fa7f647c5baa03bd1b5dd - size: 14947 + md5: 97f06c3b76ff05d62ccdecd9d5742712 + size: 137681 create-embeddings: cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json deps: - path: data/chunked_data.json hash: md5 - md5: e9160d8c6c0fa7f647c5baa03bd1b5dd - size: 14947 + md5: 97f06c3b76ff05d62ccdecd9d5742712 + size: 137681 - path: scripts/create_embeddings.py hash: md5 md5: 4649c700dfae922b43b3608ee4f00c1a @@ -79,16 +79,16 @@ stages: outs: - path: data/embeddings.json hash: md5 - md5: b08299369d1f243eb8d8ffa2cdb9a90f - size: 351126 + md5: 8d80ef225c59ede34d026f6f2930bae3 + size: 1894126 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 - md5: b08299369d1f243eb8d8ffa2cdb9a90f - size: 351126 + md5: 8d80ef225c59ede34d026f6f2930bae3 + size: 1894126 - path: scripts/upload_to_docstore.py hash: md5 md5: 41da88e3bb6d2592bee938ce347f6983 @@ -96,8 +96,8 @@ stages: outs: - path: data/chroma-data hash: md5 - md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir - size: 2069220 + md5: cc85398c596d4c5839714e93e33468bb.dir + size: 3580644 nfiles: 5 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv @@ -105,8 +105,8 @@ stages: deps: - path: data/chroma-data hash: md5 - md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir - size: 2069220 + md5: cc85398c596d4c5839714e93e33468bb.dir + size: 3580644 nfiles: 5 - path: data/eidc_rag_test_sample.csv hash: md5 @@ -119,8 +119,8 @@ stages: outs: - path: data/evaluation_data.csv hash: md5 - md5: f6bce3f5c551e84da224d36201858839 - size: 6638 + md5: 9825cf7e7a89ca17634b44e9256eefc9 + size: 9695 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -129,20 +129,29 @@ stages: md5: a371d83c5822d256286e80d64d58c3fe size: 7524 fetch-supporting-docs: - cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json + cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json + deps: + - path: data/eidc_metadata.json + hash: md5 + md5: b4f3774a2921debb4d7740165ac604d4 + size: 12157676 + - path: scripts/fetch_supporting_docs.py + hash: md5 + md5: de0c11e81bf10e040bef67e43466b789 + size: 1472 outs: - path: data/supporting-docs.json hash: md5 - md5: 0febface6f1d23fda46c11bef65284f4 - size: 34 + md5: b0941cc9a7ca7df456157380bcc28f39 + size: 75646 evaluate: cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: f6bce3f5c551e84da224d36201858839 - size: 6638 + md5: 9825cf7e7a89ca17634b44e9256eefc9 + size: 9695 - path: scripts/evaluate.py hash: md5 md5: 10f76511eafc8a1a9b90e9ae92a76bc5 @@ -150,9 +159,9 @@ stages: outs: - path: data/eval.png hash: md5 - md5: fd66aa842f93e8f370399dae5b68e2fe - size: 50525 + md5: 1279778c7e509e972d1f366157d24966 + size: 58228 - path: data/metrics.json hash: md5 - md5: 55266ae1bd64a3499508d07651a5aa13 - size: 214 + md5: 2b93334ba0e8226c916d0964237cb72c + size: 225 diff --git a/dvc.yaml b/dvc.yaml index fa419ff..0e9f154 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -6,7 +6,10 @@ stages: outs: - ${files.metadata} fetch-supporting-docs: - cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs} + cmd: python scripts/fetch_supporting_docs.py ${files.metadata} ${files.supporting-docs} + deps: + - ${files.metadata} + - scripts/fetch_supporting_docs.py outs: - ${files.supporting-docs} extract-metadata: @@ -17,7 +20,7 @@ stages: outs: - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} + cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs} deps: - ${files.extracted} - ${files.supporting-docs} diff --git a/params.yaml b/params.yaml index 988dbdb..85c3119 100644 --- a/params.yaml +++ b/params.yaml @@ -1,30 +1,34 @@ hp: - chunk-size: 300 + chunk-size: 500 overlap: 100 - embeddings-model: "all-MiniLM-L6-v2" + embeddings-model: all-MiniLM-L6-v2 doc-store: - collection: "eidc-data" - files: "data/chroma-data" + collection: eidc-data + files: data/chroma-data files: - metadata: "data/eidc_metadata.json" - extracted: "data/extracted_metadata.json" - supporting-docs: "data/supporting-docs.json" - chunked: "data/chunked_data.json" - embeddings: "data/embeddings.json" - doc-store: "data/chroma-data" - test-set: "data/eidc_rag_test_sample.csv" - eval-set: "data/evaluation_data.csv" - metrics: "data/metrics.json" - eval-plot: "data/eval.png" + metadata: data/eidc_metadata.json + extracted: data/extracted_metadata.json + supporting-docs: data/supporting-docs.json + chunked: data/chunked_data.json + embeddings: data/embeddings.json + doc-store: data/chroma-data + test-set: data/eidc_rag_test_sample.csv + eval-set: data/evaluation_data.csv + metrics: data/metrics.json + eval-plot: data/eval.png sample-size: 10 # sample size of 0 will process all data rag: model: llama3.1 - prompt: > - You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n + prompt: >- + You are part of a retrieval augmented pipeline. You will be given a question and + a context on which to base your answer.\n Do not use your own knowledge to answer the question.\n - The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n - Do not refer to "context" in your answer, instead refer to the context as available information. - If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n + The context provided will be metadata from datasets contained in the Environmental + Information Data Centre (EIDC).\n + Do not refer to "context" in your answer, instead refer to the context as available + information. + If the answer to the question is not clear from the context, suggest which dataset + or datasets might be helpful in answering the question.\n Question: {{query}}\n Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %} - Answer: \ No newline at end of file + Answer: diff --git a/scripts/fetch_eidc_supporting_docs.py b/scripts/fetch_eidc_supporting_docs.py deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py new file mode 100644 index 0000000..36354e7 --- /dev/null +++ b/scripts/fetch_supporting_docs.py @@ -0,0 +1,47 @@ +from argparse import ArgumentParser +import json +from tqdm import tqdm +import requests +import os +from typing import Dict, List +from dotenv import load_dotenv + + +def extract_ids(metadata_file: str): + with open(metadata_file) as f: + json_data = json.load(f) + ids = [dataset["identifier"] for dataset in json_data["results"]] + return ids + + +def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: + res = requests.get( + f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) + ) + json_data = res.json() + docs = [] + for key, val in json_data["success"].items(): + docs.append({"id": eidc_id, "field": key, "value": val}) + return docs + + +def main(metadata_file: str, supporting_docs_file: str): + load_dotenv() + user = os.getenv("username") + password = os.getenv("password") + ids = extract_ids(metadata_file) + docs = [] + for id in tqdm(ids): + docs.extend(get_supporting_docs(id, user, password)) + if len(docs) > 0: + break + with open(supporting_docs_file, "w") as f: + json.dump(docs, f, indent=4) + + +if __name__ == "__main__": + parser = ArgumentParser("fetch_supporting_docs.py") + parser.add_argument("metadata", help="File containing EIDC metadata.") + parser.add_argument("supporting_docs", help="File to save supporting docs to.") + args = parser.parse_args() + main(args.metadata, args.supporting_docs) From 68e17ac4ce10df582ffa0b941d5c787178f4eab8 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 14:02:56 +0100 Subject: [PATCH 19/28] Reformatted code with ruff --- dummy-evaluation.py | 25 +++++++++++++++++++++---- notebooks/ragas_eval.ipynb | 29 ++++++++++++++++++++++------- notebooks/ragas_synth.ipynb | 15 +++++++-------- notebooks/vllm_test.ipynb | 15 ++++++++------- 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/dummy-evaluation.py b/dummy-evaluation.py index 318b80d..53ec812 100644 --- a/dummy-evaluation.py +++ b/dummy-evaluation.py @@ -3,7 +3,12 @@ import plotly.graph_objects as go import plotly.io as pio -metrics = {"answer_relevancy", "answer_correctness", "context_precision", "context_recall"} +metrics = { + "answer_relevancy", + "answer_correctness", + "context_precision", + "context_recall", +} dummy_data = {metric: np.random.rand(100) for metric in metrics} df = pd.DataFrame(dummy_data) @@ -13,8 +18,20 @@ pio.templates.default = "gridon" fig = go.Figure() -metrics = [metric for metric in df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]] +metrics = [ + metric + for metric in df.columns.to_list() + if metric not in ["question", "ground_truth", "answer", "contexts"] +] for metric in metrics: - fig.add_trace(go.Violin(y=df[metric], name=metric, points="all", box_visible=True, meanline_visible=True)) -fig.update_yaxes(range=[-0.02,1.02]) + fig.add_trace( + go.Violin( + y=df[metric], + name=metric, + points="all", + box_visible=True, + meanline_visible=True, + ) + ) +fig.update_yaxes(range=[-0.02, 1.02]) fig.write_image("metrics.png") diff --git a/notebooks/ragas_eval.ipynb b/notebooks/ragas_eval.ipynb index 53d862b..56bcb43 100644 --- a/notebooks/ragas_eval.ipynb +++ b/notebooks/ragas_eval.ipynb @@ -54,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "nest_asyncio.apply() # apply the event loop async fix" + "nest_asyncio.apply() # apply the event loop async fix" ] }, { @@ -70,7 +70,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\"../data/evaluation-sets/eidc-eval-sample.csv\", converters={\"contexts\": pd.eval})\n", + "df = pd.read_csv(\n", + " \"../data/evaluation-sets/eidc-eval-sample.csv\", converters={\"contexts\": pd.eval}\n", + ")\n", "eval_dataset = Dataset.from_pandas(df)" ] }, @@ -235,8 +237,8 @@ "metadata": {}, "outputs": [], "source": [ - "llm = ChatOllama(model='mistral-nemo', num_ctx=16384)\n", - "embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)" + "llm = ChatOllama(model=\"mistral-nemo\", num_ctx=16384)\n", + "embeddings = OllamaEmbeddings(model=\"mistral-nemo\", num_ctx=16384)" ] }, { @@ -316,10 +318,22 @@ "result_df = result.to_pandas()\n", "pio.templates.default = \"gridon\"\n", "fig = go.Figure()\n", - "metrics = [metric for metric in result_df.columns.to_list() if metric not in [\"question\", \"ground_truth\", \"answer\", \"contexts\"]]\n", + "metrics = [\n", + " metric\n", + " for metric in result_df.columns.to_list()\n", + " if metric not in [\"question\", \"ground_truth\", \"answer\", \"contexts\"]\n", + "]\n", "for metric in metrics:\n", - " fig.add_trace(go.Violin(y=result_df[metric], name=metric, points=\"all\", box_visible=True, meanline_visible=True))\n", - "fig.update_yaxes(range=[-0.02,1.02])\n", + " fig.add_trace(\n", + " go.Violin(\n", + " y=result_df[metric],\n", + " name=metric,\n", + " points=\"all\",\n", + " box_visible=True,\n", + " meanline_visible=True,\n", + " )\n", + " )\n", + "fig.update_yaxes(range=[-0.02, 1.02])\n", "with open(\"eval.png\", \"wb\") as f:\n", " f.write(fig.to_image(format=\"png\"))" ] @@ -331,6 +345,7 @@ "outputs": [], "source": [ "import json\n", + "\n", "with open(\"metrics.json\", \"w\") as f:\n", " json.dump(result, f)" ] diff --git a/notebooks/ragas_synth.ipynb b/notebooks/ragas_synth.ipynb index b0c4371..cf39b04 100644 --- a/notebooks/ragas_synth.ipynb +++ b/notebooks/ragas_synth.ipynb @@ -19,8 +19,6 @@ "from ragas.testset.generator import TestsetGenerator\n", "from ragas.testset.evolutions import simple, reasoning, multi_context\n", "from ragas.run_config import RunConfig\n", - "from langchain.docstore.document import Document\n", - "import pandas as pd\n", "import nest_asyncio" ] }, @@ -30,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "nest_asyncio.apply() # apply the event loop async fix" + "nest_asyncio.apply() # apply the event loop async fix" ] }, { @@ -46,9 +44,11 @@ "metadata": {}, "outputs": [], "source": [ - "llm = ChatOllama(model='mistral-nemo', num_ctx=16384)\n", - "embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)\n", - "gen = TestsetGenerator.from_langchain(llm, llm, embeddings, run_config=RunConfig(max_workers=1, max_retries=1))\n", + "llm = ChatOllama(model=\"mistral-nemo\", num_ctx=16384)\n", + "embeddings = OllamaEmbeddings(model=\"mistral-nemo\", num_ctx=16384)\n", + "gen = TestsetGenerator.from_langchain(\n", + " llm, llm, embeddings, run_config=RunConfig(max_workers=1, max_retries=1)\n", + ")\n", "dist = {simple: 0.6, multi_context: 0.2, reasoning: 0.2}" ] }, @@ -65,7 +65,7 @@ "metadata": {}, "outputs": [], "source": [ - "docs = [] # load a set of langchain documents to base the synthetic test set generation on" + "docs = [] # load a set of langchain documents to base the synthetic test set generation on" ] }, { @@ -81,7 +81,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "testset = gen.generate_with_langchain_docs(docs, 5, dist, is_async=False)" ] }, diff --git a/notebooks/vllm_test.ipynb b/notebooks/vllm_test.ipynb index 755c34c..86701c7 100644 --- a/notebooks/vllm_test.ipynb +++ b/notebooks/vllm_test.ipynb @@ -15,9 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "prompts = [\n", - " \"Tell me a joke.\"\n", - "]\n", + "prompts = [\"Tell me a joke.\"]\n", "params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)" ] }, @@ -26,9 +24,7 @@ "execution_count": 3, "metadata": {}, "outputs": [], - "source": [ - "import os" - ] + "source": [] }, { "cell_type": "code", @@ -87,7 +83,12 @@ } ], "source": [ - "llm = LLM(model=\"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit\", quantization=\"bitsandbytes\", load_format=\"bitsandbytes\", max_model_len=4096)" + "llm = LLM(\n", + " model=\"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit\",\n", + " quantization=\"bitsandbytes\",\n", + " load_format=\"bitsandbytes\",\n", + " max_model_len=4096,\n", + ")" ] }, { From be3852603eb3558d900717f7e4f5e8a8611f2fec Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 18:45:52 +0100 Subject: [PATCH 20/28] Caught exception when supporitng docs not available --- dvc.lock | 56 ++++++++++++++++---------------- scripts/fetch_supporting_docs.py | 24 ++++++++------ 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/dvc.lock b/dvc.lock index dd7f7b1..1d52d2e 100644 --- a/dvc.lock +++ b/dvc.lock @@ -54,8 +54,8 @@ stages: size: 4584498 - path: data/supporting-docs.json hash: md5 - md5: b0941cc9a7ca7df456157380bcc28f39 - size: 75646 + md5: f3ea9980226e5408497c96a10cc77b80 + size: 72013526 - path: scripts/chunk_data.py hash: md5 md5: 681528e4aa1dc8cfb5fe5e5472e25fdf @@ -63,15 +63,15 @@ stages: outs: - path: data/chunked_data.json hash: md5 - md5: 97f06c3b76ff05d62ccdecd9d5742712 - size: 137681 + md5: f6426396e1a3564b53649ef5fc0571fd + size: 993814 create-embeddings: cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json deps: - path: data/chunked_data.json hash: md5 - md5: 97f06c3b76ff05d62ccdecd9d5742712 - size: 137681 + md5: f6426396e1a3564b53649ef5fc0571fd + size: 993814 - path: scripts/create_embeddings.py hash: md5 md5: 4649c700dfae922b43b3608ee4f00c1a @@ -79,16 +79,16 @@ stages: outs: - path: data/embeddings.json hash: md5 - md5: 8d80ef225c59ede34d026f6f2930bae3 - size: 1894126 + md5: 8fd682131a282736f6a81a6c53040b1e + size: 13422675 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 - md5: 8d80ef225c59ede34d026f6f2930bae3 - size: 1894126 + md5: 8fd682131a282736f6a81a6c53040b1e + size: 13422675 - path: scripts/upload_to_docstore.py hash: md5 md5: 41da88e3bb6d2592bee938ce347f6983 @@ -96,18 +96,18 @@ stages: outs: - path: data/chroma-data hash: md5 - md5: cc85398c596d4c5839714e93e33468bb.dir - size: 3580644 - nfiles: 5 + md5: 5c99644f30def03f87b37c98341c6f25.dir + size: 13758136 + nfiles: 6 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv data/chroma-data -c eidc-data deps: - path: data/chroma-data hash: md5 - md5: cc85398c596d4c5839714e93e33468bb.dir - size: 3580644 - nfiles: 5 + md5: 5c99644f30def03f87b37c98341c6f25.dir + size: 13758136 + nfiles: 6 - path: data/eidc_rag_test_sample.csv hash: md5 md5: a371d83c5822d256286e80d64d58c3fe @@ -119,8 +119,8 @@ stages: outs: - path: data/evaluation_data.csv hash: md5 - md5: 9825cf7e7a89ca17634b44e9256eefc9 - size: 9695 + md5: 8ea0a3f240478e9db41855922ac534a6 + size: 9894 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -137,21 +137,21 @@ stages: size: 12157676 - path: scripts/fetch_supporting_docs.py hash: md5 - md5: de0c11e81bf10e040bef67e43466b789 - size: 1472 + md5: 923af3b6ce1447d388b08fab0e3ab77d + size: 1660 outs: - path: data/supporting-docs.json hash: md5 - md5: b0941cc9a7ca7df456157380bcc28f39 - size: 75646 + md5: f3ea9980226e5408497c96a10cc77b80 + size: 72013526 evaluate: cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: 9825cf7e7a89ca17634b44e9256eefc9 - size: 9695 + md5: 8ea0a3f240478e9db41855922ac534a6 + size: 9894 - path: scripts/evaluate.py hash: md5 md5: 10f76511eafc8a1a9b90e9ae92a76bc5 @@ -159,9 +159,9 @@ stages: outs: - path: data/eval.png hash: md5 - md5: 1279778c7e509e972d1f366157d24966 - size: 58228 + md5: bae77b1b721bf283a30a64f67af45fea + size: 74438 - path: data/metrics.json hash: md5 - md5: 2b93334ba0e8226c916d0964237cb72c - size: 225 + md5: 0145280f36071a6df551ef57d3f8393e + size: 229 diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py index 36354e7..66e77ac 100644 --- a/scripts/fetch_supporting_docs.py +++ b/scripts/fetch_supporting_docs.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser +import logging import json from tqdm import tqdm import requests @@ -6,6 +7,7 @@ from typing import Dict, List from dotenv import load_dotenv +logger = logging.getLogger(__name__) def extract_ids(metadata_file: str): with open(metadata_file) as f: @@ -15,14 +17,18 @@ def extract_ids(metadata_file: str): def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: - res = requests.get( - f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) - ) - json_data = res.json() - docs = [] - for key, val in json_data["success"].items(): - docs.append({"id": eidc_id, "field": key, "value": val}) - return docs + try: + res = requests.get( + f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) + ) + json_data = res.json() + docs = [] + for key, val in json_data["success"].items(): + docs.append({"id": eidc_id, "field": key, "value": val}) + return docs + except Exception as e: + logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e) + return [] def main(metadata_file: str, supporting_docs_file: str): @@ -33,8 +39,6 @@ def main(metadata_file: str, supporting_docs_file: str): docs = [] for id in tqdm(ids): docs.extend(get_supporting_docs(id, user, password)) - if len(docs) > 0: - break with open(supporting_docs_file, "w") as f: json.dump(docs, f, indent=4) From e9b504f66f6253c103c196d20176970e436de632 Mon Sep 17 00:00:00 2001 From: mpc Date: Mon, 21 Oct 2024 13:42:33 +0100 Subject: [PATCH 21/28] Testing dag diagram in mermaid format --- dag.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 dag.md diff --git a/dag.md b/dag.md new file mode 100644 index 0000000..91e16fe --- /dev/null +++ b/dag.md @@ -0,0 +1,23 @@ +```mermaid +flowchart TD + node1["chunk-data"] + node2["create-embeddings"] + node3["evaluate"] + node4["extract-metadata"] + node5["fetch-metadata"] + node6["fetch-supporting-docs"] + node7["generate-testset"] + node8["run-rag-pipeline"] + node9["upload-to-docstore"] + node1-->node2 + node2-->node9 + node4-->node1 + node5-->node4 + node5-->node6 + node6-->node1 + node7-->node8 + node8-->node3 + node9-->node8 + node10["data/evaluation-sets.dvc"] + node11["data/synthetic-datasets.dvc"] +``` From dbfb2bcfc610eab6b8c60b4cf350034e0b58f397 Mon Sep 17 00:00:00 2001 From: mpc Date: Mon, 21 Oct 2024 13:47:32 +0100 Subject: [PATCH 22/28] Updated readme /w mermaid format dag --- README.md | 67 +++++++++++++++++++++---------------------------------- dag.md | 23 ------------------- 2 files changed, 26 insertions(+), 64 deletions(-) delete mode 100644 dag.md diff --git a/README.md b/README.md index 20d03ac..3ccf9c1 100644 --- a/README.md +++ b/README.md @@ -28,48 +28,33 @@ This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the co ```shell dvc dag ``` +or it can be output to mermaid format to display in markdown: +```shell +dvc dag -md ``` - +----------------+ - | fetch-metadata | - +----------------+ - ** ** - *** *** - ** ** - +------------------+ +-----------------------+ - | extract-metadata | | fetch-supporting-docs | - +------------------+ +-----------------------+ - ** ** - *** *** - ** ** - +------------+ - | chunk-data | - +------------+ - * - * - * - +-------------------+ - | create-embeddings | - +-------------------+ - * - * - * -+------------------+ +--------------------+ -| generate-testset | | upload-to-docstore | -+------------------+ +--------------------+ - ** ** - *** *** - ** ** - +------------------+ - | run-rag-pipeline | - +------------------+ - * - * - * - +----------+ - | evaluate | - +----------+ +```mermaid +flowchart TD + node1["chunk-data"] + node2["create-embeddings"] + node3["evaluate"] + node4["extract-metadata"] + node5["fetch-metadata"] + node6["fetch-supporting-docs"] + node7["generate-testset"] + node8["run-rag-pipeline"] + node9["upload-to-docstore"] + node1-->node2 + node2-->node9 + node4-->node1 + node5-->node4 + node5-->node6 + node6-->node1 + node7-->node8 + node8-->node3 + node9-->node8 + node10["data/evaluation-sets.dvc"] + node11["data/synthetic-datasets.dvc"] ``` - > Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file. ## Running Experiments @@ -104,4 +89,4 @@ Notes on the use of Data Version Control and Continuous Machine Learning: ### vLLM Notes on running models with vLLM: -- [vLLM](vllm.md) \ No newline at end of file +- [vLLM](vllm.md) diff --git a/dag.md b/dag.md deleted file mode 100644 index 91e16fe..0000000 --- a/dag.md +++ /dev/null @@ -1,23 +0,0 @@ -```mermaid -flowchart TD - node1["chunk-data"] - node2["create-embeddings"] - node3["evaluate"] - node4["extract-metadata"] - node5["fetch-metadata"] - node6["fetch-supporting-docs"] - node7["generate-testset"] - node8["run-rag-pipeline"] - node9["upload-to-docstore"] - node1-->node2 - node2-->node9 - node4-->node1 - node5-->node4 - node5-->node6 - node6-->node1 - node7-->node8 - node8-->node3 - node9-->node8 - node10["data/evaluation-sets.dvc"] - node11["data/synthetic-datasets.dvc"] -``` From 501df1a1e46fc87dde4fd88be7a532b179bdcf62 Mon Sep 17 00:00:00 2001 From: mpc Date: Mon, 21 Oct 2024 13:51:32 +0100 Subject: [PATCH 23/28] Removed vllm dependency and updated readme --- README.md | 5 +++++ pyproject.toml | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3ccf9c1..414f241 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,28 @@ This repository contains a reproducible workflow setup using [DVC](https://dvc.o ## Requirements - [Ollama](https://ollama.com/download) ([`llama3.1`](https://ollama.com/library/llama3.1) and [`mistral-nemo`](https://ollama.com/library/mistral-nemo) models) +- [Python 3.9+](https://www.python.org/downloads/) ## Getting started +### Setup First create a new virtual environment and install the required dependencies: ```shell python -m venv .venv source .venv/bin/activate pip install . ``` +### Configuration Next setup your local DVC configuration with your [Jasmin object store access key](https://help.jasmin.ac.uk/docs/short-term-project-storage/using-the-jasmin-object-store/#creating-an-access-key-and-secret): ```shell dvc remote modify --local jasmin access_key_id '' dvc remote modify --local jasmin secret_access_key '' ``` +### Getting the data Pull the data from the object store using DVC: ```shell dvc pull ``` +### Working with the pipeline You should now be ready to re-run the pipeline: ```shell dvc repro diff --git a/pyproject.toml b/pyproject.toml index 5abe51d..16077cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ dependencies = [ "kaleido", "dvc", "dvc[s3]", - "vllm", "bitsandbytes", "haystack-ai", "accelerate", @@ -37,4 +36,4 @@ jupyter = [ py-modules = [] [tool.mypy] -files = ["scripts"] \ No newline at end of file +files = ["scripts"] From 731544ac35bf41db87dc75bd1de963156ee8a643 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 25 Oct 2024 09:25:21 +0100 Subject: [PATCH 24/28] Froze dependencies to try and fix gh action --- .github/workflows/cml.yaml | 2 +- pyproject.toml | 37 ++++++++++++++----------------------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml index e48e94c..147f0b3 100644 --- a/.github/workflows/cml.yaml +++ b/.github/workflows/cml.yaml @@ -6,7 +6,7 @@ jobs: container: docker://ghcr.io/iterative/cml:0-dvc2-base1 steps: - uses: actions/checkout@v3 - - name: Train model + - name: Run pipeline env: REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | diff --git a/pyproject.toml b/pyproject.toml index 5abe51d..7ecdec5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,28 +3,22 @@ name = "llm-eval" dynamic = ["version"] dependencies = [ - "plotly", - "pandas", - "numpy", - "kaleido", - "dvc", - "dvc[s3]", - "vllm", - "bitsandbytes", - "haystack-ai", - "accelerate", - "sentence-transformers", - "chromadb", + "plotly == 5.24.1", + "pandas == 2.2.3", + "numpy == 1.26.4", + "kaleido == 0.2.1", + "dvc[s3] == 3.2.0 ", + "bitsandbytes == 0.44.1", + "haystack-ai == 2.6.0", + "accelerate == 1.0.0", + "sentence-transformers == 3.1.1", + "chromadb == 0.5.15", "ollama-haystack == 0.0.7", - "chroma-haystack", + "chroma-haystack == 0.22.1", "ragas == 0.1.10", - "nltk", - "nbformat>=4.2.0", - "ruff", - "mypy", - "types-requests", - "types-tqdm", - "pandas-stubs", + "nltk == 3.9.1", + "nbformat == 4.2.0", + "ruff == 0.7.0", ] [project.optional-dependencies] @@ -35,6 +29,3 @@ jupyter = [ [tool.setuptools] py-modules = [] - -[tool.mypy] -files = ["scripts"] \ No newline at end of file From 68469993c89783d54b0bf50fa9c1756ad0eced6b Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 25 Oct 2024 10:18:21 +0100 Subject: [PATCH 25/28] Fixed gh actions pipeline --- .github/workflows/cml.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml index 147f0b3..02ca014 100644 --- a/.github/workflows/cml.yaml +++ b/.github/workflows/cml.yaml @@ -3,9 +3,12 @@ on: [push] jobs: train-and-report: runs-on: ubuntu-latest - container: docker://ghcr.io/iterative/cml:0-dvc2-base1 steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - uses: iterative/setup-cml@v2 - name: Run pipeline env: REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 6a2f0b4881afa6c0a97e347335e3b118b12b717a Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 25 Oct 2024 11:13:50 +0100 Subject: [PATCH 26/28] re-added ruff --- notebooks/ragas_eval.ipynb | 20 ++++++++++---------- notebooks/ragas_synth.ipynb | 10 +++++----- pyproject.toml | 27 ++++++++++++++++++++++----- scripts/chunk_data.py | 6 ++++-- scripts/create_embeddings.py | 8 +++++--- scripts/evaluate.py | 23 ++++++++++++----------- scripts/extract_metadata.py | 5 ++--- scripts/fetch_eidc_metadata.py | 3 ++- scripts/fetch_supporting_docs.py | 21 +++++++++++++-------- scripts/run_rag_pipeline.py | 19 ++++++++++--------- scripts/upload_to_docstore.py | 13 ++++++++----- 11 files changed, 93 insertions(+), 62 deletions(-) diff --git a/notebooks/ragas_eval.ipynb b/notebooks/ragas_eval.ipynb index 56bcb43..395269f 100644 --- a/notebooks/ragas_eval.ipynb +++ b/notebooks/ragas_eval.ipynb @@ -37,15 +37,15 @@ } ], "source": [ + "import nest_asyncio\n", "import pandas as pd\n", - "from datasets import Dataset\n", - "from ragas import evaluate\n", - "from ragas.run_config import RunConfig\n", - "from langchain_community.embeddings import OllamaEmbeddings\n", - "from langchain_community.chat_models import ChatOllama\n", "import plotly.graph_objects as go\n", "import plotly.io as pio\n", - "import nest_asyncio" + "from datasets import Dataset\n", + "from langchain_community.chat_models import ChatOllama\n", + "from langchain_community.embeddings import OllamaEmbeddings\n", + "from ragas import evaluate\n", + "from ragas.run_config import RunConfig" ] }, { @@ -248,13 +248,13 @@ "outputs": [], "source": [ "from ragas.metrics import (\n", - " faithfulness,\n", + " answer_correctness,\n", " answer_relevancy,\n", + " answer_similarity,\n", + " context_entity_recall,\n", " context_precision,\n", " context_recall,\n", - " context_entity_recall,\n", - " answer_similarity,\n", - " answer_correctness,\n", + " faithfulness,\n", ")" ] }, diff --git a/notebooks/ragas_synth.ipynb b/notebooks/ragas_synth.ipynb index cf39b04..f8057b0 100644 --- a/notebooks/ragas_synth.ipynb +++ b/notebooks/ragas_synth.ipynb @@ -14,12 +14,12 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_community.embeddings import OllamaEmbeddings\n", + "import nest_asyncio\n", "from langchain_community.chat_models import ChatOllama\n", - "from ragas.testset.generator import TestsetGenerator\n", - "from ragas.testset.evolutions import simple, reasoning, multi_context\n", + "from langchain_community.embeddings import OllamaEmbeddings\n", "from ragas.run_config import RunConfig\n", - "import nest_asyncio" + "from ragas.testset.evolutions import multi_context, reasoning, simple\n", + "from ragas.testset.generator import TestsetGenerator" ] }, { @@ -65,7 +65,7 @@ "metadata": {}, "outputs": [], "source": [ - "docs = [] # load a set of langchain documents to base the synthetic test set generation on" + "docs = [] # load a set of langchain docs to base the synthetic test set generation on" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 7ecdec5..bb37d7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,6 @@ [project] name = "llm-eval" -dynamic = ["version"] - +version = "0.1.0" dependencies = [ "plotly == 5.24.1", "pandas == 2.2.3", @@ -18,7 +17,6 @@ dependencies = [ "ragas == 0.1.10", "nltk == 3.9.1", "nbformat == 4.2.0", - "ruff == 0.7.0", ] [project.optional-dependencies] @@ -26,6 +24,25 @@ jupyter = [ "ipykernel", "ipywidgets", ] +lint = [ + "ruff == 0.7.1", + "mypy == 1.13.0", +] +dev = [ + "llm-eval[jupyter,lint]" +] + +[tool.ruff.lint] +select = [ + "I", + "E", + "F", + "ANN" +] +fixable = ["ALL"] + +[tool.ruff] +line-length = 88 -[tool.setuptools] -py-modules = [] +[tool.ruff.lint.pydocstyle] +convention = "google" diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py index ace111d..28707ed 100644 --- a/scripts/chunk_data.py +++ b/scripts/chunk_data.py @@ -1,6 +1,6 @@ -from typing import List, Dict import json from argparse import ArgumentParser +from typing import Any, Dict, List def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]: @@ -12,7 +12,9 @@ def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]: return chunks -def chunk_metadata_value(metada_value, chunk_size, overlap): +def chunk_metadata_value( + metada_value: str, chunk_size: int, overlap: int +) -> List[Dict[str, Any]]: chunks = chunk_value(metada_value["value"], chunk_size, overlap) return [ { diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index 2ad9cc9..7aa507c 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -1,15 +1,17 @@ import json -from sentence_transformers import SentenceTransformer from argparse import ArgumentParser + +from sentence_transformers import SentenceTransformer +from torch import Tensor from tqdm import tqdm -def create_embedding(text): +def create_embedding(text: str) -> Tensor: model = SentenceTransformer("all-MiniLM-L6-v2") return model.encode(text) -def main(input_file, output_file): +def main(input_file: str, output_file: str) -> None: with open(input_file) as input, open(output_file, "w") as output: data = json.load(input) for chunk in tqdm(data): diff --git a/scripts/evaluate.py b/scripts/evaluate.py index d7ac98f..c130e96 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -1,23 +1,24 @@ +import json from argparse import ArgumentParser + +import nest_asyncio import pandas as pd -from datasets import Dataset -from ragas import evaluate -from ragas.run_config import RunConfig -from langchain_community.embeddings import OllamaEmbeddings -from langchain_community.chat_models import ChatOllama import plotly.graph_objects as go import plotly.io as pio -import nest_asyncio +from datasets import Dataset +from langchain_community.chat_models import ChatOllama +from langchain_community.embeddings import OllamaEmbeddings +from ragas import evaluate from ragas.metrics import ( - faithfulness, + answer_correctness, answer_relevancy, + answer_similarity, + context_entity_recall, context_precision, context_recall, - context_entity_recall, - answer_similarity, - answer_correctness, + faithfulness, ) -import json +from ragas.run_config import RunConfig def main(eval_dataset: str, metric_output: str, image_output: str) -> None: diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py index 8007d09..9bd4c3c 100644 --- a/scripts/extract_metadata.py +++ b/scripts/extract_metadata.py @@ -1,7 +1,6 @@ -from typing import List, Dict import json from argparse import ArgumentParser - +from typing import Dict, List METADATA_FIELDS = ["title", "description", "lineage"] @@ -30,7 +29,7 @@ def parse_eidc_metadata(file_path: str) -> List[Dict[str, str]]: return data -def main(input, output) -> None: +def main(input: str, output: str) -> None: data = parse_eidc_metadata(input) with open(output, "w") as f: json.dump(data, f, indent=4) diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index f411c16..0ab6297 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -1,7 +1,8 @@ -import requests import json from argparse import ArgumentParser +import requests + URL = "https://catalogue.ceh.ac.uk/eidc/documents" diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py index 66e77ac..d95493b 100644 --- a/scripts/fetch_supporting_docs.py +++ b/scripts/fetch_supporting_docs.py @@ -1,15 +1,17 @@ -from argparse import ArgumentParser -import logging import json -from tqdm import tqdm -import requests +import logging import os +from argparse import ArgumentParser from typing import Dict, List + +import requests from dotenv import load_dotenv +from tqdm import tqdm logger = logging.getLogger(__name__) -def extract_ids(metadata_file: str): + +def extract_ids(metadata_file: str) -> List[str]: with open(metadata_file) as f: json_data = json.load(f) ids = [dataset["identifier"] for dataset in json_data["results"]] @@ -19,7 +21,8 @@ def extract_ids(metadata_file: str): def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: try: res = requests.get( - f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) + f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", + auth=(user, password), ) json_data = res.json() docs = [] @@ -27,11 +30,13 @@ def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str docs.append({"id": eidc_id, "field": key, "value": val}) return docs except Exception as e: - logger.error(f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e) + logger.error( + f"Failed to download supporting docs for dataset {eidc_id}", exc_info=e + ) return [] -def main(metadata_file: str, supporting_docs_file: str): +def main(metadata_file: str, supporting_docs_file: str) -> None: load_dotenv() user = os.getenv("username") password = os.getenv("password") diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index 91408ea..2c620e5 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -1,13 +1,14 @@ -from argparse import ArgumentParser import shutil +from argparse import ArgumentParser +from typing import Any, Dict, List, Tuple + +import pandas as pd from haystack import Pipeline -from haystack_integrations.document_stores.chroma import ChromaDocumentStore -from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever from haystack.components.builders import PromptBuilder -from haystack_integrations.components.generators.ollama.generator import OllamaGenerator from haystack.components.builders.answer_builder import AnswerBuilder -import pandas as pd - +from haystack_integrations.components.generators.ollama.generator import OllamaGenerator +from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever +from haystack_integrations.document_stores.chroma import ChromaDocumentStore TMP_DOC_PATH = ".tmp/doc-store" @@ -61,7 +62,7 @@ def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline: return rag_pipe -def run_query(query: str, pipeline: Pipeline): +def run_query(query: str, pipeline: Pipeline) -> Dict[str, Any]: return pipeline.run( { "retriever": {"query": query}, @@ -71,7 +72,7 @@ def run_query(query: str, pipeline: Pipeline): ) -def query_pipeline(questions, rag_pipe): +def query_pipeline(questions: List[str], rag_pipe: Pipeline) -> Tuple[str, List[str]]: answers = [] contexts = [] for q in questions: @@ -85,7 +86,7 @@ def query_pipeline(questions, rag_pipe): def main( test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str -): +) -> None: shutil.copytree(doc_store_path, TMP_DOC_PATH) rag_pipe = build_rag_pipeline("llama3.1", collection_name) diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py index 7b547d7..9f1a880 100644 --- a/scripts/upload_to_docstore.py +++ b/scripts/upload_to_docstore.py @@ -1,14 +1,16 @@ -from argparse import ArgumentParser import json -import uuid -import shutil import os +import shutil +import uuid +from argparse import ArgumentParser import chromadb from chromadb.utils import embedding_functions -def main(input_file: str, output_path: str, collection_name: str, embedding_model: str): +def main( + input_file: str, output_path: str, collection_name: str, embedding_model: str +) -> None: if os.path.exists(output_path): shutil.rmtree(output_path) @@ -55,7 +57,8 @@ def main(input_file: str, output_path: str, collection_name: str, embedding_mode parser.add_argument( "-em", "--embedding_model", - help="Embedding model to use in the doc store (must be the same as the function used to create embeddings.)", + help="""Embedding model to use in the doc store (must be the same as the + function used to create embeddings.)""", default="all-MiniLM-L6-v2", ) args = parser.parse_args() From aa292247c15489b050a473e9acb711d6c3393703 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 25 Oct 2024 11:53:29 +0100 Subject: [PATCH 27/28] red-added pymodule setup --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index bb37d7f..fdd2586 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,3 +46,6 @@ line-length = 88 [tool.ruff.lint.pydocstyle] convention = "google" + +- [tool.setuptools] +- py-modules = [] \ No newline at end of file From 5d0430116d93c40306bb565a53995e2d1bc8fba5 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 25 Oct 2024 11:47:22 +0000 Subject: [PATCH 28/28] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fdd2586..3dda280 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,5 +47,5 @@ line-length = 88 [tool.ruff.lint.pydocstyle] convention = "google" -- [tool.setuptools] -- py-modules = [] \ No newline at end of file +[tool.setuptools] +py-modules = []