From 4f7ab43272ae4dd80d27fbeafa05f1e8ba415844 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 09:51:39 +0100 Subject: [PATCH] Added ruff, mypy and cleaned scripts --- dvc.lock | 61 +++++++++++++++++----------------- dvc.yaml | 4 +-- params.yaml | 3 ++ pyproject.toml | 8 +++++ scripts/create_embeddings.py | 3 +- scripts/evaluate.py | 27 ++++++++++----- scripts/extract_metadata.py | 6 ++-- scripts/fetch_eidc_metadata.py | 1 + scripts/run_rag_pipeline.py | 36 ++++++++++++++++---- scripts/upload_to_docstore.py | 6 +++- 10 files changed, 104 insertions(+), 51 deletions(-) diff --git a/dvc.lock b/dvc.lock index 8b454b4..d143f87 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,13 +5,13 @@ stages: deps: - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: ba838a284da239217d0464f08e0a45ce - size: 674 + md5: 53d620665448ef91f2deedb517e2f502 + size: 675 outs: - path: data/eidc_metadata.json hash: md5 - md5: fc2f9ebe92cbd07eb06ff6e39366fdac - size: 12146216 + md5: b4f3774a2921debb4d7740165ac604d4 + size: 12157676 prepare: cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: @@ -33,25 +33,25 @@ stages: deps: - path: data/eidc_metadata.json hash: md5 - md5: fc2f9ebe92cbd07eb06ff6e39366fdac - size: 12146216 + md5: b4f3774a2921debb4d7740165ac604d4 + size: 12157676 - path: scripts/extract_metadata.py hash: md5 - md5: c2fa7d2c4b8f28a6e24536ce0df244fd - size: 1296 + md5: 3f0269a6413845f4425af55e7cea7bf8 + size: 1304 outs: - path: data/extracted_metadata.json hash: md5 - md5: fce18ce3c43175af1cea5d84dac9baf9 - size: 4579965 + md5: 789fda7a14f9a85c6ee0e10af8170a95 + size: 4584498 chunk-data: cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s 10 data/extracted_metadata.json deps: - path: data/extracted_metadata.json hash: md5 - md5: fce18ce3c43175af1cea5d84dac9baf9 - size: 4579965 + md5: 789fda7a14f9a85c6ee0e10af8170a95 + size: 4584498 - path: data/supporting-docs.json hash: md5 md5: 0febface6f1d23fda46c11bef65284f4 @@ -74,7 +74,7 @@ stages: size: 14947 - path: scripts/create_embeddings.py hash: md5 - md5: 3dc6ef284730398375a13df4bff41846 + md5: 4649c700dfae922b43b3608ee4f00c1a size: 808 outs: - path: data/embeddings.json @@ -83,7 +83,7 @@ stages: size: 351126 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data - -em all-MiniLM-L6-v2 + -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 @@ -91,20 +91,21 @@ stages: size: 351126 - path: scripts/upload_to_docstore.py hash: md5 - md5: ae8755770166dd3d6c1efb9f15723116 - size: 1836 + md5: 41da88e3bb6d2592bee938ce347f6983 + size: 1905 outs: - path: data/chroma-data hash: md5 - md5: 2f2ba629bf078284bb6d6be73c6166a7.dir + md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir size: 2069220 nfiles: 5 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv + data/chroma-data -c eidc-data deps: - path: data/chroma-data hash: md5 - md5: 0254e85bb660da611cfa14e5221dae92.dir + md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir size: 2069220 nfiles: 5 - path: data/eidc_rag_test_sample.csv @@ -113,13 +114,13 @@ stages: size: 7524 - path: scripts/run_rag_pipeline.py hash: md5 - md5: 6d1f49fa8b22288ecd50ed0e3898fd60 - size: 3153 + md5: 8d5fc0669771146562c773186f4f44f6 + size: 3667 outs: - path: data/evaluation_data.csv hash: md5 - md5: 47a0adeb2ee1cb67202048684064d30f - size: 7293 + md5: f6bce3f5c551e84da224d36201858839 + size: 6638 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -140,18 +141,18 @@ stages: deps: - path: data/evaluation_data.csv hash: md5 - md5: 47a0adeb2ee1cb67202048684064d30f - size: 7293 + md5: f6bce3f5c551e84da224d36201858839 + size: 6638 - path: scripts/evaluate.py hash: md5 - md5: 51f036b805f23dd3ebfd5d819bc9d457 - size: 2489 + md5: 10f76511eafc8a1a9b90e9ae92a76bc5 + size: 2633 outs: - path: data/eval.png hash: md5 - md5: 8c11f987449f8718b6f6011078b6c259 - size: 49498 + md5: fd66aa842f93e8f370399dae5b68e2fe + size: 50525 - path: data/metrics.json hash: md5 - md5: 53fba29cb236fedd3c6446ea94fea3cc - size: 215 + md5: 55266ae1bd64a3499508d07651a5aa13 + size: 214 diff --git a/dvc.yaml b/dvc.yaml index 59a6ccc..fa419ff 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -32,7 +32,7 @@ stages: outs: - ${files.embeddings} upload-to-docstore: - cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model} + cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${doc-store.files} -em ${hp.embeddings-model} -c ${doc-store.collection} deps: - ${files.embeddings} - scripts/upload_to_docstore.py @@ -43,7 +43,7 @@ stages: outs: - ${files.test-set} run-rag-pipeline: - cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} + cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} ${files.doc-store} -c ${doc-store.collection} deps: - ${files.test-set} - ${files.doc-store} diff --git a/params.yaml b/params.yaml index 900e48f..988dbdb 100644 --- a/params.yaml +++ b/params.yaml @@ -2,6 +2,9 @@ hp: chunk-size: 300 overlap: 100 embeddings-model: "all-MiniLM-L6-v2" +doc-store: + collection: "eidc-data" + files: "data/chroma-data" files: metadata: "data/eidc_metadata.json" extracted: "data/extracted_metadata.json" diff --git a/pyproject.toml b/pyproject.toml index 4844faf..5abe51d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,11 @@ dependencies = [ "ragas == 0.1.10", "nltk", "nbformat>=4.2.0", + "ruff", + "mypy", + "types-requests", + "types-tqdm", + "pandas-stubs", ] [project.optional-dependencies] @@ -30,3 +35,6 @@ jupyter = [ [tool.setuptools] py-modules = [] + +[tool.mypy] +files = ["scripts"] \ No newline at end of file diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index ce1c37b..2ad9cc9 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -3,9 +3,10 @@ from argparse import ArgumentParser from tqdm import tqdm + def create_embedding(text): model = SentenceTransformer("all-MiniLM-L6-v2") - return model.encode(text) + return model.encode(text) def main(input_file, output_file): diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 10b3a61..d7ac98f 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -19,12 +19,13 @@ ) import json + def main(eval_dataset: str, metric_output: str, image_output: str) -> None: - nest_asyncio.apply() # apply the event loop async fix + nest_asyncio.apply() # apply the event loop async fix df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval}) eval_dataset = Dataset.from_pandas(df) - llm = ChatOllama(model='mistral-nemo', num_ctx=16384) - embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384) + llm = ChatOllama(model="mistral-nemo", num_ctx=16384) + embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384) result = evaluate( eval_dataset, metrics=[ @@ -45,19 +46,29 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: pio.templates.default = "gridon" fig = go.Figure() - with open(metric_output, "w") as f: json.dump(result, f) - metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]] + metrics = [ + metric + for metric in result_df.columns.to_list() + if metric not in ["question", "ground_truth", "answer", "contexts"] + ] for metric in metrics: - fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True)) - fig.update_yaxes(range=[-0.02,1.02]) + fig.add_trace( + go.Violin( + y=result_df[metric], + name=metric, + points="all", + box_visible=True, + meanline_visible=True, + ) + ) + fig.update_yaxes(range=[-0.02, 1.02]) with open(image_output, "wb") as f: f.write(fig.to_image(format="png")) - if __name__ == "__main__": parser = ArgumentParser("evaluate.py") parser.add_argument("eval_dataset", help="File containing the evaluation data.") diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py index 241bc1a..8007d09 100644 --- a/scripts/extract_metadata.py +++ b/scripts/extract_metadata.py @@ -6,7 +6,9 @@ METADATA_FIELDS = ["title", "description", "lineage"] -def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]: +def extact_eidc_metadata_fields( + json_data: Dict, fields: List[str] = METADATA_FIELDS +) -> List[Dict[str, str]]: metadatas = [] for field in fields: if json_data[field]: @@ -18,7 +20,7 @@ def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FI return metadatas -def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]: +def parse_eidc_metadata(file_path: str) -> List[Dict[str, str]]: data = [] with open(file_path) as f: json_data = json.load(f) diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index cd56b4e..f411c16 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -4,6 +4,7 @@ URL = "https://catalogue.ceh.ac.uk/eidc/documents" + def main(output_file: str) -> None: res = requests.get( URL, diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index 830a052..91408ea 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser +import shutil from haystack import Pipeline from haystack_integrations.document_stores.chroma import ChromaDocumentStore from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever @@ -8,9 +9,12 @@ import pandas as pd -def build_rag_pipeline(model_name: str) -> Pipeline: +TMP_DOC_PATH = ".tmp/doc-store" + + +def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline: document_store = ChromaDocumentStore( - collection_name="eidc-data", persist_path="data/chroma-data" + collection_name=collection_name, persist_path=TMP_DOC_PATH ) retriever = ChromaQueryTextRetriever(document_store, top_k=3) print("Creating prompt template...") @@ -73,22 +77,30 @@ def query_pipeline(questions, rag_pipe): for q in questions: response = run_query(q, rag_pipe) answers.append(response["answer_builder"]["answers"][0].data) - contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents]) + contexts.append( + [doc.content for doc in response["answer_builder"]["answers"][0].documents] + ) return answers, contexts -def main(test_data_file: str, ouput_file: str): - rag_pipe = build_rag_pipeline("llama3.1") +def main( + test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str +): + shutil.copytree(doc_store_path, TMP_DOC_PATH) + + rag_pipe = build_rag_pipeline("llama3.1", collection_name) df = pd.read_csv(test_data_file) df.drop(columns=["rating", "contexts"], inplace=True) answers, contexts = query_pipeline(df["question"], rag_pipe) - + df["answer"] = answers df["contexts"] = contexts df.to_csv(ouput_file, index=False) + shutil.rmtree(TMP_DOC_PATH) + if __name__ == "__main__": parser = ArgumentParser("run_rag_pipeline.py") @@ -100,5 +112,15 @@ def main(test_data_file: str, ouput_file: str): "output_file", help="File to output results to.", ) + parser.add_argument( + "doc_store_path", + help="Path to the doc store.", + ) + parser.add_argument( + "-c", + "--collection", + help="Collection name in doc store.", + default="eidc-data", + ) args = parser.parse_args() - main(args.test_data_file, args.output_file) + main(args.test_data_file, args.output_file, args.doc_store_path, args.collection) diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py index 4f2e8af..7b547d7 100644 --- a/scripts/upload_to_docstore.py +++ b/scripts/upload_to_docstore.py @@ -1,13 +1,17 @@ from argparse import ArgumentParser import json import uuid +import shutil +import os import chromadb from chromadb.utils import embedding_functions def main(input_file: str, output_path: str, collection_name: str, embedding_model: str): - print(collection_name) + if os.path.exists(output_path): + shutil.rmtree(output_path) + with open(input_file) as f: json_data = json.load(f)