Skip to content

Commit

Permalink
Added ruff, mypy and cleaned scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 18, 2024
1 parent ccd4e3c commit 4f7ab43
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 51 deletions.
61 changes: 31 additions & 30 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ stages:
deps:
- path: scripts/fetch_eidc_metadata.py
hash: md5
md5: ba838a284da239217d0464f08e0a45ce
size: 674
md5: 53d620665448ef91f2deedb517e2f502
size: 675
outs:
- path: data/eidc_metadata.json
hash: md5
md5: fc2f9ebe92cbd07eb06ff6e39366fdac
size: 12146216
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
Expand All @@ -33,25 +33,25 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: fc2f9ebe92cbd07eb06ff6e39366fdac
size: 12146216
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
- path: scripts/extract_metadata.py
hash: md5
md5: c2fa7d2c4b8f28a6e24536ce0df244fd
size: 1296
md5: 3f0269a6413845f4425af55e7cea7bf8
size: 1304
outs:
- path: data/extracted_metadata.json
hash: md5
md5: fce18ce3c43175af1cea5d84dac9baf9
size: 4579965
md5: 789fda7a14f9a85c6ee0e10af8170a95
size: 4584498
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s
10 data/extracted_metadata.json
deps:
- path: data/extracted_metadata.json
hash: md5
md5: fce18ce3c43175af1cea5d84dac9baf9
size: 4579965
md5: 789fda7a14f9a85c6ee0e10af8170a95
size: 4584498
- path: data/supporting-docs.json
hash: md5
md5: 0febface6f1d23fda46c11bef65284f4
Expand All @@ -74,7 +74,7 @@ stages:
size: 14947
- path: scripts/create_embeddings.py
hash: md5
md5: 3dc6ef284730398375a13df4bff41846
md5: 4649c700dfae922b43b3608ee4f00c1a
size: 808
outs:
- path: data/embeddings.json
Expand All @@ -83,28 +83,29 @@ stages:
size: 351126
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
-em all-MiniLM-L6-v2
-em all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: b08299369d1f243eb8d8ffa2cdb9a90f
size: 351126
- path: scripts/upload_to_docstore.py
hash: md5
md5: ae8755770166dd3d6c1efb9f15723116
size: 1836
md5: 41da88e3bb6d2592bee938ce347f6983
size: 1905
outs:
- path: data/chroma-data
hash: md5
md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
size: 2069220
nfiles: 5
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
data/chroma-data -c eidc-data
deps:
- path: data/chroma-data
hash: md5
md5: 0254e85bb660da611cfa14e5221dae92.dir
md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
size: 2069220
nfiles: 5
- path: data/eidc_rag_test_sample.csv
Expand All @@ -113,13 +114,13 @@ stages:
size: 7524
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 6d1f49fa8b22288ecd50ed0e3898fd60
size: 3153
md5: 8d5fc0669771146562c773186f4f44f6
size: 3667
outs:
- path: data/evaluation_data.csv
hash: md5
md5: 47a0adeb2ee1cb67202048684064d30f
size: 7293
md5: f6bce3f5c551e84da224d36201858839
size: 6638
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
Expand All @@ -140,18 +141,18 @@ stages:
deps:
- path: data/evaluation_data.csv
hash: md5
md5: 47a0adeb2ee1cb67202048684064d30f
size: 7293
md5: f6bce3f5c551e84da224d36201858839
size: 6638
- path: scripts/evaluate.py
hash: md5
md5: 51f036b805f23dd3ebfd5d819bc9d457
size: 2489
md5: 10f76511eafc8a1a9b90e9ae92a76bc5
size: 2633
outs:
- path: data/eval.png
hash: md5
md5: 8c11f987449f8718b6f6011078b6c259
size: 49498
md5: fd66aa842f93e8f370399dae5b68e2fe
size: 50525
- path: data/metrics.json
hash: md5
md5: 53fba29cb236fedd3c6446ea94fea3cc
size: 215
md5: 55266ae1bd64a3499508d07651a5aa13
size: 214
4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ stages:
outs:
- ${files.embeddings}
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${files.doc-store} -em ${hp.embeddings-model}
cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${doc-store.files} -em ${hp.embeddings-model} -c ${doc-store.collection}
deps:
- ${files.embeddings}
- scripts/upload_to_docstore.py
Expand All @@ -43,7 +43,7 @@ stages:
outs:
- ${files.test-set}
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set}
cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set} ${files.doc-store} -c ${doc-store.collection}
deps:
- ${files.test-set}
- ${files.doc-store}
Expand Down
3 changes: 3 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ hp:
chunk-size: 300
overlap: 100
embeddings-model: "all-MiniLM-L6-v2"
doc-store:
collection: "eidc-data"
files: "data/chroma-data"
files:
metadata: "data/eidc_metadata.json"
extracted: "data/extracted_metadata.json"
Expand Down
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ dependencies = [
"ragas == 0.1.10",
"nltk",
"nbformat>=4.2.0",
"ruff",
"mypy",
"types-requests",
"types-tqdm",
"pandas-stubs",
]

[project.optional-dependencies]
Expand All @@ -30,3 +35,6 @@ jupyter = [

[tool.setuptools]
py-modules = []

[tool.mypy]
files = ["scripts"]
3 changes: 2 additions & 1 deletion scripts/create_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from argparse import ArgumentParser
from tqdm import tqdm


def create_embedding(text):
model = SentenceTransformer("all-MiniLM-L6-v2")
return model.encode(text)
return model.encode(text)


def main(input_file, output_file):
Expand Down
27 changes: 19 additions & 8 deletions scripts/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@
)
import json


def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
nest_asyncio.apply() # apply the event loop async fix
nest_asyncio.apply() # apply the event loop async fix
df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval})
eval_dataset = Dataset.from_pandas(df)
llm = ChatOllama(model='mistral-nemo', num_ctx=16384)
embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)
llm = ChatOllama(model="mistral-nemo", num_ctx=16384)
embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384)
result = evaluate(
eval_dataset,
metrics=[
Expand All @@ -45,19 +46,29 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
pio.templates.default = "gridon"
fig = go.Figure()


with open(metric_output, "w") as f:
json.dump(result, f)
metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]]
metrics = [
metric
for metric in result_df.columns.to_list()
if metric not in ["question", "ground_truth", "answer", "contexts"]
]

for metric in metrics:
fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True))
fig.update_yaxes(range=[-0.02,1.02])
fig.add_trace(
go.Violin(
y=result_df[metric],
name=metric,
points="all",
box_visible=True,
meanline_visible=True,
)
)
fig.update_yaxes(range=[-0.02, 1.02])
with open(image_output, "wb") as f:
f.write(fig.to_image(format="png"))



if __name__ == "__main__":
parser = ArgumentParser("evaluate.py")
parser.add_argument("eval_dataset", help="File containing the evaluation data.")
Expand Down
6 changes: 4 additions & 2 deletions scripts/extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
METADATA_FIELDS = ["title", "description", "lineage"]


def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]:
def extact_eidc_metadata_fields(
json_data: Dict, fields: List[str] = METADATA_FIELDS
) -> List[Dict[str, str]]:
metadatas = []
for field in fields:
if json_data[field]:
Expand All @@ -18,7 +20,7 @@ def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FI
return metadatas


def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
def parse_eidc_metadata(file_path: str) -> List[Dict[str, str]]:
data = []
with open(file_path) as f:
json_data = json.load(f)
Expand Down
1 change: 1 addition & 0 deletions scripts/fetch_eidc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

URL = "https://catalogue.ceh.ac.uk/eidc/documents"


def main(output_file: str) -> None:
res = requests.get(
URL,
Expand Down
36 changes: 29 additions & 7 deletions scripts/run_rag_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from argparse import ArgumentParser
import shutil
from haystack import Pipeline
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
Expand All @@ -8,9 +9,12 @@
import pandas as pd


def build_rag_pipeline(model_name: str) -> Pipeline:
TMP_DOC_PATH = ".tmp/doc-store"


def build_rag_pipeline(model_name: str, collection_name: str) -> Pipeline:
document_store = ChromaDocumentStore(
collection_name="eidc-data", persist_path="data/chroma-data"
collection_name=collection_name, persist_path=TMP_DOC_PATH
)
retriever = ChromaQueryTextRetriever(document_store, top_k=3)
print("Creating prompt template...")
Expand Down Expand Up @@ -73,22 +77,30 @@ def query_pipeline(questions, rag_pipe):
for q in questions:
response = run_query(q, rag_pipe)
answers.append(response["answer_builder"]["answers"][0].data)
contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
contexts.append(
[doc.content for doc in response["answer_builder"]["answers"][0].documents]
)
return answers, contexts


def main(test_data_file: str, ouput_file: str):
rag_pipe = build_rag_pipeline("llama3.1")
def main(
test_data_file: str, ouput_file: str, doc_store_path: str, collection_name: str
):
shutil.copytree(doc_store_path, TMP_DOC_PATH)

rag_pipe = build_rag_pipeline("llama3.1", collection_name)

df = pd.read_csv(test_data_file)
df.drop(columns=["rating", "contexts"], inplace=True)

answers, contexts = query_pipeline(df["question"], rag_pipe)

df["answer"] = answers
df["contexts"] = contexts
df.to_csv(ouput_file, index=False)

shutil.rmtree(TMP_DOC_PATH)


if __name__ == "__main__":
parser = ArgumentParser("run_rag_pipeline.py")
Expand All @@ -100,5 +112,15 @@ def main(test_data_file: str, ouput_file: str):
"output_file",
help="File to output results to.",
)
parser.add_argument(
"doc_store_path",
help="Path to the doc store.",
)
parser.add_argument(
"-c",
"--collection",
help="Collection name in doc store.",
default="eidc-data",
)
args = parser.parse_args()
main(args.test_data_file, args.output_file)
main(args.test_data_file, args.output_file, args.doc_store_path, args.collection)
6 changes: 5 additions & 1 deletion scripts/upload_to_docstore.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from argparse import ArgumentParser
import json
import uuid
import shutil
import os

import chromadb
from chromadb.utils import embedding_functions


def main(input_file: str, output_path: str, collection_name: str, embedding_model: str):
print(collection_name)
if os.path.exists(output_path):
shutil.rmtree(output_path)

with open(input_file) as f:
json_data = json.load(f)

Expand Down

0 comments on commit 4f7ab43

Please sign in to comment.