Skip to content

Commit

Permalink
Added evaluation script to pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 17, 2024
1 parent f9b9b3e commit 419e83f
Show file tree
Hide file tree
Showing 7 changed files with 220 additions and 42 deletions.
2 changes: 2 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@
/evaluation_data.csv
/eidc_rag_test_sample.csv
/supporting-docs.json
/metrics.json
/eval.png
26 changes: 20 additions & 6 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ stages:
deps:
- path: data/chroma-data
hash: md5
md5: 1d7c499f71791267391ff4108632988c.dir
md5: 0254e85bb660da611cfa14e5221dae92.dir
size: 2069220
nfiles: 5
- path: data/eidc_rag_test_sample.csv
Expand All @@ -118,8 +118,8 @@ stages:
outs:
- path: data/evaluation_data.csv
hash: md5
md5: e313cb899c10a2b5ad670b8bc84d059f
size: 8407
md5: 47a0adeb2ee1cb67202048684064d30f
size: 7293
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
Expand All @@ -135,9 +135,23 @@ stages:
md5: 0febface6f1d23fda46c11bef65284f4
size: 34
evaluate:
cmd: echo "Evaluate responses"
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: e313cb899c10a2b5ad670b8bc84d059f
size: 8407
md5: 47a0adeb2ee1cb67202048684064d30f
size: 7293
- path: scripts/evaluate.py
hash: md5
md5: 51f036b805f23dd3ebfd5d819bc9d457
size: 2489
outs:
- path: data/eval.png
hash: md5
md5: 8c11f987449f8718b6f6011078b6c259
size: 49498
- path: data/metrics.json
hash: md5
md5: 53fba29cb236fedd3c6446ea94fea3cc
size: 215
8 changes: 6 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ stages:
outs:
- ${files.eval-set}
evaluate:
cmd: echo "Evaluate responses"
cmd: python scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot}
deps:
- ${files.eval-set}
- ${files.eval-set}
- scripts/evaluate.py
outs:
- ${files.metrics}
- ${files.eval-plot}
143 changes: 111 additions & 32 deletions notebooks/ragas_eval.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ files:
doc-store: "data/chroma-data"
test-set: "data/eidc_rag_test_sample.csv"
eval-set: "data/evaluation_data.csv"
metrics: "data/metrics.json"
eval-plot: "data/eval.png"
sample-size: 10 # sample size of 0 will process all data
rag:
model: llama3.1
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ dependencies = [
"ollama-haystack == 0.0.7",
"chroma-haystack",
"ragas == 0.1.10",
"nltk"
"nltk",
"nbformat>=4.2.0",
]

[project.optional-dependencies]
Expand Down
78 changes: 77 additions & 1 deletion scripts/evaluate.py
Original file line number Diff line number Diff line change
@@ -1 +1,77 @@
# Run RAGAS to evaluate
from argparse import ArgumentParser
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.run_config import RunConfig
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
import plotly.graph_objects as go
import plotly.io as pio
import nest_asyncio
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
context_entity_recall,
answer_similarity,
answer_correctness,
)
import json

def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
nest_asyncio.apply() # apply the event loop async fix
df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval})
eval_dataset = Dataset.from_pandas(df)
llm = ChatOllama(model='mistral-nemo', num_ctx=16384)
embeddings = OllamaEmbeddings(model='mistral-nemo', num_ctx=16384)
result = evaluate(
eval_dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall,
context_entity_recall,
answer_similarity,
answer_correctness,
],
llm=llm,
embeddings=embeddings,
raise_exceptions=False,
run_config=RunConfig(max_workers=1),
)
result_df = result.to_pandas()
pio.templates.default = "gridon"
fig = go.Figure()


with open(metric_output, "w") as f:
json.dump(result, f)
metrics = [metric for metric in result_df.columns.to_list() if metric not in ["question", "ground_truth", "answer", "contexts"]]

for metric in metrics:
fig.add_trace(go.Violin(y=result_df[metric], name=metric, points="all", box_visible=True, meanline_visible=True))
fig.update_yaxes(range=[-0.02,1.02])
with open(image_output, "wb") as f:
f.write(fig.to_image(format="png"))



if __name__ == "__main__":
parser = ArgumentParser("evaluate.py")
parser.add_argument("eval_dataset", help="File containing the evaluation data.")
parser.add_argument(
"-m",
"--metrics_output",
help="File to save evaluation metrics to.",
default="data/metrics.json",
)
parser.add_argument(
"-img",
"--image_output",
help="File to save image plot to.",
default="data/evaluation.png",
)
args = parser.parse_args()
main(args.eval_dataset, args.metrics_output, args.image_output)

1 comment on commit 419e83f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

answer_relevancy: 0.5007659295747804
context_recall: 0.514847925063943
answer_correctness: 0.4804706386685034
context_precision: 0.5123200844263376

Please sign in to comment.