Skip to content

Commit

Permalink
Merge pull request #3 from Arize-ai/juliagomes/benchmark-additional-l…
Browse files Browse the repository at this point in the history
…lm-judge-guards

Juliagomes/benchmark additional llm judge guards
  • Loading branch information
jgomes168 authored Jul 19, 2024
2 parents 5ea109d + 7bbe0c9 commit 7db409d
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 7 deletions.
114 changes: 114 additions & 0 deletions benchmark_context_relevancy_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Script to evaluate Context Relevancy Guard on "wiki_qa-train" benchmark dataset.
* https://huggingface.co/datasets/microsoft/wiki_qa
INFO:root:Guard Results
INFO:root: precision recall f1-score support
False 0.59 0.90 0.71 41
True 0.89 0.56 0.69 59
accuracy 0.70 100
macro avg 0.74 0.73 0.70 100
weighted avg 0.77 0.70 0.70 100
INFO:root:Latency
INFO:root:count 100.000000
mean 2.138843
std 0.908402
min 0.938294
25% 1.466153
50% 1.873620
75% 2.542088
max 5.952361
Name: guard_latency, dtype: float64
"""
import os
import time
from getpass import getpass
from typing import List, Tuple
import logging
import random

import openai
import pandas as pd
from sklearn.metrics import classification_report

from guardrails import Guard
from main import ContextRelevancyPrompt, LlmRagEvaluator
from phoenix.evals import download_benchmark_dataset
from sklearn.utils import shuffle

logger = logging.getLogger(__name__)
logging.getLogger().setLevel(logging.INFO)

random.seed(119)


MODEL = "gpt-4o-mini"
N_EVAL_SAMPLE_SIZE = 100


def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
"""Evaluate guard on benchmark dataset.
:param test_dataset: Dataframe of test examples.
:param guard: Guard we want to evaluate.
:return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed.
"""
latency_measurements = []
guard_passed = []
for _, rag_example in test_dataset.iterrows():
start_time = time.perf_counter()
response = guard(
llm_api=openai.chat.completions.create,
prompt=rag_example["query_text"],
model=MODEL,
max_tokens=1024,
temperature=0.5,
metadata={
"user_message": rag_example["query_text"],
"context": rag_example["document_text"],
}
)
latency_measurements.append(time.perf_counter() - start_time)
logging.info(response)
guard_passed.append(response.validation_passed)
return latency_measurements, guard_passed


if __name__ == "__main__":
if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

# Columns: Index(['query_id', 'query_text', 'document_title', 'document_text', 'document_text_with_emphasis', 'relevant']
test_dataset = download_benchmark_dataset(
task="binary-relevance-classification",
dataset_name="wiki_qa-train")
test_dataset = shuffle(test_dataset)
test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]

guard = Guard.from_string(
validators=[
LlmRagEvaluator(
eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"),
llm_evaluator_fail_response="unrelated",
llm_evaluator_pass_response="relevant",
llm_callable=MODEL,
on_fail="noop",
on="prompt")
],
)

latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
test_dataset["guard_passed"] = guard_passed
test_dataset["guard_latency"] = latency_measurements

logging.info("Guard Results")
# Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer)
logging.info(classification_report(~test_dataset["relevant"], ~test_dataset["guard_passed"]))

logging.info("Latency")
logging.info(test_dataset["guard_latency"].describe())
33 changes: 31 additions & 2 deletions benchmark_guard.py → benchmark_hallucination_prompt.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,52 @@
"""Script to evaluate Guard on benchmark dataset. Currently supported datasets include "halueval_qa_data" from the HaluEval benchmark:
"""Script to evaluate Hallucination Guard on benchmark dataset.
Currently supported datasets include "halueval_qa_data" from the HaluEval benchmark:
* https://arxiv.org/abs/2305.11747
* https://github.com/RUCAIBox/HaluEval
INFO:root:Guard Results
INFO:root: precision recall f1-score support
False 0.83 0.93 0.88 54
True 0.90 0.78 0.84 46
accuracy 0.86 100
macro avg 0.87 0.85 0.86 100
weighted avg 0.86 0.86 0.86 100
INFO:root:Latency
INFO:root:count 100.000000
mean 1.533940
std 0.552186
min 1.069116
25% 1.256626
50% 1.393182
75% 1.617315
max 4.579247
Name: guard_latency, dtype: float64
"""
import os
import time
from getpass import getpass
from typing import List, Tuple
import logging
import random

import openai
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

from guardrails import Guard
from main import HallucinationPrompt, LlmRagEvaluator
from phoenix.evals import download_benchmark_dataset

logger = logging.getLogger(__name__)
logging.getLogger().setLevel(logging.INFO)

random.seed(119)


MODEL = "gpt-4-turbo"
MODEL = "gpt-4o-mini"
N_EVAL_SAMPLE_SIZE = 100


Expand Down Expand Up @@ -63,6 +90,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
test_dataset = download_benchmark_dataset(
task="binary-hallucination-classification",
dataset_name="halueval_qa_data")
test_dataset = shuffle(test_dataset)
test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]

guard = Guard.from_string(
Expand All @@ -82,6 +110,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
test_dataset["guard_latency"] = latency_measurements

logging.info("Guard Results")
# Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination)
logging.info(classification_report(test_dataset["is_hallucination"], ~test_dataset["guard_passed"]))

logging.info("Latency")
Expand Down
117 changes: 117 additions & 0 deletions benchmark_qa_correctness_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Script to evaluate QA Correctness Guard on benchmark dataset.
The 2.0 version of the large-scale dataset Stanford Question Answering Dataset (SQuAD 2.0) allows
researchers to design AI models for reading comprehension tasks under challenging constraints.
https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/default/15785042.pdf
INFO:root:Guard Results
INFO:root: precision recall f1-score support
False 1.00 0.94 0.97 50
True 0.94 1.00 0.97 50
accuracy 0.97 100
macro avg 0.97 0.97 0.97 100
weighted avg 0.97 0.97 0.97 100
INFO:root:Latency
INFO:root:count 100.000000
mean 1.845307
std 0.867450
min 0.982674
25% 1.354958
50% 1.606060
75% 1.928065
max 6.342991
Name: guard_latency, dtype: float64
"""
import os
import time
from getpass import getpass
from typing import List, Tuple
import logging
import random

import openai
import pandas as pd
from sklearn.metrics import classification_report

from guardrails import Guard
from main import QACorrectnessPrompt, LlmRagEvaluator
from phoenix.evals import download_benchmark_dataset
from sklearn.utils import shuffle

logger = logging.getLogger(__name__)
logging.getLogger().setLevel(logging.INFO)

random.seed(119)


MODEL = "gpt-4o-mini"
N_EVAL_SAMPLE_SIZE = 100


def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
"""Evaluate guard on benchmark dataset.
:param test_dataset: Dataframe of test examples.
:param guard: Guard we want to evaluate.
:return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed.
"""
latency_measurements = []
guard_passed = []
for _, rag_example in test_dataset.iterrows():
start_time = time.perf_counter()
response = guard(
llm_api=openai.chat.completions.create,
prompt=rag_example["question"],
model=MODEL,
max_tokens=1024,
temperature=0.5,
metadata={
"user_message": rag_example["question"],
"context": rag_example["context"],
"llm_response": rag_example["sampled_answer"],
}
)
latency_measurements.append(time.perf_counter() - start_time)
logging.info(response)
guard_passed.append(response.validation_passed)
return latency_measurements, guard_passed


if __name__ == "__main__":
if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

# Columns: Index(['id', 'title', 'context', 'question', 'answers', 'correct_answer', 'wrong_answer', 'sampled_answer', 'answer_true']
test_dataset = df = download_benchmark_dataset(
task="qa-classification",
dataset_name="qa_generated_dataset")
test_dataset = shuffle(test_dataset)
test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]

guard = Guard.from_string(
validators=[
LlmRagEvaluator(
eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"),
llm_evaluator_fail_response="incorrect",
llm_evaluator_pass_response="correct",
llm_callable=MODEL,
on_fail="noop",
on="prompt")
],
)

latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
test_dataset["guard_passed"] = guard_passed
test_dataset["guard_latency"] = latency_measurements

logging.info("Guard Results")
# Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer)
logging.info(classification_report(~test_dataset["answer_true"], ~test_dataset["guard_passed"]))

logging.info("Latency")
logging.info(test_dataset["guard_latency"].describe())
16 changes: 11 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,20 @@ class LlmRagEvaluator(Validator):

def __init__(
self,
eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase] = HallucinationPrompt("hallucination_judge_llm"),
llm_evaluator_fail_response: str = "hallucinated",
llm_evaluator_pass_response: str = "factual",
llm_callable: str = "gpt-3.5-turbo", # str for litellm model name
eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase],
llm_evaluator_fail_response: str,
llm_evaluator_pass_response: str,
llm_callable: str = "gpt-4o-mini",
on_fail: Optional[Callable] = "noop",
**kwargs,
):
super().__init__(on_fail, llm_callable=llm_callable, **kwargs)
super().__init__(
on_fail,
eval_llm_prompt_generator=eval_llm_prompt_generator,
llm_evaluator_fail_response=llm_evaluator_fail_response,
llm_evaluator_pass_response=llm_evaluator_pass_response,
llm_callable=llm_callable,
**kwargs)
self._llm_evaluator_prompt_generator = eval_llm_prompt_generator
self._llm_callable = llm_callable
self._fail_response = llm_evaluator_fail_response
Expand Down

0 comments on commit 7db409d

Please sign in to comment.