From 5cee63faaa3d3c45e03fd2256cf829d4fd7f87bb Mon Sep 17 00:00:00 2001 From: jgomes168 <168474583+jgomes168@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:47:25 -0700 Subject: [PATCH] Juliagomes/improve validator benchmarking (#4) * Remove default llm callable * Median latency * Remove info * Switch to logging.debug * gpt-4o-mini Context Relevancy Guard benchmark results * Add gpt-3.5-turbo * Change default model * Add 4o-mini results * 3.5 turbo hallucination * gpt-3.5-turbo for QA Correctness * gpt-4o-mini benchmark for QA Correctness. * Remove pycache * Fix random seed * Clean up scripts * Remove logging * indentation * Add results * mini csv * Hallucination guard * More results * Remove pycache * benchmark gpt-4-turbo on context relevancy * Remove pycache and shorten csv * Last benchmarking results * Remove CSV files --------- Co-authored-by: Julia Gomes --- benchmark_context_relevancy_prompt.py | 137 +++++++++++++++---------- benchmark_hallucination_prompt.py | 140 +++++++++++++++---------- benchmark_qa_correctness_prompt.py | 142 ++++++++++++++++---------- main.py | 6 +- 4 files changed, 258 insertions(+), 167 deletions(-) diff --git a/benchmark_context_relevancy_prompt.py b/benchmark_context_relevancy_prompt.py index 8447426..cce71ff 100644 --- a/benchmark_context_relevancy_prompt.py +++ b/benchmark_context_relevancy_prompt.py @@ -1,33 +1,58 @@ """Script to evaluate Context Relevancy Guard on "wiki_qa-train" benchmark dataset. * https://huggingface.co/datasets/microsoft/wiki_qa -INFO:root:Guard Results -INFO:root: precision recall f1-score support - - False 0.59 0.90 0.71 41 - True 0.89 0.56 0.69 59 - - accuracy 0.70 100 - macro avg 0.74 0.73 0.70 100 -weighted avg 0.77 0.70 0.70 100 - -INFO:root:Latency -INFO:root:count 100.000000 -mean 2.138843 -std 0.908402 -min 0.938294 -25% 1.466153 -50% 1.873620 -75% 2.542088 -max 5.952361 -Name: guard_latency, dtype: float64 +Model: gpt-4o-mini +Guard Results + precision recall f1-score support + + relevant 0.70 0.86 0.77 93 + unrelated 0.85 0.68 0.76 107 + + accuracy 0.77 200 + macro avg 0.78 0.77 0.76 200 +weighted avg 0.78 0.77 0.76 200 + +Latency +count 200.000000 +mean 2.812122 +std 1.753805 +min 1.067620 +25% 1.708051 +50% 2.248962 +75% 3.321251 +max 14.102804 +Name: guard_latency_gpt-4o-mini, dtype: float64 +median latency +2.2489616039965767 + +Model: gpt-4-turbo +Guard Results + precision recall f1-score support + + relevant 0.64 0.90 0.75 93 + unrelated 0.87 0.56 0.68 107 + + accuracy 0.72 200 + macro avg 0.76 0.73 0.72 200 +weighted avg 0.76 0.72 0.71 200 + +Latency +count 200.000000 +mean 8.561413 +std 6.425799 +min 1.624563 +25% 3.957226 +50% 5.979291 +75% 11.579224 +max 34.342637 +Name: guard_latency_gpt-4-turbo, dtype: float64 +median latency +5.979290812509134 """ import os import time from getpass import getpass from typing import List, Tuple -import logging -import random import openai import pandas as pd @@ -38,17 +63,14 @@ from phoenix.evals import download_benchmark_dataset from sklearn.utils import shuffle -logger = logging.getLogger(__name__) -logging.getLogger().setLevel(logging.INFO) -random.seed(119) +RANDOM_STATE = 119 +MODELS = ["gpt-4o-mini", "gpt-4-turbo"] +N_EVAL_SAMPLE_SIZE = 200 +SAVE_RESULTS_PATH = "context_relevancy_guard_results.csv" -MODEL = "gpt-4o-mini" -N_EVAL_SAMPLE_SIZE = 100 - - -def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]: +def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: """Evaluate guard on benchmark dataset. :param test_dataset: Dataframe of test examples. @@ -63,7 +85,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple response = guard( llm_api=openai.chat.completions.create, prompt=rag_example["query_text"], - model=MODEL, + model=model, max_tokens=1024, temperature=0.5, metadata={ @@ -72,7 +94,6 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple } ) latency_measurements.append(time.perf_counter() - start_time) - logging.info(response) guard_passed.append(response.validation_passed) return latency_measurements, guard_passed @@ -87,28 +108,36 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple test_dataset = download_benchmark_dataset( task="binary-relevance-classification", dataset_name="wiki_qa-train") - test_dataset = shuffle(test_dataset) + test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE) test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] - guard = Guard.from_string( - validators=[ - LlmRagEvaluator( - eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"), - llm_evaluator_fail_response="unrelated", - llm_evaluator_pass_response="relevant", - llm_callable=MODEL, - on_fail="noop", - on="prompt") - ], - ) - - latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard) - test_dataset["guard_passed"] = guard_passed - test_dataset["guard_latency"] = latency_measurements - - logging.info("Guard Results") - # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer) - logging.info(classification_report(~test_dataset["relevant"], ~test_dataset["guard_passed"])) + for model in MODELS: + guard = Guard.from_string( + validators=[ + LlmRagEvaluator( + eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"), + llm_evaluator_fail_response="unrelated", + llm_evaluator_pass_response="relevant", + llm_callable=model, + on_fail="noop", + on="prompt") + ], + ) + + latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) + test_dataset[f"guard_passed_{model}"] = guard_passed + test_dataset[f"guard_latency_{model}"] = latency_measurements + + print(f"\nModel: {model}") + print("Guard Results") + # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer) + print(classification_report( + test_dataset["relevant"].replace(True, "relevant").replace(False, "unrelated"), + test_dataset[f"guard_passed_{model}"].replace(True, "relevant").replace(False, "unrelated"))) + print("Latency") + print(test_dataset[f"guard_latency_{model}"].describe()) + print("median latency") + print(test_dataset[f"guard_latency_{model}"].median()) - logging.info("Latency") - logging.info(test_dataset["guard_latency"].describe()) + if SAVE_RESULTS_PATH: + test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/benchmark_hallucination_prompt.py b/benchmark_hallucination_prompt.py index ba08bfb..bf68f7b 100644 --- a/benchmark_hallucination_prompt.py +++ b/benchmark_hallucination_prompt.py @@ -3,33 +3,58 @@ * https://arxiv.org/abs/2305.11747 * https://github.com/RUCAIBox/HaluEval -INFO:root:Guard Results -INFO:root: precision recall f1-score support - - False 0.83 0.93 0.88 54 - True 0.90 0.78 0.84 46 - - accuracy 0.86 100 - macro avg 0.87 0.85 0.86 100 -weighted avg 0.86 0.86 0.86 100 - -INFO:root:Latency -INFO:root:count 100.000000 -mean 1.533940 -std 0.552186 -min 1.069116 -25% 1.256626 -50% 1.393182 -75% 1.617315 -max 4.579247 -Name: guard_latency, dtype: float64 +Model: gpt-4o-mini +Guard Results + precision recall f1-score support + + factual 0.79 0.97 0.87 129 +hallucinated 0.96 0.73 0.83 121 + + accuracy 0.85 250 + macro avg 0.87 0.85 0.85 250 +weighted avg 0.87 0.85 0.85 250 + +Latency +count 250.000000 +mean 1.865513 +std 0.603700 +min 1.139974 +25% 1.531160 +50% 1.758210 +75% 2.026153 +max 6.403010 +Name: guard_latency_gpt-4o-mini, dtype: float64 +median latency +1.7582097915001214 + +Model: gpt-4-turbo +Guard Results + precision recall f1-score support + + factual 0.83 0.88 0.85 129 +hallucinated 0.87 0.80 0.83 121 + + accuracy 0.84 250 + macro avg 0.85 0.84 0.84 250 +weighted avg 0.85 0.84 0.84 250 + +Latency +count 250.000000 +mean 4.295613 +std 2.393394 +min 1.460899 +25% 2.868255 +50% 3.724649 +75% 4.939440 +max 23.465773 +Name: guard_latency_gpt-4-turbo, dtype: float64 +median latency +3.724648874514969 """ import os import time from getpass import getpass from typing import List, Tuple -import logging -import random import openai import pandas as pd @@ -40,17 +65,14 @@ from main import HallucinationPrompt, LlmRagEvaluator from phoenix.evals import download_benchmark_dataset -logger = logging.getLogger(__name__) -logging.getLogger().setLevel(logging.INFO) -random.seed(119) +RANDOM_STATE = 119 +MODELS = ["gpt-4o-mini", "gpt-4-turbo"] +N_EVAL_SAMPLE_SIZE = 250 +SAVE_RESULTS_PATH = "hallucination_guard_results.csv" -MODEL = "gpt-4o-mini" -N_EVAL_SAMPLE_SIZE = 100 - - -def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]: +def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: """Evaluate guard on benchmark dataset. :param test_dataset: Dataframe of test examples. @@ -65,7 +87,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple response = guard( llm_api=openai.chat.completions.create, prompt=rag_example["query"], - model=MODEL, + model=model, max_tokens=1024, temperature=0.5, metadata={ @@ -75,7 +97,6 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple } ) latency_measurements.append(time.perf_counter() - start_time) - logging.info(response) guard_passed.append(response.validation_passed) return latency_measurements, guard_passed @@ -90,28 +111,37 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple test_dataset = download_benchmark_dataset( task="binary-hallucination-classification", dataset_name="halueval_qa_data") - test_dataset = shuffle(test_dataset) + test_dataset = shuffle(test_dataset, random_state=119) test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] - guard = Guard.from_string( - validators=[ - LlmRagEvaluator( - eval_llm_prompt_generator=HallucinationPrompt(prompt_name="hallucination_judge_llm"), - llm_evaluator_fail_response="hallucinated", - llm_evaluator_pass_response="factual", - llm_callable=MODEL, - on_fail="noop", - on="prompt") - ], - ) - - latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard) - test_dataset["guard_passed"] = guard_passed - test_dataset["guard_latency"] = latency_measurements - - logging.info("Guard Results") - # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination) - logging.info(classification_report(test_dataset["is_hallucination"], ~test_dataset["guard_passed"])) - - logging.info("Latency") - logging.info(test_dataset["guard_latency"].describe()) + for model in MODELS: + guard = Guard.from_string( + validators=[ + LlmRagEvaluator( + eval_llm_prompt_generator=HallucinationPrompt(prompt_name="hallucination_judge_llm"), + llm_evaluator_fail_response="hallucinated", + llm_evaluator_pass_response="factual", + llm_callable=model, + on_fail="noop", + on="prompt") + ], + ) + + latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) + test_dataset[f"guard_passed_{model}"] = guard_passed + test_dataset[f"guard_latency_{model}"] = latency_measurements + + print(f"\nModel: {model}") + print("Guard Results") + # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination) + print(classification_report( + test_dataset["is_hallucination"].replace(True, "hallucinated").replace(False, "factual"), + test_dataset[f"guard_passed_{model}"].replace(True, "factual").replace(False, "hallucinated"))) + + print("Latency") + print(test_dataset[f"guard_latency_{model}"].describe()) + print("median latency") + print(test_dataset[f"guard_latency_{model}"].median()) + + if SAVE_RESULTS_PATH: + test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/benchmark_qa_correctness_prompt.py b/benchmark_qa_correctness_prompt.py index 62c89e2..f821527 100644 --- a/benchmark_qa_correctness_prompt.py +++ b/benchmark_qa_correctness_prompt.py @@ -3,33 +3,60 @@ researchers to design AI models for reading comprehension tasks under challenging constraints. https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/default/15785042.pdf -INFO:root:Guard Results -INFO:root: precision recall f1-score support - - False 1.00 0.94 0.97 50 - True 0.94 1.00 0.97 50 - - accuracy 0.97 100 - macro avg 0.97 0.97 0.97 100 -weighted avg 0.97 0.97 0.97 100 - -INFO:root:Latency -INFO:root:count 100.000000 -mean 1.845307 -std 0.867450 -min 0.982674 -25% 1.354958 -50% 1.606060 -75% 1.928065 -max 6.342991 -Name: guard_latency, dtype: float64 +Model: gpt-4o-mini + +Guard Results + precision recall f1-score support + + correct 1.00 0.96 0.98 133 + incorrect 0.96 1.00 0.98 117 + + accuracy 0.98 250 + macro avg 0.98 0.98 0.98 250 +weighted avg 0.98 0.98 0.98 250 + +Latency +count 250.000000 +mean 2.610912 +std 1.415877 +min 1.148114 +25% 1.678278 +50% 2.263149 +75% 2.916726 +max 10.625763 +Name: guard_latency_gpt-4o-mini, dtype: float64 +median latency +2.263148645986803 + +Model: gpt-4-turbo + +Guard Results + precision recall f1-score support + + correct 1.00 0.92 0.96 133 + incorrect 0.91 1.00 0.96 117 + + accuracy 0.96 250 + macro avg 0.96 0.96 0.96 250 +weighted avg 0.96 0.96 0.96 250 + +Latency +count 250.000000 +mean 7.390556 +std 5.804535 +min 1.671949 +25% 3.544383 +50% 5.239343 +75% 8.484112 +max 30.651372 +Name: guard_latency_gpt-4-turbo, dtype: float64 +median latency +5.239343083492713 """ import os import time from getpass import getpass from typing import List, Tuple -import logging -import random import openai import pandas as pd @@ -40,17 +67,14 @@ from phoenix.evals import download_benchmark_dataset from sklearn.utils import shuffle -logger = logging.getLogger(__name__) -logging.getLogger().setLevel(logging.INFO) -random.seed(119) +RANDOM_STATE = 119 +MODELS = ["gpt-4o-mini", "gpt-4-turbo"] +N_EVAL_SAMPLE_SIZE = 250 +SAVE_RESULTS_PATH = "qa_correctness_guard_results.csv" -MODEL = "gpt-4o-mini" -N_EVAL_SAMPLE_SIZE = 100 - - -def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]: +def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]: """Evaluate guard on benchmark dataset. :param test_dataset: Dataframe of test examples. @@ -65,7 +89,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple response = guard( llm_api=openai.chat.completions.create, prompt=rag_example["question"], - model=MODEL, + model=model, max_tokens=1024, temperature=0.5, metadata={ @@ -75,7 +99,6 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple } ) latency_measurements.append(time.perf_counter() - start_time) - logging.info(response) guard_passed.append(response.validation_passed) return latency_measurements, guard_passed @@ -90,28 +113,37 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple test_dataset = df = download_benchmark_dataset( task="qa-classification", dataset_name="qa_generated_dataset") - test_dataset = shuffle(test_dataset) + test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE) test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE] - guard = Guard.from_string( - validators=[ - LlmRagEvaluator( - eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"), - llm_evaluator_fail_response="incorrect", - llm_evaluator_pass_response="correct", - llm_callable=MODEL, - on_fail="noop", - on="prompt") - ], - ) - - latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard) - test_dataset["guard_passed"] = guard_passed - test_dataset["guard_latency"] = latency_measurements - - logging.info("Guard Results") - # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer) - logging.info(classification_report(~test_dataset["answer_true"], ~test_dataset["guard_passed"])) - - logging.info("Latency") - logging.info(test_dataset["guard_latency"].describe()) + for model in MODELS: + guard = Guard.from_string( + validators=[ + LlmRagEvaluator( + eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"), + llm_evaluator_fail_response="incorrect", + llm_evaluator_pass_response="correct", + llm_callable=model, + on_fail="noop", + on="prompt") + ], + ) + + latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model) + test_dataset[f"guard_passed_{model}"] = guard_passed + test_dataset[f"guard_latency_{model}"] = latency_measurements + + print(f"\nModel: {model}") + print("\nGuard Results") + # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer) + print(classification_report( + test_dataset["answer_true"].replace(True, "correct").replace(False, "incorrect"), + test_dataset[f"guard_passed_{model}"].replace(True, "correct").replace(False, "incorrect"))) + + print("Latency") + print(test_dataset[f"guard_latency_{model}"].describe()) + print("median latency") + print(test_dataset[f"guard_latency_{model}"].median()) + + if SAVE_RESULTS_PATH: + test_dataset.to_csv(SAVE_RESULTS_PATH) diff --git a/main.py b/main.py index 78785f2..df23d4a 100644 --- a/main.py +++ b/main.py @@ -119,7 +119,7 @@ def __init__( eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase], llm_evaluator_fail_response: str, llm_evaluator_pass_response: str, - llm_callable: str = "gpt-4o-mini", + llm_callable: str, on_fail: Optional[Callable] = "noop", **kwargs, ): @@ -203,11 +203,11 @@ def validate(self, value: Any, metadata: Dict) -> ValidationResult: # 2. Setup the prompt prompt = self._llm_evaluator_prompt_generator.generate_prompt(user_input_message=user_input_message, reference_text=reference_text, llm_response=value) - logging.info(f"evaluator prompt: {prompt}") + logging.debug(f"evaluator prompt: {prompt}") # 3. Get the LLM response llm_response = self.get_llm_response(prompt) - logging.info(f"llm evaluator response: {llm_response}") + logging.debug(f"llm evaluator response: {llm_response}") # 4. Check the LLM response and return the result if llm_response == self._fail_response: