Merge pull request #3 from Arize-ai/juliagomes/benchmark-additional-l…

…lm-judge-guards Juliagomes/benchmark additional llm judge guards
Arize-ai · Jul 19, 2024 · 7db409d · 7db409d
2 parents 5ea109d + 7bbe0c9
commit 7db409d
Show file tree

Hide file tree

Showing 4 changed files with 273 additions and 7 deletions.
diff --git a/benchmark_context_relevancy_prompt.py b/benchmark_context_relevancy_prompt.py
@@ -0,0 +1,114 @@
+"""Script to evaluate Context Relevancy Guard on "wiki_qa-train" benchmark dataset.
+* https://huggingface.co/datasets/microsoft/wiki_qa
+
+INFO:root:Guard Results
+INFO:root:              precision    recall  f1-score   support
+
+       False       0.59      0.90      0.71        41
+        True       0.89      0.56      0.69        59
+
+    accuracy                           0.70       100
+   macro avg       0.74      0.73      0.70       100
+weighted avg       0.77      0.70      0.70       100
+
+INFO:root:Latency
+INFO:root:count    100.000000
+mean       2.138843
+std        0.908402
+min        0.938294
+25%        1.466153
+50%        1.873620
+75%        2.542088
+max        5.952361
+Name: guard_latency, dtype: float64
+"""
+import os
+import time
+from getpass import getpass
+from typing import List, Tuple
+import logging
+import random
+
+import openai
+import pandas as pd
+from sklearn.metrics import classification_report
+
+from guardrails import Guard
+from main import ContextRelevancyPrompt, LlmRagEvaluator
+from phoenix.evals import download_benchmark_dataset
+from sklearn.utils import shuffle
+
+logger = logging.getLogger(__name__)
+logging.getLogger().setLevel(logging.INFO)
+
+random.seed(119)
+
+
+MODEL = "gpt-4o-mini"
+N_EVAL_SAMPLE_SIZE = 100
+
+
+def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
+    """Evaluate guard on benchmark dataset.
+
+    :param test_dataset: Dataframe of test examples.
+    :param guard: Guard we want to evaluate.
+
+    :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed.
+    """
+    latency_measurements = []
+    guard_passed = []
+    for _, rag_example in test_dataset.iterrows():
+        start_time = time.perf_counter()
+        response = guard(
+            llm_api=openai.chat.completions.create,
+            prompt=rag_example["query_text"],
+            model=MODEL,
+            max_tokens=1024,
+            temperature=0.5,
+            metadata={
+                "user_message": rag_example["query_text"],
+                "context": rag_example["document_text"],
+            }
+        )
+        latency_measurements.append(time.perf_counter() - start_time)
+        logging.info(response)
+        guard_passed.append(response.validation_passed)
+    return latency_measurements, guard_passed
+
+
+if __name__ == "__main__":
+    if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
+        openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
+    openai.api_key = openai_api_key
+    os.environ["OPENAI_API_KEY"] = openai_api_key
+
+    # Columns: Index(['query_id', 'query_text', 'document_title', 'document_text', 'document_text_with_emphasis', 'relevant']
+    test_dataset = download_benchmark_dataset(
+        task="binary-relevance-classification",
+        dataset_name="wiki_qa-train")
+    test_dataset = shuffle(test_dataset)
+    test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]
+
+    guard = Guard.from_string(
+        validators=[
+            LlmRagEvaluator(
+                eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"),
+                llm_evaluator_fail_response="unrelated",
+                llm_evaluator_pass_response="relevant",
+                llm_callable=MODEL,
+                on_fail="noop",
+                on="prompt")
+        ],
+    )
+
+    latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
+    test_dataset["guard_passed"] = guard_passed
+    test_dataset["guard_latency"] = latency_measurements
+
+    logging.info("Guard Results")
+    # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer)
+    logging.info(classification_report(~test_dataset["relevant"], ~test_dataset["guard_passed"]))
+
+    logging.info("Latency")
+    logging.info(test_dataset["guard_latency"].describe())
diff --git a/benchmark_guard.py → benchmark_hallucination_prompt.py b/benchmark_guard.py → benchmark_hallucination_prompt.py
@@ -1,25 +1,52 @@
-"""Script to evaluate Guard on benchmark dataset. Currently supported datasets include "halueval_qa_data" from the HaluEval benchmark:
+"""Script to evaluate Hallucination Guard on benchmark dataset.
+Currently supported datasets include "halueval_qa_data" from the HaluEval benchmark:
 * https://arxiv.org/abs/2305.11747
 * https://github.com/RUCAIBox/HaluEval
+
+INFO:root:Guard Results
+INFO:root:              precision    recall  f1-score   support
+
+       False       0.83      0.93      0.88        54
+        True       0.90      0.78      0.84        46
+
+    accuracy                           0.86       100
+   macro avg       0.87      0.85      0.86       100
+weighted avg       0.86      0.86      0.86       100
+
+INFO:root:Latency
+INFO:root:count    100.000000
+mean       1.533940
+std        0.552186
+min        1.069116
+25%        1.256626
+50%        1.393182
+75%        1.617315
+max        4.579247
+Name: guard_latency, dtype: float64
 """
 import os
 import time
 from getpass import getpass
 from typing import List, Tuple
 import logging
+import random
 
 import openai
 import pandas as pd
 from sklearn.metrics import classification_report
+from sklearn.utils import shuffle
 
 from guardrails import Guard
 from main import HallucinationPrompt, LlmRagEvaluator
 from phoenix.evals import download_benchmark_dataset
 
 logger = logging.getLogger(__name__)
+logging.getLogger().setLevel(logging.INFO)
+
+random.seed(119)
 
 
-MODEL = "gpt-4-turbo"
+MODEL = "gpt-4o-mini"
 N_EVAL_SAMPLE_SIZE = 100
 
 
@@ -63,6 +90,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
     test_dataset = download_benchmark_dataset(
         task="binary-hallucination-classification",
         dataset_name="halueval_qa_data")
+    test_dataset = shuffle(test_dataset)
     test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]
 
     guard = Guard.from_string(
@@ -82,6 +110,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
     test_dataset["guard_latency"] = latency_measurements
 
     logging.info("Guard Results")
+    # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination)
     logging.info(classification_report(test_dataset["is_hallucination"], ~test_dataset["guard_passed"]))
 
     logging.info("Latency")

diff --git a/benchmark_qa_correctness_prompt.py b/benchmark_qa_correctness_prompt.py
@@ -0,0 +1,117 @@
+"""Script to evaluate QA Correctness Guard on benchmark dataset.
+The 2.0 version of the large-scale dataset Stanford Question Answering Dataset (SQuAD 2.0) allows
+researchers to design AI models for reading comprehension tasks under challenging constraints.
+https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/default/15785042.pdf
+
+INFO:root:Guard Results
+INFO:root:              precision    recall  f1-score   support
+
+       False       1.00      0.94      0.97        50
+        True       0.94      1.00      0.97        50
+
+    accuracy                           0.97       100
+   macro avg       0.97      0.97      0.97       100
+weighted avg       0.97      0.97      0.97       100
+
+INFO:root:Latency
+INFO:root:count    100.000000
+mean       1.845307
+std        0.867450
+min        0.982674
+25%        1.354958
+50%        1.606060
+75%        1.928065
+max        6.342991
+Name: guard_latency, dtype: float64
+"""
+import os
+import time
+from getpass import getpass
+from typing import List, Tuple
+import logging
+import random
+
+import openai
+import pandas as pd
+from sklearn.metrics import classification_report
+
+from guardrails import Guard
+from main import QACorrectnessPrompt, LlmRagEvaluator
+from phoenix.evals import download_benchmark_dataset
+from sklearn.utils import shuffle
+
+logger = logging.getLogger(__name__)
+logging.getLogger().setLevel(logging.INFO)
+
+random.seed(119)
+
+
+MODEL = "gpt-4o-mini"
+N_EVAL_SAMPLE_SIZE = 100
+
+
+def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
+    """Evaluate guard on benchmark dataset.
+
+    :param test_dataset: Dataframe of test examples.
+    :param guard: Guard we want to evaluate.
+
+    :return: Tuple where the first lists contains latency, and the second list contains a boolean indicating whether the guard passed.
+    """
+    latency_measurements = []
+    guard_passed = []
+    for _, rag_example in test_dataset.iterrows():
+        start_time = time.perf_counter()
+        response = guard(
+            llm_api=openai.chat.completions.create,
+            prompt=rag_example["question"],
+            model=MODEL,
+            max_tokens=1024,
+            temperature=0.5,
+            metadata={
+                "user_message": rag_example["question"],
+                "context": rag_example["context"],
+                "llm_response": rag_example["sampled_answer"],
+            }
+        )
+        latency_measurements.append(time.perf_counter() - start_time)
+        logging.info(response)
+        guard_passed.append(response.validation_passed)
+    return latency_measurements, guard_passed
+
+
+if __name__ == "__main__":
+    if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
+        openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
+    openai.api_key = openai_api_key
+    os.environ["OPENAI_API_KEY"] = openai_api_key
+
+    # Columns: Index(['id', 'title', 'context', 'question', 'answers', 'correct_answer', 'wrong_answer', 'sampled_answer', 'answer_true']
+    test_dataset = df = download_benchmark_dataset(
+        task="qa-classification",
+        dataset_name="qa_generated_dataset")
+    test_dataset = shuffle(test_dataset)
+    test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]
+
+    guard = Guard.from_string(
+        validators=[
+            LlmRagEvaluator(
+                eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"),
+                llm_evaluator_fail_response="incorrect",
+                llm_evaluator_pass_response="correct",
+                llm_callable=MODEL,
+                on_fail="noop",
+                on="prompt")
+        ],
+    )
+
+    latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
+    test_dataset["guard_passed"] = guard_passed
+    test_dataset["guard_latency"] = latency_measurements
+
+    logging.info("Guard Results")
+    # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer)
+    logging.info(classification_report(~test_dataset["answer_true"], ~test_dataset["guard_passed"]))
+
+    logging.info("Latency")
+    logging.info(test_dataset["guard_latency"].describe())
diff --git a/main.py b/main.py
@@ -116,14 +116,20 @@ class LlmRagEvaluator(Validator):
 
     def __init__(
         self,
-        eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase] = HallucinationPrompt("hallucination_judge_llm"),
-        llm_evaluator_fail_response: str = "hallucinated",
-        llm_evaluator_pass_response: str = "factual", 
-        llm_callable: str = "gpt-3.5-turbo",  # str for litellm model name
+        eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase],
+        llm_evaluator_fail_response: str,
+        llm_evaluator_pass_response: str, 
+        llm_callable: str = "gpt-4o-mini",
         on_fail: Optional[Callable] = "noop",
         **kwargs,
     ):
-        super().__init__(on_fail, llm_callable=llm_callable, **kwargs)
+        super().__init__(
+            on_fail,
+            eval_llm_prompt_generator=eval_llm_prompt_generator,
+            llm_evaluator_fail_response=llm_evaluator_fail_response,
+            llm_evaluator_pass_response=llm_evaluator_pass_response,
+            llm_callable=llm_callable, 
+            **kwargs)
         self._llm_evaluator_prompt_generator = eval_llm_prompt_generator
         self._llm_callable = llm_callable
         self._fail_response = llm_evaluator_fail_response