From 5cee63faaa3d3c45e03fd2256cf829d4fd7f87bb Mon Sep 17 00:00:00 2001
From: jgomes168 <168474583+jgomes168@users.noreply.github.com>
Date: Tue, 23 Jul 2024 17:47:25 -0700
Subject: [PATCH] Juliagomes/improve validator benchmarking (#4)

* Remove default llm callable

* Median latency

* Remove info

* Switch to logging.debug

* gpt-4o-mini Context Relevancy Guard benchmark results

* Add gpt-3.5-turbo

* Change default model

* Add 4o-mini results

* 3.5 turbo hallucination

* gpt-3.5-turbo for QA Correctness

* gpt-4o-mini benchmark for QA Correctness.

* Remove pycache

* Fix random seed

* Clean up scripts

* Remove logging

* indentation

* Add results

* mini csv

* Hallucination guard

* More results

* Remove pycache

* benchmark gpt-4-turbo on context relevancy

* Remove pycache and shorten csv

* Last benchmarking results

* Remove CSV files

---------

Co-authored-by: Julia Gomes <juliagomes@Julias-MacBook-Pro.local>
---
 benchmark_context_relevancy_prompt.py | 137 +++++++++++++++----------
 benchmark_hallucination_prompt.py     | 140 +++++++++++++++----------
 benchmark_qa_correctness_prompt.py    | 142 ++++++++++++++++----------
 main.py                               |   6 +-
 4 files changed, 258 insertions(+), 167 deletions(-)

diff --git a/benchmark_context_relevancy_prompt.py b/benchmark_context_relevancy_prompt.py
index 8447426..cce71ff 100644
--- a/benchmark_context_relevancy_prompt.py
+++ b/benchmark_context_relevancy_prompt.py
@@ -1,33 +1,58 @@
 """Script to evaluate Context Relevancy Guard on "wiki_qa-train" benchmark dataset.
 * https://huggingface.co/datasets/microsoft/wiki_qa
 
-INFO:root:Guard Results
-INFO:root:              precision    recall  f1-score   support
-
-       False       0.59      0.90      0.71        41
-        True       0.89      0.56      0.69        59
-
-    accuracy                           0.70       100
-   macro avg       0.74      0.73      0.70       100
-weighted avg       0.77      0.70      0.70       100
-
-INFO:root:Latency
-INFO:root:count    100.000000
-mean       2.138843
-std        0.908402
-min        0.938294
-25%        1.466153
-50%        1.873620
-75%        2.542088
-max        5.952361
-Name: guard_latency, dtype: float64
+Model: gpt-4o-mini
+Guard Results
+              precision    recall  f1-score   support
+
+    relevant       0.70      0.86      0.77        93
+   unrelated       0.85      0.68      0.76       107
+
+    accuracy                           0.77       200
+   macro avg       0.78      0.77      0.76       200
+weighted avg       0.78      0.77      0.76       200
+
+Latency
+count    200.000000
+mean       2.812122
+std        1.753805
+min        1.067620
+25%        1.708051
+50%        2.248962
+75%        3.321251
+max       14.102804
+Name: guard_latency_gpt-4o-mini, dtype: float64
+median latency
+2.2489616039965767
+
+Model: gpt-4-turbo
+Guard Results
+              precision    recall  f1-score   support
+
+    relevant       0.64      0.90      0.75        93
+   unrelated       0.87      0.56      0.68       107
+
+    accuracy                           0.72       200
+   macro avg       0.76      0.73      0.72       200
+weighted avg       0.76      0.72      0.71       200
+
+Latency
+count    200.000000
+mean       8.561413
+std        6.425799
+min        1.624563
+25%        3.957226
+50%        5.979291
+75%       11.579224
+max       34.342637
+Name: guard_latency_gpt-4-turbo, dtype: float64
+median latency
+5.979290812509134
 """
 import os
 import time
 from getpass import getpass
 from typing import List, Tuple
-import logging
-import random
 
 import openai
 import pandas as pd
@@ -38,17 +63,14 @@
 from phoenix.evals import download_benchmark_dataset
 from sklearn.utils import shuffle
 
-logger = logging.getLogger(__name__)
-logging.getLogger().setLevel(logging.INFO)
 
-random.seed(119)
+RANDOM_STATE = 119
+MODELS = ["gpt-4o-mini", "gpt-4-turbo"]
+N_EVAL_SAMPLE_SIZE = 200
+SAVE_RESULTS_PATH = "context_relevancy_guard_results.csv"
 
 
-MODEL = "gpt-4o-mini"
-N_EVAL_SAMPLE_SIZE = 100
-
-
-def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
+def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]:
     """Evaluate guard on benchmark dataset.
 
     :param test_dataset: Dataframe of test examples.
@@ -63,7 +85,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
         response = guard(
             llm_api=openai.chat.completions.create,
             prompt=rag_example["query_text"],
-            model=MODEL,
+            model=model,
             max_tokens=1024,
             temperature=0.5,
             metadata={
@@ -72,7 +94,6 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
             }
         )
         latency_measurements.append(time.perf_counter() - start_time)
-        logging.info(response)
         guard_passed.append(response.validation_passed)
     return latency_measurements, guard_passed
 
@@ -87,28 +108,36 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
     test_dataset = download_benchmark_dataset(
         task="binary-relevance-classification",
         dataset_name="wiki_qa-train")
-    test_dataset = shuffle(test_dataset)
+    test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE)
     test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]
     
-    guard = Guard.from_string(
-        validators=[
-            LlmRagEvaluator(
-                eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"),
-                llm_evaluator_fail_response="unrelated",
-                llm_evaluator_pass_response="relevant",
-                llm_callable=MODEL,
-                on_fail="noop",
-                on="prompt")
-        ],
-    )
-    
-    latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
-    test_dataset["guard_passed"] = guard_passed
-    test_dataset["guard_latency"] = latency_measurements
-    
-    logging.info("Guard Results")
-    # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer)
-    logging.info(classification_report(~test_dataset["relevant"], ~test_dataset["guard_passed"]))
+    for model in MODELS:
+        guard = Guard.from_string(
+            validators=[
+                LlmRagEvaluator(
+                    eval_llm_prompt_generator=ContextRelevancyPrompt(prompt_name="context_relevancy_judge_llm"),
+                    llm_evaluator_fail_response="unrelated",
+                    llm_evaluator_pass_response="relevant",
+                    llm_callable=model,
+                    on_fail="noop",
+                    on="prompt")
+            ],
+        )
+        
+        latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model)
+        test_dataset[f"guard_passed_{model}"] = guard_passed
+        test_dataset[f"guard_latency_{model}"] = latency_measurements
+        
+        print(f"\nModel: {model}")
+        print("Guard Results")
+        # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an irrelevant answer)
+        print(classification_report(
+            test_dataset["relevant"].replace(True, "relevant").replace(False, "unrelated"),
+            test_dataset[f"guard_passed_{model}"].replace(True, "relevant").replace(False, "unrelated")))
+        print("Latency")
+        print(test_dataset[f"guard_latency_{model}"].describe())
+        print("median latency")
+        print(test_dataset[f"guard_latency_{model}"].median())
     
-    logging.info("Latency")
-    logging.info(test_dataset["guard_latency"].describe())
+    if SAVE_RESULTS_PATH:
+        test_dataset.to_csv(SAVE_RESULTS_PATH)
diff --git a/benchmark_hallucination_prompt.py b/benchmark_hallucination_prompt.py
index ba08bfb..bf68f7b 100644
--- a/benchmark_hallucination_prompt.py
+++ b/benchmark_hallucination_prompt.py
@@ -3,33 +3,58 @@
 * https://arxiv.org/abs/2305.11747
 * https://github.com/RUCAIBox/HaluEval
 
-INFO:root:Guard Results
-INFO:root:              precision    recall  f1-score   support
-
-       False       0.83      0.93      0.88        54
-        True       0.90      0.78      0.84        46
-
-    accuracy                           0.86       100
-   macro avg       0.87      0.85      0.86       100
-weighted avg       0.86      0.86      0.86       100
-
-INFO:root:Latency
-INFO:root:count    100.000000
-mean       1.533940
-std        0.552186
-min        1.069116
-25%        1.256626
-50%        1.393182
-75%        1.617315
-max        4.579247
-Name: guard_latency, dtype: float64
+Model: gpt-4o-mini
+Guard Results
+              precision    recall  f1-score   support
+
+     factual       0.79      0.97      0.87       129
+hallucinated       0.96      0.73      0.83       121
+
+    accuracy                           0.85       250
+   macro avg       0.87      0.85      0.85       250
+weighted avg       0.87      0.85      0.85       250
+
+Latency
+count    250.000000
+mean       1.865513
+std        0.603700
+min        1.139974
+25%        1.531160
+50%        1.758210
+75%        2.026153
+max        6.403010
+Name: guard_latency_gpt-4o-mini, dtype: float64
+median latency
+1.7582097915001214
+
+Model: gpt-4-turbo
+Guard Results
+              precision    recall  f1-score   support
+
+     factual       0.83      0.88      0.85       129
+hallucinated       0.87      0.80      0.83       121
+
+    accuracy                           0.84       250
+   macro avg       0.85      0.84      0.84       250
+weighted avg       0.85      0.84      0.84       250
+
+Latency
+count    250.000000
+mean       4.295613
+std        2.393394
+min        1.460899
+25%        2.868255
+50%        3.724649
+75%        4.939440
+max       23.465773
+Name: guard_latency_gpt-4-turbo, dtype: float64
+median latency
+3.724648874514969
 """
 import os
 import time
 from getpass import getpass
 from typing import List, Tuple
-import logging
-import random
 
 import openai
 import pandas as pd
@@ -40,17 +65,14 @@
 from main import HallucinationPrompt, LlmRagEvaluator
 from phoenix.evals import download_benchmark_dataset
 
-logger = logging.getLogger(__name__)
-logging.getLogger().setLevel(logging.INFO)
 
-random.seed(119)
+RANDOM_STATE = 119
+MODELS = ["gpt-4o-mini", "gpt-4-turbo"]
+N_EVAL_SAMPLE_SIZE = 250
+SAVE_RESULTS_PATH = "hallucination_guard_results.csv"
 
 
-MODEL = "gpt-4o-mini"
-N_EVAL_SAMPLE_SIZE = 100
-
-
-def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
+def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]:
     """Evaluate guard on benchmark dataset.
 
     :param test_dataset: Dataframe of test examples.
@@ -65,7 +87,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
         response = guard(
             llm_api=openai.chat.completions.create,
             prompt=rag_example["query"],
-            model=MODEL,
+            model=model,
             max_tokens=1024,
             temperature=0.5,
             metadata={
@@ -75,7 +97,6 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
             }
         )
         latency_measurements.append(time.perf_counter() - start_time)
-        logging.info(response)
         guard_passed.append(response.validation_passed)
     return latency_measurements, guard_passed
 
@@ -90,28 +111,37 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
     test_dataset = download_benchmark_dataset(
         task="binary-hallucination-classification",
         dataset_name="halueval_qa_data")
-    test_dataset = shuffle(test_dataset)
+    test_dataset = shuffle(test_dataset, random_state=119)
     test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]
     
-    guard = Guard.from_string(
-        validators=[
-            LlmRagEvaluator(
-                eval_llm_prompt_generator=HallucinationPrompt(prompt_name="hallucination_judge_llm"),
-                llm_evaluator_fail_response="hallucinated",
-                llm_evaluator_pass_response="factual",
-                llm_callable=MODEL,
-                on_fail="noop",
-                on="prompt")
-        ],
-    )
-    
-    latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
-    test_dataset["guard_passed"] = guard_passed
-    test_dataset["guard_latency"] = latency_measurements
-    
-    logging.info("Guard Results")
-    # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination)
-    logging.info(classification_report(test_dataset["is_hallucination"], ~test_dataset["guard_passed"]))
-    
-    logging.info("Latency")
-    logging.info(test_dataset["guard_latency"].describe())
+    for model in MODELS:
+        guard = Guard.from_string(
+            validators=[
+                LlmRagEvaluator(
+                    eval_llm_prompt_generator=HallucinationPrompt(prompt_name="hallucination_judge_llm"),
+                    llm_evaluator_fail_response="hallucinated",
+                    llm_evaluator_pass_response="factual",
+                    llm_callable=model,
+                    on_fail="noop",
+                    on="prompt")
+            ],
+        )
+        
+        latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model)
+        test_dataset[f"guard_passed_{model}"] = guard_passed
+        test_dataset[f"guard_latency_{model}"] = latency_measurements
+        
+        print(f"\nModel: {model}")
+        print("Guard Results")
+        # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags a hallucination)
+        print(classification_report(
+            test_dataset["is_hallucination"].replace(True, "hallucinated").replace(False, "factual"),
+            test_dataset[f"guard_passed_{model}"].replace(True, "factual").replace(False, "hallucinated")))
+        
+        print("Latency")
+        print(test_dataset[f"guard_latency_{model}"].describe())
+        print("median latency")
+        print(test_dataset[f"guard_latency_{model}"].median())
+
+    if SAVE_RESULTS_PATH:
+        test_dataset.to_csv(SAVE_RESULTS_PATH)
diff --git a/benchmark_qa_correctness_prompt.py b/benchmark_qa_correctness_prompt.py
index 62c89e2..f821527 100644
--- a/benchmark_qa_correctness_prompt.py
+++ b/benchmark_qa_correctness_prompt.py
@@ -3,33 +3,60 @@
 researchers to design AI models for reading comprehension tasks under challenging constraints.
 https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/default/15785042.pdf
 
-INFO:root:Guard Results
-INFO:root:              precision    recall  f1-score   support
-
-       False       1.00      0.94      0.97        50
-        True       0.94      1.00      0.97        50
-
-    accuracy                           0.97       100
-   macro avg       0.97      0.97      0.97       100
-weighted avg       0.97      0.97      0.97       100
-
-INFO:root:Latency
-INFO:root:count    100.000000
-mean       1.845307
-std        0.867450
-min        0.982674
-25%        1.354958
-50%        1.606060
-75%        1.928065
-max        6.342991
-Name: guard_latency, dtype: float64
+Model: gpt-4o-mini
+
+Guard Results
+              precision    recall  f1-score   support
+
+     correct       1.00      0.96      0.98       133
+   incorrect       0.96      1.00      0.98       117
+
+    accuracy                           0.98       250
+   macro avg       0.98      0.98      0.98       250
+weighted avg       0.98      0.98      0.98       250
+
+Latency
+count    250.000000
+mean       2.610912
+std        1.415877
+min        1.148114
+25%        1.678278
+50%        2.263149
+75%        2.916726
+max       10.625763
+Name: guard_latency_gpt-4o-mini, dtype: float64
+median latency
+2.263148645986803
+
+Model: gpt-4-turbo
+
+Guard Results
+              precision    recall  f1-score   support
+
+     correct       1.00      0.92      0.96       133
+   incorrect       0.91      1.00      0.96       117
+
+    accuracy                           0.96       250
+   macro avg       0.96      0.96      0.96       250
+weighted avg       0.96      0.96      0.96       250
+
+Latency
+count    250.000000
+mean       7.390556
+std        5.804535
+min        1.671949
+25%        3.544383
+50%        5.239343
+75%        8.484112
+max       30.651372
+Name: guard_latency_gpt-4-turbo, dtype: float64
+median latency
+5.239343083492713
 """
 import os
 import time
 from getpass import getpass
 from typing import List, Tuple
-import logging
-import random
 
 import openai
 import pandas as pd
@@ -40,17 +67,14 @@
 from phoenix.evals import download_benchmark_dataset
 from sklearn.utils import shuffle
 
-logger = logging.getLogger(__name__)
-logging.getLogger().setLevel(logging.INFO)
 
-random.seed(119)
+RANDOM_STATE = 119
+MODELS = ["gpt-4o-mini", "gpt-4-turbo"]
+N_EVAL_SAMPLE_SIZE = 250
+SAVE_RESULTS_PATH = "qa_correctness_guard_results.csv"
 
 
-MODEL = "gpt-4o-mini"
-N_EVAL_SAMPLE_SIZE = 100
-
-
-def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple[List[float], List[bool]]:
+def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard, model: str) -> Tuple[List[float], List[bool]]:
     """Evaluate guard on benchmark dataset.
 
     :param test_dataset: Dataframe of test examples.
@@ -65,7 +89,7 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
         response = guard(
             llm_api=openai.chat.completions.create,
             prompt=rag_example["question"],
-            model=MODEL,
+            model=model,
             max_tokens=1024,
             temperature=0.5,
             metadata={
@@ -75,7 +99,6 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
             }
         )
         latency_measurements.append(time.perf_counter() - start_time)
-        logging.info(response)
         guard_passed.append(response.validation_passed)
     return latency_measurements, guard_passed
 
@@ -90,28 +113,37 @@ def evaluate_guard_on_dataset(test_dataset: pd.DataFrame, guard: Guard) -> Tuple
     test_dataset = df = download_benchmark_dataset(
         task="qa-classification",
         dataset_name="qa_generated_dataset")
-    test_dataset = shuffle(test_dataset)
+    test_dataset = shuffle(test_dataset, random_state=RANDOM_STATE)
     test_dataset = test_dataset[:N_EVAL_SAMPLE_SIZE]
     
-    guard = Guard.from_string(
-        validators=[
-            LlmRagEvaluator(
-                eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"),
-                llm_evaluator_fail_response="incorrect",
-                llm_evaluator_pass_response="correct",
-                llm_callable=MODEL,
-                on_fail="noop",
-                on="prompt")
-        ],
-    )
-    
-    latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard)
-    test_dataset["guard_passed"] = guard_passed
-    test_dataset["guard_latency"] = latency_measurements
-    
-    logging.info("Guard Results")
-    # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer)
-    logging.info(classification_report(~test_dataset["answer_true"], ~test_dataset["guard_passed"]))
-    
-    logging.info("Latency")
-    logging.info(test_dataset["guard_latency"].describe())
+    for model in MODELS:
+        guard = Guard.from_string(
+            validators=[
+                LlmRagEvaluator(
+                    eval_llm_prompt_generator=QACorrectnessPrompt(prompt_name="qa_correctness_judge_llm"),
+                    llm_evaluator_fail_response="incorrect",
+                    llm_evaluator_pass_response="correct",
+                    llm_callable=model,
+                    on_fail="noop",
+                    on="prompt")
+            ],
+        )
+        
+        latency_measurements, guard_passed = evaluate_guard_on_dataset(test_dataset=test_dataset, guard=guard, model=model)
+        test_dataset[f"guard_passed_{model}"] = guard_passed
+        test_dataset[f"guard_latency_{model}"] = latency_measurements
+        
+        print(f"\nModel: {model}")
+        print("\nGuard Results")
+        # Calculate precision, recall and f1-score for when the Guard fails (e.g. flags an incorrect answer)
+        print(classification_report(
+            test_dataset["answer_true"].replace(True, "correct").replace(False, "incorrect"),
+            test_dataset[f"guard_passed_{model}"].replace(True, "correct").replace(False, "incorrect")))
+        
+        print("Latency")
+        print(test_dataset[f"guard_latency_{model}"].describe())
+        print("median latency")
+        print(test_dataset[f"guard_latency_{model}"].median())
+
+    if SAVE_RESULTS_PATH:
+        test_dataset.to_csv(SAVE_RESULTS_PATH)
diff --git a/main.py b/main.py
index 78785f2..df23d4a 100644
--- a/main.py
+++ b/main.py
@@ -119,7 +119,7 @@ def __init__(
         eval_llm_prompt_generator: Type[ArizeRagEvalPromptBase],
         llm_evaluator_fail_response: str,
         llm_evaluator_pass_response: str, 
-        llm_callable: str = "gpt-4o-mini",
+        llm_callable: str,
         on_fail: Optional[Callable] = "noop",
         **kwargs,
     ):
@@ -203,11 +203,11 @@ def validate(self, value: Any, metadata: Dict) -> ValidationResult:
 
         # 2. Setup the prompt
         prompt = self._llm_evaluator_prompt_generator.generate_prompt(user_input_message=user_input_message, reference_text=reference_text, llm_response=value)
-        logging.info(f"evaluator prompt: {prompt}")
+        logging.debug(f"evaluator prompt: {prompt}")
 
         # 3. Get the LLM response
         llm_response = self.get_llm_response(prompt)
-        logging.info(f"llm evaluator response: {llm_response}")
+        logging.debug(f"llm evaluator response: {llm_response}")
 
         # 4. Check the LLM response and return the result
         if llm_response == self._fail_response: