update demo

apache · Nov 5, 2024 · 6e82497 · 6e82497
1 parent c6c72a8
commit 6e82497
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 49 deletions.
diff --git a/hugegraph-llm/requirements.txt b/hugegraph-llm/requirements.txt
@@ -14,4 +14,4 @@ python-dotenv>=1.0.1
 pyarrow~=17.0.0 # TODO: a temporary dependency for pandas, figure out why ImportError
 pandas~=2.2.2
 openpyxl~=3.1.5
-ragas~=0.1.20
+git+https://github.com/jasinliu/ragas.git@patch-2 # TODO: wait for release
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py
@@ -19,31 +19,33 @@
 
 import json
 import os
-from typing import Tuple, List, Literal, Optional
+from typing import List, Literal, Optional, Tuple
 
-from datasets import Dataset
 import gradio as gr
-from gradio.utils import NamedString
 import pandas as pd
+from datasets import Dataset
+from gradio.utils import NamedString
+from langchain_openai.chat_models import ChatOpenAI
 from ragas import evaluate
+from ragas.llms import LangchainLLMWrapper
 
-from hugegraph_llm.config import resource_path, prompt
+from hugegraph_llm.config import prompt, resource_path, settings
 from hugegraph_llm.operators.graph_rag_task import RAGPipeline
 from hugegraph_llm.utils.log import log
-from hugegraph_llm.utils.ragas_utils import RAGAS_METRICS_DICT
+from hugegraph_llm.utils.ragas_utils import RAGAS_METRICS_DICT, RAGAS_METRICS_ZH_DICT
 
 
 def rag_answer(
-        text: str,
-        raw_answer: bool,
-        vector_only_answer: bool,
-        graph_only_answer: bool,
-        graph_vector_answer: bool,
-        graph_ratio: float,
-        rerank_method: Literal["bleu", "reranker"],
-        near_neighbor_first: bool,
-        custom_related_information: str,
-        answer_prompt: str,
+    text: str,
+    raw_answer: bool,
+    vector_only_answer: bool,
+    graph_only_answer: bool,
+    graph_vector_answer: bool,
+    graph_ratio: float,
+    rerank_method: Literal["bleu", "reranker"],
+    near_neighbor_first: bool,
+    custom_related_information: str,
+    answer_prompt: str,
 ) -> Tuple:
     """
     Generate an answer using the RAG (Retrieval-Augmented Generation) pipeline.
@@ -177,8 +179,7 @@ def toggle_slider(enable):
     > 1. Download the template file & fill in the questions you want to test.
     > 2. Upload the file & click the button to generate answers. (Preview shows the first 40 lines)
     > 3. The answer options are the same as the above RAG/Q&A frame 
-    """
-    )
+    """)
 
     # TODO: Replace string with python constant
     tests_df_headers = [
@@ -309,29 +310,45 @@ def several_rag_answer(
     questions_file.change(read_file_to_excel, questions_file, [qa_dataframe, answer_max_line_count])
     answer_max_line_count.change(change_showing_excel, answer_max_line_count, qa_dataframe)
 
-    def evaluate_rag(metrics: List[str], num: int):
+    def evaluate_rag(metrics: List[str], num: int, language: Literal["english", "chinese"]):
         answers_df = pd.read_excel(answers_path)
         answers_df = answers_df.head(num)
         if not any(answers_df.columns.isin(rag_answer_header_dict)):
             raise gr.Error("No RAG answers found in the answer file.")
-        rag_answers = [answer for answer in rag_answer_header_dict if answer in answers_df.columns]
-        df = pd.DataFrame()
+        if language == "chinese":
+            eval_metrics = [RAGAS_METRICS_ZH_DICT[metric] for metric in metrics]
+        else:
+            eval_metrics = [RAGAS_METRICS_DICT[metric] for metric in metrics]
+        rag_method_names = [answer for answer in rag_answer_header_dict if answer in answers_df.columns]
+        score_df = pd.DataFrame()
 
-        for answer in rag_answers:
+        for answer in rag_method_names:
             context_header = rag_answer_header_dict[answer]
             answers_df[context_header] = answers_df[context_header].apply(json.loads)
             rag_data = {
-                "question": answers_df["Question"].to_list(),
-                "answer": answers_df[answer].to_list(),
-                "contexts": answers_df[rag_answer_header_dict[answer]].to_list(),
-                "ground_truth": answers_df["Expected Answer"].to_list(),
+                "user_input": answers_df["Question"].to_list(),
+                "response": answers_df[answer].to_list(),
+                "retrieved_contexts": answers_df[rag_answer_header_dict[answer]].to_list(),
+                "reference": answers_df["Expected Answer"].to_list(),
             }
+            eval_llm = LangchainLLMWrapper(
+                ChatOpenAI(
+                    model="gpt-4o-mini",
+                    temperature=0,
+                    base_url=settings.openai_api_base,
+                    api_key=settings.openai_api_key,
+                )
+            )
+
             dataset = Dataset.from_dict(rag_data)
-            score = evaluate(dataset, metrics=[RAGAS_METRICS_DICT[metric] for metric in metrics])
-            print(score.scores.to_pandas())
-            df = pd.concat([df, score.scores.to_pandas()])
-        df.insert(0, 'method', rag_answers)
-        return df
+            score = evaluate(
+                dataset,
+                metrics=eval_metrics,
+                llm=eval_llm,
+            )
+            score_df = pd.concat([score_df, score.to_pandas()])
+        score_df.insert(0, "method", rag_method_names)
+        return score_df
 
     with gr.Row():
         with gr.Column():
@@ -340,14 +357,19 @@ def evaluate_rag(metrics: List[str], num: int):
                 value=ragas_metrics_list[:4],
                 multiselect=True,
                 label="Metrics",
-                info="Several evaluation metrics from `ragas`, please refer to https://docs.ragas.io/en/stable/concepts/metrics/index.html",
+                info=(
+                    "Several evaluation metrics from `ragas`, ",
+                    "please refer to https://docs.ragas.io/en/stable/concepts/metrics/index.html",
+                ),
             )
         with gr.Column():
-            dataset_nums = gr.Number(1, label="Dataset Numbers", minimum=1, maximum=1)
+            with gr.Row():
+                dataset_nums = gr.Number(1, label="Dataset Numbers", minimum=1, maximum=1)
+                language = gr.Radio(["english", "chinese"], label="Language", value="chinese")
             ragas_btn = gr.Button("Evaluate RAG", variant="primary")
     ragas_btn.click(
         evaluate_rag,
-        inputs=[ragas_metrics, dataset_nums],
+        inputs=[ragas_metrics, dataset_nums, language],
         outputs=[gr.DataFrame(label="RAG Evaluation Results", headers=ragas_metrics_list)],
     )
-    return inp, answer_prompt_input
+    return inp, answer_prompt_input
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/ragas_utils.py b/hugegraph-llm/src/hugegraph_llm/utils/ragas_utils.py
@@ -15,22 +15,37 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from pysbd import Segmenter
 from ragas.metrics import (
-    faithfulness,
-    answer_correctness,
-    context_precision,
-    answer_relevancy,
-    context_recall,
-    context_utilization,
-    context_entity_recall,
+    ContextEntityRecall,
+    FactualCorrectness,
+    Faithfulness,
+    LLMContextPrecisionWithoutReference,
+    LLMContextPrecisionWithReference,
+    LLMContextRecall,
+    NoiseSensitivity,
+    ResponseRelevancy,
 )
 
 RAGAS_METRICS_DICT = {
-    "context_precision": context_precision,
-    "faithfulness": faithfulness,
-    "answer_relevancy": answer_relevancy,
-    "answer_correctness": answer_correctness,
-    "context_recall": context_recall,
-    "context_utilization": context_utilization,
-    "context_entity_recall": context_entity_recall,
+    "context_entity_recall": ContextEntityRecall(),
+    "factual_correctness": FactualCorrectness(),
+    "faithfulness": Faithfulness(),
+    "llm_context_precision_without_reference": LLMContextPrecisionWithoutReference(),
+    "llm_context_precision_with_reference": LLMContextPrecisionWithReference(),
+    "llm_context_recall": LLMContextRecall(),
+    "noise_sensitivity": NoiseSensitivity(),
+    "response_relevancy": ResponseRelevancy(),
 }
+
+RAGAS_METRICS_ZH_DICT = {
+    "context_entity_recall": ContextEntityRecall(),
+    "factual_correctness": FactualCorrectness(sentence_segmenter=Segmenter(language="zh", clean=True)),
+    "faithfulness": Faithfulness(sentence_segmenter=Segmenter(language="zh", clean=True)),
+    "llm_context_precision_without_reference": LLMContextPrecisionWithoutReference(),
+    "llm_context_precision_with_reference": LLMContextPrecisionWithReference(),
+    "llm_context_recall": LLMContextRecall(),
+    "noise_sensitivity": NoiseSensitivity(sentence_segmenter=Segmenter(language="zh", clean=True)),
+    "response_relevancy": ResponseRelevancy(),
+}
+