diff --git a/hugegraph-llm/requirements.txt b/hugegraph-llm/requirements.txt index 8f4f1bb5..8631b85d 100644 --- a/hugegraph-llm/requirements.txt +++ b/hugegraph-llm/requirements.txt @@ -14,4 +14,4 @@ python-dotenv>=1.0.1 pyarrow~=17.0.0 # TODO: a temporary dependency for pandas, figure out why ImportError pandas~=2.2.2 openpyxl~=3.1.5 -ragas~=0.1.20 +git+https://github.com/jasinliu/ragas.git@patch-2 # TODO: wait for release diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py index 21092b1e..63c42571 100644 --- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py +++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py @@ -19,31 +19,33 @@ import json import os -from typing import Tuple, List, Literal, Optional +from typing import List, Literal, Optional, Tuple -from datasets import Dataset import gradio as gr -from gradio.utils import NamedString import pandas as pd +from datasets import Dataset +from gradio.utils import NamedString +from langchain_openai.chat_models import ChatOpenAI from ragas import evaluate +from ragas.llms import LangchainLLMWrapper -from hugegraph_llm.config import resource_path, prompt +from hugegraph_llm.config import prompt, resource_path, settings from hugegraph_llm.operators.graph_rag_task import RAGPipeline from hugegraph_llm.utils.log import log -from hugegraph_llm.utils.ragas_utils import RAGAS_METRICS_DICT +from hugegraph_llm.utils.ragas_utils import RAGAS_METRICS_DICT, RAGAS_METRICS_ZH_DICT def rag_answer( - text: str, - raw_answer: bool, - vector_only_answer: bool, - graph_only_answer: bool, - graph_vector_answer: bool, - graph_ratio: float, - rerank_method: Literal["bleu", "reranker"], - near_neighbor_first: bool, - custom_related_information: str, - answer_prompt: str, + text: str, + raw_answer: bool, + vector_only_answer: bool, + graph_only_answer: bool, + graph_vector_answer: bool, + graph_ratio: float, + rerank_method: Literal["bleu", "reranker"], + near_neighbor_first: bool, + custom_related_information: str, + answer_prompt: str, ) -> Tuple: """ Generate an answer using the RAG (Retrieval-Augmented Generation) pipeline. @@ -177,8 +179,7 @@ def toggle_slider(enable): > 1. Download the template file & fill in the questions you want to test. > 2. Upload the file & click the button to generate answers. (Preview shows the first 40 lines) > 3. The answer options are the same as the above RAG/Q&A frame - """ - ) + """) # TODO: Replace string with python constant tests_df_headers = [ @@ -309,29 +310,45 @@ def several_rag_answer( questions_file.change(read_file_to_excel, questions_file, [qa_dataframe, answer_max_line_count]) answer_max_line_count.change(change_showing_excel, answer_max_line_count, qa_dataframe) - def evaluate_rag(metrics: List[str], num: int): + def evaluate_rag(metrics: List[str], num: int, language: Literal["english", "chinese"]): answers_df = pd.read_excel(answers_path) answers_df = answers_df.head(num) if not any(answers_df.columns.isin(rag_answer_header_dict)): raise gr.Error("No RAG answers found in the answer file.") - rag_answers = [answer for answer in rag_answer_header_dict if answer in answers_df.columns] - df = pd.DataFrame() + if language == "chinese": + eval_metrics = [RAGAS_METRICS_ZH_DICT[metric] for metric in metrics] + else: + eval_metrics = [RAGAS_METRICS_DICT[metric] for metric in metrics] + rag_method_names = [answer for answer in rag_answer_header_dict if answer in answers_df.columns] + score_df = pd.DataFrame() - for answer in rag_answers: + for answer in rag_method_names: context_header = rag_answer_header_dict[answer] answers_df[context_header] = answers_df[context_header].apply(json.loads) rag_data = { - "question": answers_df["Question"].to_list(), - "answer": answers_df[answer].to_list(), - "contexts": answers_df[rag_answer_header_dict[answer]].to_list(), - "ground_truth": answers_df["Expected Answer"].to_list(), + "user_input": answers_df["Question"].to_list(), + "response": answers_df[answer].to_list(), + "retrieved_contexts": answers_df[rag_answer_header_dict[answer]].to_list(), + "reference": answers_df["Expected Answer"].to_list(), } + eval_llm = LangchainLLMWrapper( + ChatOpenAI( + model="gpt-4o-mini", + temperature=0, + base_url=settings.openai_api_base, + api_key=settings.openai_api_key, + ) + ) + dataset = Dataset.from_dict(rag_data) - score = evaluate(dataset, metrics=[RAGAS_METRICS_DICT[metric] for metric in metrics]) - print(score.scores.to_pandas()) - df = pd.concat([df, score.scores.to_pandas()]) - df.insert(0, 'method', rag_answers) - return df + score = evaluate( + dataset, + metrics=eval_metrics, + llm=eval_llm, + ) + score_df = pd.concat([score_df, score.to_pandas()]) + score_df.insert(0, "method", rag_method_names) + return score_df with gr.Row(): with gr.Column(): @@ -340,14 +357,19 @@ def evaluate_rag(metrics: List[str], num: int): value=ragas_metrics_list[:4], multiselect=True, label="Metrics", - info="Several evaluation metrics from `ragas`, please refer to https://docs.ragas.io/en/stable/concepts/metrics/index.html", + info=( + "Several evaluation metrics from `ragas`, ", + "please refer to https://docs.ragas.io/en/stable/concepts/metrics/index.html", + ), ) with gr.Column(): - dataset_nums = gr.Number(1, label="Dataset Numbers", minimum=1, maximum=1) + with gr.Row(): + dataset_nums = gr.Number(1, label="Dataset Numbers", minimum=1, maximum=1) + language = gr.Radio(["english", "chinese"], label="Language", value="chinese") ragas_btn = gr.Button("Evaluate RAG", variant="primary") ragas_btn.click( evaluate_rag, - inputs=[ragas_metrics, dataset_nums], + inputs=[ragas_metrics, dataset_nums, language], outputs=[gr.DataFrame(label="RAG Evaluation Results", headers=ragas_metrics_list)], ) - return inp, answer_prompt_input \ No newline at end of file + return inp, answer_prompt_input diff --git a/hugegraph-llm/src/hugegraph_llm/utils/ragas_utils.py b/hugegraph-llm/src/hugegraph_llm/utils/ragas_utils.py index d45411c2..f0535b38 100644 --- a/hugegraph-llm/src/hugegraph_llm/utils/ragas_utils.py +++ b/hugegraph-llm/src/hugegraph_llm/utils/ragas_utils.py @@ -15,22 +15,37 @@ # specific language governing permissions and limitations # under the License. +from pysbd import Segmenter from ragas.metrics import ( - faithfulness, - answer_correctness, - context_precision, - answer_relevancy, - context_recall, - context_utilization, - context_entity_recall, + ContextEntityRecall, + FactualCorrectness, + Faithfulness, + LLMContextPrecisionWithoutReference, + LLMContextPrecisionWithReference, + LLMContextRecall, + NoiseSensitivity, + ResponseRelevancy, ) RAGAS_METRICS_DICT = { - "context_precision": context_precision, - "faithfulness": faithfulness, - "answer_relevancy": answer_relevancy, - "answer_correctness": answer_correctness, - "context_recall": context_recall, - "context_utilization": context_utilization, - "context_entity_recall": context_entity_recall, + "context_entity_recall": ContextEntityRecall(), + "factual_correctness": FactualCorrectness(), + "faithfulness": Faithfulness(), + "llm_context_precision_without_reference": LLMContextPrecisionWithoutReference(), + "llm_context_precision_with_reference": LLMContextPrecisionWithReference(), + "llm_context_recall": LLMContextRecall(), + "noise_sensitivity": NoiseSensitivity(), + "response_relevancy": ResponseRelevancy(), } + +RAGAS_METRICS_ZH_DICT = { + "context_entity_recall": ContextEntityRecall(), + "factual_correctness": FactualCorrectness(sentence_segmenter=Segmenter(language="zh", clean=True)), + "faithfulness": Faithfulness(sentence_segmenter=Segmenter(language="zh", clean=True)), + "llm_context_precision_without_reference": LLMContextPrecisionWithoutReference(), + "llm_context_precision_with_reference": LLMContextPrecisionWithReference(), + "llm_context_recall": LLMContextRecall(), + "noise_sensitivity": NoiseSensitivity(sentence_segmenter=Segmenter(language="zh", clean=True)), + "response_relevancy": ResponseRelevancy(), +} +