diff --git a/poetry.lock b/poetry.lock index ef40d28..0241ded 100644 --- a/poetry.lock +++ b/poetry.lock @@ -691,19 +691,19 @@ files = [ [[package]] name = "datasets" -version = "2.14.4" +version = "2.14.6" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" files = [ - {file = "datasets-2.14.4-py3-none-any.whl", hash = "sha256:29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b"}, - {file = "datasets-2.14.4.tar.gz", hash = "sha256:ef29c2b5841de488cd343cfc26ab979bff77efa4d2285af51f1ad7db5c46a83b"}, + {file = "datasets-2.14.6-py3-none-any.whl", hash = "sha256:4de857ffce21cfc847236745c69f102e33cd1f0fa8398e7be9964525fd4cd5db"}, + {file = "datasets-2.14.6.tar.gz", hash = "sha256:97ebbace8ec7af11434a87d1215379927f8fee2beab2c4a674003756ecfe920c"}, ] [package.dependencies] aiohttp = "*" dill = ">=0.3.0,<0.3.8" -fsspec = {version = ">=2021.11.1", extras = ["http"]} +fsspec = {version = ">=2023.1.0,<=2023.10.0", extras = ["http"]} huggingface-hub = ">=0.14.0,<1.0.0" multiprocess = "*" numpy = ">=1.17" @@ -764,6 +764,17 @@ files = [ [package.extras] graph = ["objgraph (>=1.7.2)"] +[[package]] +name = "diskcache" +version = "5.6.3" +description = "Disk Cache -- Disk and file backed persistent cache." +optional = false +python-versions = ">=3" +files = [ + {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"}, + {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"}, +] + [[package]] name = "elemeta" version = "1.0.7" @@ -1655,6 +1666,27 @@ files = [ {file = "lit-17.0.3.tar.gz", hash = "sha256:e6049032462be1e2928686cbd4a6cc5b3c545d83ecd078737fe79412c1f3fcc1"}, ] +[[package]] +name = "llama-cpp-python" +version = "0.2.11" +description = "Python bindings for the llama.cpp library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "llama_cpp_python-0.2.11.tar.gz", hash = "sha256:aae4820bb24aca61800bac771fb735dcc22b08c1374300782ab47eb65743723a"}, +] + +[package.dependencies] +diskcache = ">=5.6.1" +numpy = ">=1.20.0" +typing-extensions = ">=4.5.0" + +[package.extras] +all = ["llama_cpp_python[dev,server,test]"] +dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] +server = ["fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] +test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)"] + [[package]] name = "markdown" version = "3.5" @@ -2295,6 +2327,28 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "openai" +version = "0.28.1" +description = "Python client library for the OpenAI API" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "openai-0.28.1-py3-none-any.whl", hash = "sha256:d18690f9e3d31eedb66b57b88c2165d760b24ea0a01f150dd3f068155088ce68"}, + {file = "openai-0.28.1.tar.gz", hash = "sha256:4be1dad329a65b4ce1a660fe6d5431b438f429b5855c883435f0f7fcb6d2dcc8"}, +] + +[package.dependencies] +aiohttp = "*" +requests = ">=2.20" +tqdm = "*" + +[package.extras] +datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"] +embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"] +wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"] + [[package]] name = "opt-einsum" version = "3.3.0" @@ -3652,14 +3706,14 @@ werkzeug = ">=1.0.1" [[package]] name = "tensorboard-data-server" -version = "0.7.1" +version = "0.7.2" description = "Fast data loading for TensorBoard" optional = false python-versions = ">=3.7" files = [ - {file = "tensorboard_data_server-0.7.1-py3-none-any.whl", hash = "sha256:9938bd39f5041797b33921066fba0eab03a0dd10d1887a05e62ae58841ad4c3f"}, - {file = "tensorboard_data_server-0.7.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:be8d016a1aa394e6198280d4a3dc37898f56467310c5f5e617cac10a783e055a"}, - {file = "tensorboard_data_server-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:255c02b7f5b03dd5c0a88c928e563441ff39e1d4b4a234cdbe09f016e53d9594"}, + {file = "tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb"}, + {file = "tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60"}, + {file = "tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530"}, ] [[package]] @@ -4636,4 +4690,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.11" -content-hash = "1beee4a28836d0a25d4b2503f697a8d4fc0cebdf52f9f97079378240e29c0e1c" +content-hash = "0a16f3bd30ae275d94809b1e6c347fdf3266610bd1bbd76045d8c5b2494d167f" diff --git a/pyproject.toml b/pyproject.toml index 13e176f..e2caf82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,9 @@ tensorflow = {version = "^2.14.0", platform = "linux"} tensorflow-macos = {version = "^2.14.0", platform = "darwin"} elemeta = "1.0.7" torch = ">=2.0.0, !=2.0.1, !=2.1.0" +openai = "^0.28.1" +huggingface-hub = "^0.18.0" +llama-cpp-python = "^0.2.11" en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl"} fr-core-news-sm = {url = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl"} diff --git a/saga_llm_evaluation_ml/model/helpers/__init__.py b/saga_llm_evaluation_ml/helpers/__init__.py similarity index 100% rename from saga_llm_evaluation_ml/model/helpers/__init__.py rename to saga_llm_evaluation_ml/helpers/__init__.py diff --git a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py b/saga_llm_evaluation_ml/helpers/embedding_metrics.py similarity index 90% rename from saga_llm_evaluation_ml/model/helpers/embedding_metrics.py rename to saga_llm_evaluation_ml/helpers/embedding_metrics.py index 4554ddf..cef78d3 100644 --- a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py +++ b/saga_llm_evaluation_ml/helpers/embedding_metrics.py @@ -5,8 +5,8 @@ class BERTScore: def __init__(self, lan="en", model_type=None): """ BERTScore computes a similarity score for each token in the candidate sentence with each - token in the reference sentence. The final score is the average of the similarity scores of - all tokens in the candidate sentence. + token in the reference sentence. + The final score is the average of the similarity scores of all tokens in the candidate sentence. Args: lan (str, optional): language to use. Defaults to "en", It may also be "fr". Depending @@ -51,8 +51,9 @@ def compute(self, references, predictions, **kwargs): class MAUVE: def __init__(self, featurize_model_name="gpt2"): """ - MAUVE score computes the difference between the candidate sentence distribution and the - reference sentence distribution. The bigger the MAUVE score, the better. + MAUVE score computes the difference between the candidate sentence distribution + and the reference sentence distribution. + The bigger the MAUVE score, the better. """ self.metric = load("mauve") self.featurize_model_name = featurize_model_name diff --git a/saga_llm_evaluation_ml/model/helpers/language_metrics.py b/saga_llm_evaluation_ml/helpers/language_metrics.py similarity index 98% rename from saga_llm_evaluation_ml/model/helpers/language_metrics.py rename to saga_llm_evaluation_ml/helpers/language_metrics.py index a38dedb..ae24dd8 100644 --- a/saga_llm_evaluation_ml/model/helpers/language_metrics.py +++ b/saga_llm_evaluation_ml/helpers/language_metrics.py @@ -7,8 +7,8 @@ AutoTokenizer, ) -from saga_llm_evaluation_ml.model.helpers.embedding_metrics import BERTScore -from saga_llm_evaluation_ml.model.helpers.utils import ( +from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore +from saga_llm_evaluation_ml.helpers.utils import ( INVALID_QUESTION, NO_ANS, filter_questions, diff --git a/saga_llm_evaluation_ml/helpers/llm_metrics.py b/saga_llm_evaluation_ml/helpers/llm_metrics.py new file mode 100644 index 0000000..39ce77d --- /dev/null +++ b/saga_llm_evaluation_ml/helpers/llm_metrics.py @@ -0,0 +1,510 @@ +import numpy as np +from huggingface_hub import hf_hub_download +from llama_cpp import Llama + + +class SelfCheckGPT: + def __init__( + self, + model, + eval_model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF", + eval_model_basename="llama-2-7b-chat.Q4_K_M.gguf", + ): + """ + This class implements the self-check GPT evaluation metric for generative language models. + It is inspired by the self-check metric proposed in https://arxiv.org/pdf/2303.08896.pdf. + Args: + model (transformers.PreTrainedModel): GPT model to evaluate. + eval_model_name_or_path (str): Evaluation model name or path. Defaults to "TheBloke/Llama-2-7b-Chat-GGUF". + eval_model_basename (str): Evaluation model basename. Defaults to "llama-2-7b-chat.Q4_K_M.gguf". + """ + assert isinstance( + eval_model_name_or_path, str + ), "eval_model_name_or_path must be a string." + assert isinstance( + eval_model_basename, str + ), "eval_model_basename must be a string." + + self.model = model + self.eval_model_path = hf_hub_download( + repo_id=eval_model_name_or_path, filename=eval_model_basename + ) + + self.eval_model = Llama( + model_path=self.eval_model_path, n_threads=2, verbose=False # CPU cores + ) + + def get_prompt(self, pred, sample, question): + """ + This method returns a prompt template given a candidate sentence, a sample sentence, and a question. + Args: + pred (str): Candidate sentence. + sample (str): Sample sentence. + question (str): Question asked to the model for which it generated $pred. + + Returns: + str: Prompt template. + """ + system_prompt = "You are a helpful, polite and concise assistant. Your task is to check if two texts provide the same answer to a given question. Always answer with a single word. The possible answers are either YES or NO.\n\n" + question = "###Question:\n" + question + text1 = "\n###Text 1: " + sample + text2 = "\n###Text 2: " + pred + + prompt_template = f"""SYSTEM: {system_prompt} + USER: {question + text1 + text2} + ASSISTANT (YES or NO):""" + + return prompt_template + + def get_prompts(self, pred, samples, question): + """ + This method returns a list of prompt templates given a candidate sentence, a list + of sample sentences, and a question. + Args: + pred (str): Candidate sentence. + samples (list of str): List of sample sentences. + question (str): Question asked to the model for which it generated $pred. + + Returns: + list: List of prompt templates. + """ + print(samples) + return [self.get_prompt(pred, sample, question) for sample in samples] + + def compute(self, question, pred, n_samples): + """ + Args: + question (str): Question asked to the model for which it generated $pred. + pred (str): Candidate sentence. + n_samples (int): Number of samples to generate. + + Returns: + score (float): Score for the candidate sentence. + """ + assert isinstance(question, str), "Prediction must be a string." + assert isinstance(pred, str), "Prediction must be a string." + assert isinstance(n_samples, int), "Number of samples must be an integer." + assert n_samples > 0, "Number of samples must be greater than 0." + assert question and pred, "Question and prediction must be non-empty." + + # Generate n_samples samples from the model + samples = [] + print("Samples:\n") + for _ in range(n_samples): + system_prompt = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible." + prompt_template = f"""SYSTEM: {system_prompt} + USER: {question} + ASSISTANT:""" + + response = self.model(prompt_template, max_tokens=200) + sample = response["choices"][0]["text"] + print(sample, "\n") + samples.append(sample) + print("\n") + + # For each sample, ask evaluator model to evaluate the sample + prompts = self.get_prompts(pred, samples, question) + scores = [] + print("Prompts:\n") + for prompt in prompts: + print(prompt, "\n") + answer = self.eval_model(prompt, max_tokens=200)["choices"][0]["text"] + print(answer, "\n") + scores.append(answer) + print("\n") + + # Compute the score: how often the sentence if supported by the sample + score = np.mean([1 if "yes" in score.lower() else 0 for score in scores]) + + return score + + +class GEval: + def __init__( + self, + model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF", + model_basename="llama-2-7b-chat.Q4_K_M.gguf", + ): + """ + This class implements the GEval evaluation metric for generative language models. + It is inspired by the GEval metric proposed in https://arxiv.org/pdf/2303.16634.pdf. + Args: + model_name_or_path (str): Model name or path. Defaults to "TheBloke/Llama-2-7b-Chat-GGUF". + model_basename (str): Model basename. Defaults to "llama-2-7b-chat.Q4_K_M.gguf". + """ + assert isinstance( + model_name_or_path, str + ), "model_name_or_path must be a string." + assert isinstance(model_basename, str), "model_basename must be a string." + + self.model_path = hf_hub_download( + repo_id=model_name_or_path, filename=model_basename + ) + + self.lcpp_llm = Llama( + model_path=self.model_path, + n_threads=2, # CPU cores + logits_all=True, + n_ctx=1000, + ) + + self.tasks = { + "summ": "You will be given one summary written for a news article. Your task is to rate the summary on one metric. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.", + "diag": "You will be given a conversation between two individuals. You will then be given one potential response for the next turn in the conversation. The response concerns an interesting fact, which will be provided as well. Your task is to rate the responses on one metric. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.", + } + self.criteria = { + "COH": { + "name": "Coherence", + "prompt": "Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby ”the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.”", + }, + "CON": { + "name": "Consistency", + "prompt": "Consistency (1-5) - the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts. ", + }, + "ENG": { + "name": "Engagingness", + "prompt": "Engagingness (1-5) - Is the response dull/interesting? - A score of 1 indicates that the response is dull and uninteresting. A score of 5 indicates that the response is interesting and engaging.", + }, + "FLU": { + "name": "Fluency", + "prompt": "Fluency (1-5) - the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure. - 1: Poor. The summary is difficult to read and understand. It contains many grammatical errors, spelling mistakes, and/or punctuation errors. - 2: Fair. The summary is somewhat difficult to read and understand. It contains some grammatical errors, spelling mistakes, and/or punctuation errors. - 3: Good. The summary is easy to read and understand. It contains few grammatical errors, spelling mistakes, and/or punctuation errors. - 4: Very Good. The summary is easy to read and understand. It contains no grammatical errors, spelling mistakes, and/or punctuation errors. - 5: Excellent. The summary is easy to read and understand. It contains no grammatical errors, spelling mistakes, and/or punctuation errors.", + }, + "REL": { + "name": "Relevance", + "prompt": "Relevance (1-5) - selection of important content from the source. The summary should include only important information from the source document. Annotators were instructed to penalize summaries which contained redundancies and excess information.", + }, + "POL": { + "name": "Politeness", + "prompt": "Politeness (1-5) - the degree to which the response is polite. - 1: Very impolite. The response is very impolite. - 2: Somewhat impolite. The response is somewhat impolite. - 3: Neutral. The response is neutral. - 4: Somewhat polite. The response is somewhat polite. - 5: Very polite. The response is very polite.", + }, + } + + def get_prediction(self, prompt): + """ + This method returns a prediction given a prompt template. + Args: + prompt (str): Prompt template. + + Returns: + response (dict): Response from the model. + """ + response = self.lcpp_llm.create_completion( + prompt=prompt, + max_tokens=250, + temperature=0.5, + top_p=0.95, + logprobs=5, + repeat_penalty=1.2, + top_k=50, + echo=True, + ) + return response + + def get_cot(self, prompt): + """ + This method returns a chain of thoughts given a prompt template. + Args: + prompt (str): Prompt template. + + Returns: + cot (str): Chain of thoughts. + """ + title = "\nEvaluation steps:\n" + cot = self.get_prediction(prompt + title)["choices"][0]["text"] + return cot + + # pylint: disable=consider-iterating-dictionary + def get_prompt(self, src, pred, definition, criterion, criterion_name): + """ + Args: + src (str): Source text. + pred (str): Candidate sentence to evaluate. + definition (str): Definition of the task. + crit_code (str): Evaluation criterion code. + """ + definition = ( + "\n Task definition:\n" + self.tasks[definition] + if definition in self.tasks.keys() + else definition + ) + crit = ( + "\n Evaluation criteria:\n" + self.criteria[criterion]["prompt"] + if criterion in self.criteria.keys() + else criterion + ) + crit_name = ( + self.criteria[criterion]["name"] + if criterion in self.criteria.keys() + else criterion_name + ) + + prompt = f"{definition} {crit}" + + # Chain of thoughts, set of intermediate instructions generated by llm detailing evaluation steps + auto_cot = self.get_cot(prompt) + + return ( + prompt + + auto_cot + + "\n Example:\n Source Text:\n" + + src + + "\n Generated text:\n" + + pred + + "\n Evaluation Form (scores ONLY):\n" + + crit_name + + ": " + ) + + def get_score(self, prompt): + """ + Args: + prompt (str): Prompt template. + + Returns: + score (float): Score for the candidate sentence. + """ + response = self.get_prediction(prompt) + tokens = response["choices"][0]["logprobs"]["tokens"] + top_logprobs = response["choices"][0]["logprobs"]["top_logprobs"] + + # Extract evaluation form from tokens () + template_tokens = [ + " E", + "valu", + "ation", + " Form", + " (", + "sc", + "ores", + " ON", + "LY", + "):", + ] + start_index = tokens.index(template_tokens[-1]) + 1 + # Extract number index from the remaining tokens + for token in tokens[start_index:]: + if token.isdigit(): + number_index = tokens.index(token) + break + + # Get logprobs associated with number + logprobs = top_logprobs[number_index] + + # Compute score + # Get only keys that are numbers + number_keys = [int(key) for key in logprobs.keys() if key.isdigit()] + number_logprobs = [logprobs[str(key)] for key in number_keys] + number_probs = [np.exp(logprob) for logprob in number_logprobs] + + score = np.sum(np.multiply(number_keys, number_probs)) / len(number_keys) + + return score + + def compute(self, source, pred, definition, criterion, criterion_name=None): + """ + This method computes the GEval score for a candidate sentence given a source text, + a prompt template, an aspect to evaluate, and a task description. + Args: + source (str): Source text. + pred (str): Candidate sentence to evaluate. + definition (str): Definition of the task. + criterion (str): Evaluation criterion code. + criterion_name (str, optional): Evaluation criterion name. Defaults to None. + + Returns: + score (float): Score for the candidate sentence. + """ + assert isinstance(source, str), "Source must be a string." + assert isinstance(pred, str), "Pred must be a string." + assert isinstance(definition, str), "Definition must be a string." + assert isinstance(criterion, str), "Criterion must be a string." + assert criterion_name is None or isinstance( + criterion_name, str + ), "Criterion name must be a string." + assert ( + criterion in self.criteria.keys() or criterion_name is not None + ), "Criterion name must be given if criterion is not in the list of criteria." + + prompt = self.get_prompt(source, pred, definition, criterion, criterion_name) + return self.get_score(prompt) + + +class GPTScore: + # pylint: disable=f-string-without-interpolation + def __init__( + self, + model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF", + model_basename="llama-2-7b-chat.Q4_K_M.gguf", + ): + """ + This class implements the GPTScore evaluation metric for generative language models. + It is inspired by the GPTScore metric proposed in https://arxiv.org/pdf/2302.04166.pdf. + Args: + model_name_or_path (str): Model name or path. Defaults to "TheBloke/Llama-2-7b-Chat-GGUF". + model_basename (str): Model basename. Defaults to "llama-2-7b-chat.Q4_K_M.gguf". + """ + assert isinstance( + model_name_or_path, str + ), "model_name_or_path must be a string." + assert isinstance(model_basename, str), "model_basename must be a string." + + self.templates = { + "summ": { + "FAC": f"Generate a summary with consistent facts for the following text: {{src}}\n\nTl;dr{{pred}}", + "COV": f"Generate a summary with as much semantic coverage as possible for the following text: {{src}}\n\nTl;dr{{pred}}", + "CON": f"Generate factually consistent summary for the following text: {{src}}\n\nTl;dr{{pred}}", + "INF": f"Generate an informative summary that captures the key points of the following text:{{src}}\n\nTl;dr{{pred}}", + "COH": f"Generate a coherent summary for the following text: {{src}}\n\nTl;dr{{pred}}", + "REL": f"Generate a relevant summary with consistent details for the following text: {{src}}\n\nTl;dr{{pred}}", + "FLU": f"Generate a fluent and grammatical summary for the following text: {{src}}\n\nTl;dr{{pred}}", + }, + "MT": { + "ACC": f"Rewrite the following text with its core information and consistent facts:{{src}} In other words, {{pred}}", + "FLU": f"Rewrite the following text to make it more grammatical and well-written:{{src}} In other words,{{pred}}", + "MQM": f"Rewrite the following text into high-quality text with its core information:{{src}} In other words,{{pred}}", + }, + "D2T": { + "INF": f"Convert the following text to another expression that preserves key information:\n\n{{src}} In other words, {{pred}}", + "NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{{src}} In other words, {{pred}}", + "FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{{src}} In other words, {{pred}}", + }, + "diag": { + "COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "CON": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + "ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation:\nUser: {{src}}\nAI: {{pred}}\nAnswer:", + }, + } + + self.tasks = self.templates.keys() + self.aspects = list( + {aspect for task in self.tasks for aspect in self.templates[task]} + ) + + self.model_path = hf_hub_download( + repo_id=model_name_or_path, filename=model_basename + ) + + self.lcpp_llm = Llama( + model_path=self.model_path, + n_threads=2, # CPU cores + logits_all=True, + ) + + def get_prompts(self, aspect, task, sources, preds): + """ + This method returns a list of prompt templates given a task description, and an aspect to evaluate. + Args: + aspect (str): Aspect to evaluate. + task (str): Task description. + sources (list of str): Source texts. + preds (list of str): Candidate sentences. + Returns: + list: List of prompt templates. + """ + return [ + self.get_prompt(aspect, task, src, pred) + for (src, pred) in zip(sources, preds) + ] + + def get_prompt(self, aspect, task, src, pred): + """ + This method returns a prompt template given a task description, and an aspect to evaluate. + Args: + aspect (str): Aspect to evaluate. + task (str): Task description. + src (str): Source text. + pred (str): Candidate sentence. + Returns: + str: Prompt template. + """ + # Check that the corresponding entry exists in the prompt template + assert ( + aspect in self.templates[task] + ), f"Aspect {aspect} is not available for task {task}." + # Check that the prompt template is not empty + assert self.templates[task][ + aspect + ], f"Prompt template for aspect {aspect} and task {task} is non-existent. Please specify a prompt template." + + template = self.templates[task][aspect] + + # Replace placeholders with source and candidate sentence + template = template.replace("{src}", src) + template = template.replace("{pred}", pred) + + return template + + def compute(self, source, pred, prompt=None, aspect=None, task=None): + """ + This method computes the GPTScore for a candidate sentence given a source text, + a prompt template, an aspect to evaluate, and a task description. + Args: + source (str): Source text. + pred (str): Candidate sentence. + prompt (str, optional): Prompt template. Defaults to None. + aspect (str, optional): Aspect to evaluate. Defaults to None. + task (str, optional): Task description. Defaults to None. + Returns: + score (float): Score for the candidate sentence. + """ + assert isinstance(source, str), "Source must be a string." + assert isinstance(pred, str), "Pred must be a string." + + # If prompt is given, check that it is a list of string + if prompt: + assert isinstance(prompt, str), "Prompt must be a string." + assert not aspect, "Aspect must not be given if prompt is given." + assert not task, "Task must not be given if prompt is given." + else: + # If prompt is not given, check that task and aspect are given + assert aspect, "Aspect must be given if prompt is not given." + assert task, "Task must be given if prompt is not given." + + # If aspect is given, check that it is a string + if aspect: + assert isinstance(aspect, str), "Aspect must be a string." + assert aspect in self.aspects, f"Aspect must be one of {self.aspects}." + + # If task is given, check that it is a string + if task: + assert isinstance(task, str), "Task must be a string." + assert task in self.tasks, f"Task must be one of {self.tasks}." + + # Generative LLM is given a prompt template and some context information + if not prompt: + prompt = self.get_prompt(aspect, task, source, pred) + + response = self.lcpp_llm.create_completion( + prompt=prompt, + max_tokens=500, + temperature=0.5, + top_p=0.95, + logprobs=1, + repeat_penalty=1.2, + top_k=50, + echo=True, + ) + + # Compute logprobs + # Find the end position of the input... + print(response["choices"][0]["logprobs"]["text_offset"]) + i = response["choices"][0]["logprobs"]["text_offset"].index(len(prompt)) + if i == 0: + i = i + 1 + + # Get logprobs + loss = -sum( + response["choices"][0]["logprobs"]["token_logprobs"][i:-1] + ) # ignore the last '.' + avg_loss = loss / ( + len(response["choices"][0]["logprobs"]["text_offset"]) - i - 1 + ) # 1 is the last '.' + + return avg_loss diff --git a/saga_llm_evaluation_ml/model/helpers/utils.py b/saga_llm_evaluation_ml/helpers/utils.py similarity index 96% rename from saga_llm_evaluation_ml/model/helpers/utils.py rename to saga_llm_evaluation_ml/helpers/utils.py index b64f3e6..41593d4 100644 --- a/saga_llm_evaluation_ml/model/helpers/utils.py +++ b/saga_llm_evaluation_ml/helpers/utils.py @@ -103,7 +103,7 @@ class MetadataExtractor: def __init__(self): self.metadata_extractor = MetafeatureExtractorsRunner() - def addWordRegexMatchesCount(self, regex_rule, name=None): + def add_word_regex_matches_count(self, regex_rule, name=None): """ Adds a regex rule to the metadata extractor. For a given regex return the number of words matching the regex. @@ -115,7 +115,7 @@ def addWordRegexMatchesCount(self, regex_rule, name=None): WordRegexMatchesCount(regex=regex_rule, name=name) ) - def addRegexMatchCount(self, regex_rule, name=None): + def add_regex_match_count(self, regex_rule, name=None): """ Adds a regex rule to the metadata extractor. For a given regex return the number of matches it has in the text. diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_embedding_metrics.py b/tests/test_embedding_metrics.py index d55eb84..a13c5b2 100644 --- a/tests/test_embedding_metrics.py +++ b/tests/test_embedding_metrics.py @@ -1,6 +1,6 @@ import unittest -from saga_llm_evaluation_ml.model.helpers.embedding_metrics import MAUVE, BERTScore +from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore, MAUVE class TestBERTScore(unittest.TestCase): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a1c0aa4..39b8cc9 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,6 +1,6 @@ import unittest -from saga_llm_evaluation_ml.model.helpers.utils import MetadataExtractor +from saga_llm_evaluation_ml.helpers.utils import MetadataExtractor class TestMetadataExtractor(unittest.TestCase): @@ -25,8 +25,8 @@ def test_add_regex(self): """Tests that the MetadataExtractor class extracts the correct metadata when regex rules are added.""" text = "The cat sat on the mat." extractor = MetadataExtractor() - extractor.addWordRegexMatchesCount("the") - extractor.addRegexMatchCount("the") + extractor.add_word_regex_matches_count("the") + extractor.add_regex_match_count("the") metadata = extractor.compute(text) # Test a few metadata values @@ -45,8 +45,10 @@ def test_add_regex(self): len_metadata = len(metadata) # Check that the metadata is longer when multiple regex rules are added - extractor.addWordRegexMatchesCount("cat", name="word_regex_matches_count_cat") - extractor.addRegexMatchCount("cat", name="regex_match_count_cat") + extractor.add_word_regex_matches_count( + "cat", name="word_regex_matches_count_cat" + ) + extractor.add_regex_match_count("cat", name="regex_match_count_cat") metadata = extractor.compute(text) self.assertGreater(len(metadata), len_metadata) diff --git a/tests/test_language_metrics.py b/tests/test_language_metrics.py index 7c85f91..05f583d 100644 --- a/tests/test_language_metrics.py +++ b/tests/test_language_metrics.py @@ -1,6 +1,6 @@ import unittest -from saga_llm_evaluation_ml.model.helpers.language_metrics import BLEURTScore, QSquared +from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore, QSquared class TestBLEURTScore(unittest.TestCase): diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py new file mode 100644 index 0000000..a255f20 --- /dev/null +++ b/tests/test_llm_metrics.py @@ -0,0 +1,194 @@ +import unittest + +from llama_cpp import Llama +from huggingface_hub import hf_hub_download +from saga_llm_evaluation_ml.helpers.llm_metrics import GPTScore, GEval, SelfCheckGPT + + +class TestGEval(unittest.TestCase): + def test_init(self): + with self.assertRaises(AssertionError): + GEval(1, 1) + GEval("1", 1) + GEval(1, "1") + + def test_bad_arguments(self): + geval = GEval() + + source = "Hi how are you" + pred = "Im ok" + task = "diag" + aspect = "ENG" + + with self.assertRaises(AssertionError): + geval.compute([source], pred, task, aspect) + geval.compute(source, [pred], task, aspect) + geval.compute(source, pred, 1, aspect) + geval.compute(source, pred, task, 1) + geval.compute(source, pred, task, "notvalid") + geval.compute(source, pred, "notvalid", aspect) + geval.compute(source, pred, task, criterion=None) + geval.compute(source, pred, definition=None, criterion=aspect) + + def test_compute(self): + geval = GEval() + + source = "Hi how are you?" + preds = ["Shut up creep!!!", "I am very good, thank you! And you?"] + task = "diag" + aspect = "POL" + + scores = {key: 0 for key in preds} + for pred in preds: + score = geval.compute(source, pred, task, aspect) + self.assertTrue(isinstance(score, float)) + self.assertGreaterEqual(score, 0.0) + scores[pred] = score + + self.assertGreaterEqual( + scores["I am very good, thank you! And you?"], scores["Shut up creep!!!"] + ) + + +class TestSelfCheckGPT(unittest.TestCase): + def test_init(self): + model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF" + model_basename = "llama-2-7b-chat.Q4_K_M.gguf" # the model is in bin format + + model_path = hf_hub_download( + repo_id=model_name_or_path, filename=model_basename + ) + model = Llama(model_path=model_path, n_threads=2, verbose=False) # CPU cores + + with self.assertRaises(AssertionError): + SelfCheckGPT(model, eval_model_name_or_path=1, eval_model_basename=1) + SelfCheckGPT(model, eval_model_name_or_path=1, eval_model_basename="1") + SelfCheckGPT(model, eval_model_name_or_path="1", eval_model_basename=1) + + def test_bad_arguments(self): + + model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF" + model_basename = "llama-2-7b-chat.Q4_K_M.gguf" # the model is in bin format + + model_path = hf_hub_download( + repo_id=model_name_or_path, filename=model_basename + ) + model = Llama(model_path=model_path, n_threads=2, verbose=False) # CPU cores + + selfcheckgpt = SelfCheckGPT(model) + question = "What is the capital of France?" + pred = "Paris" + n_samples = 1 + + with self.assertRaises(AssertionError): + selfcheckgpt.compute([question], pred, n_samples) + selfcheckgpt.compute(question, [pred], n_samples) + selfcheckgpt.compute(question, pred, "1") + selfcheckgpt.compute(question, pred, 1.0) + selfcheckgpt.compute(question, pred, -1) + selfcheckgpt.compute(question=question, pred=None, n_samples=5) + selfcheckgpt.compute(question=None, pred=pred, n_samples=5) + + def test_compute(self): + model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF" + model_basename = "llama-2-7b-chat.Q4_K_M.gguf" + + model_path = hf_hub_download( + repo_id=model_name_or_path, filename=model_basename + ) + model = Llama(model_path=model_path, n_threads=2, verbose=False) # CPU cores + + selfcheckgpt = SelfCheckGPT(model) + question = "What is the capital of France?" + preds = ["Paris", "sandwich"] + n_samples = 10 + + scores = {key: 0 for key in preds} + for pred in preds: + score = selfcheckgpt.compute(question, pred, n_samples) + self.assertTrue(isinstance(score, float)) + self.assertGreaterEqual(score, 0.0) + self.assertLessEqual(score, 1.0) + scores[pred] = score + + self.assertGreaterEqual(scores["Paris"], scores["sandwich"]) + + +class TestGPTScore(unittest.TestCase): + def test_init(self): + with self.assertRaises(AssertionError): + GPTScore(model_basename=1, model_name_or_path=1) + GPTScore(model_basename="1", model_name_or_path=1) + GPTScore(model_basename=1, model_name_or_path="1") + + def test_bad_arguments(self): + gptscore = GPTScore() + + with self.assertRaises(AssertionError): + gptscore.compute(["The cat sat on the mat."], ["The dog sat on the log."]) + gptscore.compute("The cat sat on the mat.", ["The dog sat on the log."]) + gptscore.compute("The cat sat on the mat.", "The dog sat on the log.") + gptscore.compute( + "The cat sat on the mat.", "The dog sat on the log.", prompt=2 + ) + gptscore.compute( + "The cat sat on the mat.", + "The dog sat on the log.", + prompt="2", + aspect="COV", + task="diag", + ) + gptscore.compute( + "The cat sat on the mat.", + "The dog sat on the log.", + aspect=2, + task="diag", + ) + gptscore.compute( + "The cat sat on the mat.", + "The dog sat on the log.", + aspect="COV", + task=2, + ) + gptscore.compute( + "The cat sat on the mat.", + "The dog sat on the log.", + aspect="COV", + task="notvalid", + ) + gptscore.compute( + "The cat sat on the mat.", + "The dog sat on the log.", + aspect="notvalid", + task="diag", + ) + gptscore.compute( + "The cat sat on the mat.", "The dog sat on the log.", aspect="COV" + ) + gptscore.compute( + "The cat sat on the mat.", "The dog sat on the log.", task="diag" + ) + + def test_compute(self): + gptscore = GPTScore() + + source = "Hi how are you?" + preds = [ + "I am very fine. Thanks! What about you?", + "Shut up creep I don't want to talk to you!!!", + ] + # prompt = "Task: evaluate how polite this dialog is." + aspect = "LIK" + task = "diag" + + scores = {key: 0 for key in preds} + for target in preds: + score = gptscore.compute(source, target, aspect=aspect, task=task) + scores[target] = score + self.assertTrue(isinstance(score, float)) + self.assertGreaterEqual(score, 0.0) + + self.assertGreaterEqual( + scores["I am very fine. Thanks! What about you?"], + scores["Shut up creep I don't want to talk to you!!!"], + )