From 09d2cbaf6354cb7d728fd7758bc7fba7c51c58fb Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Thu, 19 Oct 2023 13:36:01 +0200
Subject: [PATCH 1/6] feat(gptscore): add gptscore support

---
 poetry.lock                                   |  24 ++-
 pyproject.toml                                |   1 +
 .../model/helpers/llm_metrics.py              | 197 ++++++++++++++++++
 tests/test_llm_metrics.py                     |  71 +++++++
 4 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_llm_metrics.py

diff --git a/poetry.lock b/poetry.lock
index 9415fb4..24edbde 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2261,6 +2261,28 @@ rsa = ["cryptography (>=3.0.0)"]
 signals = ["blinker (>=1.4.0)"]
 signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
 
+[[package]]
+name = "openai"
+version = "0.28.1"
+description = "Python client library for the OpenAI API"
+optional = false
+python-versions = ">=3.7.1"
+files = [
+    {file = "openai-0.28.1-py3-none-any.whl", hash = "sha256:d18690f9e3d31eedb66b57b88c2165d760b24ea0a01f150dd3f068155088ce68"},
+    {file = "openai-0.28.1.tar.gz", hash = "sha256:4be1dad329a65b4ce1a660fe6d5431b438f429b5855c883435f0f7fcb6d2dcc8"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+requests = ">=2.20"
+tqdm = "*"
+
+[package.extras]
+datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"]
+embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
+wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
+
 [[package]]
 name = "opt-einsum"
 version = "3.3.0"
@@ -4658,4 +4680,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.11"
-content-hash = "dab7a7b6060ba4a9e674f1d62d7ee3f5330e51fbd896c70d8a2dd10acd9195ca"
+content-hash = "5ce38de044cbc1d3f927898cc730660d9fad1c5f1e27c57b6f8b7caf6f9ba9c1"
diff --git a/pyproject.toml b/pyproject.toml
index b40a01e..3ab9009 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ bleurt = {git = "https://github.com/google-research/bleurt.git"}
 tensorflow-macos = {version = "2.14.0", platform = "darwin"}
 elemeta = "1.0.7"
 torch = ">=2.0.0, !=2.0.1, !=2.1.0"
+openai = "^0.28.1"
 
 [tool.poetry.dev-dependencies]
 pylint = "^2.13"
diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
index e69de29..695df85 100644
--- a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
@@ -0,0 +1,197 @@
+import openai
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class GPTScore:
+    def __init__(self):
+        """
+        GPTScore is a metric which allows to evaluate generative models on a variety of tasks.
+        GPTScore(h|d, a, S) =  sum_{t=1}^m w_t * log p(h_t | h_{<t}, T(d, a, S), theta)
+        where w_t is a weight assigned to each token h_t.
+        T is a prompt template with
+        d: task description,
+        a: aspect to evaluate,
+        S: context information.
+        and theta are model parameters.
+        GPTScore does not require any reference text.
+        """
+
+    def compute(
+        self, src, pred, model="gpt2", prompt=None, a=None, d=None, api_key=None
+    ):
+        """
+        This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
+        The possible values for aspect are:
+        - (COV): Semantic coverage. How many semantic content units from the reference text are covered by the generated text?
+        - (FAC): Factuality. Does the generated text preserve the factual statements of the source text?)
+        - (FLU): Fluency. Is the generated text well-written and grammatical?
+        - (CON): Consistency. Is the generated text consistent in the information it provides?
+        - (INF): Informativeness. How well does the generated text capture the key ideas of its source text?
+        - (COH): Coherence. How much does the generated text make sense?
+        - (REL): Relevance. How well is the generated text relevant to its source text?
+        - (ACC): Accuracy. Are there inaccuracies, missing, or unfactual content in the generated text?
+        - (MQM): Multidimensional MT How is the overall quality of the generated text?
+        - (INT): Interest. Is the generated text interesting?
+        - (ENG): Engagement. Is the generated text engaging?
+        - (SPE): Specific. Is the generated text generic or specific to the source text?
+        - (COR): Correctness. Is the generated text correct or was there a misunderstanding of the source text?
+        - (SEM): Semantically appropriate. Is the generated text semantically appropriate?
+        - (UND): Understandability. Is the generated text understandable?
+        - (ERR): Error Recovery. Is the system able to recover from errors that it makes?
+        - (DIV): Diversity. Is there diversity in the system responses?
+        - (DEP): Depth. Does the system discuss topics in depth?
+        - (LIK): Likeability. Does the system display a likeable personality?
+        - (FLE): Flexibility. Is the system flexible and adaptable to the user and their interests?
+        - (INQ): Inquisitiveness. Is the system inquisitive throughout the conversation?
+
+        Possible tasks are for pre-made prompts are:
+        - (summ): Summarization. Generating an informative and fluent summary for a given long text.
+        - (MT): Machine Translation. Translate a sentence from one language to another.
+        - (D2T): Data to Text. Automatically generate a fluent and factual description for a given table.
+        - (diag): Dialogue. Generate an engaging and informative response based on the dialogue history.
+
+        Args:
+            src (str): Source text.
+            pred (str): Candidate sentence.
+            model (str): Model name. If None, a default model is used.
+            prompt (str): Prompt template. If None, a default prompt template is used.
+            a (list): List of aspects to evaluate.
+            d (str): Task description.
+            api_key (str): OpenAI API key.
+
+        Returns:
+            list: List of scores for each candidate sentence.
+        """
+        prompts = {
+            "summ": {
+                "FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
+                "COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
+                "CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
+                "INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
+                "COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
+                "REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
+                "FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
+            },
+            "MT": {
+                "ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
+                "FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
+                "MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
+            },
+            "D2T": {
+                "INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
+                "NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
+                "FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
+            },
+            "diag": {
+                "COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "CON": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+            },
+        }
+
+        assert isinstance(src, str), "Source must be a string."
+        assert isinstance(pred, str), "Prediction must be a string."
+        assert isinstance(model, str), "Model must be a string."
+        # If model is not in the list of models, raise an error
+        models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
+        assert model in models, f"Model must be one of {models}."
+
+        # If prompt is given, check that it is a string
+        if prompt:
+            assert isinstance(prompt, str), "Prompt must be a string."
+            assert not a, "Aspect must not be given if prompt is given."
+            assert not d, "Task must not be given if prompt is given."
+        else:
+            # If prompt is not given, check that task and aspect are given
+            assert a, "Aspect must be given if prompt is not given."
+            assert d, "Task must be given if prompt is not given."
+
+        # If aspect is given, check that it is a string
+        if a:
+            assert isinstance(a, str), "Aspect must be a string."
+            aspects = [
+                "COV",
+                "FAC",
+                "FLU",
+                "CON",
+                "INF",
+                "COH",
+                "REL",
+                "ACC",
+                "MQM",
+                "INT",
+                "ENG",
+                "SPE",
+                "COR",
+                "SEM",
+                "UND",
+                "ERR",
+                "DIV",
+                "DEP",
+                "LIK",
+                "FLE",
+                "INQ",
+            ]
+            assert a in aspects, f"Aspect must be one of {aspects}."
+
+        # If task is given, check that it is a string
+        if d:
+            assert isinstance(d, str), "Task must be a string."
+            tasks = ["summ", "MT", "D2T", "diag"]
+            assert d in tasks, f"Task must be one of {tasks}."
+
+        if a and d:
+            # Check that the corresponding entry exists in the prompt template
+            assert a in prompts[d], f"Aspect {a} is not available for task {d}."
+            # Check that the prompt template is not empty
+            assert prompts[d][
+                a
+            ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+
+        # Generative LLM is given a prompt template and some context information
+        prompt = prompt if prompt else prompts[d][a]
+
+        # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
+        if model == "meta-llama/Llama-2-7b-chat-hf" or model == "gpt2":
+            tokenizer = AutoTokenizer.from_pretrained(model)
+            llm = AutoModelForCausalLM.from_pretrained(model)
+            inputs = tokenizer(prompt, return_tensors="pt")
+
+            outputs = llm.generate(
+                **inputs,
+                max_new_tokens=50,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+
+            transition_scores = llm.compute_transition_scores(
+                outputs.sequences, outputs.scores, normalize_logits=True
+            )
+
+            logprobs = np.array(transition_scores[0].tolist())
+            print(logprobs)
+
+        elif model == "gpt-3.5-turbo":
+            openai.api_key = api_key
+            response = openai.Completion.create(
+                model=model,
+                prompt=prompt,
+                logprobs=5,
+            )
+
+            logprobs = response["choices"][0]["logprobs"]
+
+        # Compute GPTScore
+        score = 0
+        for i, _ in enumerate(pred.split()):
+            score += logprobs[i]
+
+        return score
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
new file mode 100644
index 0000000..d7d62bf
--- /dev/null
+++ b/tests/test_llm_metrics.py
@@ -0,0 +1,71 @@
+import unittest
+
+from saga_llm_evaluation_ml.model.helpers.llm_metrics import GPTScore
+
+
+class TestGPTScore(unittest.TestCase):
+    def test_bad_arguments(self):
+        gptscore = GPTScore()
+
+        with self.assertRaises(AssertionError):
+            gptscore.compute(["The cat sat on the mat."], ["The dog sat on the log."])
+            gptscore.compute(
+                "The cat sat on the mat.", "The dog sat on the log.", model="random"
+            )
+            gptscore.compute(
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                model="meta-llama/Llama-2-7b-chat-hf",
+                prompt=10,
+            )
+            gptscore.compute(
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                prompt="Summarize",
+                a="ERR",
+            )
+            gptscore.compute(
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                prompt="Summarize",
+                d="summ",
+            )
+            gptscore.compute(
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                prompt="Summarize",
+                a="ERR",
+                d="summ",
+            )
+            gptscore.compute(
+                "The cat sat on the mat.", "The dog sat on the log.", a="ERR", d=None
+            )
+            gptscore.compute(
+                "The cat sat on the mat.", "The dog sat on the log.", a=None, d="summ"
+            )
+            gptscore.compute(
+                "The cat sat on the mat.", "The dog sat on the log.", a=2, d="summ"
+            )
+            gptscore.compute(
+                "The cat sat on the mat.", "The dog sat on the log.", a="ERR", d=None
+            )
+            gptscore.compute(
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                a="notvalid",
+                d="summ",
+            )
+            gptscore.compute(
+                "The cat sat on the mat.", "The dog sat on the log.", a="ERR", d="D2T"
+            )
+
+    def test_compute(self):
+        """Tests that the GPTScore computes a higher score for a better prediction."""
+        source = "State something true."
+        pred = "The cat eats elephants."
+        better_pred = "The cat eats mice."
+
+        gptscore = GPTScore()
+        score = gptscore.compute(source, pred, a="ERR", d="diag")
+        score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag")
+        self.assertGreater(score_2, score)

From ce999f88f6a0230a00547fd092f951700b93b044 Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Thu, 19 Oct 2023 14:21:12 +0200
Subject: [PATCH 2/6] feat(gptscore): revamp to make multiple predictions at
 once

---
 .../model/helpers/llm_metrics.py              | 195 ++++++++++--------
 tests/test_llm_metrics.py                     |  79 ++++---
 2 files changed, 160 insertions(+), 114 deletions(-)

diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
index 695df85..acee1fe 100644
--- a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
@@ -16,9 +16,91 @@ def __init__(self):
         and theta are model parameters.
         GPTScore does not require any reference text.
         """
+        self.huggingface_models = ["meta-llama/Llama-2-7b-chat-hf", "gpt2", "mistralai/Mistral-7B-v0.1"]
+        self.aspects = [
+                "COV",
+                "FAC",
+                "FLU",
+                "CON",
+                "INF",
+                "COH",
+                "REL",
+                "ACC",
+                "MQM",
+                "INT",
+                "ENG",
+                "SPE",
+                "COR",
+                "SEM",
+                "UND",
+                "ERR",
+                "DIV",
+                "DEP",
+                "LIK",
+                "FLE",
+                "INQ",
+            ]
+        self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
+        self.tasks = ["summ", "MT", "D2T", "diag"]
+
+    def get_prompt(self, a, d, src, pred):
+        """
+        This method returns a prompt template given a task description, and an aspect to evaluate.
+        Args:
+            a (str): Aspect to evaluate.
+            d (str): Task description.
+            src (str): Source text.
+            pred (str): Candidate sentence.
+        Returns:
+            str: Prompt template.
+        """
+        
+        templates = {
+            "summ": {
+                "FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
+                "COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
+                "CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
+                "INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
+                "COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
+                "REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
+                "FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
+            },
+            "MT": {
+                "ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
+                "FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
+                "MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
+            },
+            "D2T": {
+                "INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
+                "NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
+                "FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
+            },
+            "diag": {
+                "COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "CON": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+            },
+        }
+
+        # Check that the corresponding entry exists in the prompt template
+        assert a in templates[d], f"Aspect {a} is not available for task {d}."
+        # Check that the prompt template is not empty
+        assert templates[d][
+            a
+        ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+
+
+        return templates[d][a]
 
     def compute(
-        self, src, pred, model="gpt2", prompt=None, a=None, d=None, api_key=None
+        self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
     ):
         """
         This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
@@ -52,8 +134,8 @@ def compute(
         - (diag): Dialogue. Generate an engaging and informative response based on the dialogue history.
 
         Args:
-            src (str): Source text.
-            pred (str): Candidate sentence.
+            sources (list of str): Source texts.
+            preds (list of str): Candidate sentences.
             model (str): Model name. If None, a default model is used.
             prompt (str): Prompt template. If None, a default prompt template is used.
             a (list): List of aspects to evaluate.
@@ -63,50 +145,15 @@ def compute(
         Returns:
             list: List of scores for each candidate sentence.
         """
-        prompts = {
-            "summ": {
-                "FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
-                "COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
-                "CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
-                "INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
-                "COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
-                "REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
-                "FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
-            },
-            "MT": {
-                "ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
-                "FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
-                "MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
-            },
-            "D2T": {
-                "INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
-                "NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
-                "FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
-            },
-            "diag": {
-                "COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "CON": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-            },
-        }
+        assert isinstance(sources, list) and isinstance(sources[0], str), "Source must be a list of strings."
+        assert isinstance(preds, list) and isinstance(preds[0], str), "Prediction must be a list of strings."
 
-        assert isinstance(src, str), "Source must be a string."
-        assert isinstance(pred, str), "Prediction must be a string."
         assert isinstance(model, str), "Model must be a string."
-        # If model is not in the list of models, raise an error
-        models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
-        assert model in models, f"Model must be one of {models}."
+        assert model in self.models, f"Model must be one of {self.models}."
 
-        # If prompt is given, check that it is a string
-        if prompt:
-            assert isinstance(prompt, str), "Prompt must be a string."
+        # If prompt is given, check that it is a list of string
+        if prompts:
+            assert isinstance(prompts, list) and isinstance(prompts[0], str), "Prompts must be a list of strings."
             assert not a, "Aspect must not be given if prompt is given."
             assert not d, "Task must not be given if prompt is given."
         else:
@@ -117,53 +164,21 @@ def compute(
         # If aspect is given, check that it is a string
         if a:
             assert isinstance(a, str), "Aspect must be a string."
-            aspects = [
-                "COV",
-                "FAC",
-                "FLU",
-                "CON",
-                "INF",
-                "COH",
-                "REL",
-                "ACC",
-                "MQM",
-                "INT",
-                "ENG",
-                "SPE",
-                "COR",
-                "SEM",
-                "UND",
-                "ERR",
-                "DIV",
-                "DEP",
-                "LIK",
-                "FLE",
-                "INQ",
-            ]
-            assert a in aspects, f"Aspect must be one of {aspects}."
+            assert a in self.aspects, f"Aspect must be one of {self.aspects}."
 
         # If task is given, check that it is a string
         if d:
             assert isinstance(d, str), "Task must be a string."
-            tasks = ["summ", "MT", "D2T", "diag"]
-            assert d in tasks, f"Task must be one of {tasks}."
-
-        if a and d:
-            # Check that the corresponding entry exists in the prompt template
-            assert a in prompts[d], f"Aspect {a} is not available for task {d}."
-            # Check that the prompt template is not empty
-            assert prompts[d][
-                a
-            ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+            assert d in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompt = prompt if prompt else prompts[d][a]
+        prompts = prompts if prompts else [self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)]
 
         # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
-        if model == "meta-llama/Llama-2-7b-chat-hf" or model == "gpt2":
+        if model in self.huggingface_models:
             tokenizer = AutoTokenizer.from_pretrained(model)
             llm = AutoModelForCausalLM.from_pretrained(model)
-            inputs = tokenizer(prompt, return_tensors="pt")
+            inputs = tokenizer(prompts, return_tensors="pt")
 
             outputs = llm.generate(
                 **inputs,
@@ -176,22 +191,24 @@ def compute(
                 outputs.sequences, outputs.scores, normalize_logits=True
             )
 
-            logprobs = np.array(transition_scores[0].tolist())
-            print(logprobs)
+            logprobs = np.array(transition_scores.tolist())
 
         elif model == "gpt-3.5-turbo":
             openai.api_key = api_key
             response = openai.Completion.create(
                 model=model,
-                prompt=prompt,
+                prompt=prompts,
                 logprobs=5,
             )
 
             logprobs = response["choices"][0]["logprobs"]
 
-        # Compute GPTScore
-        score = 0
-        for i, _ in enumerate(pred.split()):
-            score += logprobs[i]
+        # Compute GPTScores
+        scores = []
+        for i, pred in enumerate(preds):
+            pred_tokens = pred.split()
+            pred_logprobs = logprobs[i][: len(pred_tokens)]
+            score = np.mean(pred_logprobs)
+            scores.append(score)
 
-        return score
+        return scores
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
index d7d62bf..a8731c8 100644
--- a/tests/test_llm_metrics.py
+++ b/tests/test_llm_metrics.py
@@ -8,64 +8,93 @@ def test_bad_arguments(self):
         gptscore = GPTScore()
 
         with self.assertRaises(AssertionError):
-            gptscore.compute(["The cat sat on the mat."], ["The dog sat on the log."])
+            gptscore.compute("The cat sat on the mat.", "The dog sat on the log.")
             gptscore.compute(
-                "The cat sat on the mat.", "The dog sat on the log.", model="random"
+                ["The cat sat on the mat."], ["The dog sat on the log."], model="random"
             )
             gptscore.compute(
-                "The cat sat on the mat.",
-                "The dog sat on the log.",
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
                 model="meta-llama/Llama-2-7b-chat-hf",
                 prompt=10,
             )
             gptscore.compute(
-                "The cat sat on the mat.",
-                "The dog sat on the log.",
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
                 prompt="Summarize",
                 a="ERR",
             )
             gptscore.compute(
-                "The cat sat on the mat.",
-                "The dog sat on the log.",
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
                 prompt="Summarize",
                 d="summ",
             )
             gptscore.compute(
-                "The cat sat on the mat.",
-                "The dog sat on the log.",
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
                 prompt="Summarize",
                 a="ERR",
                 d="summ",
             )
             gptscore.compute(
-                "The cat sat on the mat.", "The dog sat on the log.", a="ERR", d=None
+                ["The cat sat on the mat."], ["The dog sat on the log."], a="ERR", d=None
             )
             gptscore.compute(
-                "The cat sat on the mat.", "The dog sat on the log.", a=None, d="summ"
+                ["The cat sat on the mat."], ["The dog sat on the log."], a=None, d="summ"
             )
             gptscore.compute(
-                "The cat sat on the mat.", "The dog sat on the log.", a=2, d="summ"
+                ["The cat sat on the mat."], ["The dog sat on the log."], a=2, d="summ"
             )
             gptscore.compute(
-                "The cat sat on the mat.", "The dog sat on the log.", a="ERR", d=None
+                ["The cat sat on the mat."], ["The dog sat on the log."], a="ERR", d=None
             )
             gptscore.compute(
-                "The cat sat on the mat.",
-                "The dog sat on the log.",
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
                 a="notvalid",
                 d="summ",
             )
             gptscore.compute(
-                "The cat sat on the mat.", "The dog sat on the log.", a="ERR", d="D2T"
+                ["The cat sat on the mat."], ["The dog sat on the log."], a="ERR", d="D2T"
             )
 
-    def test_compute(self):
-        """Tests that the GPTScore computes a higher score for a better prediction."""
-        source = "State something true."
-        pred = "The cat eats elephants."
-        better_pred = "The cat eats mice."
+    def test_compute_gpt2(self):
+        """Tests that the GPTScore computes a higher score for a better prediction with gpt2."""
+        sources = ["State something true.", "State something true."]
+        preds = ["The cat eats elephants.",  "The cat eats mice."]
 
         gptscore = GPTScore()
-        score = gptscore.compute(source, pred, a="ERR", d="diag")
-        score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag")
-        self.assertGreater(score_2, score)
+
+        # gpt2
+        scores = gptscore.compute(sources, preds, a="ERR", d="diag", model="gpt2")
+        self.assertGreater(scores[1], scores[0])
+
+    # def test_compute_mistral(self):
+    #     """Tests that the GPTScore computes a higher score for a better prediction with mistralai/Mistral-7B-v0.1."""
+    #     source = "State something true."
+    #     pred = "The cat eats elephants."
+    #     better_pred = "The cat eats mice."
+
+    #     gptscore = GPTScore()
+
+    #     # mistralai/Mistral-7B-v0.1
+    #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
+    #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
+    #     self.assertGreater(score_2, score)
+
+
+    # def test_compute_llama(self):
+    #     """Tests that the GPTScore computes a higher score for a better prediction with meta-llama/Llama-2-7b-chat-hf."""
+    #     source = "State something true."
+    #     pred = "The cat eats elephants."
+    #     better_pred = "The cat eats mice."
+
+    #     gptscore = GPTScore()
+
+    #     # meta-llama/Llama-2-7b-chat-hf
+    #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
+    #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
+    #     self.assertGreater(score_2, score)
+
+

From 4f44004ea6b7a21a0ee2c86de1883e969942bc9a Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Thu, 19 Oct 2023 14:27:27 +0200
Subject: [PATCH 3/6] fix(pylint): format code

---
 .../model/helpers/llm_metrics.py              | 73 +++++++++++--------
 tests/test_llm_metrics.py                     | 43 +++++++----
 2 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
index acee1fe..3760bf5 100644
--- a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
@@ -16,30 +16,34 @@ def __init__(self):
         and theta are model parameters.
         GPTScore does not require any reference text.
         """
-        self.huggingface_models = ["meta-llama/Llama-2-7b-chat-hf", "gpt2", "mistralai/Mistral-7B-v0.1"]
+        self.huggingface_models = [
+            "meta-llama/Llama-2-7b-chat-hf",
+            "gpt2",
+            "mistralai/Mistral-7B-v0.1",
+        ]
         self.aspects = [
-                "COV",
-                "FAC",
-                "FLU",
-                "CON",
-                "INF",
-                "COH",
-                "REL",
-                "ACC",
-                "MQM",
-                "INT",
-                "ENG",
-                "SPE",
-                "COR",
-                "SEM",
-                "UND",
-                "ERR",
-                "DIV",
-                "DEP",
-                "LIK",
-                "FLE",
-                "INQ",
-            ]
+            "COV",
+            "FAC",
+            "FLU",
+            "CON",
+            "INF",
+            "COH",
+            "REL",
+            "ACC",
+            "MQM",
+            "INT",
+            "ENG",
+            "SPE",
+            "COR",
+            "SEM",
+            "UND",
+            "ERR",
+            "DIV",
+            "DEP",
+            "LIK",
+            "FLE",
+            "INQ",
+        ]
         self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
         self.tasks = ["summ", "MT", "D2T", "diag"]
 
@@ -54,7 +58,7 @@ def get_prompt(self, a, d, src, pred):
         Returns:
             str: Prompt template.
         """
-        
+
         templates = {
             "summ": {
                 "FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
@@ -96,7 +100,6 @@ def get_prompt(self, a, d, src, pred):
             a
         ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
 
-
         return templates[d][a]
 
     def compute(
@@ -145,15 +148,21 @@ def compute(
         Returns:
             list: List of scores for each candidate sentence.
         """
-        assert isinstance(sources, list) and isinstance(sources[0], str), "Source must be a list of strings."
-        assert isinstance(preds, list) and isinstance(preds[0], str), "Prediction must be a list of strings."
+        assert isinstance(sources, list) and isinstance(
+            sources[0], str
+        ), "Source must be a list of strings."
+        assert isinstance(preds, list) and isinstance(
+            preds[0], str
+        ), "Prediction must be a list of strings."
 
         assert isinstance(model, str), "Model must be a string."
         assert model in self.models, f"Model must be one of {self.models}."
 
         # If prompt is given, check that it is a list of string
         if prompts:
-            assert isinstance(prompts, list) and isinstance(prompts[0], str), "Prompts must be a list of strings."
+            assert isinstance(prompts, list) and isinstance(
+                prompts[0], str
+            ), "Prompts must be a list of strings."
             assert not a, "Aspect must not be given if prompt is given."
             assert not d, "Task must not be given if prompt is given."
         else:
@@ -172,7 +181,13 @@ def compute(
             assert d in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompts = prompts if prompts else [self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)]
+        prompts = (
+            prompts
+            if prompts
+            else [
+                self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)
+            ]
+        )
 
         # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
         if model in self.huggingface_models:
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
index a8731c8..3eb1e1e 100644
--- a/tests/test_llm_metrics.py
+++ b/tests/test_llm_metrics.py
@@ -16,38 +16,47 @@ def test_bad_arguments(self):
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 model="meta-llama/Llama-2-7b-chat-hf",
-                prompt=10,
+                prompts=10,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                prompt="Summarize",
+                prompts="Summarize",
                 a="ERR",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                prompt="Summarize",
+                prompts="Summarize",
                 d="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                prompt="Summarize",
+                prompts="Summarize",
                 a="ERR",
                 d="summ",
             )
             gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], a="ERR", d=None
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
+                a="ERR",
+                d=None,
             )
             gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], a=None, d="summ"
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
+                a=None,
+                d="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."], ["The dog sat on the log."], a=2, d="summ"
             )
             gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], a="ERR", d=None
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
+                a="ERR",
+                d=None,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
@@ -56,13 +65,16 @@ def test_bad_arguments(self):
                 d="summ",
             )
             gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], a="ERR", d="D2T"
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
+                a="ERR",
+                d="D2T",
             )
 
     def test_compute_gpt2(self):
         """Tests that the GPTScore computes a higher score for a better prediction with gpt2."""
         sources = ["State something true.", "State something true."]
-        preds = ["The cat eats elephants.",  "The cat eats mice."]
+        preds = ["The cat eats elephants.", "The cat eats mice."]
 
         gptscore = GPTScore()
 
@@ -71,7 +83,10 @@ def test_compute_gpt2(self):
         self.assertGreater(scores[1], scores[0])
 
     # def test_compute_mistral(self):
-    #     """Tests that the GPTScore computes a higher score for a better prediction with mistralai/Mistral-7B-v0.1."""
+    #     """
+    #     Tests that the GPTScore computes a higher score for a better prediction
+    #     with mistralai/Mistral-7B-v0.1.
+    #     """
     #     source = "State something true."
     #     pred = "The cat eats elephants."
     #     better_pred = "The cat eats mice."
@@ -83,9 +98,11 @@ def test_compute_gpt2(self):
     #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
     #     self.assertGreater(score_2, score)
 
-
     # def test_compute_llama(self):
-    #     """Tests that the GPTScore computes a higher score for a better prediction with meta-llama/Llama-2-7b-chat-hf."""
+    #     """
+    #     Tests that the GPTScore computes a higher score for a better prediction
+    #     with meta-llama/Llama-2-7b-chat-hf.
+    #     """
     #     source = "State something true."
     #     pred = "The cat eats elephants."
     #     better_pred = "The cat eats mice."
@@ -96,5 +113,3 @@ def test_compute_gpt2(self):
     #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
     #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
     #     self.assertGreater(score_2, score)
-
-

From 2be80d7d23c944c90529b48c852aa7e55b7d889e Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Thu, 19 Oct 2023 15:16:51 +0200
Subject: [PATCH 4/6] refactor(structure): refactor file structure for pylint

---
 .../{model => }/helpers/__init__.py           |   0
 .../{model => }/helpers/embedding_metrics.py  |   9 +-
 .../{model => }/helpers/language_metrics.py   |   3 +-
 .../{model => }/helpers/llm_metrics.py        | 179 +++++++++++-------
 .../{model => }/helpers/utils.py              |   4 +-
 tests/test_embedding_metrics.py               |   2 +-
 tests/test_helpers.py                         |  12 +-
 tests/test_language_metrics.py                |   2 +-
 tests/test_llm_metrics.py                     |  55 +++---
 9 files changed, 164 insertions(+), 102 deletions(-)
 rename saga_llm_evaluation_ml/{model => }/helpers/__init__.py (100%)
 rename saga_llm_evaluation_ml/{model => }/helpers/embedding_metrics.py (90%)
 rename saga_llm_evaluation_ml/{model => }/helpers/language_metrics.py (91%)
 rename saga_llm_evaluation_ml/{model => }/helpers/llm_metrics.py (68%)
 rename saga_llm_evaluation_ml/{model => }/helpers/utils.py (92%)

diff --git a/saga_llm_evaluation_ml/model/helpers/__init__.py b/saga_llm_evaluation_ml/helpers/__init__.py
similarity index 100%
rename from saga_llm_evaluation_ml/model/helpers/__init__.py
rename to saga_llm_evaluation_ml/helpers/__init__.py
diff --git a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py b/saga_llm_evaluation_ml/helpers/embedding_metrics.py
similarity index 90%
rename from saga_llm_evaluation_ml/model/helpers/embedding_metrics.py
rename to saga_llm_evaluation_ml/helpers/embedding_metrics.py
index 6af0c38..fa06db7 100644
--- a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/embedding_metrics.py
@@ -4,7 +4,8 @@
 class BERTScore:
     def __init__(self, model_type="distilbert-base-uncased"):
         """
-        BERTScore computes a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        BERTScore computes a similarity score for each token in the candidate sentence with each
+        token in the reference sentence.
         The final score is the average of the similarity scores of all tokens in the candidate sentence.
 
         Args:
@@ -20,7 +21,8 @@ def compute(self, references, predictions, **kwargs):
             predictions (list): List of candidate sentences.
 
         Returns:
-            list: List of scores for each candidate sentence. Contains a list of scores for precisions, recalls, and F1 scores.
+            list: List of scores for each candidate sentence. Contains a list of scores
+            for precisions, recalls, and F1 scores.
         """
         assert len(references) == len(
             predictions
@@ -39,7 +41,8 @@ def compute(self, references, predictions, **kwargs):
 class MAUVE:
     def __init__(self, featurize_model_name="gpt2"):
         """
-        MAUVE score computes the difference between the candidate sentence distribution and the reference sentence distribution.
+        MAUVE score computes the difference between the candidate sentence distribution
+        and the reference sentence distribution.
         The bigger the MAUVE score, the better.
         """
         self.metric = load("mauve")
diff --git a/saga_llm_evaluation_ml/model/helpers/language_metrics.py b/saga_llm_evaluation_ml/helpers/language_metrics.py
similarity index 91%
rename from saga_llm_evaluation_ml/model/helpers/language_metrics.py
rename to saga_llm_evaluation_ml/helpers/language_metrics.py
index 490a3f2..0d887d0 100644
--- a/saga_llm_evaluation_ml/model/helpers/language_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/language_metrics.py
@@ -4,7 +4,8 @@
 class BLEURTScore:
     def __init__(self, checkpoint="BLEURT-tiny"):
         """
-        BLEURT is a learnt metric that uses BERT to compute a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        BLEURT is a learnt metric that uses BERT to compute a similarity score for
+        each token in the candidate sentence with each token in the reference sentence.
 
         Args:
             checkpoint (str, optional): Checkpoint to use. Defaults to BLEURT-tiny if not specified.
diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/helpers/llm_metrics.py
similarity index 68%
rename from saga_llm_evaluation_ml/model/helpers/llm_metrics.py
rename to saga_llm_evaluation_ml/helpers/llm_metrics.py
index 3760bf5..d270915 100644
--- a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/llm_metrics.py
@@ -4,7 +4,7 @@
 
 
 class GPTScore:
-    def __init__(self):
+    def __init__(self, model="gpt2"):
         """
         GPTScore is a metric which allows to evaluate generative models on a variety of tasks.
         GPTScore(h|d, a, S) =  sum_{t=1}^m w_t * log p(h_t | h_{<t}, T(d, a, S), theta)
@@ -15,12 +15,17 @@ def __init__(self):
         S: context information.
         and theta are model parameters.
         GPTScore does not require any reference text.
+
+        Args:
+            model (str, optional): Model name. Defaults to "gpt2".
         """
+        self.model = model
         self.huggingface_models = [
             "meta-llama/Llama-2-7b-chat-hf",
             "gpt2",
             "mistralai/Mistral-7B-v0.1",
         ]
+        self.openai_models = ["gpt-3.5-turbo"]
         self.aspects = [
             "COV",
             "FAC",
@@ -44,15 +49,87 @@ def __init__(self):
             "FLE",
             "INQ",
         ]
-        self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
+        self.models = [
+            "meta-llama/Llama-2-7b-chat-hf",
+            "gpt-3.5-turbo",
+            "gpt2",
+            "mistralai/Mistral-7B-v0.1",
+        ]
         self.tasks = ["summ", "MT", "D2T", "diag"]
 
-    def get_prompt(self, a, d, src, pred):
+        assert isinstance(model, str), "Model must be a string."
+        assert model in self.models, f"Model must be one of {self.models}."
+
+    def huggingface_logprobs(self, prompts):
+        """
+        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
+        Args:
+            prompts (list of str): List of prompt templates.
+        Returns:
+            list: List of log-likelihoods for each candidate sentence.
+        """
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model)
+        llm = AutoModelForCausalLM.from_pretrained(self.model)
+        inputs = tokenizer(prompts, return_tensors="pt")
+
+        outputs = llm.generate(
+            **inputs,
+            max_new_tokens=50,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        logprobs = np.array(
+            llm.compute_transition_scores(
+                outputs.sequences, outputs.scores, normalize_logits=True
+            ).tolist()
+        )
+
+        return logprobs
+
+    def openai_logprobs(self, prompts, api_key):
+        """
+        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
+        Args:
+            prompts (list of str): List of prompt templates.
+            api_key (str): OpenAI API key.
+        Returns:
+            list: List of log-likelihoods for each candidate sentence.
+        """
+        openai.api_key = api_key
+        outputs = openai.Completion.create(
+            model=self.model,
+            prompt=prompts,
+            logprobs=5,
+        )
+
+        logprobs = outputs["choices"][0]["logprobs"]
+
+        return logprobs
+
+    def get_prompts(self, aspect, task, sources, preds):
+        """
+        This method returns a list of prompt templates given a task description, and an aspect to evaluate.
+        Args:
+            aspect (str): Aspect to evaluate.
+            task (str): Task description.
+            sources (list of str): Source texts.
+            preds (list of str): Candidate sentences.
+        Returns:
+            list: List of prompt templates.
+        """
+        return [
+            self.get_prompt(aspect, task, src, pred)
+            for (src, pred) in zip(sources, preds)
+        ]
+
+    def get_prompt(self, aspect, task, src, pred):
         """
         This method returns a prompt template given a task description, and an aspect to evaluate.
         Args:
-            a (str): Aspect to evaluate.
-            d (str): Task description.
+            aspect (str): Aspect to evaluate.
+            task (str): Task description.
             src (str): Source text.
             pred (str): Candidate sentence.
         Returns:
@@ -94,21 +171,25 @@ def get_prompt(self, a, d, src, pred):
         }
 
         # Check that the corresponding entry exists in the prompt template
-        assert a in templates[d], f"Aspect {a} is not available for task {d}."
+        assert (
+            aspect in templates[task]
+        ), f"Aspect {aspect} is not available for task {task}."
         # Check that the prompt template is not empty
-        assert templates[d][
-            a
-        ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+        assert templates[task][
+            aspect
+        ], f"Prompt template for aspect {aspect} and task {task} is non-existent. Please specify a prompt template."
 
-        return templates[d][a]
+        return templates[task][aspect]
 
     def compute(
-        self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
+        self, sources, preds, prompts=None, aspect=None, task=None, api_key=None
     ):
         """
-        This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
+        This method computes GPTScore for a list of candidate sentences given a task description,
+        an aspect to evaluate and context information.
         The possible values for aspect are:
-        - (COV): Semantic coverage. How many semantic content units from the reference text are covered by the generated text?
+        - (COV): Semantic coverage. How many semantic content units from the reference text
+                    are covered by the generated text?
         - (FAC): Factuality. Does the generated text preserve the factual statements of the source text?)
         - (FLU): Fluency. Is the generated text well-written and grammatical?
         - (CON): Consistency. Is the generated text consistent in the information it provides?
@@ -139,10 +220,9 @@ def compute(
         Args:
             sources (list of str): Source texts.
             preds (list of str): Candidate sentences.
-            model (str): Model name. If None, a default model is used.
-            prompt (str): Prompt template. If None, a default prompt template is used.
-            a (list): List of aspects to evaluate.
-            d (str): Task description.
+            prompts (str): Prompt template. If None, a default prompt template is used.
+            aspect (list): List of aspects to evaluate.
+            task (str): Task description.
             api_key (str): OpenAI API key.
 
         Returns:
@@ -155,75 +235,46 @@ def compute(
             preds[0], str
         ), "Prediction must be a list of strings."
 
-        assert isinstance(model, str), "Model must be a string."
-        assert model in self.models, f"Model must be one of {self.models}."
+        assert isinstance(self.model, str), "Model must be a string."
+        assert self.model in self.models, f"Model must be one of {self.models}."
 
         # If prompt is given, check that it is a list of string
         if prompts:
             assert isinstance(prompts, list) and isinstance(
                 prompts[0], str
             ), "Prompts must be a list of strings."
-            assert not a, "Aspect must not be given if prompt is given."
-            assert not d, "Task must not be given if prompt is given."
+            assert not aspect, "Aspect must not be given if prompt is given."
+            assert not task, "Task must not be given if prompt is given."
         else:
             # If prompt is not given, check that task and aspect are given
-            assert a, "Aspect must be given if prompt is not given."
-            assert d, "Task must be given if prompt is not given."
+            assert aspect, "Aspect must be given if prompt is not given."
+            assert task, "Task must be given if prompt is not given."
 
         # If aspect is given, check that it is a string
-        if a:
-            assert isinstance(a, str), "Aspect must be a string."
-            assert a in self.aspects, f"Aspect must be one of {self.aspects}."
+        if aspect:
+            assert isinstance(aspect, str), "Aspect must be a string."
+            assert aspect in self.aspects, f"Aspect must be one of {self.aspects}."
 
         # If task is given, check that it is a string
-        if d:
-            assert isinstance(d, str), "Task must be a string."
-            assert d in self.tasks, f"Task must be one of {self.tasks}."
+        if task:
+            assert isinstance(task, str), "Task must be a string."
+            assert task in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompts = (
-            prompts
-            if prompts
-            else [
-                self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)
-            ]
-        )
+        prompts = prompts or self.get_prompts(aspect, task, sources, preds)
 
         # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
-        if model in self.huggingface_models:
-            tokenizer = AutoTokenizer.from_pretrained(model)
-            llm = AutoModelForCausalLM.from_pretrained(model)
-            inputs = tokenizer(prompts, return_tensors="pt")
-
-            outputs = llm.generate(
-                **inputs,
-                max_new_tokens=50,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-
-            transition_scores = llm.compute_transition_scores(
-                outputs.sequences, outputs.scores, normalize_logits=True
-            )
-
-            logprobs = np.array(transition_scores.tolist())
-
-        elif model == "gpt-3.5-turbo":
-            openai.api_key = api_key
-            response = openai.Completion.create(
-                model=model,
-                prompt=prompts,
-                logprobs=5,
-            )
+        if self.model in self.huggingface_models:
+            logprobs = self.huggingface_logprobs(prompts)
 
-            logprobs = response["choices"][0]["logprobs"]
+        elif self.model in self.openai_models:
+            logprobs = self.openai_logprobs(prompts, api_key)
 
         # Compute GPTScores
         scores = []
         for i, pred in enumerate(preds):
             pred_tokens = pred.split()
             pred_logprobs = logprobs[i][: len(pred_tokens)]
-            score = np.mean(pred_logprobs)
-            scores.append(score)
+            scores.append(np.mean(pred_logprobs))
 
         return scores
diff --git a/saga_llm_evaluation_ml/model/helpers/utils.py b/saga_llm_evaluation_ml/helpers/utils.py
similarity index 92%
rename from saga_llm_evaluation_ml/model/helpers/utils.py
rename to saga_llm_evaluation_ml/helpers/utils.py
index 5fd4a9f..0d38b02 100644
--- a/saga_llm_evaluation_ml/model/helpers/utils.py
+++ b/saga_llm_evaluation_ml/helpers/utils.py
@@ -18,7 +18,7 @@ class MetadataExtractor:
     def __init__(self):
         self.metadata_extractor = MetafeatureExtractorsRunner()
 
-    def addWordRegexMatchesCount(self, regex_rule, name=None):
+    def add_word_regex_matches_count(self, regex_rule, name=None):
         """
         Adds a regex rule to the metadata extractor.
         For a given regex return the number of words matching the regex.
@@ -30,7 +30,7 @@ def addWordRegexMatchesCount(self, regex_rule, name=None):
             WordRegexMatchesCount(regex=regex_rule, name=name)
         )
 
-    def addRegexMatchCount(self, regex_rule, name=None):
+    def add_regex_match_count(self, regex_rule, name=None):
         """
         Adds a regex rule to the metadata extractor.
         For a given regex return the number of matches it has in the text.
diff --git a/tests/test_embedding_metrics.py b/tests/test_embedding_metrics.py
index 963fe23..a13c5b2 100644
--- a/tests/test_embedding_metrics.py
+++ b/tests/test_embedding_metrics.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.embedding_metrics import BERTScore, MAUVE
+from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore, MAUVE
 
 
 class TestBERTScore(unittest.TestCase):
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 9d3f244..30429cb 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.utils import MetadataExtractor
+from saga_llm_evaluation_ml.helpers.utils import MetadataExtractor
 
 
 class TestMetadataExtractor(unittest.TestCase):
@@ -25,8 +25,8 @@ def test_add_regex(self):
         """Tests that the MetadataExtractor class extracts the correct metadata when regex rules are added."""
         text = "The cat sat on the mat."
         extractor = MetadataExtractor()
-        extractor.addWordRegexMatchesCount("the")
-        extractor.addRegexMatchCount("the")
+        extractor.add_word_regex_matches_count("the")
+        extractor.add_regex_match_count("the")
         metadata = extractor.compute(text)
 
         # Test a few metadata values
@@ -45,8 +45,10 @@ def test_add_regex(self):
         len_metadata = len(metadata)
 
         # Check that the metadata is longer when multiple regex rules are added
-        extractor.addWordRegexMatchesCount("cat", name="word_regex_matches_count_cat")
-        extractor.addRegexMatchCount("cat", name="regex_match_count_cat")
+        extractor.add_word_regex_matches_count(
+            "cat", name="word_regex_matches_count_cat"
+        )
+        extractor.add_regex_match_count("cat", name="regex_match_count_cat")
         metadata = extractor.compute(text)
 
         self.assertGreater(len(metadata), len_metadata)
diff --git a/tests/test_language_metrics.py b/tests/test_language_metrics.py
index 62090a7..4a26424 100644
--- a/tests/test_language_metrics.py
+++ b/tests/test_language_metrics.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.language_metrics import BLEURTScore
+from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore
 
 
 class TestBLEURTScore(unittest.TestCase):
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
index 3eb1e1e..9561ce1 100644
--- a/tests/test_llm_metrics.py
+++ b/tests/test_llm_metrics.py
@@ -1,74 +1,78 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.llm_metrics import GPTScore
+from saga_llm_evaluation_ml.helpers.llm_metrics import GPTScore
 
 
 class TestGPTScore(unittest.TestCase):
+    def test_init(self):
+        with self.assertRaises(AssertionError):
+            GPTScore(model=100)
+            GPTScore(model="notvalid")
+
     def test_bad_arguments(self):
         gptscore = GPTScore()
 
         with self.assertRaises(AssertionError):
             gptscore.compute("The cat sat on the mat.", "The dog sat on the log.")
-            gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], model="random"
-            )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                model="meta-llama/Llama-2-7b-chat-hf",
                 prompts=10,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 prompts="Summarize",
-                a="ERR",
+                aspect="ERR",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 prompts="Summarize",
-                d="summ",
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 prompts="Summarize",
-                a="ERR",
-                d="summ",
+                aspect="ERR",
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="ERR",
-                d=None,
+                aspect="ERR",
+                task=None,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a=None,
-                d="summ",
+                aspect=None,
+                task="summ",
             )
             gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], a=2, d="summ"
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
+                aspect=2,
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="ERR",
-                d=None,
+                aspect="ERR",
+                task=None,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="notvalid",
-                d="summ",
+                aspect="notvalid",
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="ERR",
-                d="D2T",
+                aspect="ERR",
+                task="D2T",
             )
 
     def test_compute_gpt2(self):
@@ -79,7 +83,7 @@ def test_compute_gpt2(self):
         gptscore = GPTScore()
 
         # gpt2
-        scores = gptscore.compute(sources, preds, a="ERR", d="diag", model="gpt2")
+        scores = gptscore.compute(sources, preds, aspect="ERR", task="diag")
         self.assertGreater(scores[1], scores[0])
 
     # def test_compute_mistral(self):
@@ -94,8 +98,8 @@ def test_compute_gpt2(self):
     #     gptscore = GPTScore()
 
     #     # mistralai/Mistral-7B-v0.1
-    #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
-    #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
+    #     score = gptscore.compute(source, pred, aspect="ERR", task="diag", model="mistralai/Mistral-7B-v0.1")
+    #     score_2 = gptscore.compute(source, better_pred, aspect="ERR", task="diag", model="mistralai/Mistral-7B-v0.1")
     #     self.assertGreater(score_2, score)
 
     # def test_compute_llama(self):
@@ -110,6 +114,7 @@ def test_compute_gpt2(self):
     #     gptscore = GPTScore()
 
     #     # meta-llama/Llama-2-7b-chat-hf
-    #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
-    #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
+    #     score = gptscore.compute(source, pred, aspect="ERR", task="diag", model="meta-llama/Llama-2-7b-chat-hf")
+    #     score_2 = gptscore.compute(source, better_pred, aspect="ERR", task="diag",
+    #       model="meta-llama/Llama-2-7b-chat-hf")
     #     self.assertGreater(score_2, score)

From 980acdb0014d9b8c6291a5449f4ae61e3a6b9ee4 Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Tue, 24 Oct 2023 15:55:19 +0200
Subject: [PATCH 5/6] feat(llm_metrics): refactor gptscore and add support for
 geval and selfcheck

---
 poetry.lock                                   | 423 +++++++-------
 pyproject.toml                                |   2 +
 saga_llm_evaluation_ml/helpers/llm_metrics.py | 538 +++++++++++++-----
 tests/test_llm_metrics.py                     | 246 +++++---
 4 files changed, 752 insertions(+), 457 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 24edbde..8934047 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -375,101 +375,101 @@ files = [
 
 [[package]]
 name = "charset-normalizer"
-version = "3.3.0"
+version = "3.3.1"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.3.0.tar.gz", hash = "sha256:63563193aec44bce707e0c5ca64ff69fa72ed7cf34ce6e11d5127555756fd2f6"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:effe5406c9bd748a871dbcaf3ac69167c38d72db8c9baf3ff954c344f31c4cbe"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4162918ef3098851fcd8a628bf9b6a98d10c380725df9e04caf5ca6dd48c847a"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0570d21da019941634a531444364f2482e8db0b3425fcd5ac0c36565a64142c8"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5707a746c6083a3a74b46b3a631d78d129edab06195a92a8ece755aac25a3f3d"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:278c296c6f96fa686d74eb449ea1697f3c03dc28b75f873b65b5201806346a69"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a4b71f4d1765639372a3b32d2638197f5cd5221b19531f9245fcc9ee62d38f56"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5969baeaea61c97efa706b9b107dcba02784b1601c74ac84f2a532ea079403e"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3f93dab657839dfa61025056606600a11d0b696d79386f974e459a3fbc568ec"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:db756e48f9c5c607b5e33dd36b1d5872d0422e960145b08ab0ec7fd420e9d649"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:232ac332403e37e4a03d209a3f92ed9071f7d3dbda70e2a5e9cff1c4ba9f0678"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e5c1502d4ace69a179305abb3f0bb6141cbe4714bc9b31d427329a95acfc8bdd"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:2502dd2a736c879c0f0d3e2161e74d9907231e25d35794584b1ca5284e43f596"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23e8565ab7ff33218530bc817922fae827420f143479b753104ab801145b1d5b"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-win32.whl", hash = "sha256:1872d01ac8c618a8da634e232f24793883d6e456a66593135aeafe3784b0848d"},
-    {file = "charset_normalizer-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:557b21a44ceac6c6b9773bc65aa1b4cc3e248a5ad2f5b914b91579a32e22204d"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d7eff0f27edc5afa9e405f7165f85a6d782d308f3b6b9d96016c010597958e63"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a685067d05e46641d5d1623d7c7fdf15a357546cbb2f71b0ebde91b175ffc3e"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d3d5b7db9ed8a2b11a774db2bbea7ba1884430a205dbd54a32d61d7c2a190fa"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2935ffc78db9645cb2086c2f8f4cfd23d9b73cc0dc80334bc30aac6f03f68f8c"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fe359b2e3a7729010060fbca442ca225280c16e923b37db0e955ac2a2b72a05"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:380c4bde80bce25c6e4f77b19386f5ec9db230df9f2f2ac1e5ad7af2caa70459"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0d1e3732768fecb052d90d62b220af62ead5748ac51ef61e7b32c266cac9293"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b2919306936ac6efb3aed1fbf81039f7087ddadb3160882a57ee2ff74fd2382"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f8888e31e3a85943743f8fc15e71536bda1c81d5aa36d014a3c0c44481d7db6e"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:82eb849f085624f6a607538ee7b83a6d8126df6d2f7d3b319cb837b289123078"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7b8b8bf1189b3ba9b8de5c8db4d541b406611a71a955bbbd7385bbc45fcb786c"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5adf257bd58c1b8632046bbe43ee38c04e1038e9d37de9c57a94d6bd6ce5da34"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c350354efb159b8767a6244c166f66e67506e06c8924ed74669b2c70bc8735b1"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-win32.whl", hash = "sha256:02af06682e3590ab952599fbadac535ede5d60d78848e555aa58d0c0abbde786"},
-    {file = "charset_normalizer-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:86d1f65ac145e2c9ed71d8ffb1905e9bba3a91ae29ba55b4c46ae6fc31d7c0d4"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3b447982ad46348c02cb90d230b75ac34e9886273df3a93eec0539308a6296d7"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:abf0d9f45ea5fb95051c8bfe43cb40cda383772f7e5023a83cc481ca2604d74e"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b09719a17a2301178fac4470d54b1680b18a5048b481cb8890e1ef820cb80455"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3d9b48ee6e3967b7901c052b670c7dda6deb812c309439adaffdec55c6d7b78"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:edfe077ab09442d4ef3c52cb1f9dab89bff02f4524afc0acf2d46be17dc479f5"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3debd1150027933210c2fc321527c2299118aa929c2f5a0a80ab6953e3bd1908"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f63face3a527284f7bb8a9d4f78988e3c06823f7bea2bd6f0e0e9298ca0403"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24817cb02cbef7cd499f7c9a2735286b4782bd47a5b3516a0e84c50eab44b98e"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c71f16da1ed8949774ef79f4a0260d28b83b3a50c6576f8f4f0288d109777989"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9cf3126b85822c4e53aa28c7ec9869b924d6fcfb76e77a45c44b83d91afd74f9"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b3b2316b25644b23b54a6f6401074cebcecd1244c0b8e80111c9a3f1c8e83d65"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:03680bb39035fbcffe828eae9c3f8afc0428c91d38e7d61aa992ef7a59fb120e"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cc152c5dd831641e995764f9f0b6589519f6f5123258ccaca8c6d34572fefa8"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-win32.whl", hash = "sha256:b8f3307af845803fb0b060ab76cf6dd3a13adc15b6b451f54281d25911eb92df"},
-    {file = "charset_normalizer-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:8eaf82f0eccd1505cf39a45a6bd0a8cf1c70dcfc30dba338207a969d91b965c0"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dc45229747b67ffc441b3de2f3ae5e62877a282ea828a5bdb67883c4ee4a8810"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4a0033ce9a76e391542c182f0d48d084855b5fcba5010f707c8e8c34663d77"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ada214c6fa40f8d800e575de6b91a40d0548139e5dc457d2ebb61470abf50186"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b1121de0e9d6e6ca08289583d7491e7fcb18a439305b34a30b20d8215922d43c"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1063da2c85b95f2d1a430f1c33b55c9c17ffaf5e612e10aeaad641c55a9e2b9d"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70f1d09c0d7748b73290b29219e854b3207aea922f839437870d8cc2168e31cc"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:250c9eb0f4600361dd80d46112213dff2286231d92d3e52af1e5a6083d10cad9"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:750b446b2ffce1739e8578576092179160f6d26bd5e23eb1789c4d64d5af7dc7"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:fc52b79d83a3fe3a360902d3f5d79073a993597d48114c29485e9431092905d8"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:588245972aca710b5b68802c8cad9edaa98589b1b42ad2b53accd6910dad3545"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e39c7eb31e3f5b1f88caff88bcff1b7f8334975b46f6ac6e9fc725d829bc35d4"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:abecce40dfebbfa6abf8e324e1860092eeca6f7375c8c4e655a8afb61af58f2c"},
-    {file = "charset_normalizer-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:24a91a981f185721542a0b7c92e9054b7ab4fea0508a795846bc5b0abf8118d4"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:67b8cc9574bb518ec76dc8e705d4c39ae78bb96237cb533edac149352c1f39fe"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac71b2977fb90c35d41c9453116e283fac47bb9096ad917b8819ca8b943abecd"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3ae38d325b512f63f8da31f826e6cb6c367336f95e418137286ba362925c877e"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:542da1178c1c6af8873e143910e2269add130a299c9106eef2594e15dae5e482"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a85aed0b864ac88309b7d94be09f6046c834ef60762a8833b660139cfbad13"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aae32c93e0f64469f74ccc730a7cb21c7610af3a775157e50bbd38f816536b38"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b26ddf78d57f1d143bdf32e820fd8935d36abe8a25eb9ec0b5a71c82eb3895"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f5d10bae5d78e4551b7be7a9b29643a95aded9d0f602aa2ba584f0388e7a557"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:249c6470a2b60935bafd1d1d13cd613f8cd8388d53461c67397ee6a0f5dce741"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:c5a74c359b2d47d26cdbbc7845e9662d6b08a1e915eb015d044729e92e7050b7"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:b5bcf60a228acae568e9911f410f9d9e0d43197d030ae5799e20dca8df588287"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:187d18082694a29005ba2944c882344b6748d5be69e3a89bf3cc9d878e548d5a"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:81bf654678e575403736b85ba3a7867e31c2c30a69bc57fe88e3ace52fb17b89"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-win32.whl", hash = "sha256:85a32721ddde63c9df9ebb0d2045b9691d9750cb139c161c80e500d210f5e26e"},
-    {file = "charset_normalizer-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:468d2a840567b13a590e67dd276c570f8de00ed767ecc611994c301d0f8c014f"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e0fc42822278451bc13a2e8626cf2218ba570f27856b536e00cfa53099724828"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09c77f964f351a7369cc343911e0df63e762e42bac24cd7d18525961c81754f4"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:12ebea541c44fdc88ccb794a13fe861cc5e35d64ed689513a5c03d05b53b7c82"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:805dfea4ca10411a5296bcc75638017215a93ffb584c9e344731eef0dcfb026a"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96c2b49eb6a72c0e4991d62406e365d87067ca14c1a729a870d22354e6f68115"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaf7b34c5bc56b38c931a54f7952f1ff0ae77a2e82496583b247f7c969eb1479"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:619d1c96099be5823db34fe89e2582b336b5b074a7f47f819d6b3a57ff7bdb86"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a0ac5e7015a5920cfce654c06618ec40c33e12801711da6b4258af59a8eff00a"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93aa7eef6ee71c629b51ef873991d6911b906d7312c6e8e99790c0f33c576f89"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7966951325782121e67c81299a031f4c115615e68046f79b85856b86ebffc4cd"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:02673e456dc5ab13659f85196c534dc596d4ef260e4d86e856c3b2773ce09843"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:c2af80fb58f0f24b3f3adcb9148e6203fa67dd3f61c4af146ecad033024dde43"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:153e7b6e724761741e0974fc4dcd406d35ba70b92bfe3fedcb497226c93b9da7"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-win32.whl", hash = "sha256:d47ecf253780c90ee181d4d871cd655a789da937454045b17b5798da9393901a"},
-    {file = "charset_normalizer-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:d97d85fa63f315a8bdaba2af9a6a686e0eceab77b3089af45133252618e70884"},
-    {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
+    {file = "charset-normalizer-3.3.1.tar.gz", hash = "sha256:d9137a876020661972ca6eec0766d81aef8a5627df628b664b234b73396e727e"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8aee051c89e13565c6bd366813c386939f8e928af93c29fda4af86d25b73d8f8"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:352a88c3df0d1fa886562384b86f9a9e27563d4704ee0e9d56ec6fcd270ea690"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:223b4d54561c01048f657fa6ce41461d5ad8ff128b9678cfe8b2ecd951e3f8a2"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f861d94c2a450b974b86093c6c027888627b8082f1299dfd5a4bae8e2292821"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1171ef1fc5ab4693c5d151ae0fdad7f7349920eabbaca6271f95969fa0756c2d"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28f512b9a33235545fbbdac6a330a510b63be278a50071a336afc1b78781b147"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0e842112fe3f1a4ffcf64b06dc4c61a88441c2f02f373367f7b4c1aa9be2ad5"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f9bc2ce123637a60ebe819f9fccc614da1bcc05798bbbaf2dd4ec91f3e08846"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f194cce575e59ffe442c10a360182a986535fd90b57f7debfaa5c845c409ecc3"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9a74041ba0bfa9bc9b9bb2cd3238a6ab3b7618e759b41bd15b5f6ad958d17605"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b578cbe580e3b41ad17b1c428f382c814b32a6ce90f2d8e39e2e635d49e498d1"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:6db3cfb9b4fcecb4390db154e75b49578c87a3b9979b40cdf90d7e4b945656e1"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:debb633f3f7856f95ad957d9b9c781f8e2c6303ef21724ec94bea2ce2fcbd056"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-win32.whl", hash = "sha256:87071618d3d8ec8b186d53cb6e66955ef2a0e4fa63ccd3709c0c90ac5a43520f"},
+    {file = "charset_normalizer-3.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:e372d7dfd154009142631de2d316adad3cc1c36c32a38b16a4751ba78da2a397"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae4070f741f8d809075ef697877fd350ecf0b7c5837ed68738607ee0a2c572cf"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58e875eb7016fd014c0eea46c6fa92b87b62c0cb31b9feae25cbbe62c919f54d"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dbd95e300367aa0827496fe75a1766d198d34385a58f97683fe6e07f89ca3e3c"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de0b4caa1c8a21394e8ce971997614a17648f94e1cd0640fbd6b4d14cab13a72"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:985c7965f62f6f32bf432e2681173db41336a9c2611693247069288bcb0c7f8b"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a15c1fe6d26e83fd2e5972425a772cca158eae58b05d4a25a4e474c221053e2d"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae55d592b02c4349525b6ed8f74c692509e5adffa842e582c0f861751701a673"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be4d9c2770044a59715eb57c1144dedea7c5d5ae80c68fb9959515037cde2008"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:851cf693fb3aaef71031237cd68699dded198657ec1e76a76eb8be58c03a5d1f"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:31bbaba7218904d2eabecf4feec0d07469284e952a27400f23b6628439439fa7"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:871d045d6ccc181fd863a3cd66ee8e395523ebfbc57f85f91f035f50cee8e3d4"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:501adc5eb6cd5f40a6f77fbd90e5ab915c8fd6e8c614af2db5561e16c600d6f3"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f5fb672c396d826ca16a022ac04c9dce74e00a1c344f6ad1a0fdc1ba1f332213"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-win32.whl", hash = "sha256:bb06098d019766ca16fc915ecaa455c1f1cd594204e7f840cd6258237b5079a8"},
+    {file = "charset_normalizer-3.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:8af5a8917b8af42295e86b64903156b4f110a30dca5f3b5aedea123fbd638bff"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7ae8e5142dcc7a49168f4055255dbcced01dc1714a90a21f87448dc8d90617d1"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5b70bab78accbc672f50e878a5b73ca692f45f5b5e25c8066d748c09405e6a55"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5ceca5876032362ae73b83347be8b5dbd2d1faf3358deb38c9c88776779b2e2f"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34d95638ff3613849f473afc33f65c401a89f3b9528d0d213c7037c398a51296"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9edbe6a5bf8b56a4a84533ba2b2f489d0046e755c29616ef8830f9e7d9cf5728"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6a02a3c7950cafaadcd46a226ad9e12fc9744652cc69f9e5534f98b47f3bbcf"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10b8dd31e10f32410751b3430996f9807fc4d1587ca69772e2aa940a82ab571a"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edc0202099ea1d82844316604e17d2b175044f9bcb6b398aab781eba957224bd"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b891a2f68e09c5ef989007fac11476ed33c5c9994449a4e2c3386529d703dc8b"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:71ef3b9be10070360f289aea4838c784f8b851be3ba58cf796262b57775c2f14"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:55602981b2dbf8184c098bc10287e8c245e351cd4fdcad050bd7199d5a8bf514"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:46fb9970aa5eeca547d7aa0de5d4b124a288b42eaefac677bde805013c95725c"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:520b7a142d2524f999447b3a0cf95115df81c4f33003c51a6ab637cbda9d0bf4"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-win32.whl", hash = "sha256:8ec8ef42c6cd5856a7613dcd1eaf21e5573b2185263d87d27c8edcae33b62a61"},
+    {file = "charset_normalizer-3.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:baec8148d6b8bd5cee1ae138ba658c71f5b03e0d69d5907703e3e1df96db5e41"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63a6f59e2d01310f754c270e4a257426fe5a591dc487f1983b3bbe793cf6bac6"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d6bfc32a68bc0933819cfdfe45f9abc3cae3877e1d90aac7259d57e6e0f85b1"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f3100d86dcd03c03f7e9c3fdb23d92e32abbca07e7c13ebd7ddfbcb06f5991f"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39b70a6f88eebe239fa775190796d55a33cfb6d36b9ffdd37843f7c4c1b5dc67"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e12f8ee80aa35e746230a2af83e81bd6b52daa92a8afaef4fea4a2ce9b9f4fa"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b6cefa579e1237ce198619b76eaa148b71894fb0d6bcf9024460f9bf30fd228"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:61f1e3fb621f5420523abb71f5771a204b33c21d31e7d9d86881b2cffe92c47c"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4f6e2a839f83a6a76854d12dbebde50e4b1afa63e27761549d006fa53e9aa80e"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:1ec937546cad86d0dce5396748bf392bb7b62a9eeb8c66efac60e947697f0e58"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:82ca51ff0fc5b641a2d4e1cc8c5ff108699b7a56d7f3ad6f6da9dbb6f0145b48"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:633968254f8d421e70f91c6ebe71ed0ab140220469cf87a9857e21c16687c034"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-win32.whl", hash = "sha256:c0c72d34e7de5604df0fde3644cc079feee5e55464967d10b24b1de268deceb9"},
+    {file = "charset_normalizer-3.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:63accd11149c0f9a99e3bc095bbdb5a464862d77a7e309ad5938fbc8721235ae"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5a3580a4fdc4ac05f9e53c57f965e3594b2f99796231380adb2baaab96e22761"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2465aa50c9299d615d757c1c888bc6fef384b7c4aec81c05a0172b4400f98557"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb7cd68814308aade9d0c93c5bd2ade9f9441666f8ba5aa9c2d4b389cb5e2a45"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91e43805ccafa0a91831f9cd5443aa34528c0c3f2cc48c4cb3d9a7721053874b"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:854cc74367180beb327ab9d00f964f6d91da06450b0855cbbb09187bcdb02de5"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c15070ebf11b8b7fd1bfff7217e9324963c82dbdf6182ff7050519e350e7ad9f"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c4c99f98fc3a1835af8179dcc9013f93594d0670e2fa80c83aa36346ee763d2"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3fb765362688821404ad6cf86772fc54993ec11577cd5a92ac44b4c2ba52155b"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dced27917823df984fe0c80a5c4ad75cf58df0fbfae890bc08004cd3888922a2"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a66bcdf19c1a523e41b8e9d53d0cedbfbac2e93c649a2e9502cb26c014d0980c"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ecd26be9f112c4f96718290c10f4caea6cc798459a3a76636b817a0ed7874e42"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3f70fd716855cd3b855316b226a1ac8bdb3caf4f7ea96edcccc6f484217c9597"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:17a866d61259c7de1bdadef418a37755050ddb4b922df8b356503234fff7932c"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-win32.whl", hash = "sha256:548eefad783ed787b38cb6f9a574bd8664468cc76d1538215d510a3cd41406cb"},
+    {file = "charset_normalizer-3.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:45f053a0ece92c734d874861ffe6e3cc92150e32136dd59ab1fb070575189c97"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bc791ec3fd0c4309a753f95bb6c749ef0d8ea3aea91f07ee1cf06b7b02118f2f"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c8c61fb505c7dad1d251c284e712d4e0372cef3b067f7ddf82a7fa82e1e9a93"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2c092be3885a1b7899cd85ce24acedc1034199d6fca1483fa2c3a35c86e43041"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2000c54c395d9e5e44c99dc7c20a64dc371f777faf8bae4919ad3e99ce5253e"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4cb50a0335382aac15c31b61d8531bc9bb657cfd848b1d7158009472189f3d62"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c30187840d36d0ba2893bc3271a36a517a717f9fd383a98e2697ee890a37c273"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe81b35c33772e56f4b6cf62cf4aedc1762ef7162a31e6ac7fe5e40d0149eb67"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0bf89afcbcf4d1bb2652f6580e5e55a840fdf87384f6063c4a4f0c95e378656"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:06cf46bdff72f58645434d467bf5228080801298fbba19fe268a01b4534467f5"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:3c66df3f41abee950d6638adc7eac4730a306b022570f71dd0bd6ba53503ab57"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd805513198304026bd379d1d516afbf6c3c13f4382134a2c526b8b854da1c2e"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:9505dc359edb6a330efcd2be825fdb73ee3e628d9010597aa1aee5aa63442e97"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:31445f38053476a0c4e6d12b047b08ced81e2c7c712e5a1ad97bc913256f91b2"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-win32.whl", hash = "sha256:bd28b31730f0e982ace8663d108e01199098432a30a4c410d06fe08fdb9e93f4"},
+    {file = "charset_normalizer-3.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:555fe186da0068d3354cdf4bbcbc609b0ecae4d04c921cc13e209eece7720727"},
+    {file = "charset_normalizer-3.3.1-py3-none-any.whl", hash = "sha256:800561453acdecedaac137bf09cd719c7a440b6800ec182f077bb8e7025fb708"},
 ]
 
 [[package]]
@@ -764,6 +764,17 @@ files = [
 [package.extras]
 graph = ["objgraph (>=1.7.2)"]
 
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+description = "Disk Cache -- Disk and file backed persistent cache."
+optional = false
+python-versions = ">=3"
+files = [
+    {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"},
+    {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"},
+]
+
 [[package]]
 name = "elemeta"
 version = "1.0.7"
@@ -1050,13 +1061,13 @@ files = [
 
 [[package]]
 name = "fsspec"
-version = "2023.9.2"
+version = "2023.10.0"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "fsspec-2023.9.2-py3-none-any.whl", hash = "sha256:603dbc52c75b84da501b9b2ec8c11e1f61c25984c4a0dda1f129ef391fbfc9b4"},
-    {file = "fsspec-2023.9.2.tar.gz", hash = "sha256:80bfb8c70cc27b2178cc62a935ecf242fc6e8c3fb801f9c571fc01b1e715ba7d"},
+    {file = "fsspec-2023.10.0-py3-none-any.whl", hash = "sha256:346a8f024efeb749d2a5fca7ba8854474b1ff9af7c3faaf636a4548781136529"},
+    {file = "fsspec-2023.10.0.tar.gz", hash = "sha256:330c66757591df346ad3091a53bd907e15348c2ba17d63fd54f5c39c4457d2a5"},
 ]
 
 [package.dependencies]
@@ -1259,18 +1270,18 @@ numpy = ">=1.17.3"
 
 [[package]]
 name = "huggingface-hub"
-version = "0.17.3"
+version = "0.18.0"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.17.3-py3-none-any.whl", hash = "sha256:545eb3665f6ac587add946e73984148f2ea5c7877eac2e845549730570c1933a"},
-    {file = "huggingface_hub-0.17.3.tar.gz", hash = "sha256:40439632b211311f788964602bf8b0d9d6b7a2314fba4e8d67b2ce3ecea0e3fd"},
+    {file = "huggingface_hub-0.18.0-py3-none-any.whl", hash = "sha256:ee0b6b68acbf6aeb6d083ea081e981c277a1104b82ab67fdf6780ff5396830af"},
+    {file = "huggingface_hub-0.18.0.tar.gz", hash = "sha256:10eda12b9c1cfa800b4b7c096b3ace8843734c3f28d69d1c243743fb7d7a2e81"},
 ]
 
 [package.dependencies]
 filelock = "*"
-fsspec = "*"
+fsspec = ">=2023.5.0"
 packaging = ">=20.9"
 pyyaml = ">=5.1"
 requests = "*"
@@ -1621,6 +1632,27 @@ files = [
     {file = "lit-17.0.3.tar.gz", hash = "sha256:e6049032462be1e2928686cbd4a6cc5b3c545d83ecd078737fe79412c1f3fcc1"},
 ]
 
+[[package]]
+name = "llama-cpp-python"
+version = "0.2.11"
+description = "Python bindings for the llama.cpp library"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "llama_cpp_python-0.2.11.tar.gz", hash = "sha256:aae4820bb24aca61800bac771fb735dcc22b08c1374300782ab47eb65743723a"},
+]
+
+[package.dependencies]
+diskcache = ">=5.6.1"
+numpy = ">=1.20.0"
+typing-extensions = ">=4.5.0"
+
+[package.extras]
+all = ["llama_cpp_python[dev,server,test]"]
+dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"]
+server = ["fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"]
+test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)"]
+
 [[package]]
 name = "markdown"
 version = "3.5"
@@ -2005,21 +2037,21 @@ files = [
 
 [[package]]
 name = "networkx"
-version = "3.1"
+version = "3.2"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
-    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+    {file = "networkx-3.2-py3-none-any.whl", hash = "sha256:8b25f564bd28f94ac821c58b04ae1a3109e73b001a7d476e4bb0d00d63706bf8"},
+    {file = "networkx-3.2.tar.gz", hash = "sha256:bda29edf392d9bfa5602034c767d28549214ec45f620081f0b74dc036a1fbbc1"},
 ]
 
 [package.extras]
-default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
-developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
-doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
-extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
-test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
+developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
+doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
+test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
 [[package]]
 name = "nltk"
@@ -3929,117 +3961,56 @@ files = [
 
 [[package]]
 name = "tokenizers"
-version = "0.14.1"
-description = ""
+version = "0.13.3"
+description = "Fast and Customizable Tokenizers"
 optional = false
-python-versions = ">=3.7"
+python-versions = "*"
 files = [
-    {file = "tokenizers-0.14.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:04ec1134a18ede355a05641cdc7700f17280e01f69f2f315769f02f7e295cf1e"},
-    {file = "tokenizers-0.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:638abedb39375f0ddce2de536fc9c976639b2d1b7202d715c2e7a25f0ebfd091"},
-    {file = "tokenizers-0.14.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:901635098565773a44f74068639d265f19deaaca47ea77b428fd9bee13a61d87"},
-    {file = "tokenizers-0.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72e95184bf5b9a4c08153ed07c16c130ff174835c9a1e6ee2b311be758c8b3ef"},
-    {file = "tokenizers-0.14.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ebefbc26ccff5e96ae7d40772172e7310174f9aa3683d2870a1882313ec3a4d5"},
-    {file = "tokenizers-0.14.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3a6330c9f1deda22873e8b4ac849cc06d3ff33d60b3217ac0bb397b541e1509"},
-    {file = "tokenizers-0.14.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6cba7483ba45600346a35c466bde32327b108575022f73c35a0f7170b5a71ae2"},
-    {file = "tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60fec380778d75cbb492f14ca974f11f37b41d53c057b9c8ba213315b86e1f84"},
-    {file = "tokenizers-0.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:930c19b699dd7e1077eac98967adc2fe5f0b104bd96cc1f26778ab82b31ceb24"},
-    {file = "tokenizers-0.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a1e30a13376db5329570e09b14c8eb36c017909ed7e88591ca3aa81f3c7d6f32"},
-    {file = "tokenizers-0.14.1-cp310-none-win32.whl", hash = "sha256:370b5b86da9bddbe65fa08711f0e8ffdf8b0036558178d1a31dfcb44efcde72a"},
-    {file = "tokenizers-0.14.1-cp310-none-win_amd64.whl", hash = "sha256:c2c659f2106b6d154f118ad1b700e68148c46c59b720f04867b1fc5f26a85060"},
-    {file = "tokenizers-0.14.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:00df4c5bf25c153b432b98689609b426ae701a44f3d8074dcb619f410bc2a870"},
-    {file = "tokenizers-0.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fee553657dcdb7e73df8823c49e8611457ba46e9d7026b7e9c44820c08c327c3"},
-    {file = "tokenizers-0.14.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a480bd902e327dfcaa52b7dd14fdc71e7aa45d73a3d6e41e028a75891d2823cf"},
-    {file = "tokenizers-0.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e448b2be0430ab839cf7954715c39d6f34ff6cf2b49393f336283b7a59f485af"},
-    {file = "tokenizers-0.14.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c11444984aecd342f0cf160c3320288edeb1763871fbb560ed466654b2a7016c"},
-    {file = "tokenizers-0.14.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe164a1c72c6be3c5c26753c6c412f81412f4dae0d7d06371e0b396a9cc0fc9"},
-    {file = "tokenizers-0.14.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:72d9967fb1f927542cfb5347207fde01b29f25c9bb8cbc7ced280decfa015983"},
-    {file = "tokenizers-0.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37cc955c84ec67c2d11183d372044399342b20a1fa447b7a33040f4889bba318"},
-    {file = "tokenizers-0.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:db96cf092d86d4cb543daa9148e299011e0a40770380bb78333b9fd700586fcb"},
-    {file = "tokenizers-0.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c84d3cb1349936c2b96ca6175b50f5a9518170bffd76464219ee0ea6022a64a7"},
-    {file = "tokenizers-0.14.1-cp311-none-win32.whl", hash = "sha256:8db3a6f3d430ac3dc3793c53fa8e5e665c23ba359484d365a191027ad8b65a30"},
-    {file = "tokenizers-0.14.1-cp311-none-win_amd64.whl", hash = "sha256:c65d76052561c60e17cb4fa289885ed00a9995d59e97019fac2138bd45142057"},
-    {file = "tokenizers-0.14.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:c375161b588982be381c43eb7158c250f430793d0f708ce379a0f196164c6778"},
-    {file = "tokenizers-0.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50f03d2330a153a9114c2429061137bd323736059f384de8348d7cb1ca1baa15"},
-    {file = "tokenizers-0.14.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0c8ee283b249c3c3c201c41bc23adc3be2514ae4121eacdb5c5250a461eaa8c6"},
-    {file = "tokenizers-0.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9f27399b8d50c5d3f08f0aae961bcc66a1dead1cd0ae9401e4c2a43a623322a"},
-    {file = "tokenizers-0.14.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:89cbeec7e9d5d8773ec4779c64e3cbcbff53d234ca6ad7b1a3736588003bba48"},
-    {file = "tokenizers-0.14.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08e55920b453c30b46d58accc68a38e8e7488d0c03babfdb29c55d3f39dd2052"},
-    {file = "tokenizers-0.14.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91d32bd1056c0e83a0f90e4ffa213c25096b2d8b9f0e2d172a45f138c7d8c081"},
-    {file = "tokenizers-0.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44f1748035c36c939848c935715bde41734d9249ab7b844ff9bfbe984be8952c"},
-    {file = "tokenizers-0.14.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1ff516d129f01bb7a4aa95bc6aae88e4d86dd63bfc2d57db9302c2624d1be7cb"},
-    {file = "tokenizers-0.14.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:acfc8db61c6e919d932448cc7985b85e330c8d745528e12fce6e62d40d268bce"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:ba336bc9107acbc1da2ad30967df7b2db93448ca66538ad86aa1fbb91116f631"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:f77371b5030e53f8bf92197640af437539e3bba1bc8342b97888c8e26567bfdc"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d72d25c57a9c814240802d188ff0a808b701e2dd2bf1c64721c7088ceeeb1ed7"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:caf0df8657277e32671aa8a4d3cc05f2050ab19d9b49447f2265304168e9032c"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cb3c6bc6e599e46a26ad559ad5dec260ffdf705663cc9b894033d64a69314e86"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8cf2fcdc2368df4317e05571e33810eeed24cd594acc9dfc9788b21dac6b3a8"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f475d5eda41d2ed51ca775a07c80529a923dd759fcff7abf03ccdd83d9f7564e"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cce4d1a97a7eb2253b5d3f29f4a478d8c37ba0303ea34024eb9e65506d4209f8"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ff66577ae55114f7d0f6aa0d4d335f27cae96bf245962a745b718ec887bbe7eb"},
-    {file = "tokenizers-0.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a687099e085f5162e5b88b3402adb6c2b41046180c015c5075c9504440b6e971"},
-    {file = "tokenizers-0.14.1-cp37-none-win32.whl", hash = "sha256:49f5336b82e315a33bef1025d247ca08d95719715b29e33f0e9e8cf15ff1dfb6"},
-    {file = "tokenizers-0.14.1-cp37-none-win_amd64.whl", hash = "sha256:117c8da60d1bd95a6df2692926f36de7971baa1d89ff702fae47b6689a4465ad"},
-    {file = "tokenizers-0.14.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:01d2bd5935642de22a6c6778bb2307f9949cd6eaeeb5c77f9b98f0060b69f0db"},
-    {file = "tokenizers-0.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b05ec04132394c20bd6bcb692d557a8eb8ab1bac1646d28e49c67c00907d17c8"},
-    {file = "tokenizers-0.14.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7d9025b185465d9d18679406f6f394850347d5ed2681efc203539d800f36f459"},
-    {file = "tokenizers-0.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2539831838ab5393f78a893d7bbf27d5c36e43baf77e91dc9992922b2b97e09d"},
-    {file = "tokenizers-0.14.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec8f46d533092d8e20bc742c47918cbe24b8641dbfbbcb83177c5de3c9d4decb"},
-    {file = "tokenizers-0.14.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8b019c4810903fdea3b230f358b9d27377c0f38454778b607676c9e1b57d14b7"},
-    {file = "tokenizers-0.14.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e8984114fd83ed3913d89526c992395920930c9620a2feee61faf035f41d7b9a"},
-    {file = "tokenizers-0.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11284b32f0036fe7ef4b8b00201dda79c00f3fcea173bc0e5c599e09c937ab0f"},
-    {file = "tokenizers-0.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:53614f44f36917282a583180e402105bc63d61d1aca067d51cb7f051eb489901"},
-    {file = "tokenizers-0.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e3b6082e9532309727273443c8943bb9558d52e36788b246aa278bda7c642116"},
-    {file = "tokenizers-0.14.1-cp38-none-win32.whl", hash = "sha256:7560fca3e17a6bc876d20cd825d7721c101fa2b1cd0bfa0abf9a2e781e49b37b"},
-    {file = "tokenizers-0.14.1-cp38-none-win_amd64.whl", hash = "sha256:c318a5acb429ca38f632577754235140bbb8c5a27faca1c51b43fbf575596e34"},
-    {file = "tokenizers-0.14.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:b886e0f5c72aa4249c609c24b9610a9ca83fd963cbb5066b19302723ea505279"},
-    {file = "tokenizers-0.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f522f28c88a0d5b2f9e895cf405dd594cd518e99d61905406aec74d30eb6383b"},
-    {file = "tokenizers-0.14.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5bef76c4d9329913cef2fe79ce1f4dab98f77fa4887e5f0420ffc9386941de32"},
-    {file = "tokenizers-0.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59c7df2103052b30b7c76d4fa8251326c9f82689578a912698a127dc1737f43e"},
-    {file = "tokenizers-0.14.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:232445e7b85255ccfe68dfd42185db8a3f3349b34ad7068404856c4a5f67c355"},
-    {file = "tokenizers-0.14.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8e63781da85aa8948864970e529af10abc4084a990d30850c41bbdb5f83eee45"},
-    {file = "tokenizers-0.14.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5760a831c0f3c6d3229b50ef3fafa4c164ec99d7e8c2237fe144e67a9d33b120"},
-    {file = "tokenizers-0.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c84b456ff8525ec3ff09762e32ccc27888d036dcd0ba2883e1db491e164dd725"},
-    {file = "tokenizers-0.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:463ee5f3afbfec29cbf5652752c9d1032bdad63daf48bb8cb9970064cc81d5f9"},
-    {file = "tokenizers-0.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ee6b63aecf929a7bcf885bdc8a8aec96c43bc4442f63fe8c6d48f24fc992b05b"},
-    {file = "tokenizers-0.14.1-cp39-none-win32.whl", hash = "sha256:aae42798ba1da3bc1572b2048fe42e61dd6bacced2b424cb0f5572c5432f79c2"},
-    {file = "tokenizers-0.14.1-cp39-none-win_amd64.whl", hash = "sha256:68c4699147dded6926a3d2c2f948d435d54d027f69909e0ef3c6587933723ed2"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:5f9afdcf701a1aa3c41e0e748c152d2162434d61639a1e5d8523ecf60ae35aea"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:6859d81243cd09854be9054aca3ecab14a2dee5b3c9f6d7ef12061d478ca0c57"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7975178f9478ccedcf613332d5d6f37b67c74ef4e2e47e0c965597506b921f04"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ce2f0ff2e5f12ac5bebaa690606395725239265d7ffa35f35c243a379316297"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7cfc3d42e81cda802f93aa9e92caf79feaa1711426e28ce620560b8aaf5e4d"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:67d3adff654dc7f7c7091dd259b3b847fe119c08d0bda61db91e2ea2b61c38c0"},
-    {file = "tokenizers-0.14.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:956729b7dd599020e57133fb95b777e4f81ee069ff0a70e80f6eeac82658972f"},
-    {file = "tokenizers-0.14.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:fe2ea1177146a7ab345ab61e90a490eeea25d5f063e1cb9d4eb1425b169b64d7"},
-    {file = "tokenizers-0.14.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9930f31f603ecc6ea54d5c6dfa299f926ab3e921f72f94babcb02598c32b57c6"},
-    {file = "tokenizers-0.14.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d49567a2754e9991c05c2b5a7e6650b56e24365b7cab504558e58033dcf0edc4"},
-    {file = "tokenizers-0.14.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3678be5db330726f19c1949d8ae1b845a02eeb2a2e1d5a8bb8eaa82087ae25c1"},
-    {file = "tokenizers-0.14.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:42b180ed1bec58ab9bdc65d406577e0c0fb7241b74b8c032846073c7743c9f86"},
-    {file = "tokenizers-0.14.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:319e4367596fb0d52be645b3de1616faf0fadaf28507ce1c7595bebd9b4c402c"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:2cda65b689aec63b7c76a77f43a08044fa90bbc6ad9849267cedfee9795913f3"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:ca0bfc79b27d84fcb7fa09339b2ee39077896738d9a30ff99c0332376e985072"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a7093767e070269e22e2c5f845e46510304f124c32d2cd249633c0f27eb29d86"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad759ba39cd32c2c2247864d02c84ea5883b5f6cc6a4ee0c95602a3dde52268f"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26fee36a6d8f2bd9464f3566b95e3e3fb7fd7dad723f775c500aac8204ec98c6"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d091c62cb7abbd32e527a85c41f7c8eb4526a926251891fc4ecbe5f974142ffb"},
-    {file = "tokenizers-0.14.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ca304402ea66d58f99c05aa3d7a6052faea61e5a8313b94f6bc36fbf27960e2d"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:102f118fa9b720b93c3217c1e239ed7bc1ae1e8dbfe9b4983a4f2d7b4ce6f2ec"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:df4f058e96e8b467b7742e5dba7564255cd482d3c1e6cf81f8cb683bb0433340"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:040ee44efc1806900de72b13c1c3036154077d9cde189c9a7e7a50bbbdcbf39f"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7618b84118ae704f7fa23c4a190bd80fc605671841a4427d5ca14b9b8d9ec1a3"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ecdfe9736c4a73343f629586016a137a10faed1a29c6dc699d8ab20c2d3cf64"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:92c34de04fec7f4ff95f7667d4eb085c4e4db46c31ef44c3d35c38df128430da"},
-    {file = "tokenizers-0.14.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:628b654ba555b2ba9111c0936d558b14bfc9d5f57b8c323b02fc846036b38b2f"},
-    {file = "tokenizers-0.14.1.tar.gz", hash = "sha256:ea3b3f8908a9a5b9d6fc632b5f012ece7240031c44c6d4764809f33736534166"},
+    {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"},
+    {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"},
+    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"},
+    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee0b1b311d65beab83d7a41c56a1e46ab732a9eed4460648e8eb0bd69fc2d059"},
+    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ef4215284df1277dadbcc5e17d4882bda19f770d02348e73523f7e7d8b8d396"},
+    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4d53976079cff8a033f778fb9adca2d9d69d009c02fa2d71a878b5f3963ed30"},
+    {file = "tokenizers-0.13.3-cp310-cp310-win32.whl", hash = "sha256:1f0e3b4c2ea2cd13238ce43548959c118069db7579e5d40ec270ad77da5833ce"},
+    {file = "tokenizers-0.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:89649c00d0d7211e8186f7a75dfa1db6996f65edce4b84821817eadcc2d3c79e"},
+    {file = "tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:56b726e0d2bbc9243872b0144515ba684af5b8d8cd112fb83ee1365e26ec74c8"},
+    {file = "tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc5c022ce692e1f499d745af293ab9ee6f5d92538ed2faf73f9708c89ee59ce6"},
+    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55c981ac44ba87c93e847c333e58c12abcbb377a0c2f2ef96e1a266e4184ff2"},
+    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f247eae99800ef821a91f47c5280e9e9afaeed9980fc444208d5aa6ba69ff148"},
+    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e3215d048e94f40f1c95802e45dcc37c5b05eb46280fc2ccc8cd351bff839"},
+    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ba2b0bf01777c9b9bc94b53764d6684554ce98551fec496f71bc5be3a03e98b"},
+    {file = "tokenizers-0.13.3-cp311-cp311-win32.whl", hash = "sha256:cc78d77f597d1c458bf0ea7c2a64b6aa06941c7a99cb135b5969b0278824d808"},
+    {file = "tokenizers-0.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:ecf182bf59bd541a8876deccf0360f5ae60496fd50b58510048020751cf1724c"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:0527dc5436a1f6bf2c0327da3145687d3bcfbeab91fed8458920093de3901b44"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07cbb2c307627dc99b44b22ef05ff4473aa7c7cc1fec8f0a8b37d8a64b1a16d2"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4560dbdeaae5b7ee0d4e493027e3de6d53c991b5002d7ff95083c99e11dd5ac0"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64064bd0322405c9374305ab9b4c07152a1474370327499911937fd4a76d004b"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8c6e2ab0f2e3d939ca66aa1d596602105fe33b505cd2854a4c1717f704c51de"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-win32.whl", hash = "sha256:6cc29d410768f960db8677221e497226e545eaaea01aa3613fa0fdf2cc96cff4"},
+    {file = "tokenizers-0.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fc2a7fdf864554a0dacf09d32e17c0caa9afe72baf9dd7ddedc61973bae352d8"},
+    {file = "tokenizers-0.13.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8791dedba834c1fc55e5f1521be325ea3dafb381964be20684b92fdac95d79b7"},
+    {file = "tokenizers-0.13.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:d607a6a13718aeb20507bdf2b96162ead5145bbbfa26788d6b833f98b31b26e1"},
+    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3791338f809cd1bf8e4fee6b540b36822434d0c6c6bc47162448deee3f77d425"},
+    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2f35f30e39e6aab8716f07790f646bdc6e4a853816cc49a95ef2a9016bf9ce6"},
+    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310204dfed5aa797128b65d63538a9837cbdd15da2a29a77d67eefa489edda26"},
+    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0f9b92ea052305166559f38498b3b0cae159caea712646648aaa272f7160963"},
+    {file = "tokenizers-0.13.3-cp38-cp38-win32.whl", hash = "sha256:9a3fa134896c3c1f0da6e762d15141fbff30d094067c8f1157b9fdca593b5806"},
+    {file = "tokenizers-0.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:8e7b0cdeace87fa9e760e6a605e0ae8fc14b7d72e9fc19c578116f7287bb873d"},
+    {file = "tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:00cee1e0859d55507e693a48fa4aef07060c4bb6bd93d80120e18fea9371c66d"},
+    {file = "tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a23ff602d0797cea1d0506ce69b27523b07e70f6dda982ab8cf82402de839088"},
+    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ce07445050b537d2696022dafb115307abdffd2a5c106f029490f84501ef97"},
+    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:280ffe95f50eaaf655b3a1dc7ff1d9cf4777029dbbc3e63a74e65a056594abc3"},
+    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97acfcec592f7e9de8cadcdcda50a7134423ac8455c0166b28c9ff04d227b371"},
+    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd7730c98a3010cd4f523465867ff95cd9d6430db46676ce79358f65ae39797b"},
+    {file = "tokenizers-0.13.3-cp39-cp39-win32.whl", hash = "sha256:48625a108029cb1ddf42e17a81b5a3230ba6888a70c9dc14e81bc319e812652d"},
+    {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"},
+    {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"},
 ]
 
-[package.dependencies]
-huggingface_hub = ">=0.16.4,<0.18"
-
 [package.extras]
-dev = ["tokenizers[testing]"]
-docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"]
+dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"]
+docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
 testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"]
 
 [[package]]
@@ -4141,39 +4112,39 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.34.0"
+version = "4.33.3"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.34.0-py3-none-any.whl", hash = "sha256:3f0187183a7f22c51ecbbc9eac5145df666c5b86bec6feed10e11f0363f3a1f9"},
-    {file = "transformers-4.34.0.tar.gz", hash = "sha256:cc2ae61bfbfaa45337fd9017326669fc60e4f55125f589d50da47819e3d6f504"},
+    {file = "transformers-4.33.3-py3-none-any.whl", hash = "sha256:7150bbf6781ddb3338ce7d74f4d6f557e6c236a0a1dd3de57412214caae7fd71"},
+    {file = "transformers-4.33.3.tar.gz", hash = "sha256:8ea7c92310dee7c63b14766ce928218f7a9177960b2487ac018c91ae621af03e"},
 ]
 
 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.16.4,<1.0"
+huggingface-hub = ">=0.15.1,<1.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 pyyaml = ">=5.1"
 regex = "!=2019.12.17"
 requests = "*"
 safetensors = ">=0.3.1"
-tokenizers = ">=0.14,<0.15"
+tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.14"
 tqdm = ">=4.27"
 
 [package.extras]
 accelerate = ["accelerate (>=0.20.3)"]
 agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.10,!=1.12.0)"]
-all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.15)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"]
+all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 codecarbon = ["codecarbon (==1.2.0)"]
 deepspeed = ["accelerate (>=0.20.3)", "deepspeed (>=0.9.3)"]
 deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.15)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.15)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.15)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.15)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"]
+dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"]
 docs-specific = ["hf-doc-builder"]
 fairscale = ["fairscale (>0.3)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"]
@@ -4200,11 +4171,11 @@ tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.15)",
 tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 timm = ["timm"]
-tokenizers = ["tokenizers (>=0.14,<0.15)"]
+tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.14)"]
 torch = ["accelerate (>=0.20.3)", "torch (>=1.10,!=1.12.0)"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (<10.0.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.16.4,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.15)", "torch (>=1.10,!=1.12.0)", "tqdm (>=4.27)"]
+torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "tqdm (>=4.27)"]
 video = ["av (==9.2.0)", "decord (==0.6.0)"]
 vision = ["Pillow (<10.0.0)"]
 
@@ -4680,4 +4651,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.11"
-content-hash = "5ce38de044cbc1d3f927898cc730660d9fad1c5f1e27c57b6f8b7caf6f9ba9c1"
+content-hash = "a4dba91afe1f1b6f224a6bd9ea1bc73f597217ffa6425884eeab79db52ebedcf"
diff --git a/pyproject.toml b/pyproject.toml
index 3ab9009..dce0ef5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,8 @@ tensorflow-macos = {version = "2.14.0", platform = "darwin"}
 elemeta = "1.0.7"
 torch = ">=2.0.0, !=2.0.1, !=2.1.0"
 openai = "^0.28.1"
+huggingface-hub = "^0.18.0"
+llama-cpp-python = "^0.2.11"
 
 [tool.poetry.dev-dependencies]
 pylint = "^2.13"
diff --git a/saga_llm_evaluation_ml/helpers/llm_metrics.py b/saga_llm_evaluation_ml/helpers/llm_metrics.py
index d270915..6049fd5 100644
--- a/saga_llm_evaluation_ml/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/llm_metrics.py
@@ -1,31 +1,353 @@
-import openai
 import numpy as np
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 
 
-class GPTScore:
-    def __init__(self, model="gpt2"):
+class SelfCheckGPT:
+    def __init__(
+        self,
+        model,
+        eval_model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF",
+        eval_model_basename="llama-2-7b-chat.Q4_K_M.gguf",
+    ):
         """
-        GPTScore is a metric which allows to evaluate generative models on a variety of tasks.
-        GPTScore(h|d, a, S) =  sum_{t=1}^m w_t * log p(h_t | h_{<t}, T(d, a, S), theta)
-        where w_t is a weight assigned to each token h_t.
-        T is a prompt template with
-        d: task description,
-        a: aspect to evaluate,
-        S: context information.
-        and theta are model parameters.
-        GPTScore does not require any reference text.
-
+        This class implements the self-check GPT evaluation metric for generative language models.
+        It is inspired by the self-check metric proposed in https://arxiv.org/pdf/2303.08896.pdf.
         Args:
-            model (str, optional): Model name. Defaults to "gpt2".
+            model (transformers.PreTrainedModel): GPT model to evaluate.
+            eval_model_name_or_path (str): Evaluation model name or path. Defaults to "TheBloke/Llama-2-7b-Chat-GGUF".
+            eval_model_basename (str): Evaluation model basename. Defaults to "llama-2-7b-chat.Q4_K_M.gguf".
         """
+        assert isinstance(
+            eval_model_name_or_path, str
+        ), "eval_model_name_or_path must be a string."
+        assert isinstance(
+            eval_model_basename, str
+        ), "eval_model_basename must be a string."
+
         self.model = model
-        self.huggingface_models = [
-            "meta-llama/Llama-2-7b-chat-hf",
-            "gpt2",
-            "mistralai/Mistral-7B-v0.1",
+        self.eval_model_path = hf_hub_download(
+            repo_id=eval_model_name_or_path, filename=eval_model_basename
+        )
+
+        self.eval_model = Llama(
+            model_path=self.eval_model_path, n_threads=2, verbose=False  # CPU cores
+        )
+
+    def get_prompt(self, pred, sample, question):
+        """
+        This method returns a prompt template given a candidate sentence, a sample sentence, and a question.
+        Args:
+            pred (str): Candidate sentence.
+            sample (str): Sample sentence.
+            question (str): Question asked to the model for which it generated $pred.
+
+        Returns:
+            str: Prompt template.
+        """
+        system_prompt = "You are a helpful, polite and concise assistant. Your task is to check if two texts provide the same answer to a given question. Always answer with a single word. The possible answers are either YES or NO.\n\n"
+        question = "###Question:\n" + question
+        text1 = "\n###Text 1: " + sample
+        text2 = "\n###Text 2: " + pred
+
+        prompt_template = f"""SYSTEM: {system_prompt}
+        USER: {question + text1 + text2}
+        ASSISTANT (YES or NO):"""
+
+        return prompt_template
+
+    def get_prompts(self, pred, samples, question):
+        """
+        This method returns a list of prompt templates given a candidate sentence, a list
+        of sample sentences, and a question.
+        Args:
+            pred (str): Candidate sentence.
+            samples (list of str): List of sample sentences.
+            question (str): Question asked to the model for which it generated $pred.
+
+        Returns:
+            list: List of prompt templates.
+        """
+        print(samples)
+        return [self.get_prompt(pred, sample, question) for sample in samples]
+
+    def compute(self, question, pred, n_samples):
+        """
+        Args:
+            question (str): Question asked to the model for which it generated $pred.
+            pred (str): Candidate sentence.
+            n_samples (int): Number of samples to generate.
+
+        Returns:
+            score (float): Score for the candidate sentence.
+        """
+        assert isinstance(question, str), "Prediction must be a string."
+        assert isinstance(pred, str), "Prediction must be a string."
+        assert isinstance(n_samples, int), "Number of samples must be an integer."
+        assert n_samples > 0, "Number of samples must be greater than 0."
+        assert question and pred, "Question and prediction must be non-empty."
+
+        # Generate n_samples samples from the model
+        samples = []
+        print("Samples:\n")
+        for _ in range(n_samples):
+            system_prompt = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."
+            prompt_template = f"""SYSTEM: {system_prompt}
+            USER: {question}
+            ASSISTANT:"""
+
+            response = self.model(prompt_template, max_tokens=200)
+            sample = response["choices"][0]["text"]
+            print(sample, "\n")
+            samples.append(sample)
+        print("\n")
+
+        # For each sample, ask evaluator model to evaluate the sample
+        prompts = self.get_prompts(pred, samples, question)
+        scores = []
+        print("Prompts:\n")
+        for prompt in prompts:
+            print(prompt, "\n")
+            answer = self.eval_model(prompt, max_tokens=200)["choices"][0]["text"]
+            print(answer, "\n")
+            scores.append(answer)
+        print("\n")
+
+        # Compute the score: how often the sentence if supported by the sample
+        score = np.mean([1 if "yes" in score.lower() else 0 for score in scores])
+
+        return score
+
+
+class GEval:
+    def __init__(
+        self,
+        model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF",
+        model_basename="llama-2-7b-chat.Q4_K_M.gguf",
+    ):
+        """
+        This class implements the GEval evaluation metric for generative language models.
+        It is inspired by the GEval metric proposed in https://arxiv.org/pdf/2303.16634.pdf.
+        Args:
+            model_name_or_path (str): Model name or path. Defaults to "TheBloke/Llama-2-7b-Chat-GGUF".
+            model_basename (str): Model basename. Defaults to "llama-2-7b-chat.Q4_K_M.gguf".
+        """
+        assert isinstance(
+            model_name_or_path, str
+        ), "model_name_or_path must be a string."
+        assert isinstance(model_basename, str), "model_basename must be a string."
+
+        self.model_path = hf_hub_download(
+            repo_id=model_name_or_path, filename=model_basename
+        )
+
+        self.lcpp_llm = Llama(
+            model_path=self.model_path,
+            n_threads=2,  # CPU cores
+            logits_all=True,
+            n_ctx=1000,
+        )
+
+        self.tasks = {
+            "summ": "You will be given one summary written for a news article. Your task is to rate the summary on one metric. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.",
+            "diag": "You will be given a conversation between two individuals. You will then be given one potential response for the next turn in the conversation. The response concerns an interesting fact, which will be provided as well. Your task is to rate the responses on one metric. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.",
+        }
+        self.criteria = {
+            "COH": {
+                "name": "Coherence",
+                "prompt": "Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby ”the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.”",
+            },
+            "CON": {
+                "name": "Consistency",
+                "prompt": "Consistency (1-5) - the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts. ",
+            },
+            "ENG": {
+                "name": "Engagingness",
+                "prompt": "Engagingness (1-5) - Is the response dull/interesting? - A score of 1 indicates that the response is dull and uninteresting. A score of 5 indicates that the response is interesting and engaging.",
+            },
+            "FLU": {
+                "name": "Fluency",
+                "prompt": "Fluency (1-5) - the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure. - 1: Poor. The summary is difficult to read and understand. It contains many grammatical errors, spelling mistakes, and/or punctuation errors. - 2: Fair. The summary is somewhat difficult to read and understand. It contains some grammatical errors, spelling mistakes, and/or punctuation errors. - 3: Good. The summary is easy to read and understand. It contains few grammatical errors, spelling mistakes, and/or punctuation errors. - 4: Very Good. The summary is easy to read and understand. It contains no grammatical errors, spelling mistakes, and/or punctuation errors. - 5: Excellent. The summary is easy to read and understand. It contains no grammatical errors, spelling mistakes, and/or punctuation errors.",
+            },
+            "REL": {
+                "name": "Relevance",
+                "prompt": "Relevance (1-5) - selection of important content from the source. The summary should include only important information from the source document. Annotators were instructed to penalize summaries which contained redundancies and excess information.",
+            },
+            "POL": {
+                "name": "Politeness",
+                "prompt": "Politeness (1-5) - the degree to which the response is polite. - 1: Very impolite. The response is very impolite. - 2: Somewhat impolite. The response is somewhat impolite. - 3: Neutral. The response is neutral. - 4: Somewhat polite. The response is somewhat polite. - 5: Very polite. The response is very polite.",
+            },
+        }
+
+    def get_prediction(self, prompt):
+        """
+        This method returns a prediction given a prompt template.
+        Args:
+            prompt (str): Prompt template.
+
+        Returns:
+            response (dict): Response from the model.
+        """
+        response = self.lcpp_llm.create_completion(
+            prompt=prompt,
+            max_tokens=250,
+            temperature=0.5,
+            top_p=0.95,
+            logprobs=5,
+            repeat_penalty=1.2,
+            top_k=50,
+            echo=True,
+        )
+        return response
+
+    def get_cot(self, prompt):
+        """
+        This method returns a chain of thoughts given a prompt template.
+        Args:
+            prompt (str): Prompt template.
+
+        Returns:
+            cot (str): Chain of thoughts.
+        """
+        title = "\nEvaluation steps:\n"
+        cot = self.get_prediction(prompt + title)["choices"][0]["text"]
+        return cot
+
+    # pylint: disable=consider-iterating-dictionary
+    def get_prompt(self, src, pred, definition, criterion, criterion_name):
+        """
+        Args:
+            src (str): Source text.
+            pred (str): Candidate sentence to evaluate.
+            definition (str): Definition of the task.
+            crit_code (str): Evaluation criterion code.
+        """
+        definition = (
+            "\n Task definition:\n" + self.tasks[definition]
+            if definition in self.tasks.keys()
+            else definition
+        )
+        crit = (
+            "\n Evaluation criteria:\n" + self.criteria[criterion]["prompt"]
+            if criterion in self.criteria.keys()
+            else criterion
+        )
+        crit_name = (
+            self.criteria[criterion]["name"]
+            if criterion in self.criteria.keys()
+            else criterion_name
+        )
+
+        prompt = f"{definition} {crit}"
+
+        # Chain of thoughts, set of intermediate instructions generated by llm detailing evaluation steps
+        auto_cot = self.get_cot(prompt)
+
+        return (
+            prompt
+            + auto_cot
+            + "\n Example:\n Source Text:\n"
+            + src
+            + "\n Generated text:\n"
+            + pred
+            + "\n Evaluation Form (scores ONLY):\n"
+            + crit_name
+            + ": "
+        )
+
+    def get_score(self, prompt):
+        """
+        Args:
+            prompt (str): Prompt template.
+
+        Returns:
+            score (float): Score for the candidate sentence.
+        """
+        response = self.get_prediction(prompt)
+        tokens = response["choices"][0]["logprobs"]["tokens"]
+        top_logprobs = response["choices"][0]["logprobs"]["top_logprobs"]
+
+        # Extract evaluation form from tokens ()
+        template_tokens = [
+            " E",
+            "valu",
+            "ation",
+            " Form",
+            " (",
+            "sc",
+            "ores",
+            " ON",
+            "LY",
+            "):",
         ]
-        self.openai_models = ["gpt-3.5-turbo"]
+        start_index = tokens.index(template_tokens[-1]) + 1
+        # Extract number index from the remaining tokens
+        for token in tokens[start_index:]:
+            if token.isdigit():
+                number_index = tokens.index(token)
+                break
+
+        # Get logprobs associated with number
+        logprobs = top_logprobs[number_index]
+
+        # Compute score
+        # Get only keys that are numbers
+        number_keys = [int(key) for key in logprobs.keys() if key.isdigit()]
+        number_logprobs = [logprobs[str(key)] for key in number_keys]
+        number_probs = [np.exp(logprob) for logprob in number_logprobs]
+
+        score = np.sum(np.multiply(number_keys, number_probs)) / len(number_keys)
+
+        return score
+
+    def compute(self, source, pred, definition, criterion, criterion_name=None):
+        """
+        This method computes the GEval score for a candidate sentence given a source text,
+        a prompt template, an aspect to evaluate, and a task description.
+        Args:
+            source (str): Source text.
+            pred (str): Candidate sentence to evaluate.
+            definition (str): Definition of the task.
+            criterion (str): Evaluation criterion code.
+            criterion_name (str, optional): Evaluation criterion name. Defaults to None.
+
+        Returns:
+            score (float): Score for the candidate sentence.
+        """
+        assert isinstance(source, str), "Source must be a string."
+        assert isinstance(pred, str), "Pred must be a string."
+        assert isinstance(definition, str), "Definition must be a string."
+        assert isinstance(criterion, str), "Criterion must be a string."
+        assert criterion_name is None or isinstance(
+            criterion_name, str
+        ), "Criterion name must be a string."
+        assert (
+            criterion in self.criteria.keys() or criterion_name is not None
+        ), "Criterion name must be given if criterion is not in the list of criteria."
+
+        prompt = self.get_prompt(source, pred, definition, criterion, criterion_name)
+        return self.get_score(prompt)
+
+
+class GPTScore:
+    def __init__(
+        self,
+        model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF",
+        model_basename="llama-2-7b-chat.Q4_K_M.gguf",
+    ):
+        """
+        This class implements the GPTScore evaluation metric for generative language models.
+        It is inspired by the GPTScore metric proposed in https://arxiv.org/pdf/2302.04166.pdf.
+        Args:
+            model_name_or_path (str): Model name or path. Defaults to "TheBloke/Llama-2-7b-Chat-GGUF".
+            model_basename (str): Model basename. Defaults to "llama-2-7b-chat.Q4_K_M.gguf".
+        """
+        assert isinstance(
+            model_name_or_path, str
+        ), "model_name_or_path must be a string."
+        assert isinstance(model_basename, str), "model_basename must be a string."
+
+        self.tasks = ["summ", "MT", "D2T", "diag"]
         self.aspects = [
             "COV",
             "FAC",
@@ -49,65 +371,17 @@ def __init__(self, model="gpt2"):
             "FLE",
             "INQ",
         ]
-        self.models = [
-            "meta-llama/Llama-2-7b-chat-hf",
-            "gpt-3.5-turbo",
-            "gpt2",
-            "mistralai/Mistral-7B-v0.1",
-        ]
-        self.tasks = ["summ", "MT", "D2T", "diag"]
-
-        assert isinstance(model, str), "Model must be a string."
-        assert model in self.models, f"Model must be one of {self.models}."
-
-    def huggingface_logprobs(self, prompts):
-        """
-        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
-        Args:
-            prompts (list of str): List of prompt templates.
-        Returns:
-            list: List of log-likelihoods for each candidate sentence.
-        """
-
-        tokenizer = AutoTokenizer.from_pretrained(self.model)
-        llm = AutoModelForCausalLM.from_pretrained(self.model)
-        inputs = tokenizer(prompts, return_tensors="pt")
 
-        outputs = llm.generate(
-            **inputs,
-            max_new_tokens=50,
-            return_dict_in_generate=True,
-            output_scores=True,
+        self.model_path = hf_hub_download(
+            repo_id=model_name_or_path, filename=model_basename
         )
 
-        logprobs = np.array(
-            llm.compute_transition_scores(
-                outputs.sequences, outputs.scores, normalize_logits=True
-            ).tolist()
+        self.lcpp_llm = Llama(
+            model_path=self.model_path,
+            n_threads=2,  # CPU cores
+            logits_all=True,
         )
 
-        return logprobs
-
-    def openai_logprobs(self, prompts, api_key):
-        """
-        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
-        Args:
-            prompts (list of str): List of prompt templates.
-            api_key (str): OpenAI API key.
-        Returns:
-            list: List of log-likelihoods for each candidate sentence.
-        """
-        openai.api_key = api_key
-        outputs = openai.Completion.create(
-            model=self.model,
-            prompt=prompts,
-            logprobs=5,
-        )
-
-        logprobs = outputs["choices"][0]["logprobs"]
-
-        return logprobs
-
     def get_prompts(self, aspect, task, sources, preds):
         """
         This method returns a list of prompt templates given a task description, and an aspect to evaluate.
@@ -181,68 +455,25 @@ def get_prompt(self, aspect, task, src, pred):
 
         return templates[task][aspect]
 
-    def compute(
-        self, sources, preds, prompts=None, aspect=None, task=None, api_key=None
-    ):
+    def compute(self, source, pred, prompt=None, aspect=None, task=None):
         """
-        This method computes GPTScore for a list of candidate sentences given a task description,
-        an aspect to evaluate and context information.
-        The possible values for aspect are:
-        - (COV): Semantic coverage. How many semantic content units from the reference text
-                    are covered by the generated text?
-        - (FAC): Factuality. Does the generated text preserve the factual statements of the source text?)
-        - (FLU): Fluency. Is the generated text well-written and grammatical?
-        - (CON): Consistency. Is the generated text consistent in the information it provides?
-        - (INF): Informativeness. How well does the generated text capture the key ideas of its source text?
-        - (COH): Coherence. How much does the generated text make sense?
-        - (REL): Relevance. How well is the generated text relevant to its source text?
-        - (ACC): Accuracy. Are there inaccuracies, missing, or unfactual content in the generated text?
-        - (MQM): Multidimensional MT How is the overall quality of the generated text?
-        - (INT): Interest. Is the generated text interesting?
-        - (ENG): Engagement. Is the generated text engaging?
-        - (SPE): Specific. Is the generated text generic or specific to the source text?
-        - (COR): Correctness. Is the generated text correct or was there a misunderstanding of the source text?
-        - (SEM): Semantically appropriate. Is the generated text semantically appropriate?
-        - (UND): Understandability. Is the generated text understandable?
-        - (ERR): Error Recovery. Is the system able to recover from errors that it makes?
-        - (DIV): Diversity. Is there diversity in the system responses?
-        - (DEP): Depth. Does the system discuss topics in depth?
-        - (LIK): Likeability. Does the system display a likeable personality?
-        - (FLE): Flexibility. Is the system flexible and adaptable to the user and their interests?
-        - (INQ): Inquisitiveness. Is the system inquisitive throughout the conversation?
-
-        Possible tasks are for pre-made prompts are:
-        - (summ): Summarization. Generating an informative and fluent summary for a given long text.
-        - (MT): Machine Translation. Translate a sentence from one language to another.
-        - (D2T): Data to Text. Automatically generate a fluent and factual description for a given table.
-        - (diag): Dialogue. Generate an engaging and informative response based on the dialogue history.
-
+        This method computes the GPTScore for a candidate sentence given a source text,
+        a prompt template, an aspect to evaluate, and a task description.
         Args:
-            sources (list of str): Source texts.
-            preds (list of str): Candidate sentences.
-            prompts (str): Prompt template. If None, a default prompt template is used.
-            aspect (list): List of aspects to evaluate.
-            task (str): Task description.
-            api_key (str): OpenAI API key.
-
+            source (str): Source text.
+            pred (str): Candidate sentence.
+            prompt (str, optional): Prompt template. Defaults to None.
+            aspect (str, optional): Aspect to evaluate. Defaults to None.
+            task (str, optional): Task description. Defaults to None.
         Returns:
-            list: List of scores for each candidate sentence.
+            score (float): Score for the candidate sentence.
         """
-        assert isinstance(sources, list) and isinstance(
-            sources[0], str
-        ), "Source must be a list of strings."
-        assert isinstance(preds, list) and isinstance(
-            preds[0], str
-        ), "Prediction must be a list of strings."
-
-        assert isinstance(self.model, str), "Model must be a string."
-        assert self.model in self.models, f"Model must be one of {self.models}."
+        assert isinstance(source, str), "Source must be a string."
+        assert isinstance(pred, str), "Pred must be a string."
 
         # If prompt is given, check that it is a list of string
-        if prompts:
-            assert isinstance(prompts, list) and isinstance(
-                prompts[0], str
-            ), "Prompts must be a list of strings."
+        if prompt:
+            assert isinstance(prompt, str), "Prompt must be a string."
             assert not aspect, "Aspect must not be given if prompt is given."
             assert not task, "Task must not be given if prompt is given."
         else:
@@ -261,20 +492,41 @@ def compute(
             assert task in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompts = prompts or self.get_prompts(aspect, task, sources, preds)
+        prompt = (
+            prompt
+            + "\nQuestion:"
+            + source
+            + "\nAnswer:"
+            + pred
+            + "\n"
+            + "\nEvaluation: "
+            or self.get_prompt(aspect, task, source, pred)
+        )
 
-        # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
-        if self.model in self.huggingface_models:
-            logprobs = self.huggingface_logprobs(prompts)
+        response = self.lcpp_llm.create_completion(
+            prompt=prompt,
+            max_tokens=500,
+            temperature=0.5,
+            top_p=0.95,
+            logprobs=1,
+            repeat_penalty=1.2,
+            top_k=50,
+            echo=True,
+        )
 
-        elif self.model in self.openai_models:
-            logprobs = self.openai_logprobs(prompts, api_key)
+        # Compute logprobs
+        # Find the end position of the input...
+        print(response["choices"][0]["logprobs"]["text_offset"])
+        i = response["choices"][0]["logprobs"]["text_offset"].index(len(prompt) - 1)
+        if i == 0:
+            i = i + 1
 
-        # Compute GPTScores
-        scores = []
-        for i, pred in enumerate(preds):
-            pred_tokens = pred.split()
-            pred_logprobs = logprobs[i][: len(pred_tokens)]
-            scores.append(np.mean(pred_logprobs))
+        # Get logprobs
+        loss = -sum(
+            response["choices"][0]["logprobs"]["token_logprobs"][i:-1]
+        )  # ignore the last '.'
+        avg_loss = loss / (
+            len(response["choices"][0]["logprobs"]["text_offset"]) - i - 1
+        )  # 1 is the last '.'
 
-        return scores
+        return avg_loss
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
index 9561ce1..6b9c6e7 100644
--- a/tests/test_llm_metrics.py
+++ b/tests/test_llm_metrics.py
@@ -1,120 +1,190 @@
 import unittest
 
-from saga_llm_evaluation_ml.helpers.llm_metrics import GPTScore
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+from saga_llm_evaluation_ml.helpers.llm_metrics import GPTScore, GEval, SelfCheckGPT
+
+
+class TestGEval(unittest.TestCase):
+    def test_init(self):
+        with self.assertRaises(AssertionError):
+            GEval(1, 1)
+            GEval("1", 1)
+            GEval(1, "1")
+
+    def test_bad_arguments(self):
+        geval = GEval()
+
+        source = "Hi how are you"
+        pred = "Im ok"
+        task = "diag"
+        aspect = "ENG"
+
+        with self.assertRaises(AssertionError):
+            geval.compute([source], pred, task, aspect)
+            geval.compute(source, [pred], task, aspect)
+            geval.compute(source, pred, 1, aspect)
+            geval.compute(source, pred, task, 1)
+            geval.compute(source, pred, task, "notvalid")
+            geval.compute(source, pred, "notvalid", aspect)
+            geval.compute(source, pred, task, criterion=None)
+            geval.compute(source, pred, definition=None, criterion=aspect)
+
+    def test_compute(self):
+        geval = GEval()
+
+        source = "Hi how are you?"
+        preds = ["Shut up creep!!!", "I am very good, thank you! And you?"]
+        task = "diag"
+        aspect = "POL"
+
+        scores = {key: 0 for key in preds}
+        for pred in preds:
+            score = geval.compute(source, pred, task, aspect)
+            self.assertTrue(isinstance(score, float))
+            self.assertGreaterEqual(score, 0.0)
+            scores[pred] = score
+
+        self.assertGreaterEqual(
+            scores["I am very good, thank you! And you?"], scores["Shut up creep!!!"]
+        )
+
+
+class TestSelfCheckGPT(unittest.TestCase):
+    def test_init(self):
+        model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF"
+        model_basename = "llama-2-7b-chat.Q4_K_M.gguf"  # the model is in bin format
+
+        model_path = hf_hub_download(
+            repo_id=model_name_or_path, filename=model_basename
+        )
+        model = Llama(model_path=model_path, n_threads=2, verbose=False)  # CPU cores
+
+        with self.assertRaises(AssertionError):
+            SelfCheckGPT(model, eval_model_name_or_path=1, eval_model_basename=1)
+            SelfCheckGPT(model, eval_model_name_or_path=1, eval_model_basename="1")
+            SelfCheckGPT(model, eval_model_name_or_path="1", eval_model_basename=1)
+
+    def test_bad_arguments(self):
+
+        model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF"
+        model_basename = "llama-2-7b-chat.Q4_K_M.gguf"  # the model is in bin format
+
+        model_path = hf_hub_download(
+            repo_id=model_name_or_path, filename=model_basename
+        )
+        model = Llama(model_path=model_path, n_threads=2, verbose=False)  # CPU cores
+
+        selfcheckgpt = SelfCheckGPT(model)
+        question = "What is the capital of France?"
+        pred = "Paris"
+        n_samples = 1
+
+        with self.assertRaises(AssertionError):
+            selfcheckgpt.compute([question], pred, n_samples)
+            selfcheckgpt.compute(question, [pred], n_samples)
+            selfcheckgpt.compute(question, pred, "1")
+            selfcheckgpt.compute(question, pred, 1.0)
+            selfcheckgpt.compute(question, pred, -1)
+            selfcheckgpt.compute(question=question, pred=None, n_samples=5)
+            selfcheckgpt.compute(question=None, pred=pred, n_samples=5)
+
+    def test_compute(self):
+        model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF"
+        model_basename = "llama-2-7b-chat.Q4_K_M.gguf"
+
+        model_path = hf_hub_download(
+            repo_id=model_name_or_path, filename=model_basename
+        )
+        model = Llama(model_path=model_path, n_threads=2, verbose=False)  # CPU cores
+
+        selfcheckgpt = SelfCheckGPT(model)
+        question = "What is the capital of France?"
+        preds = ["Paris", "dragon"]
+        n_samples = 6
+
+        scores = {key: 0 for key in preds}
+        for pred in preds:
+            score = selfcheckgpt.compute(question, pred, n_samples)
+            self.assertTrue(isinstance(score, float))
+            self.assertGreaterEqual(score, 0.0)
+            self.assertLessEqual(score, 1.0)
+            scores[pred] = score
+
+        self.assertGreaterEqual(scores["Paris"], scores["dragon"])
 
 
 class TestGPTScore(unittest.TestCase):
     def test_init(self):
         with self.assertRaises(AssertionError):
-            GPTScore(model=100)
-            GPTScore(model="notvalid")
+            GPTScore(model_basename=1, model_name_or_path=1)
+            GPTScore(model_basename="1", model_name_or_path=1)
+            GPTScore(model_basename=1, model_name_or_path="1")
 
     def test_bad_arguments(self):
         gptscore = GPTScore()
 
         with self.assertRaises(AssertionError):
+            gptscore.compute(["The cat sat on the mat."], ["The dog sat on the log."])
+            gptscore.compute("The cat sat on the mat.", ["The dog sat on the log."])
             gptscore.compute("The cat sat on the mat.", "The dog sat on the log.")
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                prompts=10,
+                "The cat sat on the mat.", "The dog sat on the log.", prompt=2
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                prompts="Summarize",
-                aspect="ERR",
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                prompt="2",
+                aspect="COV",
+                task="diag",
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                prompts="Summarize",
-                task="summ",
-            )
-            gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                prompts="Summarize",
-                aspect="ERR",
-                task="summ",
-            )
-            gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                aspect="ERR",
-                task=None,
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                aspect=2,
+                task="diag",
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                aspect=None,
-                task="summ",
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                aspect="COV",
+                task=2,
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                aspect=2,
-                task="summ",
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                aspect="COV",
+                task="notvalid",
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                aspect="ERR",
-                task=None,
+                "The cat sat on the mat.",
+                "The dog sat on the log.",
+                aspect="notvalid",
+                task="diag",
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                aspect="notvalid",
-                task="summ",
+                "The cat sat on the mat.", "The dog sat on the log.", aspect="COV"
             )
             gptscore.compute(
-                ["The cat sat on the mat."],
-                ["The dog sat on the log."],
-                aspect="ERR",
-                task="D2T",
+                "The cat sat on the mat.", "The dog sat on the log.", task="diag"
             )
 
-    def test_compute_gpt2(self):
-        """Tests that the GPTScore computes a higher score for a better prediction with gpt2."""
-        sources = ["State something true.", "State something true."]
-        preds = ["The cat eats elephants.", "The cat eats mice."]
-
+    def test_compute(self):
         gptscore = GPTScore()
 
-        # gpt2
-        scores = gptscore.compute(sources, preds, aspect="ERR", task="diag")
-        self.assertGreater(scores[1], scores[0])
-
-    # def test_compute_mistral(self):
-    #     """
-    #     Tests that the GPTScore computes a higher score for a better prediction
-    #     with mistralai/Mistral-7B-v0.1.
-    #     """
-    #     source = "State something true."
-    #     pred = "The cat eats elephants."
-    #     better_pred = "The cat eats mice."
-
-    #     gptscore = GPTScore()
-
-    #     # mistralai/Mistral-7B-v0.1
-    #     score = gptscore.compute(source, pred, aspect="ERR", task="diag", model="mistralai/Mistral-7B-v0.1")
-    #     score_2 = gptscore.compute(source, better_pred, aspect="ERR", task="diag", model="mistralai/Mistral-7B-v0.1")
-    #     self.assertGreater(score_2, score)
-
-    # def test_compute_llama(self):
-    #     """
-    #     Tests that the GPTScore computes a higher score for a better prediction
-    #     with meta-llama/Llama-2-7b-chat-hf.
-    #     """
-    #     source = "State something true."
-    #     pred = "The cat eats elephants."
-    #     better_pred = "The cat eats mice."
-
-    #     gptscore = GPTScore()
-
-    #     # meta-llama/Llama-2-7b-chat-hf
-    #     score = gptscore.compute(source, pred, aspect="ERR", task="diag", model="meta-llama/Llama-2-7b-chat-hf")
-    #     score_2 = gptscore.compute(source, better_pred, aspect="ERR", task="diag",
-    #       model="meta-llama/Llama-2-7b-chat-hf")
-    #     self.assertGreater(score_2, score)
+        source = "USER: Hi how are you?"
+        preds = ["AI: I am very fine! Thanks! And you?", "AI: Shut up creep!!!"]
+        prompt = "Task: evaluate how polite the AI is in this dialog."
+
+        scores = {key: 0 for key in preds}
+        for target in preds:
+            score = gptscore.compute(source, target, prompt)
+            scores[target] = score
+            self.assertTrue(isinstance(score, float))
+            self.assertGreaterEqual(score, 0.0)
+            self.assertLessEqual(score, 1.0)
+
+        self.assertGreaterEqual(
+            scores["AI: I am very fine! Thanks! And you?"],
+            scores["AI: Shut up creep!!!"],
+        )

From 7986b7f580e93ef20cdb57d132144f880bb077aa Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Wed, 25 Oct 2023 09:30:10 +0200
Subject: [PATCH 6/6] fix(format): run pylint and black

---
 saga_llm_evaluation_ml/helpers/llm_metrics.py | 6 ++++--
 tests/test_llm_metrics.py                     | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/saga_llm_evaluation_ml/helpers/llm_metrics.py b/saga_llm_evaluation_ml/helpers/llm_metrics.py
index 45491d8..39ce77d 100644
--- a/saga_llm_evaluation_ml/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/llm_metrics.py
@@ -329,8 +329,8 @@ def compute(self, source, pred, definition, criterion, criterion_name=None):
         return self.get_score(prompt)
 
 
-
 class GPTScore:
+    # pylint: disable=f-string-without-interpolation
     def __init__(
         self,
         model_name_or_path="TheBloke/Llama-2-7b-Chat-GGUF",
@@ -383,7 +383,9 @@ def __init__(
         }
 
         self.tasks = self.templates.keys()
-        self.aspects = list({aspect for task in self.tasks for aspect in self.templates[task]})
+        self.aspects = list(
+            {aspect for task in self.tasks for aspect in self.templates[task]}
+        )
 
         self.model_path = hf_hub_download(
             repo_id=model_name_or_path, filename=model_basename
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
index 8b27aa8..a255f20 100644
--- a/tests/test_llm_metrics.py
+++ b/tests/test_llm_metrics.py
@@ -173,8 +173,11 @@ def test_compute(self):
         gptscore = GPTScore()
 
         source = "Hi how are you?"
-        preds = ["I am very fine. Thanks! What about you?", "Shut up creep I don't want to talk to you!!!"]
-        #prompt = "Task: evaluate how polite this dialog is."
+        preds = [
+            "I am very fine. Thanks! What about you?",
+            "Shut up creep I don't want to talk to you!!!",
+        ]
+        # prompt = "Task: evaluate how polite this dialog is."
         aspect = "LIK"
         task = "diag"