diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index d532c9f..9be7146 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -52,4 +52,4 @@ jobs: - name: Push to private PyPI registry if: ${{ steps.new_version.outputs.version != steps.current_version.outputs.version }} - run: poetry publish --repository sagacify \ No newline at end of file + run: poetry publish diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f561c02..d16e4ab 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -40,11 +40,11 @@ jobs: - name: Run formatter run: | - poetry run black --check saga_llm_evaluation_ml tests + poetry run black --check saga_llm_evaluation tests - name: Run linter run: | - poetry run pylint saga_llm_evaluation_ml tests + poetry run pylint saga_llm_evaluation tests - name: Run tests run: | diff --git a/.pylintrc b/.pylintrc index 853c5ec..1c61e8c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -98,7 +98,6 @@ disable=missing-module-docstring, fixme, unspecified-encoding, duplicate-code, - no-self-use, too-few-public-methods, attribute-defined-outside-init diff --git a/README.md b/README.md index 097b3f5..4a59fbb 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Each of these metrics uses the [LLAMA model](https://ai.meta.com/llama/) to eval ## Installation To install the Saga LLM Evaluation ML library, use the following command: -```pip install saga_llm_evaluation_ml``` +```pip install saga-llm-evaluation``` Be aware that by default the library will run pytorch on the CPU. If you want to run it on the GPU, you need to install pytorch with GPU support. You can find the instructions [here](https://pytorch.org/get-started/locally/). @@ -40,7 +40,7 @@ The Scorer is a class that allows you to run multiple metrics at once. The metri ```python -from saga_llm_evaluation_ml import LLMScorer +from saga_llm_evaluation import LLMScorer scorer = LLMScorer( metrics = ["bertscore", "mauve", "bleurtscore", "q_squared", "selcheckgpt", "geval", "gptscore"], model = transformers.PreTrainedModel, # language model that inherits from transformers.PreTrainedModel which needs to be evaluated. Needed for SelCheck-GPT @@ -94,7 +94,7 @@ scorer.score( ### BERTScore ```python -from saga_llm_evaluation_ml import BERTScore +from saga_llm_evaluation import BERTScore bert_score = BERTScore() scores = bert_score.compute( @@ -105,7 +105,7 @@ scores = bert_score.compute( ### MAUVE ```python -from saga_llm_evaluation_ml import MAUVE +from saga_llm_evaluation import MAUVE mauve = MAUVE() scores = mauve.compute( references=["This is a reference sentence"], @@ -115,7 +115,7 @@ scores = mauve.compute( ### BLEURTScore ```python -from saga_llm_evaluation_ml import BLEURTScore +from saga_llm_evaluation import BLEURTScore bleurt_score = BLEURTScore() scores = bleurt_score.compute( references=["This is a reference sentence"], @@ -125,7 +125,7 @@ scores = bleurt_score.compute( ### Q-Squared ```python -from saga_llm_evaluation_ml import QSquared +from saga_llm_evaluation import QSquared q_squared = QSquared() scores = q_squared.compute( knowledges=["This is the text gave to the LLM as knowledge"], @@ -135,7 +135,7 @@ scores = q_squared.compute( ### SelCheck-GPT ```python -from saga_llm_evaluation_ml import SelCheckGPT +from saga_llm_evaluation import SelCheckGPT selcheck_gpt = SelCheckGPT( model = transformers.PreTrainedModel, # language model that inherits from transformers.PreTrainedModel which needs to be evaluated. eval_model = transformers.PreTrainedModel, # language model that inherits from transformers.PreTrainedModel which is used to evaluate the model. @@ -148,7 +148,7 @@ scores = selcheck_gpt.compute( ### G-Eval ```python -from saga_llm_evaluation_ml import GEval +from saga_llm_evaluation import GEval g_eval = GEval( model = transformers.PreTrainedModel, # language model that inherits from transformers.PreTrainedModel which is used to evaluate the model. ) @@ -175,7 +175,7 @@ scores = g_eval.compute( ### GPT-Score ```python -from saga_llm_evaluation_ml import GPTScore +from saga_llm_evaluation import GPTScore gpt_score = GPTScore( model = transformers.PreTrainedModel, # language model that inherits from transformers.PreTrainedModel which is used to evaluate the model. ) @@ -206,7 +206,7 @@ You can use a different LLAMA model as evaluator by using the get_llama_model fu The full list of quantized LLAMA models that may be used is available [here](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF). ```python -from saga_llm_evaluation_ml import get_llama_model +from saga_llm_evaluation import get_llama_model llama_model = get_llama_model( repo_id = "TheBloke/Llama-2-7b-Chat-GGUF", @@ -219,7 +219,7 @@ You can also download the LLAMA model manually and specify the local path to the ```huggingface-cli download TheBloke/Llama-2-7b-Chat-GGUF llama-2-7b-chat.Q2_K.gguf --local-dir path_to_model_folder --local-dir-use-symlinks False``` ```python -from saga_llm_evaluation_ml import get_llama_model +from saga_llm_evaluation import get_llama_model llama_model = get_llama_model( model_path = "path_to_model_folder", diff --git a/pyproject.toml b/pyproject.toml index 181ac35..5cb8b59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,12 @@ [tool.poetry] -name = "saga_llm_evaluation_ml" +name = "saga-llm-evaluation" version = "0.7.2" description = "Versatile Python library designed for evaluating the performance of large language models in Natural Language Processing (NLP) tasks. Developed by Sagacify" readme = "README.md" -authors = ["Leonardo Remondini ", "Lucie Navez "] +authors = [ + "Leonardo Remondini ", + "Lucie Navez ", +] [tool.poetry.dependencies] python = ">=3.9,<3.11" @@ -13,15 +16,15 @@ spacy = "^3.1.3" evaluate = "^0.4.1" mauve-text = "^0.3.0" bert-score = "^0.3.13" -bleurt = {git = "https://github.com/google-research/bleurt.git"} +bleurt = { git = "https://github.com/google-research/bleurt.git" } torch = "2.1.1" -tensorflow = {version = "^2.14", markers = "sys_platform == 'linux'"} -tensorflow-macos = {version = "^2.14", markers = "sys_platform == 'darwin' and platform_machine == 'arm64'"} +tensorflow = { version = "^2.14", markers = "sys_platform == 'linux'" } +tensorflow-macos = { version = "^2.14", markers = "sys_platform == 'darwin' and platform_machine == 'arm64'" } elemeta = "1.0.7" huggingface-hub = "^0.18.0" llama-cpp-python = "^0.2.11" -en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl"} -fr-core-news-sm = {url = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl"} +en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl" } +fr-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl" } faiss-cpu = "^1.7.4" [tool.poetry.dev-dependencies] @@ -31,12 +34,6 @@ black = "^22.10.0" pytest = "^7.1.3" pytest-env = "^0.8.1" - -[[tool.poetry.source]] -name = "sagacify" -url = "https://pypiserver.sagacify.com/" -priority = "supplemental" - [tool.poetry.group.dev.dependencies] pytest = "^7.4.2" jupyterlab = "^4.0.7" @@ -46,7 +43,7 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.semantic_release] -version_variable = "saga_llm_evaluation_ml/__init__.py:__version__" +version_variable = "saga_llm_evaluation/__init__.py:__version__" branch = "master" version_toml = "pyproject.toml:tool.poetry.version" commit_subject = 'chore(release): Release v{version} [skip ci]' diff --git a/saga_llm_evaluation_ml/__init__.py b/saga_llm_evaluation/__init__.py similarity index 100% rename from saga_llm_evaluation_ml/__init__.py rename to saga_llm_evaluation/__init__.py diff --git a/saga_llm_evaluation_ml/helpers/__init__.py b/saga_llm_evaluation/helpers/__init__.py similarity index 100% rename from saga_llm_evaluation_ml/helpers/__init__.py rename to saga_llm_evaluation/helpers/__init__.py diff --git a/saga_llm_evaluation_ml/helpers/embedding_metrics.py b/saga_llm_evaluation/helpers/embedding_metrics.py similarity index 100% rename from saga_llm_evaluation_ml/helpers/embedding_metrics.py rename to saga_llm_evaluation/helpers/embedding_metrics.py diff --git a/saga_llm_evaluation_ml/helpers/language_metrics.py b/saga_llm_evaluation/helpers/language_metrics.py similarity index 98% rename from saga_llm_evaluation_ml/helpers/language_metrics.py rename to saga_llm_evaluation/helpers/language_metrics.py index 1c02e83..eb5b7d0 100644 --- a/saga_llm_evaluation_ml/helpers/language_metrics.py +++ b/saga_llm_evaluation/helpers/language_metrics.py @@ -7,8 +7,8 @@ AutoTokenizer, ) -from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore -from saga_llm_evaluation_ml.helpers.utils import ( +from saga_llm_evaluation.helpers.embedding_metrics import BERTScore +from saga_llm_evaluation.helpers.utils import ( INVALID_QUESTION, NO_ANS, check_list_type, diff --git a/saga_llm_evaluation_ml/helpers/llm_metrics.py b/saga_llm_evaluation/helpers/llm_metrics.py similarity index 99% rename from saga_llm_evaluation_ml/helpers/llm_metrics.py rename to saga_llm_evaluation/helpers/llm_metrics.py index 65d85fd..52568dc 100644 --- a/saga_llm_evaluation_ml/helpers/llm_metrics.py +++ b/saga_llm_evaluation/helpers/llm_metrics.py @@ -2,7 +2,7 @@ from huggingface_hub import hf_hub_download from llama_cpp import Llama -from saga_llm_evaluation_ml.helpers.utils import check_list_type +from saga_llm_evaluation.helpers.utils import check_list_type # pylint: disable=consider-iterating-dictionary # pylint: disable=too-many-locals diff --git a/saga_llm_evaluation_ml/helpers/utils.py b/saga_llm_evaluation/helpers/utils.py similarity index 100% rename from saga_llm_evaluation_ml/helpers/utils.py rename to saga_llm_evaluation/helpers/utils.py diff --git a/saga_llm_evaluation_ml/model/__init__.py b/saga_llm_evaluation/resources/__init__.py similarity index 100% rename from saga_llm_evaluation_ml/model/__init__.py rename to saga_llm_evaluation/resources/__init__.py diff --git a/saga_llm_evaluation_ml/score.py b/saga_llm_evaluation/score.py similarity index 97% rename from saga_llm_evaluation_ml/score.py rename to saga_llm_evaluation/score.py index 3a04cda..dd4d222 100644 --- a/saga_llm_evaluation_ml/score.py +++ b/saga_llm_evaluation/score.py @@ -1,7 +1,7 @@ -from saga_llm_evaluation_ml.helpers.embedding_metrics import MAUVE, BERTScore -from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore, QSquared -from saga_llm_evaluation_ml.helpers.llm_metrics import GEval, GPTScore, SelfCheckGPT -from saga_llm_evaluation_ml.helpers.utils import ( +from saga_llm_evaluation.helpers.embedding_metrics import MAUVE, BERTScore +from saga_llm_evaluation.helpers.language_metrics import BLEURTScore, QSquared +from saga_llm_evaluation.helpers.llm_metrics import GEval, GPTScore, SelfCheckGPT +from saga_llm_evaluation.helpers.utils import ( MetadataExtractor, check_list_type, filter_class_input, @@ -65,7 +65,7 @@ def __init__( ) -> None: self.config = ( - config if config else load_json("./saga_llm_evaluation_ml/scorer.json") + config if config else load_json("./saga_llm_evaluation/scorer.json") ) assert isinstance(metrics, list), "metrics must be a list." assert isinstance(self.config, dict), "config file must be a dict." diff --git a/saga_llm_evaluation_ml/scorer.json b/saga_llm_evaluation/scorer.json similarity index 100% rename from saga_llm_evaluation_ml/scorer.json rename to saga_llm_evaluation/scorer.json diff --git a/saga_llm_evaluation_ml/resources/__init__.py b/saga_llm_evaluation_ml/resources/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/__init__.py b/tests/__init__.py index 6986b32..9e28cd1 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ import os import sys -from saga_llm_evaluation_ml.helpers.utils import get_llama_model +from saga_llm_evaluation.helpers.utils import get_llama_model MODULE_ROOT = os.path.abspath("/www/app/src") sys.path.append(MODULE_ROOT) diff --git a/tests/test_embedding_metrics.py b/tests/test_embedding_metrics.py index d2ac30b..74d99b0 100644 --- a/tests/test_embedding_metrics.py +++ b/tests/test_embedding_metrics.py @@ -1,6 +1,6 @@ import unittest -from saga_llm_evaluation_ml.helpers.embedding_metrics import MAUVE, BERTScore +from saga_llm_evaluation.helpers.embedding_metrics import MAUVE, BERTScore class TestBERTScore(unittest.TestCase): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 88259f7..e6b1ba1 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,6 +1,6 @@ import unittest -from saga_llm_evaluation_ml.helpers.utils import MetadataExtractor +from saga_llm_evaluation.helpers.utils import MetadataExtractor class TestMetadataExtractor(unittest.TestCase): diff --git a/tests/test_language_metrics.py b/tests/test_language_metrics.py index 0c73931..b20ebd7 100644 --- a/tests/test_language_metrics.py +++ b/tests/test_language_metrics.py @@ -1,6 +1,11 @@ import unittest -from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore, QSquared +import pytest + +from saga_llm_evaluation.helpers.language_metrics import BLEURTScore, QSquared + +# skip it for github actions, too many resources needed. Test locally +pytest.skip(allow_module_level=True) class TestBLEURTScore(unittest.TestCase): diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py index 1ad5636..ce37008 100644 --- a/tests/test_llm_metrics.py +++ b/tests/test_llm_metrics.py @@ -2,7 +2,7 @@ import pytest -from saga_llm_evaluation_ml.helpers.llm_metrics import GEval, GPTScore, SelfCheckGPT +from saga_llm_evaluation.helpers.llm_metrics import GEval, GPTScore, SelfCheckGPT from tests import LLAMA_MODEL # skip it for github actions, too many resources needed. Test locally diff --git a/tests/test_score.py b/tests/test_score.py index 751a3e1..a94d441 100644 --- a/tests/test_score.py +++ b/tests/test_score.py @@ -2,8 +2,8 @@ import pytest -from saga_llm_evaluation_ml.helpers.utils import load_json -from saga_llm_evaluation_ml.score import LLMScorer +from saga_llm_evaluation.helpers.utils import load_json +from saga_llm_evaluation.score import LLMScorer from tests import LLAMA_MODEL # skip it for github actions, too many resources needed. Test locally @@ -27,7 +27,7 @@ def test_score_bad_arguments(self): knowledge = "You are a cat. You don't like dogs." prediction = "I am a cat, I don't like dogs." reference = "I am a cat, I don't like dogs, miau." - config = load_json("saga_llm_evaluation_ml/scorer.json") + config = load_json("saga_llm_evaluation/scorer.json") with self.assertRaises(AssertionError): self.scorer.score(False, knowledge, prediction, reference, config) @@ -56,7 +56,7 @@ def test_score(self): knowledge = "Example: Eww, I hate dogs." prediction = "I am a cat, I don't like dogs." reference = "I am a cat, I don't like dogs, miau." - config = load_json("saga_llm_evaluation_ml/scorer.json") + config = load_json("saga_llm_evaluation/scorer.json") scores = self.scorer.score( user_prompt=user_prompt,