From 4cca9040764b03eb4a4883cbb8549fc57e4e37f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marlene=20Kre=C3=9F?= Date: Wed, 7 Aug 2024 12:08:51 +0200 Subject: [PATCH] Added task_type parameter for correct model loading (#245) * started adding task_type * started adding quality control tests * fixed integration tests, added separate models for tasks * [CodeBuild] fix saas db naming error * Use batch build for AWS CodeBuild to speed up tests against backends. [CodeBuild] * Build and export SLC before running SaaS integration tests to avoid waiting for the SLC build while the SaaS DB is already running [CodeBuild] * Use itde_config fixture instead itde fiture to avoid starting the itde without need and make db_conn a session fixture [CodeBuild] * Save SaaS Database id in pytest stash to not recreate a SaaS DB for each test. It seems to be a bug that a session scope fixture is called for every test. This might happen because backend is parameterized. [CodeBuild] * Increase DB Mem Size for ITDE to hopefully stabalize onprem tests in CodeBuild [CodeBuild] * Increase VM Size for onprem tests in CodeBuild to hopefully stabalize them [CodeBuild] Co-authored-by: Torsten Kilias --- buildspec.yml | 45 +++--- buildspec_onprem.yml | 26 +++ buildspec_saas.yml | 29 ++++ buildspec_without_db.yml | 25 +++ doc/changes/changes_2.0.0.md | 9 +- doc/user_guide/user_guide.md | 46 +++--- .../templates/model_downloader_udf.jinja.sql | 1 + .../udfs/models/base_model_udf.py | 10 +- .../udfs/models/filling_mask_udf.py | 2 +- .../udfs/models/model_downloader_udf.py | 16 +- ...sequence_classification_single_text_udf.py | 2 +- .../sequence_classification_text_pair_udf.py | 2 +- .../udfs/models/text_generation_udf.py | 2 +- .../udfs/models/token_classification_udf.py | 2 +- .../udfs/models/translation_udf.py | 2 +- .../zero_shot_text_classification_udf.py | 2 +- exasol_transformers_extension/upload_model.py | 38 +++-- ...ion.py => bucketfs_model_specification.py} | 33 ++-- .../utils/bucketfs_operations.py | 2 +- ...gingface_hub_bucketfs_model_transfer_sp.py | 1 + .../utils/load_local_model.py | 12 +- .../utils/model_specification.py | 54 ++++++- noxfile.py | 21 ++- poetry.lock | 1 + tests/conftest.py | 15 ++ tests/fixtures/bucketfs_fixture.py | 24 ++- tests/fixtures/database_connection_fixture.py | 78 +++++---- tests/fixtures/language_container_fixture.py | 49 +++--- tests/fixtures/model_fixture.py | 152 +++++++++++++++--- tests/fixtures/script_deployment_fixture.py | 9 +- tests/fixtures/setup_database_fixture.py | 7 +- .../deployment/test_scripts_deployer.py | 3 +- .../deployment/test_scripts_deployer_cli.py | 5 +- .../with_db/test_upload_model.py | 28 ++-- .../with_db/udfs/test_filling_mask_script.py | 22 ++- .../udfs/test_model_downloader_udf_script.py | 11 +- .../test_prediction_with_downloader_udf.py | 19 ++- .../udfs/test_question_answering_script.py | 28 +++- ...uence_classification_single_text_script.py | 27 +++- ...equence_classification_text_pair_script.py | 28 +++- .../udfs/test_text_generation_script.py | 26 ++- .../udfs/test_token_classification_script.py | 22 ++- .../with_db/udfs/test_translation_script.py | 15 +- ...st_zero_shot_text_classification_script.py | 30 +++- .../without_db/udfs/test_filling_mask_udf.py | 10 +- .../udfs/test_model_downloader_udf.py | 12 +- .../udfs/test_question_answering_udf.py | 11 +- ...sequence_classification_single_text_udf.py | 19 ++- ...t_sequence_classification_text_pair_udf.py | 19 ++- .../udfs/test_text_generation_udf.py | 11 +- .../udfs/test_token_classification_udf.py | 18 +-- .../test_zero_shot_text_classification_udf.py | 15 +- .../without_db/utils/test_load_local_model.py | 11 +- ...ot_cached_multiple_model_multiple_batch.py | 4 +- ..._not_cached_single_model_multiple_batch.py | 2 +- ...rediction_multiple_model_multiple_batch.py | 4 +- ..._prediction_single_model_multiple_batch.py | 2 +- ...ngle_subdir_single_model_multiple_batch.py | 4 +- ...single_subdir_single_model_single_batch.py | 4 +- .../multiple_model_multiple_batch_complete.py | 4 +- ...ultiple_model_multiple_batch_incomplete.py | 4 +- ...ultiple_batch_multiple_models_per_batch.py | 8 +- .../multiple_model_single_batch_complete.py | 4 +- .../multiple_model_single_batch_incomplete.py | 4 +- ...ltiple_topk_single_model_multiple_batch.py | 2 +- ...multiple_topk_single_model_single_batch.py | 2 +- ...iple_subdir_single_model_multiple_batch.py | 4 +- ...ltiple_subdir_single_model_single_batch.py | 4 +- .../single_model_multiple_batch_complete.py | 2 +- .../single_model_multiple_batch_incomplete.py | 2 +- .../single_model_single_batch_complete.py | 2 +- .../single_model_single_batch_incomplete.py | 2 +- ...ngle_topk_multiple_model_multiple_batch.py | 4 +- ...single_topk_multiple_model_single_batch.py | 4 +- tests/unit_tests/udfs/test_base_udf.py | 2 +- .../udfs/test_model_downloader_udf.py | 30 ++-- .../unit_tests/utils/test_load_local_model.py | 11 +- tests/utils/parameters.py | 26 ++- 78 files changed, 881 insertions(+), 367 deletions(-) create mode 100644 buildspec_onprem.yml create mode 100644 buildspec_saas.yml create mode 100644 buildspec_without_db.yml rename exasol_transformers_extension/utils/{current_model_specification.py => bucketfs_model_specification.py} (52%) diff --git a/buildspec.yml b/buildspec.yml index 6595e21a..290779e3 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -1,29 +1,20 @@ version: 0.2 -env: - shell: bash - secrets-manager: - DOCKER_USER: "Dockerhub:User" - DOCKER_PASSWORD: "Dockerhub:AccessToken" - SAAS_HOST: "ExasolSaaSDatabase:SAAS_HOST" - SAAS_ACCOUNT_ID: "ExasolSaaSDatabase:SAAS_ACCOUNT_ID" - SAAS_PAT: "ExasolSaaSDatabase:SAAS_PAT" - -phases: - install: - runtime-versions: - python: 3.10 - commands: - - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - - - export PATH=$PATH:$HOME/.local/bin - - poetry env use $(command -v "python3.10") - - poetry --version - - poetry install - - poetry build - pre_build: - commands: - - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin - build: - commands: - - poetry run nox -s start_database - - poetry run nox -s integration_tests +batch: + fast-fail: false + build-graph: + - identifier: without_db_tests + env: + compute-type: BUILD_GENERAL1_MEDIUM + privileged-mode: true + buildspec: ./buildspec_without_db.yml + - identifier: saas_tests + env: + compute-type: BUILD_GENERAL1_MEDIUM + privileged-mode: true + buildspec: ./buildspec_saas.yml + - identifier: onprem_tests + env: + compute-type: BUILD_GENERAL1_LARGE + privileged-mode: true + buildspec: ./buildspec_onprem.yml diff --git a/buildspec_onprem.yml b/buildspec_onprem.yml new file mode 100644 index 00000000..90a96a92 --- /dev/null +++ b/buildspec_onprem.yml @@ -0,0 +1,26 @@ +version: 0.2 + +env: + shell: bash + secrets-manager: + DOCKER_USER: "Dockerhub:User" + DOCKER_PASSWORD: "Dockerhub:AccessToken" + +phases: + install: + runtime-versions: + python: 3.10 + commands: + - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - + - export PATH=$PATH:$HOME/.local/bin + - poetry env use $(command -v "python3.10") + - poetry --version + - poetry install + - poetry build + pre_build: + commands: + - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin + build: + commands: + - poetry run nox -s start_database + - poetry run nox -s onprem_integration_tests diff --git a/buildspec_saas.yml b/buildspec_saas.yml new file mode 100644 index 00000000..cf46a37c --- /dev/null +++ b/buildspec_saas.yml @@ -0,0 +1,29 @@ +version: 0.2 + +env: + shell: bash + secrets-manager: + DOCKER_USER: "Dockerhub:User" + DOCKER_PASSWORD: "Dockerhub:AccessToken" + SAAS_HOST: "ExasolSaaSDatabase:SAAS_HOST" + SAAS_ACCOUNT_ID: "ExasolSaaSDatabase:SAAS_ACCOUNT_ID" + SAAS_PAT: "ExasolSaaSDatabase:SAAS_PAT" + +phases: + install: + runtime-versions: + python: 3.10 + commands: + - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - + - export PATH=$PATH:$HOME/.local/bin + - poetry env use $(command -v "python3.10") + - poetry --version + - poetry install + - poetry build + pre_build: + commands: + - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin + build: + commands: + - poetry run nox -s export_slc + - poetry run nox -s saas_integration_tests diff --git a/buildspec_without_db.yml b/buildspec_without_db.yml new file mode 100644 index 00000000..6f3a7782 --- /dev/null +++ b/buildspec_without_db.yml @@ -0,0 +1,25 @@ +version: 0.2 + +env: + shell: bash + secrets-manager: + DOCKER_USER: "Dockerhub:User" + DOCKER_PASSWORD: "Dockerhub:AccessToken" + +phases: + install: + runtime-versions: + python: 3.10 + commands: + - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - + - export PATH=$PATH:$HOME/.local/bin + - poetry env use $(command -v "python3.10") + - poetry --version + - poetry install + - poetry build + pre_build: + commands: + - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin + build: + commands: + - poetry run nox -s without_db_integration_tests diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md index 09ad262b..70f35458 100644 --- a/doc/changes/changes_2.0.0.md +++ b/doc/changes/changes_2.0.0.md @@ -1,9 +1,12 @@ -# Transformers Extension 2.0.0, t.b.d +# Transformers Extension 2.0.0, 2024-08-07 -Code name: +Code name: Fixed model saving, added SaaS support and update to Python 3.10 ## Summary +This release Fixes an error in saving and loading of the model metadata. It also adds Exasol Saas support and +updated the project to python 3.10 + ### Features @@ -13,6 +16,7 @@ Code name: ### Bugs - #237: Fixed reference to python-extension-common +- #245: Added task_type parameter to fix model saving and loading ### Documentation @@ -27,5 +31,6 @@ Code name: - #217: Refactored PredictionUDFs and LoadLocalModel so that LoadLocalModel constructs the bucketfs model file path - #230: Updated supported python version to >= Python 3.10 - #236: Moved to the PathLike bucketfs interface. +- #218: Changed upload_model_udf to load model from Huggingface ### Security diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index c976750b..8bc67ea6 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -263,14 +263,17 @@ Once you have internet access, invoke the UDF like this: ```sql SELECT TE_MODEL_DOWNLOADER_UDF( model_name, + task_type, sub_dir, bucketfs_conn, token_conn ) + ``` - Parameters: - ```model_name```: The name of the model to use for prediction. You can find the details of the models on the [huggingface models page](https://huggingface.co/models). + - ```task_type```: The Name of the task you want to use the model for. - ```sub_dir```: The directory where the model is stored in the BucketFS. - ```bucketfs_conn```: The BucketFS connection name. - ```token_conn```: The connection name containing the token required for @@ -278,14 +281,18 @@ SELECT TE_MODEL_DOWNLOADER_UDF( on how to create a connection object with token information, please check [here](#getting-started). - +"task_type" is a variable for the type of task you plan to use the model for. +Some models can be used for multiple types of tasks, but transformers stores +different metadata depending on the task of the model, which affects how the model +is loaded later. Setting an Incorrect task_type, o leaving the task_type empty may affect the models performance +severely. Available task_types are the same as the names of our available UDFs, namely: +`filling_mask`, `question_answering`, `sequence_classification`, `text_generation`, `token_classification`, +`translation` and`zero_shot_classification`. + ### 2. Model Uploader Script -You can invoke the python script as below which allows to load the transformer -models from the local filesystem into BucketFS: +You can invoke the python script as below which allows to download the transformer +models from The Hugging Face hub to the local filesystem, and then from there to the BucketFS. - ```buildoutcfg - python -m exasol_transformers_extension.upload_model - ``` #### List of options @@ -309,26 +316,19 @@ Unless stated otherwise in the comments column, the option is required for eithe | model-name | [x] | [x] | | | path-in-bucket | [x] | [x] | Root location in the bucket for all models | | sub-dir | [x] | [x] | Sub-directory where this model should be stored | +| task_type | [x] | [x] | Name of the task you want to use the model for | +| token | [x] | [x] | Huggingface token (needed for private models) | | [no_]use-ssl-cert-validation | [x] | [x] | Optional boolean, defaults to True | -**Note**: The options --local-model-path needs to point to a path which contains the model and its tokenizer. -These should have been saved using transformers [save_pretrained](https://huggingface.co/docs/transformers/v4.32.1/en/installation#fetch-models-and-tokenizers-to-use-offline) -function to ensure proper loading by the Transformers Extension UDFs. -You can download the model using python like this: - -```python - for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]: - # download the model and tokenizer from Hugging Face - model = model_factory.from_pretrained(model_name) - # save the downloaded model using the save_pretrained function - model_save_path = - model.save_pretrained(model_save_path) -``` -***Note:*** Hugging Face models consist of two parts, the model and the tokenizer. -Make sure to download and save both into the same save directory so the upload model script uploads them together. -And then upload it using exasol_transformers_extension.upload_model script where ```--local-model-path = ``` - +"task_type" is a variable for the type of task you plan to use the model for. +Some models can be used for multiple types of tasks, but transformers stores +different metadata depending on the task of the model, which affects how the model +is loaded later. Setting an Incorrect task_type, o leaving the task_type empty may affect the models performance +severely. Available task_types are the same as the names of our available UDFs, namely: +`filling_mask`, `question_answering`, `sequence_classification`, `text_generation`, `token_classification`, +`translation` and`zero_shot_classification`. + ## Using Prediction UDFs We provide 7 prediction UDFs in this Transformers Extension, each performing an NLP task through the [transformers API](https://huggingface.co/docs/transformers/task_summary). diff --git a/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql b/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql index 31e7470b..60e5af05 100644 --- a/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql +++ b/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql @@ -1,5 +1,6 @@ CREATE OR REPLACE {{ language_alias }} SET SCRIPT "TE_MODEL_DOWNLOADER_UDF"( model_name VARCHAR(2000000), + task_type VARCHAR(2000000), sub_dir VARCHAR(2000000), bfs_conn VARCHAR(2000000), token_conn VARCHAR(2000000) diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py index 21b96d60..490c0f1e 100644 --- a/exasol_transformers_extension/udfs/models/base_model_udf.py +++ b/exasol_transformers_extension/udfs/models/base_model_udf.py @@ -10,7 +10,7 @@ from exasol_transformers_extension.deployment import constants from exasol_transformers_extension.utils import device_management, \ bucketfs_operations, dataframe_operations -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.load_local_model import LoadLocalModel from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -40,13 +40,13 @@ def __init__(self, pipeline: transformers.Pipeline, base_model: ModelFactoryProtocol, tokenizer: ModelFactoryProtocol, - task_name: str): + task_type: str): self.exa = exa self.batch_size = batch_size self.pipeline = pipeline self.base_model = base_model self.tokenizer = tokenizer - self.task_name = task_name + self.task_type = task_type self.device = None self.model_loader = None self.last_created_pipeline = None @@ -74,7 +74,7 @@ def create_model_loader(self): self.model_loader = LoadLocalModel(pipeline_factory=self.pipeline, base_model_factory=self.base_model, tokenizer_factory=self.tokenizer, - task_name=self.task_name, + task_type=self.task_type, device=self.device) def get_predictions_from_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame: @@ -185,7 +185,7 @@ def check_cache(self, model_df: pd.DataFrame) -> None: model_name = model_df["model_name"].iloc[0] bucketfs_conn = model_df["bucketfs_conn"].iloc[0] sub_dir = model_df["sub_dir"].iloc[0] - current_model_specification = CurrentModelSpecification(model_name, bucketfs_conn, sub_dir) + current_model_specification = BucketFSModelSpecification(model_name, self.task_type, bucketfs_conn, sub_dir) if self.model_loader.current_model_specification != current_model_specification: bucketfs_location = \ diff --git a/exasol_transformers_extension/udfs/models/filling_mask_udf.py b/exasol_transformers_extension/udfs/models/filling_mask_udf.py index d7fb1170..30d7c761 100644 --- a/exasol_transformers_extension/udfs/models/filling_mask_udf.py +++ b/exasol_transformers_extension/udfs/models/filling_mask_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForMaskedLM, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='fill-mask') + tokenizer, task_type='fill-mask') self._mask_token = "" self._desired_fields_in_prediction = ["sequence", "score"] self.new_columns = ["filled_text", "score", "rank", "error_message"] diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py index 8d313289..a021be51 100644 --- a/exasol_transformers_extension/udfs/models/model_downloader_udf.py +++ b/exasol_transformers_extension/udfs/models/model_downloader_udf.py @@ -3,8 +3,8 @@ import transformers from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.current_model_specification import \ - CurrentModelSpecificationFactory +from exasol_transformers_extension.utils.bucketfs_model_specification import \ + BucketFSModelSpecificationFactory from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ HuggingFaceHubBucketFSModelTransferSPFactory @@ -24,13 +24,11 @@ class ModelDownloaderUDF: """ def __init__(self, exa, - base_model_factory: ModelFactoryProtocol = transformers.AutoModel, tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer, huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory = HuggingFaceHubBucketFSModelTransferSPFactory(), - current_model_specification_factory: CurrentModelSpecificationFactory = CurrentModelSpecificationFactory()): + current_model_specification_factory: BucketFSModelSpecificationFactory = BucketFSModelSpecificationFactory()): self._exa = exa - self._base_model_factory = base_model_factory self._tokenizer_factory = tokenizer_factory self._huggingface_hub_bucketfs_model_transfer = huggingface_hub_bucketfs_model_transfer self._current_model_specification_factory = current_model_specification_factory @@ -47,9 +45,11 @@ def _download_model(self, ctx) -> Tuple[str, str]: bfs_conn = ctx.bfs_conn # BucketFS connection token_conn = ctx.token_conn # name of token connection current_model_specification = self._current_model_specification_factory.create(ctx.model_name, - bfs_conn, - ctx.sub_dir) # specifies details of Huggingface model + ctx.task_type, + bfs_conn, + ctx.sub_dir) # specifies details of Huggingface model + model_factory = current_model_specification.get_model_factory() # extract token from the connection if token connection name is given. # note that, token is required for private models. It doesn't matter # whether there is a token for public model or even what the token is. @@ -72,7 +72,7 @@ def _download_model(self, ctx) -> Tuple[str, str]: model_path=model_path, token=token ) as downloader: - for model in [self._base_model_factory, self._tokenizer_factory]: + for model in [model_factory, self._tokenizer_factory]: downloader.download_from_huggingface_hub(model) # upload model files to BucketFS model_tar_file_path = downloader.upload_to_bucketfs() diff --git a/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py b/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py index 3060d9ae..b5d6b640 100644 --- a/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py +++ b/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py @@ -13,7 +13,7 @@ def __init__(self, base_model=transformers.AutoModelForSequenceClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='text-classification') + tokenizer, task_type='text-classification') self.new_columns = ["label", "score", "error_message"] def extract_unique_param_based_dataframes( diff --git a/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py b/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py index 393e8dba..71df6e4d 100644 --- a/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py +++ b/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py @@ -13,7 +13,7 @@ def __init__(self, base_model=transformers.AutoModelForSequenceClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='text-classification') + tokenizer, task_type='text-classification') self.new_columns = ["label", "score", "error_message"] def extract_unique_param_based_dataframes( diff --git a/exasol_transformers_extension/udfs/models/text_generation_udf.py b/exasol_transformers_extension/udfs/models/text_generation_udf.py index 081f3122..5455a625 100644 --- a/exasol_transformers_extension/udfs/models/text_generation_udf.py +++ b/exasol_transformers_extension/udfs/models/text_generation_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForCausalLM, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='text-generation') + tokenizer, task_type='text-generation') self.new_columns = ["generated_text", "error_message"] def extract_unique_param_based_dataframes( diff --git a/exasol_transformers_extension/udfs/models/token_classification_udf.py b/exasol_transformers_extension/udfs/models/token_classification_udf.py index 1f2fa3a1..81c6c3d5 100644 --- a/exasol_transformers_extension/udfs/models/token_classification_udf.py +++ b/exasol_transformers_extension/udfs/models/token_classification_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForTokenClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='token-classification') + tokenizer, task_type='token-classification') self._default_aggregation_strategy = 'simple' self._desired_fields_in_prediction = [ "start", "end", "word", "entity", "score"] diff --git a/exasol_transformers_extension/udfs/models/translation_udf.py b/exasol_transformers_extension/udfs/models/translation_udf.py index 4c0d9e04..36a5b470 100644 --- a/exasol_transformers_extension/udfs/models/translation_udf.py +++ b/exasol_transformers_extension/udfs/models/translation_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForSeq2SeqLM, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='translation') + tokenizer, task_type='translation') self._translation_prefix = "translate {src_lang} to {target_lang}: " self.new_columns = ["translation_text", "error_message"] diff --git a/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py b/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py index dba47a51..652a17eb 100644 --- a/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py +++ b/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForSequenceClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='zero-shot-classification') + tokenizer, task_type='zero-shot-classification') self._desired_fields_in_prediction = ["labels", "scores"] self.new_columns = ["label", "score", "rank", "error_message"] diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index 9c05dc03..a923a62b 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -2,21 +2,24 @@ from pathlib import Path import click +import transformers from exasol.python_extension_common.deployment.language_container_deployer_cli import ( SECRET_DISPLAY, SecretParams, secret_callback) from exasol_transformers_extension.utils import bucketfs_operations from exasol_transformers_extension.deployment import deployment_utils as utils -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification +from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ + HuggingFaceHubBucketFSModelTransferSP @click.command() @click.option('--model-name', type=str, required=True, help="name of the model") +@click.option('--task-type', type=str, required=True) @click.option('--sub-dir', type=str, required=True, help="directory where the model is stored in the BucketFS") -@click.option('--local-model-path', type=click.Path(exists=True, file_okay=True), - required=True, help="local path where model is located") +@click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") @click.option('--bucketfs-name', type=str) @click.option('--bucketfs-host', type=str) @click.option('--bucketfs-port', type=int) @@ -42,8 +45,9 @@ @click.option('--use-ssl-cert-validation/--no-use-ssl-cert-validation', type=bool, default=True) def main( model_name: str, + task_type: str, sub_dir: str, - local_model_path: str, + token: str | None, bucketfs_name: str, bucketfs_host: str, bucketfs_port: int, @@ -59,9 +63,13 @@ def main( path_in_bucket: str, use_ssl_cert_validation: bool) -> None: """ - Script for uploading locally saved model files to BucketFS. Files should have been saved locally - using Transformers save_pretrained function. This ensures proper loading from the BucketFS later + Downloads model from Huggingface hub and the transfers model to database """ + # create BucketFSModelSpecification for model to be loaded + current_model_spec = BucketFSModelSpecification(model_name, task_type, "", Path(sub_dir)) + # upload the downloaded model files into bucketfs + upload_path = current_model_spec.get_bucketfs_model_save_path() + # create bucketfs location bucketfs_location = bucketfs_operations.create_bucketfs_location( bucketfs_name=bucketfs_name, @@ -79,12 +87,18 @@ def main( path_in_bucket=path_in_bucket, use_ssl_cert_validation=use_ssl_cert_validation) - # create CurrentModelSpecification for model to be loaded - current_model_specs = CurrentModelSpecification(model_name, "", Path(sub_dir)) - # upload the downloaded model files into bucketfs - upload_path = current_model_specs.get_bucketfs_model_save_path() - bucketfs_operations.upload_model_files_to_bucketfs( - local_model_path, upload_path, bucketfs_location) + model_factory = current_model_spec.get_model_factory() + + downloader = HuggingFaceHubBucketFSModelTransferSP(bucketfs_location=bucketfs_location, + model_specification=current_model_spec, + bucketfs_model_path=upload_path, + token=token) + + for model in [model_factory, transformers.AutoTokenizer]: + downloader.download_from_huggingface_hub(model) + # upload model files to BucketFS + model_tar_file_path = downloader.upload_to_bucketfs() + print("Your model or tokenizer has been saved in the BucketFS at: " + str(model_tar_file_path)) if __name__ == '__main__': diff --git a/exasol_transformers_extension/utils/current_model_specification.py b/exasol_transformers_extension/utils/bucketfs_model_specification.py similarity index 52% rename from exasol_transformers_extension/utils/current_model_specification.py rename to exasol_transformers_extension/utils/bucketfs_model_specification.py index fb1c14bd..8e8130fe 100644 --- a/exasol_transformers_extension/utils/current_model_specification.py +++ b/exasol_transformers_extension/utils/bucketfs_model_specification.py @@ -1,22 +1,23 @@ from exasol_transformers_extension.utils.model_specification import ModelSpecification from pathlib import PurePosixPath, Path -class CurrentModelSpecification(ModelSpecification): +class BucketFSModelSpecification(ModelSpecification): """ Class describing a model with additional information about the bucketFS connection and the subdir in the bucketfs the model can be found at. """ def __init__(self, model_name: str, + task_type: str, bucketfs_conn_name: str, sub_dir: Path): - ModelSpecification.__init__(self, model_name) + ModelSpecification.__init__(self, model_name, task_type) self.bucketfs_conn_name = bucketfs_conn_name self.sub_dir = sub_dir def __eq__(self, other): """Overrides the default implementation""" - if isinstance(other, CurrentModelSpecification): + if isinstance(other, BucketFSModelSpecification): return (super().__eq__(other) and self.sub_dir == other.sub_dir and self.bucketfs_conn_name == other.bucketfs_conn_name) @@ -26,24 +27,24 @@ def get_bucketfs_model_save_path(self) -> Path: """ path model is saved at in the bucketfs """ - model_name = self.get_model_specific_path_suffix() - return Path(self.sub_dir, model_name) + model_path_suffix = self.get_model_specific_path_suffix() + return Path(self.sub_dir, model_path_suffix) - -class CurrentModelSpecificationFactory: +class BucketFSModelSpecificationFactory: def create(self, model_name: str, + task_type: str, bucketfs_conn_name: str, sub_dir: Path): - return CurrentModelSpecification(model_name, bucketfs_conn_name, sub_dir) + return BucketFSModelSpecification(model_name, task_type, bucketfs_conn_name, sub_dir) -class CurrentModelSpecificationFromModelSpecs: - def transform(self, - model_specification: ModelSpecification, - bucketfs_conn_name: str, - sub_dir: Path): - return CurrentModelSpecification(model_name=model_specification.model_name, - bucketfs_conn_name=bucketfs_conn_name, - sub_dir=sub_dir) +def get_BucketFSModelSpecification_from_model_Specs( + model_specification: ModelSpecification, + bucketfs_conn_name: str, + sub_dir: Path): + return BucketFSModelSpecification(model_name=model_specification.model_name, + task_type=model_specification.task_type, + bucketfs_conn_name=bucketfs_conn_name, + sub_dir=sub_dir) diff --git a/exasol_transformers_extension/utils/bucketfs_operations.py b/exasol_transformers_extension/utils/bucketfs_operations.py index 1e6a4f4b..6ab2671b 100644 --- a/exasol_transformers_extension/utils/bucketfs_operations.py +++ b/exasol_transformers_extension/utils/bucketfs_operations.py @@ -115,6 +115,6 @@ def create_save_pretrained_model_path(_tmpdir_name, model_specification: ModelSp before it is uploaded to the bucketfs """ model_specific_path_suffix = model_specification.get_model_specific_path_suffix() - return Path(_tmpdir_name, "pretrained", model_specific_path_suffix) #todo move to modespecstring eventually? + return Path(_tmpdir_name, "pretrained", model_specific_path_suffix) diff --git a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py index 0ae9a9a0..a5369175 100644 --- a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py +++ b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py @@ -58,6 +58,7 @@ def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol): use_auth_token=self._token) model.save_pretrained(self._save_pretrained_model_path) + def upload_to_bucketfs(self) -> Path: """ Upload the downloaded models into the BucketFS. diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py index 211ed44b..6ccd9dba 100644 --- a/exasol_transformers_extension/utils/load_local_model.py +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -3,7 +3,7 @@ from typing import Optional from pathlib import Path -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils import bucketfs_operations from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -13,20 +13,20 @@ class LoadLocalModel: Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline. :_pipeline_factory: a function to create a transformers pipeline - :task_name: name of the current task + :task_type: name of the current task :device: device to be used for pipeline creation, i.e "CPU" :_base_model_factory: a ModelFactoryProtocol for creating the loaded model :_tokenizer_factory: a ModelFactoryProtocol for creating the loaded tokenizer """ def __init__(self, pipeline_factory, - task_name: str, + task_type: str, device: str, base_model_factory: ModelFactoryProtocol, tokenizer_factory: ModelFactoryProtocol ): self.pipeline_factory = pipeline_factory - self.task_name = task_name + self.task_type = task_type self.device = device self._base_model_factory = base_model_factory self._tokenizer_factory = tokenizer_factory @@ -40,7 +40,7 @@ def current_model_specification(self): """Get the current current_model_specification.""" return self._current_model_specification - def set_current_model_specification(self, current_model_specification: CurrentModelSpecification): + def set_current_model_specification(self, current_model_specification: BucketFSModelSpecification): """Set the current_model_specification.""" self._current_model_specification = current_model_specification @@ -57,7 +57,7 @@ def load_models(self) -> transformers.pipelines.Pipeline: loaded_tokenizer = self._tokenizer_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) last_created_pipeline = self.pipeline_factory( - self.task_name, + self.task_type, model=loaded_model, tokenizer=loaded_tokenizer, device=self.device, diff --git a/exasol_transformers_extension/utils/model_specification.py b/exasol_transformers_extension/utils/model_specification.py index d8db8b11..a1245abd 100644 --- a/exasol_transformers_extension/utils/model_specification.py +++ b/exasol_transformers_extension/utils/model_specification.py @@ -1,25 +1,67 @@ from pathlib import PurePosixPath, Path +import transformers + class ModelSpecification: """ Class describing a model. """ - def __init__(self, model_name: str): + def __init__(self, model_name: str, task_type: str): # task_type, model_version self.model_name = model_name + self.task_type = self._set_task_type_from_udf_name(task_type) - def get_model_specs_for_download(self): + def _set_task_type_from_udf_name(self, text): """ - returns all attributes necessary for downloading the model from Huggingface. + switches user input(matching udf name) to transformers task types """ - return self.model_name + if text == "filling_mask": + task_type = "fill-mask" + elif text == "question_answering": + task_type = "question-answering" + elif text == "sequence_classification": + task_type = "text-classification" + elif text == "text_generation": + task_type = "text-generation" + elif text == "token_classification": + task_type = "token-classification" + elif text == "translation": + task_type = "translation" + elif text == "zero_shot_classification": + task_type = "zero-shot-classification" + else: + task_type = text + return task_type def __eq__(self, other): """Overrides the default implementation""" if isinstance(other, ModelSpecification): - return self.model_name == other.model_name + return (self.model_name == other.model_name + and self.task_type == other.task_type) return False def get_model_specific_path_suffix(self) -> PurePosixPath: - return PurePosixPath(self.model_name) #model_name-version-task + return PurePosixPath(self.model_name.replace(".", "_") + "_" + self.task_type) #model_name-version-task# + def get_model_factory(self): + """ + sets model factory depending on the task_type of the specific model + """ + model_task_type = self.task_type + if model_task_type == "fill-mask": + model_factory = transformers.AutoModelForMaskedLM + elif model_task_type == "translation": + model_factory = transformers.AutoModelForSeq2SeqLM + elif model_task_type == "zero-shot-classification": + model_factory = transformers.AutoModelForSequenceClassification + elif model_task_type == "text-classification": + model_factory = transformers.AutoModelForSequenceClassification + elif model_task_type == "question-answering": + model_factory = transformers.AutoModelForQuestionAnswering + elif model_task_type == "text-generation": + model_factory = transformers.AutoModelForCausalLM + elif model_task_type == "token-classification": + model_factory = transformers.AutoModelForTokenClassification + else: + model_factory = transformers.AutoModel + return model_factory diff --git a/noxfile.py b/noxfile.py index 2b7d644a..8cd86546 100644 --- a/noxfile.py +++ b/noxfile.py @@ -39,8 +39,25 @@ def unit_tests(session): def integration_tests(session): # We need to use a external database here, because the itde plugin doesn't provide all necessary options to # configure the database. See the start_database session. - session.run('pytest', '--itde-db-version=external', 'tests/integration_tests') + session.run('pytest', '--setup-show', '-s', '--itde-db-version=external', 'tests/integration_tests') +@nox.session(python=False) +def saas_integration_tests(session): + # We need to use a external database here, because the itde plugin doesn't provide all necessary options to + # configure the database. See the start_database session. + session.run('pytest', '--setup-show', '-s', '--backend=saas', 'tests/integration_tests/with_db') + +@nox.session(python=False) +def onprem_integration_tests(session): + # We need to use a external database here, because the itde plugin doesn't provide all necessary options to + # configure the database. See the start_database session. + session.run('pytest', '--setup-show', '-s', '--backend=onprem', '--itde-db-version=external', 'tests/integration_tests/with_db') + +@nox.session(python=False) +def without_db_integration_tests(session): + # We need to use a external database here, because the itde plugin doesn't provide all necessary options to + # configure the database. See the start_database session. + session.run('pytest', '--setup-show', '-s', '--itde-db-version=external', 'tests/integration_tests/without_db') @nox.session(python=False) def start_database(session): @@ -48,5 +65,5 @@ def start_database(session): '--environment-name', 'test', '--database-port-forward', '8888', '--bucketfs-port-forward', '6666', - '--db-mem-size', '4GB', + '--db-mem-size', '8GB', '--nameserver', '8.8.8.8') diff --git a/poetry.lock b/poetry.lock index c9bd3e95..f65d46fd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1749,6 +1749,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/tests/conftest.py b/tests/conftest.py index 1432f627..ab546df3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,3 +7,18 @@ "tests.fixtures.model_fixture", "tests.fixtures.script_deployment_fixture" ] + +_BACKEND_OPTION = '--backend' + +def pytest_addoption(parser): + parser.addoption( + _BACKEND_OPTION, + action="append", + default=[], + help=f"""List of test backends (onprem, saas). By default, the tests will be + run on both backends. To select only one of the backends add the + argument {_BACKEND_OPTION}= to the command line. Both + backends can be selected like ... {_BACKEND_OPTION}=onprem {_BACKEND_OPTION}=saas, + but this is the same as the default. + """, + ) diff --git a/tests/fixtures/bucketfs_fixture.py b/tests/fixtures/bucketfs_fixture.py index 2190c8c2..316217b4 100644 --- a/tests/fixtures/bucketfs_fixture.py +++ b/tests/fixtures/bucketfs_fixture.py @@ -1,24 +1,24 @@ from __future__ import annotations -import pytest -from pytest_itde.config import TestConfig import exasol.bucketfs as bfs +import pytest +import pytest_itde from exasol_transformers_extension.utils.bucketfs_operations import create_bucketfs_location +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.utils.parameters import bucketfs_params @pytest.fixture(scope="session") def bucketfs_location_onprem(backend, - itde: TestConfig) -> bfs.path.PathLike | None: - - if backend == bfs.path.StorageBackend.onprem: + bucketfs_config: pytest_itde.config.BucketFs) -> bfs.path.PathLike | None: + if backend == BACKEND_ONPREM: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, bucketfs_name=bucketfs_params.name, - bucketfs_url=itde.bucketfs.url, - bucketfs_user=itde.bucketfs.username, - bucketfs_password=itde.bucketfs.password, + bucketfs_url=bucketfs_config.url, + bucketfs_user=bucketfs_config.username, + bucketfs_password=bucketfs_config.password, bucket=bucketfs_params.bucket) return None @@ -29,8 +29,7 @@ def bucketfs_location_saas(backend, saas_account_id, saas_database_id, saas_token) -> bfs.path.PathLike | None: - - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, saas_url=saas_url, @@ -44,11 +43,10 @@ def bucketfs_location_saas(backend, def bucketfs_location(backend, bucketfs_location_onprem, bucketfs_location_saas) -> bfs.path.PathLike: - - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: assert bucketfs_location_onprem is not None return bucketfs_location_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: assert bucketfs_location_saas is not None return bucketfs_location_saas else: diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index c6882cbd..a734ade7 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -7,6 +7,7 @@ import pyexasol import pytest import exasol.bucketfs as bfs +from _pytest.fixtures import FixtureRequest from exasol.saas.client.api_access import ( OpenApiAccess, create_saas_client, @@ -15,6 +16,8 @@ ) from pytest_itde import config +CURRENT_SAAS_DATABASE_ID = pytest.StashKey[str]() + def _env(var: str) -> str: result = os.environ.get(var) @@ -23,45 +26,57 @@ def _env(var: str) -> str: raise RuntimeError(f"Environment variable {var} is empty.") -@pytest.fixture(scope='session', params=[bfs.path.StorageBackend.onprem, bfs.path.StorageBackend.saas]) -def backend(request) -> bfs.path.StorageBackend: - # Here we are going to add - # pytest.skip() - # if there is an instruction to skip a particular backed in the command line or an envar. +_BACKEND_OPTION = '--backend' +BACKEND_ONPREM = 'onprem' +BACKEND_SAAS = 'saas' + + +@pytest.fixture(scope='session', params=[BACKEND_ONPREM, BACKEND_SAAS]) +def backend(request) -> str: + backend_options = request.config.getoption(_BACKEND_OPTION) + if backend_options and (request.param not in backend_options): + pytest.skip() return request.param @pytest.fixture(scope="session") -def saas_url() -> str: - return _env("SAAS_HOST") +def saas_url(backend) -> str: + if backend == BACKEND_SAAS: + return _env("SAAS_HOST") @pytest.fixture(scope="session") -def saas_account_id() -> str: - return _env("SAAS_ACCOUNT_ID") +def saas_account_id(backend) -> str: + if backend == BACKEND_SAAS: + return _env("SAAS_ACCOUNT_ID") @pytest.fixture(scope="session") -def saas_token() -> str: - return _env("SAAS_PAT") +def saas_token(backend) -> str: + if backend == BACKEND_SAAS: + return _env("SAAS_PAT") + @pytest.fixture(scope="session") -def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: - - if backend == bfs.path.StorageBackend.saas: - with ExitStack() as stack: - # Create and configure the SaaS client. - client = create_saas_client(host=saas_url, pat=saas_token) - api_access = OpenApiAccess(client=client, account_id=saas_account_id) - stack.enter_context(api_access.allowed_ip()) - - # Create a temporary database and waite till it becomes operational - db = stack.enter_context(api_access.database( - name=timestamp_name('SME_CI'), - idle_time=timedelta(hours=12))) - api_access.wait_until_running(db.id) - yield db.id +def saas_database_id(request: FixtureRequest, backend, saas_url, saas_account_id, saas_token) -> str: + if backend == BACKEND_SAAS: + if CURRENT_SAAS_DATABASE_ID not in request.session.stash: + with ExitStack() as stack: + # Create and configure the SaaS client. + client = create_saas_client(host=saas_url, pat=saas_token) + api_access = OpenApiAccess(client=client, account_id=saas_account_id) + stack.enter_context(api_access.allowed_ip()) + + # Create a temporary database and waite till it becomes operational + db = stack.enter_context(api_access.database( + name=timestamp_name('TE_CI'), + idle_time=timedelta(hours=12))) + api_access.wait_until_running(db.id) + request.session.stash[CURRENT_SAAS_DATABASE_ID] = db.id + yield db.id + else: + yield request.session.stash[CURRENT_SAAS_DATABASE_ID] else: yield '' @@ -70,8 +85,7 @@ def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: def pyexasol_connection_onprem(backend, connection_factory, exasol_config: config.Exasol) -> pyexasol.ExaConnection | None: - - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: with connection_factory(exasol_config) as conn: yield conn else: @@ -84,8 +98,7 @@ def pyexasol_connection_saas(backend, saas_account_id, saas_database_id, saas_token) -> pyexasol.ExaConnection | None: - - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: # Create a connection to the database. conn_params = get_connection_params(host=saas_url, account_id=saas_account_id, @@ -104,11 +117,10 @@ def pyexasol_connection_saas(backend, def pyexasol_connection(backend, pyexasol_connection_onprem, pyexasol_connection_saas) -> pyexasol.ExaConnection: - - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: assert pyexasol_connection_onprem is not None yield pyexasol_connection_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: assert pyexasol_connection_saas is not None yield pyexasol_connection_saas else: diff --git a/tests/fixtures/language_container_fixture.py b/tests/fixtures/language_container_fixture.py index 9ac044b6..5f49eda1 100644 --- a/tests/fixtures/language_container_fixture.py +++ b/tests/fixtures/language_container_fixture.py @@ -2,17 +2,23 @@ import subprocess from pathlib import Path import time +from typing import Dict import pytest +from _pytest.fixtures import FixtureRequest from exasol_script_languages_container_tool.lib.tasks.export.export_info import ExportInfo from exasol.python_extension_common.deployment.language_container_deployer import LanguageContainerDeployer import exasol.bucketfs as bfs from exasol_transformers_extension.deployment import language_container +from tests.fixtures.database_connection_fixture import BACKEND_SAAS LANGUAGE_ALIAS = "PYTHON3_TE" CONTAINER_FILE_NAME = "exasol_transformers_extension_container.tar.gz" +SLC_EXPORT = pytest.StashKey[ExportInfo]() +SLC_UPLOADED = pytest.StashKey[Dict[str, bool]]() + @pytest.fixture(scope="session") def flavor_path() -> Path: @@ -20,31 +26,38 @@ def flavor_path() -> Path: @pytest.fixture(scope="session") -def export_slc(flavor_path: Path) -> ExportInfo: - language_container.prepare_flavor(flavor_path=flavor_path) - export_result = language_container.export(flavor_path=flavor_path) - export_info = export_result.export_infos[str(flavor_path)]["release"] - return export_info +def export_slc(request: FixtureRequest, flavor_path: Path) -> ExportInfo: + if SLC_EXPORT not in request.session.stash: + language_container.prepare_flavor(flavor_path=flavor_path) + export_result = language_container.export(flavor_path=flavor_path) + export_info = export_result.export_infos[str(flavor_path)]["release"] + request.session.stash[SLC_EXPORT] = export_info + return request.session.stash[SLC_EXPORT] @pytest.fixture(scope="session") -def upload_slc(backend, bucketfs_location, pyexasol_connection, export_slc: ExportInfo) -> None: - cleanup_images() +def upload_slc(request: FixtureRequest, backend, bucketfs_location, pyexasol_connection, + export_slc: ExportInfo) -> None: + if SLC_UPLOADED not in request.session.stash: + request.session.stash[SLC_UPLOADED] = dict() + if backend not in request.session.stash[SLC_UPLOADED]: + cleanup_images() - container_file_path = Path(export_slc.cache_file) + container_file_path = Path(export_slc.cache_file) - deployer = LanguageContainerDeployer(pyexasol_connection=pyexasol_connection, - language_alias=LANGUAGE_ALIAS, - bucketfs_path=bucketfs_location) + deployer = LanguageContainerDeployer(pyexasol_connection=pyexasol_connection, + language_alias=LANGUAGE_ALIAS, + bucketfs_path=bucketfs_location) - deployer.run(container_file=container_file_path, - bucket_file_path=CONTAINER_FILE_NAME, - allow_override=True, - wait_for_completion=True) + deployer.run(container_file=container_file_path, + bucket_file_path=CONTAINER_FILE_NAME, + allow_override=True, + wait_for_completion=True) - # Let's see if this helps - if backend == bfs.path.StorageBackend.saas: - time.sleep(300) + # Let's see if this helps + if backend == BACKEND_SAAS: + time.sleep(300) + request.session.stash[SLC_UPLOADED][backend] = True def cleanup_images(): diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index dd1b3d0b..2f6bcb0d 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -7,8 +7,8 @@ import exasol.bucketfs as bfs -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \ - CurrentModelSpecificationFromModelSpecs +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification, \ + get_BucketFSModelSpecification_from_model_Specs from exasol_transformers_extension.utils.model_specification import ModelSpecification from tests.utils import postprocessing from tests.utils.parameters import model_params @@ -21,9 +21,10 @@ def download_model_to_standard_local_save_path(model_specification: ModelSpecifi local_model_save_path = bucketfs_operations.create_save_pretrained_model_path(tmpdir_name, model_specification) model_name = model_specification.model_name - for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]: - model = model_factory.from_pretrained(model_name, cache_dir=tmpdir_name / "cache" / model_name) - model.save_pretrained(local_model_save_path) + model_factory = model_specification.get_model_factory() + for model in [model_factory, transformers.AutoTokenizer]: + downloaded_model = model.from_pretrained(model_name, cache_dir=tmpdir_name / "cache" / model_name) + downloaded_model.save_pretrained(local_model_save_path) return local_model_save_path @@ -39,7 +40,7 @@ def download_model_to_path(model_specification: ModelSpecification, @contextmanager def upload_model(bucketfs_location: bfs.path.PathLike, - current_model_specification: CurrentModelSpecification, + current_model_specification: BucketFSModelSpecification, model_dir: Path) -> Path: model_path = current_model_specification.get_bucketfs_model_save_path() bucketfs_operations.upload_model_files_to_bucketfs( @@ -52,20 +53,63 @@ def upload_model(bucketfs_location: bfs.path.PathLike, def prepare_model_for_local_bucketfs(model_specification: ModelSpecification, tmpdir_factory): - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, - "", - model_params.sub_dir) - tmpdir = tmpdir_factory.mktemp(current_model_specs.get_model_specific_path_suffix()) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_specification, + "", + model_params.sub_dir) + + tmpdir = tmpdir_factory.mktemp(current_model_specs.task_type) model_path_in_bucketfs = current_model_specs.get_bucketfs_model_save_path() bucketfs_path_for_model = tmpdir / model_path_in_bucketfs download_model_to_path(current_model_specs, bucketfs_path_for_model) return tmpdir - @pytest.fixture(scope="session") -def prepare_base_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: +def prepare_filling_mask_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: model_specification = model_params.base_model_specs + model_specification.task_type = "fill-mask" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_question_answering_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.q_a_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_sequence_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.sequence_class_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_sequence_classification_pair_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.sequence_class_pair_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_text_generation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.text_gen_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_token_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.token_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_translation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.seq2seq_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_zero_shot_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.zero_shot_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @@ -80,14 +124,14 @@ def prepare_seq2seq_model_in_local_bucketfs(tmpdir_factory) -> PurePosixPath: @contextmanager def upload_model_to_bucketfs( model_specification: ModelSpecification, - download_tmpdir: Path, + local_model_save_path: Path, bucketfs_location: bfs.path.PathLike) -> str: - download_tmpdir = download_model_to_standard_local_save_path(model_specification, download_tmpdir) - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, - "", - model_params.sub_dir) + local_model_save_path = download_model_to_standard_local_save_path(model_specification, local_model_save_path) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_specification, + "", + model_params.sub_dir) with upload_model( - bucketfs_location, current_model_specs, download_tmpdir) as model_path: + bucketfs_location, current_model_specs, local_model_save_path) as model_path: try: yield model_path finally: @@ -95,20 +139,86 @@ def upload_model_to_bucketfs( @pytest.fixture(scope="session") -def upload_base_model_to_bucketfs( +def upload_filling_mask_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: base_model_specs = model_params.base_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs.task_type = "fill-mask" + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path +@pytest.fixture(scope="session") +def upload_question_answering_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.q_a_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_sequence_classification_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.sequence_class_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_sequence_classification_pair_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.sequence_class_pair_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_text_generation_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.text_gen_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_token_classification_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.token_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_translation_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.seq2seq_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_zero_shot_classification_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + model_specs = model_params.zero_shot_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) + with upload_model_to_bucketfs( + model_specs, tmpdir, bucketfs_location) as path: + yield path + + + @pytest.fixture(scope="session") def upload_seq2seq_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: model_specification = model_params.seq2seq_model_specs - tmpdir = tmpdir_factory.mktemp(model_specification.get_model_specific_path_suffix()) + tmpdir = tmpdir_factory.mktemp(model_specification.task_type) with upload_model_to_bucketfs( model_specification, tmpdir, bucketfs_location) as path: yield path diff --git a/tests/fixtures/script_deployment_fixture.py b/tests/fixtures/script_deployment_fixture.py index 83646120..5e8be0ef 100644 --- a/tests/fixtures/script_deployment_fixture.py +++ b/tests/fixtures/script_deployment_fixture.py @@ -6,6 +6,7 @@ from pytest_itde import config import exasol.bucketfs as bfs +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.utils.parameters import bucketfs_params @@ -48,9 +49,9 @@ def deploy_params_saas(saas_url, saas_account_id, saas_database_id, saas_token) def deploy_params(backend, deploy_params_onprem, deploy_params_saas) -> dict[str, Any]: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: yield deploy_params_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: yield deploy_params_saas else: raise ValueError(f'No deploy_params fixture for the backend {backend}') @@ -60,9 +61,9 @@ def deploy_params(backend, def upload_params(backend, upload_params_onprem, deploy_params_saas) -> dict[str, Any]: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: yield upload_params_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: yield deploy_params_saas else: raise ValueError(f'No deploy_params fixture for the backend {backend}') diff --git a/tests/fixtures/setup_database_fixture.py b/tests/fixtures/setup_database_fixture.py index beb21124..59c5be2a 100644 --- a/tests/fixtures/setup_database_fixture.py +++ b/tests/fixtures/setup_database_fixture.py @@ -11,6 +11,7 @@ from exasol_transformers_extension.deployment.scripts_deployer import \ ScriptsDeployer +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.utils.parameters import bucketfs_params from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS @@ -94,9 +95,9 @@ def setup_database(backend: bfs.path.StorageBackend, _create_schema(pyexasol_connection) _deploy_scripts(pyexasol_connection) - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: _create_bucketfs_connection_onprem(bucketfs_config, pyexasol_connection) - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: _create_bucketfs_connection_saas(saas_url, saas_account_id, saas_database_id, saas_token, pyexasol_connection) else: @@ -105,7 +106,7 @@ def setup_database(backend: bfs.path.StorageBackend, return BUCKETFS_CONNECTION_NAME, SCHEMA_NAME -@pytest.fixture +@pytest.fixture(scope="session") def db_conn(setup_database, pyexasol_connection) -> pyexasol.ExaConnection: """ Per-test fixture that returns the same session-wide pyexasol connection, diff --git a/tests/integration_tests/with_db/deployment/test_scripts_deployer.py b/tests/integration_tests/with_db/deployment/test_scripts_deployer.py index e7138ca1..3e7209aa 100644 --- a/tests/integration_tests/with_db/deployment/test_scripts_deployer.py +++ b/tests/integration_tests/with_db/deployment/test_scripts_deployer.py @@ -9,6 +9,7 @@ from exasol_transformers_extension.deployment.scripts_deployer import \ ScriptsDeployer +from tests.fixtures.database_connection_fixture import BACKEND_ONPREM from tests.utils.db_queries import DBQueries from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS @@ -36,7 +37,7 @@ def test_scripts_deployer_no_schema_creation_permission( exasol_config: config.Exasol, upload_slc): - if backend != bfs.path.StorageBackend.onprem: + if backend != BACKEND_ONPREM: pytest.skip(("We run this test only with the Docker-DB, " "since the script deployer doesn't use the DB user login and password in SaaS.")) diff --git a/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py b/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py index e9170566..a2783c0e 100644 --- a/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py +++ b/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py @@ -7,6 +7,7 @@ import exasol.bucketfs as bfs from exasol.python_extension_common.deployment.language_container_validator import temp_schema +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS from exasol_transformers_extension import deploy @@ -23,7 +24,7 @@ def test_scripts_deployer_cli(backend, args_list = get_arg_list(**deploy_params, schema=schema_name, language_alias=LANGUAGE_ALIAS) args_list.insert(0, "scripts") # We validate the server certificate in SaaS, but not in the Docker DB - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: args_list.append("--use-ssl-cert-validation") else: args_list.append("--no-use-ssl-cert-validation") @@ -40,7 +41,7 @@ def test_scripts_deployer_cli_with_encryption_verify(backend, deploy_params: dict[str, Any], pyexasol_connection: ExaConnection, upload_slc): - if backend != bfs.path.StorageBackend.onprem: + if backend != BACKEND_ONPREM: pytest.skip(("We run this test only with the Docker-DB " "because SaaS always verifies the SSL certificate")) diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index 1e2a7d2b..65029313 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -4,14 +4,21 @@ from click.testing import CliRunner import exasol.bucketfs as bfs -from exasol_transformers_extension import upload_model -from exasol_transformers_extension.utils.current_model_specification import \ - CurrentModelSpecificationFromModelSpecs +from exasol_transformers_extension import upload_model as upload_model_cli +from exasol_transformers_extension.utils.bucketfs_model_specification import ( + get_BucketFSModelSpecification_from_model_Specs) from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils import postprocessing from tests.utils.parameters import bucketfs_params, model_params, get_arg_list from tests.fixtures.model_fixture import download_model_to_standard_local_save_path +from tests.fixtures.script_deployment_fixture import * +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + def adapt_file_to_upload(path: PosixPath, download_path: PosixPath): if path.is_dir(): @@ -32,20 +39,23 @@ def test_model_upload(upload_params, sub_dir = 'sub_dir' model_specification = model_params.base_model_specs + model_specification.task_type = "filling_mask" model_name = model_specification.model_name - download_path = download_model_to_standard_local_save_path(model_specification, tmp_path) - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, - "", Path(sub_dir)) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_specification, "", Path(sub_dir)) upload_path = current_model_specs.get_bucketfs_model_save_path() + args_list = get_arg_list(**upload_params, path_in_bucket=bucketfs_params.path_in_bucket, model_name=model_name, sub_dir=sub_dir, - local_model_path=str(download_path)) + task_type="filling_mask") + try: runner = CliRunner() - result = runner.invoke(upload_model.main, args_list) + print(args_list) + result = runner.invoke(upload_model_cli.main, args_list) + print(result) assert result.exit_code == 0 time.sleep(20) bucketfs_upload_location = bucketfs_location / upload_path.with_suffix(".tar.gz") @@ -79,4 +89,4 @@ def test_model_upload(upload_params, result = db_conn.execute(query).fetchall() assert len(result) == 1 and result[0][-1] is None finally: - postprocessing.cleanup_buckets(bucketfs_location, sub_dir) + postprocessing.cleanup_buckets(bucketfs_location, sub_dir) \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_filling_mask_script.py b/tests/integration_tests/with_db/udfs/test_filling_mask_script.py index 485566b2..924d473f 100644 --- a/tests/integration_tests/with_db/udfs/test_filling_mask_script.py +++ b/tests/integration_tests/with_db/udfs/test_filling_mask_script.py @@ -1,12 +1,17 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql +from tests.fixtures.model_fixture import upload_filling_mask_model_to_bucketfs +from tests.fixtures.bucketfs_fixture import bucketfs_location +from tests.fixtures.database_connection_fixture import pyexasol_connection +from tests.fixtures.setup_database_fixture import setup_database +from tests.fixtures.language_container_fixture import flavor_path, upload_slc, export_slc from tests.utils.parameters import model_params def test_filling_mask_script( - setup_database, db_conn, upload_base_model_to_bucketfs): + setup_database, db_conn, upload_filling_mask_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database + text_data = "I you so much." - text_data = "Exasol is an analytics management software company." n_rows = 100 top_k = 3 input_data = [] @@ -40,3 +45,16 @@ def test_filling_mask_script( n_rows_result = n_rows * top_k n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py index 0a8c215c..8823145c 100644 --- a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py +++ b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py @@ -1,6 +1,7 @@ from pathlib import Path -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecificationFromModelSpecs + +from exasol_transformers_extension.utils.bucketfs_model_specification import get_BucketFSModelSpecification_from_model_Specs from tests.utils import postprocessing from tests.utils.parameters import model_params from tests.utils.bucketfs_file_list import get_bucketfs_file_list @@ -18,11 +19,12 @@ def test_model_downloader_udf_script( for i in range(n_rows): sub_dir = SUB_DIR.format(id=i) sub_dirs.append(sub_dir) - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_params.tiny_model_specs, - bucketfs_conn_name, Path(sub_dir)) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_params.tiny_model_specs, + bucketfs_conn_name, Path(sub_dir)) model_paths.append(current_model_specs.get_bucketfs_model_save_path()) input_data.append(( current_model_specs.model_name, + current_model_specs.task_type, sub_dir, bucketfs_conn_name, '' @@ -33,11 +35,12 @@ def test_model_downloader_udf_script( query = f""" SELECT TE_MODEL_DOWNLOADER_UDF( t.model_name, + t.task_type, t.sub_dir, t.bucketfs_conn_name, t.token_conn_name ) FROM (VALUES {str(tuple(input_data))} AS - t(model_name, sub_dir, bucketfs_conn_name, token_conn_name)); + t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name)); """ # execute downloader UDF diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py index 06a83094..73d33f73 100644 --- a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py +++ b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py @@ -1,6 +1,7 @@ import time from tests.utils import postprocessing +TASK_TYPE = "filling_mask" SUB_DIR = 'test_downloader_with_prediction_sub_dir' MODEL_NAME = 'gaunernst/bert-tiny-uncased' @@ -13,6 +14,7 @@ def test_prediction_with_downloader_udf( # execute downloader UDF input_data = ( MODEL_NAME, + TASK_TYPE, SUB_DIR, bucketfs_conn_name, '' @@ -20,14 +22,15 @@ def test_prediction_with_downloader_udf( query = f""" SELECT TE_MODEL_DOWNLOADER_UDF( t.model_name, + t.task_type, t.sub_dir, t.bucketfs_conn_name, t.token_conn_name ) FROM (VALUES {str(input_data)} AS - t(model_name, sub_dir, bucketfs_conn_name, token_conn_name)); + t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name)); """ - db_conn.execute(query).fetchall() + result = db_conn.execute(query).fetchall() time.sleep(10) # execute the filling mask UDF @@ -59,5 +62,17 @@ def test_prediction_with_downloader_udf( assert len(result) == top_k assert all(row[-1] is None for row in result) + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string, list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > top_k / 2 + finally: postprocessing.cleanup_buckets(bucketfs_location, SUB_DIR) diff --git a/tests/integration_tests/with_db/udfs/test_question_answering_script.py b/tests/integration_tests/with_db/udfs/test_question_answering_script.py index d578110c..52e1989f 100644 --- a/tests/integration_tests/with_db/udfs/test_question_answering_script.py +++ b/tests/integration_tests/with_db/udfs/test_question_answering_script.py @@ -1,11 +1,19 @@ +from tests.fixtures.model_fixture import upload_question_answering_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + def test_question_answering_script( - setup_database, db_conn, upload_base_model_to_bucketfs): + setup_database, db_conn, upload_question_answering_model_to_bucketfs): bucketfs_conn_name, _ = setup_database - question = "Where is the Exasol?" + question = "Where is Exasol based?" + n_rows = 100 top_k = 1 input_data = [] @@ -14,9 +22,9 @@ def test_question_answering_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.q_a_model_specs.model_name, question, - ' '.join((model_params.text_data, str(i))), + 'The database software company Exasol is based in Nuremberg', top_k )) @@ -42,3 +50,15 @@ def test_question_answering_script( n_rows_result = n_rows n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + results = [result[i][6] for i in range(len(result))] + acceptable_results = ["Nuremberg", "Germany"] + number_accepted_results = 0 + + def contains(string, list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > top_k / 2 diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py index ad91c2cb..674bbe9a 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py @@ -1,11 +1,19 @@ +from tests.fixtures.model_fixture import upload_sequence_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +#debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_sequence_classification_single_text_script( - setup_database, db_conn, upload_base_model_to_bucketfs): + setup_database, db_conn, upload_sequence_classification_model_to_bucketfs): bucketfs_conn_name, _ = setup_database - n_labels = 2 + n_labels = 3 # negative, neutral, positive + n_rows = 100 input_data = [] for i in range(n_rows): @@ -13,8 +21,8 @@ def test_sequence_classification_single_text_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, - model_params.text_data)) + model_params.sequence_class_model_specs.model_name, + "I am so happy to be working on the Transformers Extension.")) query = f"SELECT TE_SEQUENCE_CLASSIFICATION_SINGLE_TEXT_UDF(" \ f"t.device_id, " \ @@ -36,3 +44,14 @@ def test_sequence_classification_single_text_script( n_rows_result = n_rows * n_labels n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + + number_accepted_results = 0 + for i in range(len(result)): + if (result[i][4] == "positive" and + result[i][5] > 0.8): #check if confidence resonably high + number_accepted_results += 1 + elif result[i][5] < 0.2: + number_accepted_results += 1 + assert number_accepted_results > n_rows_result / 1.5 diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py index 5bfbf103..928415a4 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py @@ -1,11 +1,18 @@ +from tests.fixtures.model_fixture import upload_sequence_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +# debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_sequence_classification_text_pair_script( - setup_database, db_conn, upload_base_model_to_bucketfs): + setup_database, db_conn, upload_sequence_classification_pair_model_to_bucketfs): bucketfs_conn_name, _ = setup_database - n_labels = 2 + n_labels = 3 n_rows = 100 input_data = [] for i in range(n_rows): @@ -13,9 +20,9 @@ def test_sequence_classification_text_pair_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, - model_params.text_data, - ' '.join((model_params.text_data, str(i))))) + model_params.sequence_class_pair_model_specs.model_name, + 'The database software company Exasol is based in Nuremberg', + 'The main Exasol office is located in Flensburg')) query = f"SELECT TE_SEQUENCE_CLASSIFICATION_TEXT_PAIR_UDF(" \ f"t.device_id, " \ @@ -38,3 +45,14 @@ def test_sequence_classification_text_pair_script( n_rows_result = n_rows * n_labels n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + + # lenient test for quality of results, will be replaced by deterministic test later + number_accepted_results = 0 + for i in range(len(result)): + if (result[i][5] == "contradiction" and # possible labels: contradiction, entailment, neutral + result[i][6] > 0.8): #check if confidence resonably high + number_accepted_results += 1 + elif result[i][6] < 0.2: + number_accepted_results += 1 + assert number_accepted_results > n_rows_result / 1.5 diff --git a/tests/integration_tests/with_db/udfs/test_text_generation_script.py b/tests/integration_tests/with_db/udfs/test_text_generation_script.py index 585d67c7..bebf1915 100644 --- a/tests/integration_tests/with_db/udfs/test_text_generation_script.py +++ b/tests/integration_tests/with_db/udfs/test_text_generation_script.py @@ -1,13 +1,21 @@ +from tests.fixtures.model_fixture import upload_text_generation_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +#for debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.script_deployment_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_text_generation_script( - setup_database, db_conn, upload_base_model_to_bucketfs): + setup_database, db_conn, upload_text_generation_model_to_bucketfs): bucketfs_conn_name, _ = setup_database text_data = "Exasol is an analytics database management" n_rows = 100 - max_length = 10 + max_length = 12 return_full_text = True input_data = [] for i in range(n_rows): @@ -15,7 +23,7 @@ def test_text_generation_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.text_gen_model_specs.model_name, text_data, max_length, return_full_text @@ -43,3 +51,15 @@ def test_text_generation_script( n_rows_result = n_rows n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][6] for i in range(len(result))] + acceptable_results = ["software", "system", "solution", "tool"] + number_accepted_results = 0 + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/with_db/udfs/test_token_classification_script.py b/tests/integration_tests/with_db/udfs/test_token_classification_script.py index 7a43493d..3ac674fe 100644 --- a/tests/integration_tests/with_db/udfs/test_token_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_token_classification_script.py @@ -1,9 +1,15 @@ +from tests.fixtures.model_fixture import upload_token_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_token_classification_script( - setup_database, db_conn, upload_base_model_to_bucketfs): + setup_database, db_conn, upload_token_classification_model_to_bucketfs): bucketfs_conn_name, _ = setup_database aggregation_strategy = "simple" n_rows = 100 @@ -13,8 +19,8 @@ def test_token_classification_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, - model_params.text_data, + model_params.token_model_specs.model_name, + 'The database software company Exasol is based in Nuremberg', aggregation_strategy )) @@ -38,3 +44,13 @@ def test_token_classification_script( removed_columns = 1 # device_id n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) >= n_rows and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + results = [[result[i][7], result[i][8]] for i in range(len(result))] + acceptable_result_sets = [["Exasol", "ORG"], ["Nuremberg", "LOC"]] + number_accepted_results = 0 + + for i in range(len(results)): + if results[i] in acceptable_result_sets: + number_accepted_results += 1 + assert number_accepted_results > len(result)/1.5 \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_translation_script.py b/tests/integration_tests/with_db/udfs/test_translation_script.py index 67d37511..6fc60941 100644 --- a/tests/integration_tests/with_db/udfs/test_translation_script.py +++ b/tests/integration_tests/with_db/udfs/test_translation_script.py @@ -16,7 +16,7 @@ def test_translation_script( bucketfs_conn_name, str(model_params.sub_dir), model_params.seq2seq_model_specs.model_name, - model_params.text_data, + 'The database software company Exasol is based in Nuremberg', src_lang, target_lang, max_length @@ -45,3 +45,16 @@ def test_translation_script( n_rows_result = n_rows n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][7] for i in range(len(result))] + acceptable_results = ["Die Datenbanksoftware Exasol hat ihren Sitz in Nürnberg"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py index 34ef053f..5124d6b7 100644 --- a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py @@ -1,9 +1,16 @@ +from tests.fixtures.model_fixture import upload_zero_shot_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +# debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * -def test_sequence_classification_single_text_script( - setup_database, db_conn, upload_base_model_to_bucketfs): +def test_zero_shot_classification_single_text_script( + setup_database, db_conn, upload_zero_shot_classification_model_to_bucketfs): bucketfs_conn_name, _ = setup_database n_rows = 100 input_data = [] @@ -14,8 +21,8 @@ def test_sequence_classification_single_text_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, - model_params.text_data, + model_params.zero_shot_model_specs.model_name, + 'The database software company Exasol is based in Nuremberg', candidate_labels )) @@ -40,3 +47,18 @@ def test_sequence_classification_single_text_script( n_rows_result = n_rows * n_labels n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + acceptable_results = ["Analytics", "Database", "Germany"] + + def contains(string, list): + return any(map(lambda x: x in string, list)) + + number_accepted_results = 0 + for i in range(len(result)): + if (contains(result[i][5], acceptable_results) and + result[i][6] > 0.8): #check if confidence resonably high + number_accepted_results += 1 + elif result[i][6] < 0.2: + number_accepted_results += 1 + assert number_accepted_results > n_rows_result / 1.5 diff --git a/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py b/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py index ea1966ca..3aaf2506 100644 --- a/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py +++ b/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py @@ -11,7 +11,7 @@ NoErrorMessageMatcher, NewColumnsEmptyMatcher, ErrorMessageMatcher, RankMonotonicMatcher, ColumnsMatcher from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection -from tests.fixtures.model_fixture import prepare_base_model_for_local_bucketfs +from tests.fixtures.model_fixture import prepare_filling_mask_model_for_local_bucketfs class ExaEnvironment: @@ -54,12 +54,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU with single input", 0, 1) ]) def test_filling_mask_udf( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_filling_mask_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_filling_mask_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -111,12 +111,12 @@ def test_filling_mask_udf( ("on GPU with single input", 0, 1) ]) def test_filling_mask_udf_on_error_handling( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_filling_mask_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_filling_mask_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py index af6f229c..af774d61 100644 --- a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py +++ b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py @@ -7,13 +7,12 @@ from exasol_transformers_extension.udfs.models.model_downloader_udf import \ ModelDownloaderUDF from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecificationFromModelSpecs +from exasol_transformers_extension.utils.bucketfs_model_specification import get_BucketFSModelSpecification_from_model_Specs from tests.utils.parameters import model_params from tests.utils.mock_connections import ( create_mounted_bucketfs_connection, create_hf_token_connection) from tests.utils.bucketfs_file_list import get_bucketfs_file_list - class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): self._connections = connections @@ -38,6 +37,10 @@ def model_name(self): def sub_dir(self): return self.ctx_data[self.index]['sub_dir'] + @property + def task_type(self): + return self.ctx_data[self.index]['task_type'] + @property def bfs_conn(self): return self.ctx_data[self.index]['bucketfs_conn_name'] @@ -63,11 +66,12 @@ class TestEnvironmentSetup: def __init__(self, id: str, tmp_dir: Path, token_conn_name: str): self.bucketfs_conn_name = "bucketfs_connection" + id self.sub_dir = model_params.sub_dir + id - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_params.tiny_model_specs, - self.bucketfs_conn_name, Path(self.sub_dir)) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_params.tiny_model_specs, + self.bucketfs_conn_name, Path(self.sub_dir)) self.token_conn_name = token_conn_name self.ctx_data = { 'tiny_model': current_model_specs.model_name, + 'task_type': current_model_specs.task_type, 'sub_dir': self.sub_dir, 'bucketfs_conn_name': self.bucketfs_conn_name, 'token_conn_name': self.token_conn_name diff --git a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py index b2598cea..f1c051a5 100644 --- a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py +++ b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py @@ -3,6 +3,7 @@ import pandas as pd from typing import Dict +from tests.fixtures.model_fixture import prepare_question_answering_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ScoreMatcher, RankDTypeMatcher, NoErrorMessageMatcher, RankMonotonicMatcher, ColumnsMatcher from tests.utils.parameters import model_params @@ -57,12 +58,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_question_answering_udf( description, device_id, n_rows, - top_k, prepare_base_model_for_local_bucketfs): + top_k, prepare_question_answering_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_question_answering_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -72,7 +73,7 @@ def test_question_answering_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.q_a_model_specs.model_name, question, model_params.text_data, top_k @@ -121,12 +122,12 @@ def test_question_answering_udf( ]) def test_question_answering_udf_on_error_handling( description, device_id, n_rows, - top_k, prepare_base_model_for_local_bucketfs): + top_k, prepare_question_answering_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_question_answering_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py index b819df1f..ddf0c754 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py @@ -5,11 +5,18 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.sequence_classification_single_text_udf import \ SequenceClassificationSingleTextUDF +from tests.fixtures.model_fixture import prepare_sequence_classification_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, ColumnsMatcher, NoErrorMessageMatcher, \ NewColumnsEmptyMatcher, ErrorMessageMatcher from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): @@ -49,12 +56,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_single_text_udf( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -64,7 +71,7 @@ def test_sequence_classification_single_text_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.sequence_class_model_specs.model_name, model_params.text_data + str(i) ) for i in range(n_rows)] columns = [ @@ -88,7 +95,7 @@ def test_sequence_classification_single_text_udf( grouped_by_inputs = result_df.groupby('text_data') n_unique_labels_per_input = grouped_by_inputs['label'].nunique().to_list() - n_labels = 2 + n_labels = 3 n_labels_per_input_expected = [n_labels] * n_rows result = Result(result_df) assert ( @@ -105,12 +112,12 @@ def test_sequence_classification_single_text_udf( ("on GPU", 0) ]) def test_sequence_classification_single_text_udf_on_error_handling( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py index 7c2d5831..6c5d2410 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py @@ -5,11 +5,18 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.sequence_classification_text_pair_udf import \ SequenceClassificationTextPairUDF +from tests.fixtures.model_fixture import prepare_sequence_classification_pair_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ColumnsMatcher, NoErrorMessageMatcher from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): @@ -49,12 +56,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_pair_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_pair_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -64,7 +71,7 @@ def test_sequence_classification_text_pair_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.sequence_class_pair_model_specs.model_name, model_params.text_data + str(i), model_params.text_data + str(i * i)) for i in range(n_rows)] @@ -89,7 +96,7 @@ def test_sequence_classification_text_pair_udf( grouped_by_inputs = result_df.groupby('first_text') n_unique_labels_per_input = grouped_by_inputs['label'].nunique().to_list() - n_labels = 2 + n_labels = 3 n_labels_per_input_expected = [n_labels] * n_rows result = Result(result_df) @@ -107,12 +114,12 @@ def test_sequence_classification_text_pair_udf( ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf_on_error_handling( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_pair_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_pair_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py index 413c8dbc..90c22c9b 100644 --- a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py +++ b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py @@ -7,6 +7,7 @@ from exasol_transformers_extension.udfs.models.text_generation_udf import \ TextGenerationUDF +from tests.fixtures.model_fixture import prepare_text_generation_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ScoreMatcher, ColumnsMatcher, NoErrorMessageMatcher from tests.utils.parameters import model_params @@ -53,12 +54,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU with single input", 0, 1) ]) def test_text_generation_udf( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_text_generation_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_text_generation_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -70,7 +71,7 @@ def test_text_generation_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.text_gen_model_specs.model_name, text_data, max_length, return_full_text @@ -111,12 +112,12 @@ def test_text_generation_udf( ("on GPU with single input", 0, 1) ]) def test_text_generation_udf_on_error_handlig( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_text_generation_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_text_generation_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py index 337d47a5..a53f72e8 100644 --- a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py @@ -12,7 +12,7 @@ TokenClassificationUDF # debugging -from tests.fixtures.model_fixture import prepare_base_model_for_local_bucketfs +from tests.fixtures.model_fixture import prepare_token_classification_model_for_local_bucketfs class ExaEnvironment: @@ -64,12 +64,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_token_classification_udf( description, device_id, n_rows, agg, - prepare_base_model_for_local_bucketfs): + prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -78,7 +78,7 @@ def test_token_classification_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.token_model_specs.model_name, model_params.text_data * (i + 1), agg ) for i in range(n_rows)] @@ -115,12 +115,12 @@ def test_token_classification_udf( ("on GPU", 0) ]) def test_token_classification_udf_with_multiple_aggregation_strategies( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -130,7 +130,7 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.token_model_specs.model_name, model_params.text_data * (i + 1), agg_strategy ) for i, agg_strategy in enumerate(agg_strategies)] @@ -178,12 +178,12 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( ]) def test_token_classification_udf_on_error_handling( description, device_id, n_rows, agg, - prepare_base_model_for_local_bucketfs): + prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) diff --git a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py index f3091384..935c7d64 100644 --- a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py @@ -5,6 +5,7 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.zero_shot_text_classification_udf import \ ZeroShotTextClassificationUDF +from tests.fixtures.model_fixture import prepare_zero_shot_classification_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, NoErrorMessageMatcher, \ ShapeMatcher, RankMonotonicMatcher, RankDTypeMatcher, ScoreMatcher, NewColumnsEmptyMatcher, ErrorMessageMatcher, \ ColumnsMatcher @@ -49,13 +50,13 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on CPU", None), ("on GPU", 0) ]) -def test_sequence_classification_single_text_udf( - description, device_id, prepare_base_model_for_local_bucketfs): +def test_zero_shot_classification_single_text_udf( + description, device_id, prepare_zero_shot_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_zero_shot_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -66,7 +67,7 @@ def test_sequence_classification_single_text_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.zero_shot_model_specs.model_name, model_params.text_data + str(i), candidate_labels + str(i) ) for i in range(n_rows)] @@ -112,13 +113,13 @@ def test_sequence_classification_single_text_udf( ("on CPU", None), ("on GPU", 0) ]) -def test_sequence_classification_single_text_udf_on_error_handling( - description, device_id, prepare_base_model_for_local_bucketfs): +def test_zero_shot_classification_single_text_udf_on_error_handling( + description, device_id, prepare_zero_shot_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_zero_shot_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py index f44568c1..00ee4195 100644 --- a/tests/integration_tests/without_db/utils/test_load_local_model.py +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -5,7 +5,7 @@ from transformers import AutoModel, AutoTokenizer, pipeline import tarfile -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.load_local_model import LoadLocalModel from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ @@ -16,8 +16,6 @@ from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection -#todo rename all modelspecification strings - class TestSetup: def __init__(self): @@ -28,11 +26,11 @@ def __init__(self): self.token = "token" self.model_specification = model_params.tiny_model_specs - self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) + self.mock_current_model_specification: Union[BucketFSModelSpecification, MagicMock] = create_autospec(BucketFSModelSpecification) test_pipeline = pipeline self.loader = LoadLocalModel( test_pipeline, - task_name="token-classification", + task_type="token-classification", device="cpu", base_model_factory=self.base_model_factory, tokenizer_factory=self.tokenizer_factory @@ -63,7 +61,6 @@ def test_load_local_model(tmp_path): test_setup.loader.set_current_model_specification(current_model_specification= test_setup.mock_current_model_specification) - #test_setup.loader.set_bucketfs_model_cache_dir(bucketfs_location=) #todo macke a mock? or add test for set_bucketfs_model_cache_dir test_setup.loader._bucketfs_model_cache_dir = model_save_path test_setup.loader.load_models() @@ -86,6 +83,6 @@ def test_load_local_model_with_huggingface_model_transfer(tmp_path): test_setup.loader.set_current_model_specification(current_model_specification= test_setup.mock_current_model_specification) - #test_setup.loader.set_bucketfs_model_cache_dir(bucketfs_location=) #todo macke a mock? or add test for set_bucketfs_model_cache_dir + test_setup.loader._bucketfs_model_cache_dir = sub_dir_path test_setup.loader.load_models() diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py index bd5a07db..cf35b310 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py @@ -53,9 +53,9 @@ class ErrorNotCachedMultipleModelMultipleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py index 95fe1623..8a9741d0 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py @@ -48,7 +48,7 @@ class ErrorNotCachedSingleModelMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py index a57ebf19..f0474156 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py @@ -52,9 +52,9 @@ class ErrorOnPredictionMultipleModelMultipleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py index dc4a50a0..715bd71a 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py @@ -47,7 +47,7 @@ class ErrorOnPredictionSingleModelMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py index e3ba8d4a..0b0dd61e 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py @@ -52,9 +52,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameMultipleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir2, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py index 1743857f..099a5147 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py @@ -52,9 +52,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameSingleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir2, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py index f10d4443..04b32975 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py @@ -52,9 +52,9 @@ class MultipleModelMultipleBatchComplete: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py index f7af363b..f2ec5586 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py @@ -53,9 +53,9 @@ class MultipleModelMultipleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py index 49fbe430..f658add1 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py @@ -64,13 +64,13 @@ class MultipleModelMultipleBatchMultipleModelsPerBatch: "bfs_conn4": Connection(address=f"file://{base_cache_dir4}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1), - PurePosixPath(base_cache_dir3, "sub_dir3", "model3"): + PurePosixPath(base_cache_dir3, "sub_dir3", "model3_fill-mask"): MockFillingMaskModel(sequence="text valid 3", score=0.3, rank=1), - PurePosixPath(base_cache_dir4, "sub_dir4", "model4"): + PurePosixPath(base_cache_dir4, "sub_dir4", "model4_fill-mask"): MockFillingMaskModel(sequence="text valid 4", score=0.4, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py index 3087de66..8a99d6dd 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py @@ -52,9 +52,9 @@ class MultipleModelSingleBatchComplete: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2,"sub_dir2", "model2"): + PurePosixPath(base_cache_dir2,"sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py index b47f664b..b2ae8ffa 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py @@ -53,9 +53,9 @@ class MultipleModelSingleBatchIncomplete: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py index 86d3f87e..21fe9eb2 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py @@ -52,7 +52,7 @@ class MultipleTopkSingleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py index 4279d260..ad0bbd43 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py @@ -52,7 +52,7 @@ class MultipleTopkSingleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py index 198b8e21..c13a909f 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir2", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir2", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py index e6080336..2ef50292 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir2", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir2", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py index 5352f52a..d88a07d5 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py @@ -47,7 +47,7 @@ class SingleModelMultipleBatchComplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py index de72ea3a..09cf5c8b 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py @@ -47,7 +47,7 @@ class SingleModelMultipleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py index a627df57..66cd1be3 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py @@ -47,7 +47,7 @@ class SingleModelSingleBatchComplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py index 7c91df61..d750cafa 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py @@ -47,7 +47,7 @@ class SingleModelSingleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py index 9166249f..4c8400a0 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py @@ -51,9 +51,9 @@ class SingleTopkMultipleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir1", "model2"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py index 7d38b5dc..6e3a88cd 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py @@ -51,9 +51,9 @@ class SingleTopkMultipleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir1", "model2"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udfs/test_base_udf.py b/tests/unit_tests/udfs/test_base_udf.py index 40a0d24a..46914d7e 100644 --- a/tests/unit_tests/udfs/test_base_udf.py +++ b/tests/unit_tests/udfs/test_base_udf.py @@ -113,7 +113,7 @@ def test_model_downloader_all_parameters(mock_local_path, mock_create_loc, descr ("sub_dir missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), None, "test_model"), ("model_name missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", None) + "test_subdir", None), ]) @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') @patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path') diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index 8dae4e06..2b85c17c 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -7,8 +7,8 @@ from exasol_udf_mock_python.connection import Connection from exasol_udf_mock_python.mock_meta_data import MockMetaData -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \ - CurrentModelSpecificationFactory +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification, \ + BucketFSModelSpecificationFactory from tests.unit_tests.utils_for_udf_tests import create_mock_exa_environment, create_mock_udf_context from exasol_transformers_extension.udfs.models.model_downloader_udf import \ ModelDownloaderUDF @@ -18,7 +18,6 @@ from tests.utils.matchers import AnyOrder from tests.utils.mock_cast import mock_cast - def create_mock_metadata() -> MockMetaData: def udf_wrapper(): pass @@ -29,6 +28,7 @@ def udf_wrapper(): input_columns=[ Column("model_name", str, "VARCHAR(2000000)"), Column("sub_dir", str, "VARCHAR(2000000)"), + Column("task_type", str, "VARCHAR(2000000)"), Column("bfs_conn", str, "VARCHAR(2000000)"), Column("token_conn", str, "VARCHAR(2000000)"), ], @@ -49,7 +49,6 @@ def udf_wrapper(): @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') def test_model_downloader(mock_create_loc, description, count, token_conn_name, token_conn_obj, expected_token): - mock_base_model_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_tokenizer_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_model_downloader_factory: Union[HuggingFaceHubBucketFSModelTransferSPFactory, MagicMock] = create_autospec( @@ -64,22 +63,26 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, mock_create_loc.side_effect = mock_bucketfs_locations base_model_names = [f"base_model_name_{i}" for i in range(count)] sub_directory_names = [f"sub_dir_{i}" for i in range(count)] + task_type = [f"task_type_{i}" for i in range(count)] bucketfs_connections = [Connection(address=f"file:///test{i}") for i in range(count)] bfs_conn_name = [f"bfs_conn_name_{i}" for i in bucketfs_connections] - mock_cmss = [create_autospec(CurrentModelSpecification, - model_name=base_model_names[i], - sub_dir=Path(sub_directory_names[i])) for i in range(count)] + mock_bucketfs_model_specs = [create_autospec(BucketFSModelSpecification, + model_name=base_model_names[i], + task_type=task_type[i], + sub_dir=Path(sub_directory_names[i]), + get_model_factory=BucketFSModelSpecification.get_model_factory) for i in range(count)] for i in range(count): - mock_cast(mock_cmss[i].get_bucketfs_model_save_path).side_effect = [f'{sub_directory_names[i]}/{base_model_names[i]}'] - mock_current_model_specification_factory: Union[CurrentModelSpecificationFactory, MagicMock] = ( - create_autospec(CurrentModelSpecificationFactory)) - mock_cast(mock_current_model_specification_factory.create).side_effect = mock_cmss + mock_cast(mock_bucketfs_model_specs[i].get_bucketfs_model_save_path).side_effect = [f'{sub_directory_names[i]}/{base_model_names[i]}'] + mock_current_model_specification_factory: Union[BucketFSModelSpecificationFactory, MagicMock] = ( + create_autospec(BucketFSModelSpecificationFactory)) + mock_cast(mock_current_model_specification_factory.create).side_effect = mock_bucketfs_model_specs input_data = [ ( base_model_names[i], sub_directory_names[i], + task_type[i], bfs_conn_name[i], token_conn_name ) @@ -95,21 +98,20 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, mock_ctx = create_mock_udf_context(input_data, mock_meta) udf = ModelDownloaderUDF(exa=mock_exa, - base_model_factory=mock_base_model_factory, tokenizer_factory=mock_tokenizer_factory, huggingface_hub_bucketfs_model_transfer=mock_model_downloader_factory, current_model_specification_factory=mock_current_model_specification_factory) udf.run(mock_ctx) assert mock_cast(mock_model_downloader_factory.create).mock_calls == [ call(bucketfs_location=mock_bucketfs_locations[i], - model_specification=mock_cmss[i], + model_specification=mock_bucketfs_model_specs[i], model_path=f'{sub_directory_names[i]}/{base_model_names[i]}', token=expected_token) for i in range(count) ] for i in range(count): assert mock_cast(mock_model_downloaders[i].download_from_huggingface_hub).mock_calls == [ - call(mock_base_model_factory), + call(mock_bucketfs_model_specs[i].get_model_factory()), call(mock_tokenizer_factory) ] assert call() in mock_cast(mock_model_downloaders[i].upload_to_bucketfs).mock_calls diff --git a/tests/unit_tests/utils/test_load_local_model.py b/tests/unit_tests/utils/test_load_local_model.py index af6284d9..89a0c856 100644 --- a/tests/unit_tests/utils/test_load_local_model.py +++ b/tests/unit_tests/utils/test_load_local_model.py @@ -6,7 +6,7 @@ import transformers from exasol_transformers_extension.utils.bucketfs_operations import create_save_pretrained_model_path -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.load_local_model import LoadLocalModel from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -21,13 +21,14 @@ def __init__(self): self.tokenizer_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) self.token = "token" self.model_name = "model_name" - self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) + self.model_task = "test_task" + self.mock_current_model_specification: Union[BucketFSModelSpecification, MagicMock] = create_autospec(BucketFSModelSpecification) self.cache_dir = "test/Path" self.mock_pipeline = Mock() self.loader = LoadLocalModel( self.mock_pipeline, - task_name="test_task", + self.model_task, device="cpu", base_model_factory=self.model_factory_mock, tokenizer_factory=self.tokenizer_factory_mock) @@ -35,7 +36,9 @@ def __init__(self): def test_load_function_call(): test_setup = TestSetup() - model_save_path = create_save_pretrained_model_path(test_setup.cache_dir, ModelSpecification(test_setup.model_name)) + model_save_path = create_save_pretrained_model_path(test_setup.cache_dir, + ModelSpecification(test_setup.model_name, + test_setup.model_task)) test_setup.loader._bucketfs_model_cache_dir = model_save_path test_setup.loader.set_current_model_specification(test_setup.mock_current_model_specification) diff --git a/tests/utils/parameters.py b/tests/utils/parameters.py index 4af3d07a..433b6e59 100644 --- a/tests/utils/parameters.py +++ b/tests/utils/parameters.py @@ -14,9 +14,15 @@ class BucketFSParams: @dataclass(frozen=True) class ModelParams: - base_model_specs: ModelSpecification - seq2seq_model_specs: ModelSpecification - tiny_model_specs: ModelSpecification + base_model_specs: ModelSpecification # this is used for other tests, taks_name should be set per test + seq2seq_model_specs: ModelSpecification # this model is used for testing translation_udf + q_a_model_specs: ModelSpecification # this model is used for testing question answering + text_gen_model_specs: ModelSpecification # used for text generation tests + token_model_specs: ModelSpecification # this model is used for token classification tests + sequence_class_model_specs: ModelSpecification # this model is used for sequence classification single text tests + sequence_class_pair_model_specs: ModelSpecification # this model is used for sequence classification text pair tests + zero_shot_model_specs: ModelSpecification # this model is used for zero-shot-classification tests + tiny_model_specs: ModelSpecification # this model is used for upload/download tests text_data: str sub_dir: str @@ -28,10 +34,16 @@ class ModelParams: path_in_bucket="container") model_params = ModelParams( - base_model_specs=ModelSpecification('bert-base-uncased'), - seq2seq_model_specs=ModelSpecification("t5-small"), - tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny"), - text_data='The company Exasol is based in Nuremberg', + base_model_specs=ModelSpecification('bert-base-uncased', "need to set this task_type"), #fill mask + seq2seq_model_specs=ModelSpecification("t5-small", "translation"), + q_a_model_specs=ModelSpecification("deepset/tinybert-6l-768d-squad2", "question-answering"), + text_gen_model_specs=ModelSpecification("openai-community/gpt2", "text-generation"), + token_model_specs=ModelSpecification("dslim/bert-base-NER", "token-classification"), + sequence_class_model_specs=ModelSpecification("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", "text-classification"), + sequence_class_pair_model_specs=ModelSpecification("MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli", "text-classification"), + zero_shot_model_specs=ModelSpecification("MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33", "zero-shot-classification"), + tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny", "task"), + text_data='The database software company Exasol is based in Nuremberg', sub_dir='model_sub_dir')