Added task_type parameter for correct model loading (#245)

* started adding task_type * started adding quality control tests * fixed integration tests, added separate models for tasks * [CodeBuild] fix saas db naming error * Use batch build for AWS CodeBuild to speed up tests against backends. [CodeBuild] * Build and export SLC before running SaaS integration tests to avoid waiting for the SLC build while the SaaS DB is already running [CodeBuild] * Use itde_config fixture instead itde fiture to avoid starting the itde without need and make db_conn a session fixture [CodeBuild] * Save SaaS Database id in pytest stash to not recreate a SaaS DB for each test. It seems to be a bug that a session scope fixture is called for every test. This might happen because backend is parameterized. [CodeBuild] * Increase DB Mem Size for ITDE to hopefully stabalize onprem tests in CodeBuild [CodeBuild] * Increase VM Size for onprem tests in CodeBuild to hopefully stabalize them [CodeBuild] Co-authored-by: Torsten Kilias <[email protected]>
exasol · Aug 7, 2024 · 4cca904 · 4cca904
1 parent 65b80ae
commit 4cca904
Show file tree

Hide file tree

Showing 78 changed files with 881 additions and 367 deletions.
diff --git a/buildspec.yml b/buildspec.yml
@@ -1,29 +1,20 @@
 version: 0.2
 
-env:
-  shell: bash
-  secrets-manager:
-    DOCKER_USER: "Dockerhub:User"
-    DOCKER_PASSWORD: "Dockerhub:AccessToken"
-    SAAS_HOST: "ExasolSaaSDatabase:SAAS_HOST"
-    SAAS_ACCOUNT_ID: "ExasolSaaSDatabase:SAAS_ACCOUNT_ID"
-    SAAS_PAT: "ExasolSaaSDatabase:SAAS_PAT"
-
-phases:
-  install:
-    runtime-versions:
-      python: 3.10
-    commands:
-      - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 -
-      - export PATH=$PATH:$HOME/.local/bin
-      - poetry env use $(command -v "python3.10")
-      - poetry --version
-      - poetry install
-      - poetry build
-  pre_build:
-    commands:
-      - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin
-  build:
-    commands:
-      - poetry run nox -s start_database
-      - poetry run nox -s integration_tests
+batch:
+  fast-fail: false
+  build-graph:
+    - identifier: without_db_tests
+      env:
+        compute-type: BUILD_GENERAL1_MEDIUM
+        privileged-mode: true
+      buildspec: ./buildspec_without_db.yml
+    - identifier: saas_tests
+      env:
+        compute-type: BUILD_GENERAL1_MEDIUM
+        privileged-mode: true
+      buildspec: ./buildspec_saas.yml
+    - identifier: onprem_tests
+      env:
+        compute-type: BUILD_GENERAL1_LARGE
+        privileged-mode: true
+      buildspec: ./buildspec_onprem.yml
diff --git a/buildspec_onprem.yml b/buildspec_onprem.yml
@@ -0,0 +1,26 @@
+version: 0.2
+
+env:
+  shell: bash
+  secrets-manager:
+    DOCKER_USER: "Dockerhub:User"
+    DOCKER_PASSWORD: "Dockerhub:AccessToken"
+
+phases:
+  install:
+    runtime-versions:
+      python: 3.10
+    commands:
+      - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 -
+      - export PATH=$PATH:$HOME/.local/bin
+      - poetry env use $(command -v "python3.10")
+      - poetry --version
+      - poetry install
+      - poetry build
+  pre_build:
+    commands:
+      - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin
+  build:
+    commands:
+      - poetry run nox -s start_database
+      - poetry run nox -s onprem_integration_tests
diff --git a/buildspec_saas.yml b/buildspec_saas.yml
@@ -0,0 +1,29 @@
+version: 0.2
+
+env:
+  shell: bash
+  secrets-manager:
+    DOCKER_USER: "Dockerhub:User"
+    DOCKER_PASSWORD: "Dockerhub:AccessToken"
+    SAAS_HOST: "ExasolSaaSDatabase:SAAS_HOST"
+    SAAS_ACCOUNT_ID: "ExasolSaaSDatabase:SAAS_ACCOUNT_ID"
+    SAAS_PAT: "ExasolSaaSDatabase:SAAS_PAT"
+
+phases:
+  install:
+    runtime-versions:
+      python: 3.10
+    commands:
+      - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 -
+      - export PATH=$PATH:$HOME/.local/bin
+      - poetry env use $(command -v "python3.10")
+      - poetry --version
+      - poetry install
+      - poetry build
+  pre_build:
+    commands:
+      - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin
+  build:
+    commands:
+      - poetry run nox -s export_slc
+      - poetry run nox -s saas_integration_tests
diff --git a/buildspec_without_db.yml b/buildspec_without_db.yml
@@ -0,0 +1,25 @@
+version: 0.2
+
+env:
+  shell: bash
+  secrets-manager:
+    DOCKER_USER: "Dockerhub:User"
+    DOCKER_PASSWORD: "Dockerhub:AccessToken"
+
+phases:
+  install:
+    runtime-versions:
+      python: 3.10
+    commands:
+      - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 -
+      - export PATH=$PATH:$HOME/.local/bin
+      - poetry env use $(command -v "python3.10")
+      - poetry --version
+      - poetry install
+      - poetry build
+  pre_build:
+    commands:
+      - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin
+  build:
+    commands:
+      - poetry run nox -s without_db_integration_tests
diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md
@@ -1,9 +1,12 @@
-# Transformers Extension 2.0.0, t.b.d
+# Transformers Extension 2.0.0, 2024-08-07
 
-Code name: 
+Code name: Fixed model saving, added SaaS support and update to Python 3.10
 
 ## Summary
 
+This release Fixes an error in saving and loading of the model metadata. It also adds Exasol Saas support and 
+updated the project to python 3.10
+
 
 ### Features
 
@@ -13,6 +16,7 @@ Code name:
 ### Bugs
 
 - #237: Fixed reference to python-extension-common
+- #245: Added task_type parameter to fix model saving and loading
 
 ### Documentation
 
@@ -27,5 +31,6 @@ Code name:
 - #217: Refactored PredictionUDFs and LoadLocalModel so that LoadLocalModel constructs the bucketfs model file path
 - #230: Updated supported python version to >= Python 3.10
 - #236: Moved to the PathLike bucketfs interface.
+- #218: Changed upload_model_udf to load model from Huggingface
 
 ### Security 
diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md
@@ -263,29 +263,36 @@ Once you have internet access, invoke the UDF like this:
 ```sql
 SELECT TE_MODEL_DOWNLOADER_UDF(
     model_name,
+    task_type,
     sub_dir,
     bucketfs_conn,
     token_conn
 )
+
 ```
 - Parameters:
   - ```model_name```: The name of the model to use for prediction. You can find the 
   details of the models on the [huggingface models page](https://huggingface.co/models).
+  - ```task_type```: The Name of the task you want to use the model for.
   - ```sub_dir```: The directory where the model is stored in the BucketFS.
   - ```bucketfs_conn```: The BucketFS connection name.
   - ```token_conn```: The connection name containing the token required for 
   private models. You can use an empty string ('') for public models. For details 
   on how to create a connection object with token information, please check 
   [here](#getting-started).
 
-
+"task_type" is a variable for the type of task you plan to use the model for. 
+Some models can be used for multiple types of tasks, but transformers stores 
+different metadata depending on the task of the model, which affects how the model 
+is loaded later. Setting an Incorrect task_type, o leaving the task_type empty may affect the models performance 
+severely. Available task_types are the same as the names of our available UDFs, namely: 
+`filling_mask`, `question_answering`, `sequence_classification`, `text_generation`, `token_classification`, 
+`translation` and`zero_shot_classification`.    
+
 ### 2. Model Uploader Script
-You can invoke the python script as below which allows to load the transformer 
-models from the local filesystem into BucketFS:
+You can invoke the python script as below which allows to download the transformer 
+models from The Hugging Face hub to the local filesystem, and then from there to the BucketFS.
 
-  ```buildoutcfg
-  python -m exasol_transformers_extension.upload_model <options>
-  ```
 
 #### List of options
 
@@ -309,26 +316,19 @@ Unless stated otherwise in the comments column, the option is required for eithe
 | model-name                   |   [x]   | [x]  |                                                 |
 | path-in-bucket               |   [x]   | [x]  | Root location in the bucket for all models      |
 | sub-dir                      |   [x]   | [x]  | Sub-directory where this model should be stored |
+| task_type                    |   [x]   | [x]  | Name of the task you want to use the model for  |
+| token                        |   [x]   | [x]  | Huggingface token (needed for private models)   |
 | [no_]use-ssl-cert-validation |   [x]   | [x]  | Optional boolean, defaults to True              |
 
-**Note**: The options --local-model-path needs to point to a path which contains the model and its tokenizer. 
-These should have been saved using transformers [save_pretrained](https://huggingface.co/docs/transformers/v4.32.1/en/installation#fetch-models-and-tokenizers-to-use-offline) 
-function to ensure proper loading by the Transformers Extension UDFs.
-You can download the model using python like this:
-
-```python
-    for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]:
-        # download the model and tokenizer from Hugging Face
-        model = model_factory.from_pretrained(model_name)
-        # save the downloaded model using the save_pretrained function
-        model_save_path = <your local model save path>
-        model.save_pretrained(model_save_path)
-```
-***Note:*** Hugging Face models consist of two parts, the model and the tokenizer. 
-Make sure to download and save both into the same save directory so the upload model script uploads them together.
-And then upload it using exasol_transformers_extension.upload_model script where ```--local-model-path = <your local model save path>```
-
 
+"task_type" is a variable for the type of task you plan to use the model for. 
+Some models can be used for multiple types of tasks, but transformers stores 
+different metadata depending on the task of the model, which affects how the model 
+is loaded later. Setting an Incorrect task_type, o leaving the task_type empty may affect the models performance 
+severely. Available task_types are the same as the names of our available UDFs, namely: 
+`filling_mask`, `question_answering`, `sequence_classification`, `text_generation`, `token_classification`, 
+`translation` and`zero_shot_classification`.
+
 ## Using Prediction UDFs
 We provide 7 prediction UDFs in this Transformers Extension, each performing an NLP 
 task through the [transformers API](https://huggingface.co/docs/transformers/task_summary). 

diff --git a/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql b/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql
@@ -1,5 +1,6 @@
 CREATE OR REPLACE {{ language_alias }} SET SCRIPT "TE_MODEL_DOWNLOADER_UDF"(
     model_name VARCHAR(2000000),
+    task_type VARCHAR(2000000),
     sub_dir VARCHAR(2000000),
     bfs_conn VARCHAR(2000000),
     token_conn VARCHAR(2000000)

diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py
@@ -10,7 +10,7 @@
 from exasol_transformers_extension.deployment import constants
 from exasol_transformers_extension.utils import device_management, \
     bucketfs_operations, dataframe_operations
-from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification
+from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification
 from exasol_transformers_extension.utils.load_local_model import LoadLocalModel
 from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
 from exasol_transformers_extension.utils.model_specification import ModelSpecification
@@ -40,13 +40,13 @@ def __init__(self,
                  pipeline: transformers.Pipeline,
                  base_model: ModelFactoryProtocol,
                  tokenizer: ModelFactoryProtocol,
-                 task_name: str):
+                 task_type: str):
         self.exa = exa
         self.batch_size = batch_size
         self.pipeline = pipeline
         self.base_model = base_model
         self.tokenizer = tokenizer
-        self.task_name = task_name
+        self.task_type = task_type
         self.device = None
         self.model_loader = None
         self.last_created_pipeline = None
@@ -74,7 +74,7 @@ def create_model_loader(self):
         self.model_loader = LoadLocalModel(pipeline_factory=self.pipeline,
                                            base_model_factory=self.base_model,
                                            tokenizer_factory=self.tokenizer,
-                                           task_name=self.task_name,
+                                           task_type=self.task_type,
                                            device=self.device)
 
     def get_predictions_from_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame:
@@ -185,7 +185,7 @@ def check_cache(self, model_df: pd.DataFrame) -> None:
         model_name = model_df["model_name"].iloc[0]
         bucketfs_conn = model_df["bucketfs_conn"].iloc[0]
         sub_dir = model_df["sub_dir"].iloc[0]
-        current_model_specification = CurrentModelSpecification(model_name, bucketfs_conn, sub_dir)
+        current_model_specification = BucketFSModelSpecification(model_name, self.task_type, bucketfs_conn, sub_dir)
 
         if self.model_loader.current_model_specification != current_model_specification:
             bucketfs_location = \

diff --git a/exasol_transformers_extension/udfs/models/filling_mask_udf.py b/exasol_transformers_extension/udfs/models/filling_mask_udf.py
@@ -14,7 +14,7 @@ def __init__(self,
                  base_model=transformers.AutoModelForMaskedLM,
                  tokenizer=transformers.AutoTokenizer):
         super().__init__(exa, batch_size, pipeline, base_model,
-                         tokenizer, task_name='fill-mask')
+                         tokenizer, task_type='fill-mask')
         self._mask_token = "<mask>"
         self._desired_fields_in_prediction = ["sequence", "score"]
         self.new_columns = ["filled_text", "score", "rank", "error_message"]

diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py
@@ -3,8 +3,8 @@
 import transformers
 
 from exasol_transformers_extension.utils import bucketfs_operations
-from exasol_transformers_extension.utils.current_model_specification import \
-    CurrentModelSpecificationFactory
+from exasol_transformers_extension.utils.bucketfs_model_specification import \
+    BucketFSModelSpecificationFactory
 from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
 from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \
     HuggingFaceHubBucketFSModelTransferSPFactory
@@ -24,13 +24,11 @@ class ModelDownloaderUDF:
     """
     def __init__(self,
                  exa,
-                 base_model_factory: ModelFactoryProtocol = transformers.AutoModel,
                  tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer,
                  huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory =
                  HuggingFaceHubBucketFSModelTransferSPFactory(),
-                 current_model_specification_factory: CurrentModelSpecificationFactory = CurrentModelSpecificationFactory()):
+                 current_model_specification_factory: BucketFSModelSpecificationFactory = BucketFSModelSpecificationFactory()):
         self._exa = exa
-        self._base_model_factory = base_model_factory
         self._tokenizer_factory = tokenizer_factory
         self._huggingface_hub_bucketfs_model_transfer = huggingface_hub_bucketfs_model_transfer
         self._current_model_specification_factory = current_model_specification_factory
@@ -47,9 +45,11 @@ def _download_model(self, ctx) -> Tuple[str, str]:
         bfs_conn = ctx.bfs_conn         # BucketFS connection
         token_conn = ctx.token_conn     # name of token connection
         current_model_specification = self._current_model_specification_factory.create(ctx.model_name,
-                                                                                                     bfs_conn,
-                                                                                                     ctx.sub_dir)   # specifies details of Huggingface model
+                                                                                       ctx.task_type,
+                                                                                       bfs_conn,
+                                                                                       ctx.sub_dir)   # specifies details of Huggingface model
 
+        model_factory = current_model_specification.get_model_factory()
         # extract token from the connection if token connection name is given.
         # note that, token is required for private models. It doesn't matter
         # whether there is a token for public model or even what the token is.
@@ -72,7 +72,7 @@ def _download_model(self, ctx) -> Tuple[str, str]:
                 model_path=model_path,
                 token=token
         ) as downloader:
-            for model in [self._base_model_factory, self._tokenizer_factory]:
+            for model in [model_factory, self._tokenizer_factory]:
                 downloader.download_from_huggingface_hub(model)
             # upload model files to BucketFS
             model_tar_file_path = downloader.upload_to_bucketfs()

diff --git a/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py b/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py
@@ -13,7 +13,7 @@ def __init__(self,
                  base_model=transformers.AutoModelForSequenceClassification,
                  tokenizer=transformers.AutoTokenizer):
         super().__init__(exa, batch_size, pipeline, base_model,
-                         tokenizer, task_name='text-classification')
+                         tokenizer, task_type='text-classification')
         self.new_columns = ["label", "score", "error_message"]
 
     def extract_unique_param_based_dataframes(

diff --git a/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py b/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py
@@ -13,7 +13,7 @@ def __init__(self,
                  base_model=transformers.AutoModelForSequenceClassification,
                  tokenizer=transformers.AutoTokenizer):
         super().__init__(exa, batch_size, pipeline, base_model,
-                         tokenizer, task_name='text-classification')
+                         tokenizer, task_type='text-classification')
         self.new_columns = ["label", "score", "error_message"]
 
     def extract_unique_param_based_dataframes(

diff --git a/exasol_transformers_extension/udfs/models/text_generation_udf.py b/exasol_transformers_extension/udfs/models/text_generation_udf.py
@@ -14,7 +14,7 @@ def __init__(self,
                  base_model=transformers.AutoModelForCausalLM,
                  tokenizer=transformers.AutoTokenizer):
         super().__init__(exa, batch_size, pipeline, base_model,
-                         tokenizer, task_name='text-generation')
+                         tokenizer, task_type='text-generation')
         self.new_columns = ["generated_text", "error_message"]
 
     def extract_unique_param_based_dataframes(

diff --git a/exasol_transformers_extension/udfs/models/token_classification_udf.py b/exasol_transformers_extension/udfs/models/token_classification_udf.py
@@ -14,7 +14,7 @@ def __init__(self,
                  base_model=transformers.AutoModelForTokenClassification,
                  tokenizer=transformers.AutoTokenizer):
         super().__init__(exa, batch_size, pipeline, base_model,
-                         tokenizer, task_name='token-classification')
+                         tokenizer, task_type='token-classification')
         self._default_aggregation_strategy = 'simple'
         self._desired_fields_in_prediction = [
             "start", "end", "word", "entity", "score"]

diff --git a/exasol_transformers_extension/udfs/models/translation_udf.py b/exasol_transformers_extension/udfs/models/translation_udf.py
@@ -14,7 +14,7 @@ def __init__(self,
                  base_model=transformers.AutoModelForSeq2SeqLM,
                  tokenizer=transformers.AutoTokenizer):
         super().__init__(exa, batch_size, pipeline, base_model,
-                         tokenizer, task_name='translation')
+                         tokenizer, task_type='translation')
         self._translation_prefix = "translate {src_lang} to {target_lang}: "
         self.new_columns = ["translation_text", "error_message"]