Skip to content

Commit

Permalink
documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
MarleneKress79789 committed Feb 9, 2024
1 parent 93efc77 commit fb944bc
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 21 deletions.
1 change: 1 addition & 0 deletions doc/changes/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Changelog

* [0.10.0](changes_0.10.0.md)
* [0.9.0](changes_0.9.0.md)
* [0.8.0](changes_0.8.0.md)
* [0.7.0](changes_0.7.0.md)
Expand Down
19 changes: 19 additions & 0 deletions doc/changes/changes_0.10.0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Transformers Extension 0.10.0, T.B.D

Code name: T.B.D


## Summary
T.B.D


### Features

- #146: Integrated new download and load functions using save_pretrained
`
### Refactorings


### Security


5 changes: 4 additions & 1 deletion doc/user_guide/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,10 @@ Before you can use pre-trained models, the models must be stored in the
BucketFS. We provide two different ways to load transformers models
into BucketFS:


### 1. Model Downloader UDF
Using the `TE_MODEL_DOWNLOADER_UDF` below, you can download the desired model
from the huggingface hub and upload it to bucketfs.
from the huggingface hub and upload it to BucketFS.

```sql
SELECT TE_MODEL_DOWNLOADER_UDF(
Expand Down Expand Up @@ -274,6 +275,8 @@ models from the local filesystem into BucketFS:
```

*Note*: The options --local-model-path needs to point to a path which contains the model and its tokenizer.
These should have been saved using transformers [save_pretrained](https://huggingface.co/docs/transformers/v4.32.1/en/installation#fetch-models-and-tokenizers-to-use-offline)
function to ensure proper loading by the Transformers Extension UDFs.

## Prediction UDFs
We provided 7 prediction UDFs, each performing an NLP task through the [transformers API](https://huggingface.co/docs/transformers/task_summary).
Expand Down
21 changes: 17 additions & 4 deletions exasol_transformers_extension/udfs/models/model_downloader_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@


class ModelDownloaderUDF:
"""
UDF which downloads a pretrained model from Huggingface using Huggingface's transformers API,
and uploads it to the BucketFS at , from where it can then be loaded without accessing Huggingface again.
Must be called with the following Input Table:
model_name | sub_dir | bfs_conn | token_conn
---------------------------------------------------------------------------------------------------
name of Huggingface model | directory to save model | BucketFS connection | name of token connection
returns <sub_dir/model_name> , <path of model BucketFS>
"""
#todo docu
def __init__(self,
exa,
base_model_factory: ModelFactoryProtocol = transformers.AutoModel,
Expand All @@ -32,10 +44,10 @@ def run(self, ctx) -> None:

def _download_model(self, ctx) -> Tuple[str, str]:
# parameters
model_name = ctx.model_name
sub_dir = ctx.sub_dir
bfs_conn = ctx.bfs_conn
token_conn = ctx.token_conn
model_name = ctx.model_name # name of Huggingface model
sub_dir = ctx.sub_dir # directory to save model
bfs_conn = ctx.bfs_conn # BucketFS connection
token_conn = ctx.token_conn # name of token connection

# extract token from the connection if token connection name is given.
# note that, token is required for private models. It doesn't matter
Expand Down Expand Up @@ -65,6 +77,7 @@ def _download_model(self, ctx) -> Tuple[str, str]:
) as downloader:
for model in [self._base_model_factory, self._tokenizer_factory]:
downloader.download_from_huggingface_hub(model)
# upload model files to BucketFS
model_tar_file_path = downloader.upload_to_bucketfs()

return str(model_path), str(model_tar_file_path)
4 changes: 4 additions & 0 deletions exasol_transformers_extension/upload_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def main(
model_name: str,
sub_dir: str,
local_model_path: str):
"""
Script for uploading locally saved model files to BucketFS. Files should have been saved locally
using Transformers save_pretrained function. This ensures proper loading from the BucketFS later
"""
# create bucketfs location
bucketfs_location = bucketfs_operations.create_bucketfs_location(
bucketfs_name, bucketfs_host, bucketfs_port, bucketfs_use_https,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,15 @@
from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory





class HuggingFaceHubBucketFSModelTransferSP:
"""
Class for downloading a model using the Huggingface Transformers API, and loading it into the BucketFS
using save_pretrained.
Class for downloading a model using the Huggingface Transformers API, saving it locally using
transformers save_pretrained, and loading the saved model files into the BucketFS.
:bucketfs_location: BucketFSLocation the model should be loaded to
:model_name: Name of the model to be downloaded using Huggingface Transformers API
:model_path: Path the model will be loaded into the BucketFS at
:token: Huggingface token, only needed for private models
:bucketfs_location: BucketFSLocation the model should be loaded to
:model_name: Name of the model to be downloaded using Huggingface Transformers API
:model_path: Path the model will be loaded into the BucketFS at
:token: Huggingface token, only needed for private models
:temporary_directory_factory: Optional. Default is TemporaryDirectoryFactory. Mainly change for testing.
:bucketfs_model_uploader_factory: Optional. Default is BucketFSModelUploaderFactory. Mainly change for testing.
"""
Expand Down Expand Up @@ -51,9 +48,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol):
"""
Download a model from HuggingFace Hub into a temporary directory and save it with save_pretrained
in temporary directory / pretrained .
in temporary directory / pretrained / model_name.
"""
model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache", use_auth_token=self._token)
model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache",
use_auth_token=self._token)
model.save_pretrained(self._tmpdir_name / "pretrained" / self._model_name)

def upload_to_bucketfs(self) -> Path:
Expand All @@ -62,7 +60,7 @@ def upload_to_bucketfs(self) -> Path:
returns: Path of the uploaded model in the BucketFS
"""
return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name)
return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name) #todo should we do replace(-,_) here to?


class HuggingFaceHubBucketFSModelTransferSPFactory:
Expand Down
6 changes: 3 additions & 3 deletions exasol_transformers_extension/utils/load_local_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ class LoadLocalModel:
Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline.
:_pipeline_factory: a function to create a transformers pipeline
:task_name: name of the current task
:device: device to be used for pipeline creation
:task_name: name of the current task
:device: device to be used for pipeline creation, i.e "CPU"
:_base_model_factory: a ModelFactoryProtocol for creating the loaded model
:_tokenizer_factory: a ModelFactoryProtocol for creating the loaded tokenizer
"""
Expand Down Expand Up @@ -39,7 +39,7 @@ def load_models(self,
current_model_key: str
) -> transformers.pipelines.Pipeline:
"""
Loads a locally saved model and tokenizer from "cache_dir / "pretrained" / model_name".
Loads a locally saved model and tokenizer from model_path.
Returns new pipeline corresponding to the model and task.
:model_path: location of the saved model and tokenizer
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "exasol-transformers-extension"
version = "0.9.0"
version = "0.10.0"
description = "An Exasol extension to use state-of-the-art pretrained machine learning models via the transformers api."

authors = [
Expand Down

0 comments on commit fb944bc

Please sign in to comment.