diff --git a/doc/changes/changes_0.8.0.md b/doc/changes/changes_0.8.0.md index 751d219a..1d88f3c7 100644 --- a/doc/changes/changes_0.8.0.md +++ b/doc/changes/changes_0.8.0.md @@ -9,7 +9,7 @@ T.B.D ### Features - - n/a + - #145: Added load function for loading local models ### Bug Fixes diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py new file mode 100644 index 00000000..067d3ea5 --- /dev/null +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -0,0 +1,58 @@ +import torch +import transformers.pipelines +from transformers import AutoModel, AutoTokenizer +from pathlib import Path + + +class LoadLocalModel: + """ + Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline. + + :pipeline: current model pipeline + :task_name: name of the current task + :device: device to be used for pipeline creation + """ + def __init__(self, + pipeline, + task_name, + device + ): + self.pipeline = pipeline + self.task_name = task_name + self.device = device + self.last_loaded_model = None + self.last_loaded_tokenizer = None + self.last_loaded_model_key = None + + def load_models(self, model_name: str, + current_model_key, + cache_dir: Path + ) -> transformers.pipelines.Pipeline: + """ + Loads a locally saved model and tokenizer from "cache_dir / "pretrained" / model_name". + Returns new pipeline corresponding to the model and task. + + :model_name: name of the model to be loaded + :current_model_key: Key of the model to be loaded + :cache_dir: location of the saved model + """ + + self.last_loaded_model = AutoModel.from_pretrained(str(cache_dir / "pretrained" / model_name)) # or do we want to load tokenizer + self.last_loaded_tokenizer = AutoTokenizer.from_pretrained(str(cache_dir / "pretrained" / model_name)) + + last_created_pipeline = self.pipeline( + self.task_name, + model=self.last_loaded_model, + tokenizer=self.last_loaded_tokenizer, + device=self.device, + framework="pt") + self.last_loaded_model_key = current_model_key + return last_created_pipeline + + def clear_device_memory(self): + """ + Delete models and free device memory + """ + self.last_loaded_model = None + self.last_loaded_tokenizer = None + torch.cuda.empty_cache() diff --git a/exasol_transformers_extension/utils/model_factory_protocol.py b/exasol_transformers_extension/utils/model_factory_protocol.py index 48fc390f..a87afccf 100644 --- a/exasol_transformers_extension/utils/model_factory_protocol.py +++ b/exasol_transformers_extension/utils/model_factory_protocol.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Protocol, Union, runtime_checkable +from typing import Protocol, Union, runtime_checkable, Optional import transformers @@ -9,8 +9,17 @@ class ModelFactoryProtocol(Protocol): """ Protocol for better type hints. """ - def from_pretrained(self, model_name: str, cache_dir: Path, use_auth_token: str) -> transformers.PreTrainedModel: + def from_pretrained(self, model_name: str, cache_dir: Optional[Path]=None, use_auth_token: Optional[str]=None) \ + -> transformers.PreTrainedModel: + """ + Either downloads a model from Huggingface Hub(all parameters required), + or loads a locally saved model from file (only requires filepath) + + :model_name: model name, or path to locally saved model files + :cache_dir: optional. Path where downloaded model should be cached + :use_auth_token: optional. token for Huggingface hub private models + """ pass def save_pretrained(self, save_directory: Union[str, Path]): - pass \ No newline at end of file + pass diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py new file mode 100644 index 00000000..7e67a117 --- /dev/null +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -0,0 +1,43 @@ +from pathlib import Path +from transformers import AutoModel, AutoTokenizer + +from exasol_transformers_extension.utils.load_local_model import LoadLocalModel + +from tests.utils.parameters import model_params + +from tests.fixtures.bucketfs_fixture import bucketfs_location + +import tempfile + + +class TestSetup: + def __init__(self, bucketfs_location): + self.bucketfs_location = bucketfs_location # do with this? + + self.token = "token" + model_params_ = model_params.tiny_model + self.model_name = model_params_ + + self.mock_current_model_key = None + mock_pipeline = lambda task_name, model, tokenizer, device, framework: None + self.loader = LoadLocalModel( + mock_pipeline, + task_name="test_task", + device=0) + + +def test_integration(bucketfs_location): + test_setup = TestSetup(bucketfs_location) + + with tempfile.TemporaryDirectory() as dir: + dir_p = Path(dir) + model_save_path = dir_p / "pretrained" / test_setup.model_name + # download a model + model = AutoModel.from_pretrained(test_setup.model_name) + tokenizer = AutoTokenizer.from_pretrained(test_setup.model_name) + model.save_pretrained(model_save_path) + tokenizer.save_pretrained(model_save_path) + + test_setup.loader.load_models(model_name=test_setup.model_name, + current_model_key=test_setup.mock_current_model_key, + cache_dir=dir_p) diff --git a/tests/unit_tests/utils/test_load_local_model.py b/tests/unit_tests/utils/test_load_local_model.py new file mode 100644 index 00000000..4fb4d27f --- /dev/null +++ b/tests/unit_tests/utils/test_load_local_model.py @@ -0,0 +1,45 @@ +import tempfile +from pathlib import Path +from typing import Union +from unittest.mock import create_autospec, MagicMock, call + +from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation + +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol +from exasol_transformers_extension.utils.load_local_model import LoadLocalModel + +from tests.utils.parameters import model_params + + +class TestSetup: + def __init__(self): + + self.bucketfs_location_mock: Union[BucketFSLocation, MagicMock] = create_autospec(BucketFSLocation) + + self.token = "token" + model_params_ = model_params.tiny_model + self.model_name = model_params_ + + mock_pipeline = lambda task_name, model, tokenizer, device, framework: None #todo do we want a pipeline and check creation? + self.loader = LoadLocalModel( + mock_pipeline, + task_name="test_task", + device=0) + + +#todo test current model key? test load model twice, test wrong model given +def test_load_function_call(): + test_setup = TestSetup() + mock_current_model_key = None + with tempfile.TemporaryDirectory() as dir: + dir_p = Path(dir) + cache_dir = dir_p + model_save_path = Path(cache_dir) / "pretrained" / test_setup.model_name + + test_setup.loader.load_models(model_name=test_setup.model_name, + current_model_key=mock_current_model_key, + cache_dir=cache_dir) + + #assert test_setup.model_factory_mock.mock_calls == [ + # call.from_pretrained(str(model_save_path))] +