Skip to content

Commit

Permalink
#221 Fixing the uploader directory structure (#212)
Browse files Browse the repository at this point in the history
* #221 Added an integration test for a model upload and prediction (should fail) [CodeBuild]

* #221 Moved the version number (the test should fail) [CodeBuild]

* #221 Fixed the directory structure the uploader creates [CodeBuild]

* #221 Fixed the directory structure the uploader creates [CodeBuild]

* #221 Fixed the integration tests [CodeBuild]

* #221 Updated the unit test test_model_downloader [CodeBuild]
  • Loading branch information
ahsimb authored Apr 25, 2024
1 parent 1c7c26f commit 14e7007
Show file tree
Hide file tree
Showing 9 changed files with 597 additions and 580 deletions.
1 change: 1 addition & 0 deletions doc/changes/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Changelog

* [1.0.1](changes_1.0.1.md)
* [1.0.0](changes_1.0.0.md)
* [0.10.0](changes_0.10.0.md)
* [0.9.2](changes_0.9.2.md)
Expand Down
24 changes: 24 additions & 0 deletions doc/changes/changes_1.0.1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Transformers Extension 1.0.1, 2024-04-25

Code name: Fixed the directory structure bug


## Summary

Fixed the directory structure made by the model upload UDF.

### Bugs

- #221: Directory Structure that Model Upload UDF creates is different from what PredictionUDFs expect.

### Features

N/A

### Refactorings

N/A

### Security

N/A
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _download_model(self, ctx) -> Tuple[str, str]:
token = token_conn_obj.password

# set model path in buckets
model_path = bucketfs_operations.get_model_path(sub_dir, model_name)
model_path = bucketfs_operations.get_model_path_with_pretrained(sub_dir, model_name)

# create bucketfs location
bfs_conn_obj = self._exa.get_connection(bfs_conn)
Expand Down
1,069 changes: 499 additions & 570 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "exasol-transformers-extension"
version = "1.0.0"
version = "1.0.1"
description = "An Exasol extension for using state-of-the-art pretrained machine learning models via the Hugging Face Transformers API."

authors = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_model_downloader_udf_script(
for i in range(n_rows):
sub_dir = SUB_DIR.format(id=i)
sub_dirs.append(sub_dir)
model_paths.append(bucketfs_operations.get_model_path(
model_paths.append(bucketfs_operations.get_model_path_with_pretrained(
sub_dir, model_params.tiny_model))
input_data.append((
model_params.tiny_model,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import time
from tests.utils import postprocessing

SUB_DIR = 'test_downloader_with_prediction_sub_dir'
MODEL_NAME = 'gaunernst/bert-tiny-uncased'


def test_prediction_with_downloader_udf(
setup_database, pyexasol_connection, bucketfs_location):
bucketfs_conn_name, schema_name = setup_database

try:
# execute downloader UDF
input_data = (
MODEL_NAME,
SUB_DIR,
bucketfs_conn_name,
''
)
query = f"""
SELECT TE_MODEL_DOWNLOADER_UDF(
t.model_name,
t.sub_dir,
t.bucketfs_conn_name,
t.token_conn_name
) FROM (VALUES {str(input_data)} AS
t(model_name, sub_dir, bucketfs_conn_name, token_conn_name));
"""

pyexasol_connection.execute(query).fetchall()
time.sleep(10)

# execute the filling mask UDF
text_data = "I <mask> you so much."
top_k = 3
input_data = (
'',
bucketfs_conn_name,
SUB_DIR,
MODEL_NAME,
text_data,
top_k
)

query = f"SELECT TE_FILLING_MASK_UDF(" \
f"t.device_id, " \
f"t.bucketfs_conn_name, " \
f"t.sub_dir, " \
f"t.model_name, " \
f"t.text_data," \
f"t.top_k" \
f") FROM (VALUES {str(input_data)} " \
f"AS t(device_id, bucketfs_conn_name, sub_dir, " \
f"model_name, text_data, top_k));"

result = pyexasol_connection.execute(query).fetchall()

# assertions
assert len(result) == top_k
assert all(row[-1] is None for row in result)

finally:
postprocessing.cleanup_buckets(bucketfs_location, SUB_DIR)
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, id: str, url_localfs: str, token_conn_name: str):
'bucketfs_conn_name': self.bucketfs_conn_name,
'token_conn_name': self.token_conn_name
}
self.model_path = bucketfs_operations.get_model_path(
self.model_path = bucketfs_operations.get_model_path_with_pretrained(
self.sub_dir, self.tiny_model)
self.bucketfs_connection = Connection(
address=f"{url_localfs}/bucket{id}",
Expand Down Expand Up @@ -118,7 +118,7 @@ def test_model_downloader_udf_implementation():
# assertions
env1_bucketfs_files = env1.list_files_in_bucketfs()
env2_bucketfs_files = env2.list_files_in_bucketfs()
assert ctx.get_emitted()[0] == (str(env1.model_path), str(env1.model_path.with_suffix(".tar.gz"))) \
and ctx.get_emitted()[1] == (str(env2.model_path), str(env2.model_path.with_suffix(".tar.gz"))) \
and str(Path(ctx.get_emitted()[0][1]).relative_to(env1.sub_dir)) in env1_bucketfs_files \
and str(Path(ctx.get_emitted()[1][1]).relative_to(env2.sub_dir)) in env2_bucketfs_files
assert ctx.get_emitted()[0] == (str(env1.model_path), str(env1.model_path.with_suffix(".tar.gz")))
assert ctx.get_emitted()[1] == (str(env2.model_path), str(env2.model_path.with_suffix(".tar.gz")))
assert str(Path(ctx.get_emitted()[0][1]).relative_to(env1.sub_dir)) in env1_bucketfs_files
assert str(Path(ctx.get_emitted()[1][1]).relative_to(env2.sub_dir)) in env2_bucketfs_files
4 changes: 2 additions & 2 deletions tests/unit_tests/udfs/test_model_downloader_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_model_downloader(description, count, token_conn_name, token_conn_obj, e
assert mock_cast(mock_model_downloader_factory.create).mock_calls == [
call(bucketfs_location=mock_bucketfs_locations[i],
model_name=base_model_names[i],
model_path=PosixPath(f'{sub_directory_names[i]}/{base_model_names[i]}'),
model_path=PosixPath(f'{sub_directory_names[i]}/{base_model_names[i]}/pretrained/{base_model_names[i]}'),
token=expected_token)
for i in range(count)
]
Expand All @@ -107,7 +107,7 @@ def test_model_downloader(description, count, token_conn_name, token_conn_obj, e
])
assert mock_ctx.output == [
(
f'{sub_directory_names[i]}/{base_model_names[i]}',
f'{sub_directory_names[i]}/{base_model_names[i]}/pretrained/{base_model_names[i]}',
str(mock_model_downloaders[i].upload_to_bucketfs())
)
for i in range(count)
Expand Down

0 comments on commit 14e7007

Please sign in to comment.