Skip to content

Commit

Permalink
docu update
Browse files Browse the repository at this point in the history
  • Loading branch information
MarleneKress79789 committed Nov 21, 2024
1 parent 11dcedc commit 35e5fe8
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 10 deletions.
1 change: 1 addition & 0 deletions doc/user_guide/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ SELECT TE_TOKEN_CLASSIFICATION_UDF(
The inference results are presented with _START_POS_ indicating the index of the starting character of the token,
_END_POS_ indicating the index of the ending character of the token, _WORD_ indicating the token, predicted _ENTITY_, and
confidence _SCORE_ columns, combined with the inputs used when calling this UDF.
In case the model returns an empty result for an input row, the row is dropped entirely and not part of the result set.
In case of any error during model loading or prediction, these new
columns are set to `null`, and column _ERROR_MESSAGE_ is set
to the stacktrace of the error. For example:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from exasol_transformers_extension.udfs.models.token_classification_udf import \
TokenClassificationUDF
from exasol_transformers_extension.utils.model_specification import ModelSpecification
from tests.fixtures.model_fixture_utils import prepare_model_for_local_bucketfs

from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \
ErrorMessageMatcher, NoErrorMessageMatcher, ColumnsMatcher
Expand Down Expand Up @@ -125,27 +127,31 @@ def test_token_classification_udf(
("on GPU with single input with max aggregation", 0, 1, "max")
])
def test_token_classification_udf_with_span(
description, device_id, n_rows, agg,
description, device_id, n_rows, agg,tmpdir_factory,
prepare_token_classification_model_for_local_bucketfs):
if device_id is not None and not torch.cuda.is_available():
pytest.skip(f"There is no available device({device_id}) "
f"to execute the test")

model_spec = ModelSpecification("guishe/nuner-v2_fewnerd_fine_super", "token-classification")
bucketfs_path = prepare_model_for_local_bucketfs(model_spec, tmpdir_factory)

bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs
bucketfs_conn_name = "bucketfs_connection"
bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path)

bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_path)
text_data = "Foreign governments may be spying on your smartphone notifications, senator says. Washington (CNN) — Foreign governments have reportedly attempted to spy on iPhone and Android users through the mobile app notifications they receive on their smartphones - and the US government has forced Apple and Google to keep quiet about it, according to a top US senator. Through legal demands sent to the tech giants, governments have allegedly tried to force Apple and Google to turn over sensitive information that could include the contents of a notification - such as previews of a text message displayed on a lock screen, or an update about app activity, Oregon Democratic Sen. Ron Wyden said in a new report. Wyden''s report reflects the latest example of long-running tensions between tech companies and governments over law enforcement demands, which have stretched on for more than a decade. Governments around the world have particularly battled with tech companies over encryption, which provides critical protections to users and businesses while in some cases preventing law enforcement from pursuing investigations into messages sent over the internet.'"
text_data2 = "This is a test."
batch_size = 2
sample_data = [(
None,
bucketfs_conn_name,
model_params.sub_dir,
model_params.token_model_specs.model_name,
model_params.text_data * (i + 1),
'guishe/nuner-v2_fewnerd_fine_super',
text_data2,
i,
0,
len(model_params.text_data),
agg
"simple"
) for i in range(n_rows)]
columns = [
'device_id',
Expand Down Expand Up @@ -177,12 +183,15 @@ def test_token_classification_udf_with_span(
sequence_classifier.run(ctx)

result_df = ctx.get_emitted()[0][0]
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also

print(result_df)
new_columns = \
['entity_covered_text', 'entity_type', 'score', 'entity_docid',
'entity_char_begin', 'entity_char_end', 'error_message']

result = Result(result_df)
assert (
assert (False and
result == ColumnsMatcher(columns=old_columns, new_columns=new_columns)
and result == NoErrorMessageMatcher()
)
Expand Down Expand Up @@ -217,7 +226,7 @@ def test_token_classification_udf_with_multiple_aggregation_strategies(
'device_id',
'bucketfs_conn',
'sub_dir',
'model_name',
'guishe/nuner-v2_fewnerd_fine_super',
'text_data',
'aggregation_strategy'
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ class PredictionReturnsEmptyResult:
n_entities = 3

text_data = "error_result_empty"
# TODO mention in docu if result is empty row not in output
input_data = make_input_row() * data_size + \
make_input_row(text_data=text_data) * data_size + \
make_input_row() * data_size
output_data = make_output_row() * n_entities * data_size + \
make_output_row() * n_entities * data_size# Result of input #2 is empty, so the row does not appear in the output
make_output_row() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output

work_with_span_input_data = make_input_row_with_span() * data_size + \
make_input_row_with_span(text_data=text_data) * data_size + \
Expand Down

0 comments on commit 35e5fe8

Please sign in to comment.