diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index 7e851d0e..fb6f1a51 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -498,6 +498,7 @@ SELECT TE_TOKEN_CLASSIFICATION_UDF( The inference results are presented with _START_POS_ indicating the index of the starting character of the token, _END_POS_ indicating the index of the ending character of the token, _WORD_ indicating the token, predicted _ENTITY_, and confidence _SCORE_ columns, combined with the inputs used when calling this UDF. +In case the model returns an empty result for an input row, the row is dropped entirely and not part of the result set. In case of any error during model loading or prediction, these new columns are set to `null`, and column _ERROR_MESSAGE_ is set to the stacktrace of the error. For example: diff --git a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py index 46fbe2e5..81f94416 100644 --- a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py @@ -7,6 +7,8 @@ from exasol_transformers_extension.udfs.models.token_classification_udf import \ TokenClassificationUDF +from exasol_transformers_extension.utils.model_specification import ModelSpecification +from tests.fixtures.model_fixture_utils import prepare_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, NoErrorMessageMatcher, ColumnsMatcher @@ -125,27 +127,31 @@ def test_token_classification_udf( ("on GPU with single input with max aggregation", 0, 1, "max") ]) def test_token_classification_udf_with_span( - description, device_id, n_rows, agg, + description, device_id, n_rows, agg,tmpdir_factory, prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") + model_spec = ModelSpecification("guishe/nuner-v2_fewnerd_fine_super", "token-classification") + bucketfs_path = prepare_model_for_local_bucketfs(model_spec, tmpdir_factory) + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" - bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) - + bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_path) + text_data = "Foreign governments may be spying on your smartphone notifications, senator says. Washington (CNN) — Foreign governments have reportedly attempted to spy on iPhone and Android users through the mobile app notifications they receive on their smartphones - and the US government has forced Apple and Google to keep quiet about it, according to a top US senator. Through legal demands sent to the tech giants, governments have allegedly tried to force Apple and Google to turn over sensitive information that could include the contents of a notification - such as previews of a text message displayed on a lock screen, or an update about app activity, Oregon Democratic Sen. Ron Wyden said in a new report. Wyden''s report reflects the latest example of long-running tensions between tech companies and governments over law enforcement demands, which have stretched on for more than a decade. Governments around the world have particularly battled with tech companies over encryption, which provides critical protections to users and businesses while in some cases preventing law enforcement from pursuing investigations into messages sent over the internet.'" + text_data2 = "This is a test." batch_size = 2 sample_data = [( None, bucketfs_conn_name, model_params.sub_dir, - model_params.token_model_specs.model_name, - model_params.text_data * (i + 1), + 'guishe/nuner-v2_fewnerd_fine_super', + text_data2, i, 0, len(model_params.text_data), - agg + "simple" ) for i in range(n_rows)] columns = [ 'device_id', @@ -177,12 +183,15 @@ def test_token_classification_udf_with_span( sequence_classifier.run(ctx) result_df = ctx.get_emitted()[0][0] + with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also + + print(result_df) new_columns = \ ['entity_covered_text', 'entity_type', 'score', 'entity_docid', 'entity_char_begin', 'entity_char_end', 'error_message'] result = Result(result_df) - assert ( + assert (False and result == ColumnsMatcher(columns=old_columns, new_columns=new_columns) and result == NoErrorMessageMatcher() ) @@ -217,7 +226,7 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( 'device_id', 'bucketfs_conn', 'sub_dir', - 'model_name', + 'guishe/nuner-v2_fewnerd_fine_super', 'text_data', 'aggregation_strategy' ] diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py b/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py index aa3e7a04..450cad58 100644 --- a/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py +++ b/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py @@ -15,12 +15,11 @@ class PredictionReturnsEmptyResult: n_entities = 3 text_data = "error_result_empty" - # TODO mention in docu if result is empty row not in output input_data = make_input_row() * data_size + \ make_input_row(text_data=text_data) * data_size + \ make_input_row() * data_size output_data = make_output_row() * n_entities * data_size + \ - make_output_row() * n_entities * data_size# Result of input #2 is empty, so the row does not appear in the output + make_output_row() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output work_with_span_input_data = make_input_row_with_span() * data_size + \ make_input_row_with_span(text_data=text_data) * data_size + \