From c1bf935105a0e9cb24d9a8aac8605cf24cc9f393 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marlene=20Kre=C3=9F?= <marlene.kress@gmx.de>
Date: Thu, 5 Dec 2024 14:23:42 +0100
Subject: [PATCH] Bug: #275 fix error on no token found (#277)

* added new test cases

* moved exception throw in model to test params, udf now drops input rows with empty result set, better error message

* updated type hints and made sure empty results are dealt with correctly
---
 doc/changes/changes_2.2.0.md                  |  1 +
 doc/user_guide/user_guide.md                  |  1 +
 .../udfs/models/token_classification_udf.py   | 34 ++++++++----
 ...rediction_multiple_model_multiple_batch.py |  9 ++--
 ..._prediction_single_model_multiple_batch.py |  8 ++-
 ...ediction_containing_only_unknown_fields.py | 43 +++++++++++++++
 ...error_prediction_missing_expected_field.py | 50 ++++++++++++++++++
 .../prediction_contains_additional_keys.py    | 36 +++++++++++++
 .../prediction_returns_empty_result.py        | 52 +++++++++++++++++++
 .../udfs/test_token_classification.py         | 28 +++++++---
 10 files changed, 234 insertions(+), 28 deletions(-)
 create mode 100644 tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_containing_only_unknown_fields.py
 create mode 100644 tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_missing_expected_field.py
 create mode 100644 tests/unit_tests/udf_wrapper_params/token_classification/prediction_contains_additional_keys.py
 create mode 100644 tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py

diff --git a/doc/changes/changes_2.2.0.md b/doc/changes/changes_2.2.0.md
index 84d9914a..201bec81 100644
--- a/doc/changes/changes_2.2.0.md
+++ b/doc/changes/changes_2.2.0.md
@@ -13,6 +13,7 @@ n/a
 ### Bugs
 
 - #272: Fixed unit tests assertions not working correctly
+- #275: Fixed a bug where models returning unexpected results was not handled correctly
 
 ### Documentation
 
diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md
index 7e851d0e..fb6f1a51 100644
--- a/doc/user_guide/user_guide.md
+++ b/doc/user_guide/user_guide.md
@@ -498,6 +498,7 @@ SELECT TE_TOKEN_CLASSIFICATION_UDF(
 The inference results are presented with _START_POS_ indicating the index of the starting character of the token, 
 _END_POS_ indicating the index of the ending character of the token, _WORD_ indicating the token, predicted _ENTITY_, and 
 confidence _SCORE_ columns, combined with the inputs used when calling this UDF.
+In case the model returns an empty result for an input row, the row is dropped entirely and not part of the result set.
 In case of any error during model loading or prediction, these new 
 columns are set to `null`, and column _ERROR_MESSAGE_ is set 
 to the stacktrace of the error. For example:
diff --git a/exasol_transformers_extension/udfs/models/token_classification_udf.py b/exasol_transformers_extension/udfs/models/token_classification_udf.py
index 4114b617..d3baa5e7 100644
--- a/exasol_transformers_extension/udfs/models/token_classification_udf.py
+++ b/exasol_transformers_extension/udfs/models/token_classification_udf.py
@@ -52,8 +52,7 @@ def extract_unique_param_based_dataframes(
 
             yield param_based_model_df
 
-    def execute_prediction(self, model_df: pd.DataFrame) -> List[Union[
-                Dict[str, Any], List[Dict[str, Any]]]]:
+    def execute_prediction(self, model_df: pd.DataFrame) -> List[List[Dict[str, Any]]]:
         """
         Predict the given text list using recently loaded models, return
         probability scores, entities and associated words
@@ -66,6 +65,7 @@ def execute_prediction(self, model_df: pd.DataFrame) -> List[Union[
         aggregation_strategy = model_df['aggregation_strategy'].iloc[0]
         results = self.last_created_pipeline(
             text_data, aggregation_strategy=aggregation_strategy)
+
         results = results if type(results[0]) == list else [results]
 
         if aggregation_strategy == "none":
@@ -120,7 +120,8 @@ def append_predictions_to_input_dataframe(
 
         # Concat predictions and model_df
         pred_df = pd.concat(pred_df_list, axis=0).reset_index(drop=True)
-        model_df = pd.concat([model_df, pred_df], axis=1)
+        model_df = pd.concat([model_df, pred_df], axis=1, join='inner') # join='inner' -> drop rows where results are empty
+
         if self.work_with_spans:
             model_df = self.create_new_span_columns(model_df)
             model_df[["entity_docid", "entity_char_begin", "entity_char_end"]] =\
@@ -129,8 +130,7 @@ def append_predictions_to_input_dataframe(
         return model_df
 
     def create_dataframes_from_predictions(
-            self, predictions:  List[Union[
-                Dict[str, Any], List[Dict[str, Any]]]]) -> List[pd.DataFrame]:
+            self, predictions:  List[List[Dict[str, Any]]]) -> List[pd.DataFrame]:
         """
         Convert predictions to dataframe. Only score and answer fields are
         presented.
@@ -141,12 +141,24 @@ def create_dataframes_from_predictions(
         """
         results_df_list = []
         for result in predictions:
-            result_df = pd.DataFrame(result)
-            result_df = result_df[self._desired_fields_in_prediction].rename(
-                columns={
-                    "start": "start_pos",
-                    "end": "end_pos",
-                    "entity_group": "entity"})
+            if result and result[0]:
+                result_df = pd.DataFrame(result)
+                # need to save before trying to rename, otherwise they get lost and cant be printed in error message
+                result_df_column_names = result_df.columns
+                try:
+                    result_df = result_df[self._desired_fields_in_prediction].rename(
+                        columns={
+                            "start": "start_pos",
+                            "end": "end_pos",
+                            "entity_group": "entity"})
+                except KeyError as e:
+                    # adding more detailed error message
+                    raise KeyError(f"Some expected column was not found in prediction results. "
+                                   f"Expected columns are: {self._desired_fields_in_prediction}. "
+                                   f"Prediction results contain columns: {result_df_column_names}") from e
+            else:
+                # if the result for an input is empty, just append an empty result df, and the input will be dropped during concatenation
+                result_df = pd.DataFrame({})
             results_df_list.append(result_df)
 
         return results_df_list
diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_multiple_model_multiple_batch.py
index 85520293..fabffdfb 100644
--- a/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_multiple_model_multiple_batch.py
+++ b/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_multiple_model_multiple_batch.py
@@ -38,12 +38,9 @@ class ErrorOnPredictionMultipleModelMultipleBatch:
     work_with_span_output_data = work_with_span_output_data1 + work_with_span_output_data2
 
     tokenizer_model_output_df_model1 =  [make_model_output_for_one_input_row(number_entities=n_entities) * data_size]
-    tokenizer_model_output_df_model2_batch1 =  [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
-                                  score=None, start=None, end=None, word=None, entity_group=None,
-                                  )]
-    tokenizer_model_output_df_model2_batch2 =  [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
-                                  score=None, start=None, end=None, word=None, entity_group=None,
-                                  )]
+    tokenizer_model_output_df_model2_batch1 =  [[Exception("Traceback mock_pipeline is throwing an error intentionally")]] #error on pred -> only one output per input
+
+    tokenizer_model_output_df_model2_batch2 =  [[Exception("Traceback mock_pipeline is throwing an error intentionally")]]#error on pred -> only one output per input
 
     tokenizer_models_output_df = [tokenizer_model_output_df_model1, tokenizer_model_output_df_model2_batch1, tokenizer_model_output_df_model2_batch2]
 
diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_single_model_multiple_batch.py
index 1b4c777d..b2ad91fb 100644
--- a/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_single_model_multiple_batch.py
+++ b/tests/unit_tests/udf_wrapper_params/token_classification/error_on_prediction_single_model_multiple_batch.py
@@ -28,12 +28,10 @@ class ErrorOnPredictionSingleModelMultipleBatch:
 
     number_complete_batches = data_size // batch_size
     number_remaining_data_entries_in_last_batch = data_size % batch_size
-    tokenizer_model_output_df_model1 = [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
-                                  score=None, start=None, end=None, word=None, entity_group=None) * batch_size] * \
+    tokenizer_model_output_df_model1 = [[Exception("Traceback mock_pipeline is throwing an error intentionally")] #error on pred -> only one output per input
+                                   * batch_size] * \
                                 number_complete_batches + \
-                                [make_model_output_for_one_input_row(number_entities=1,
-                                  score=None, start=None, end=None, word=None, entity_group=None,
-                                  ) * number_remaining_data_entries_in_last_batch]
+                                [[Exception("Traceback mock_pipeline is throwing an error intentionally")] * number_remaining_data_entries_in_last_batch]
     tokenizer_models_output_df = [tokenizer_model_output_df_model1]
 
     tmpdir_name = "_".join(("/tmpdir", __qualname__))
diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_containing_only_unknown_fields.py b/tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_containing_only_unknown_fields.py
new file mode 100644
index 00000000..f5036508
--- /dev/null
+++ b/tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_containing_only_unknown_fields.py
@@ -0,0 +1,43 @@
+from pathlib import PurePosixPath
+from exasol_udf_mock_python.connection import Connection
+from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
+    make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
+    text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name
+
+
+class ErrorPredictionOnlyContainsUnknownFields:
+    """
+    Model returns only not expected output columns. Udf returns error message and empty results.
+    """
+    expected_model_counter = 1
+    batch_size = 2
+    data_size = 5
+    n_entities = 3
+
+    text_data = "error_result_contains_only_unknown fields"
+
+    input_data = make_input_row(text_data=text_data) * data_size
+    output_data = make_output_row(text_data=text_data,
+                                  score=None, start=None, end=None, word=None, entity=None,
+                                  error_msg="Traceback") * data_size # only one output per input
+
+    work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
+    work_with_span_output_data =  [(bucketfs_conn, sub_dir, model_name,
+                                text_docid, text_start, text_end, agg_strategy_simple,
+                                None, None, None, None, None, None,
+                                "Traceback")] * data_size # only one output per input
+
+
+    number_complete_batches = data_size // batch_size
+    number_remaining_data_entries_in_last_batch = data_size % batch_size
+    model_output_row_wrong_keys = [[{"unknown key": "some value", "diff unknown key": i}] for i in range(n_entities)]
+    tokenizer_model_output_df_model1 = [model_output_row_wrong_keys * batch_size] * \
+                                number_complete_batches + \
+                                [model_output_row_wrong_keys * number_remaining_data_entries_in_last_batch]
+    tokenizer_models_output_df = [tokenizer_model_output_df_model1]
+
+    tmpdir_name = "_".join(("/tmpdir", __qualname__))
+    base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
+    bfs_connections = {
+        bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
+    }
\ No newline at end of file
diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_missing_expected_field.py b/tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_missing_expected_field.py
new file mode 100644
index 00000000..b0b86055
--- /dev/null
+++ b/tests/unit_tests/udf_wrapper_params/token_classification/error_prediction_missing_expected_field.py
@@ -0,0 +1,50 @@
+from pathlib import PurePosixPath
+from exasol_udf_mock_python.connection import Connection
+from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
+    make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
+    text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name
+
+
+class ErrorPredictionMissingExpectedFields:
+    """
+    Model Outputs is missing column "score", error message about missing column is returned, with empty output columns.
+    Existing output columns are dropped for all rows where one output column is missing.
+    """
+    expected_model_counter = 1
+    batch_size = 2
+    data_size = 5
+    n_entities = 3
+
+    text_data = "error_result_missing_field_'word'"
+
+    input_data = make_input_row(text_data=text_data) * data_size
+    output_data = make_output_row(text_data=text_data, score=None,
+                                  start=None, end=None, word=None, entity=None,
+                                  error_msg="Traceback") * data_size # only one output per input
+
+    work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
+    work_with_span_output_data = [(bucketfs_conn, sub_dir, model_name,
+                                text_docid, text_start, text_end, agg_strategy_simple,
+                                None, None, None, text_docid, None, None,
+                                "Traceback")] * data_size # only one output per input
+
+
+    number_complete_batches = data_size // batch_size
+    number_remaining_data_entries_in_last_batch = data_size % batch_size
+
+    model_output_rows = make_model_output_for_one_input_row(number_entities=n_entities)
+    model_output_row_missing_key = []
+    for model_output_row in model_output_rows:
+        removed = model_output_row[0].pop("score")
+        model_output_row_missing_key.append(model_output_row)
+
+    tokenizer_model_output_df_model1 = [model_output_row_missing_key * batch_size] * \
+                                number_complete_batches + \
+                                [model_output_row_missing_key * number_remaining_data_entries_in_last_batch]
+    tokenizer_models_output_df = [tokenizer_model_output_df_model1]
+
+    tmpdir_name = "_".join(("/tmpdir", __qualname__))
+    base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
+    bfs_connections = {
+        bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
+    }
\ No newline at end of file
diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/prediction_contains_additional_keys.py b/tests/unit_tests/udf_wrapper_params/token_classification/prediction_contains_additional_keys.py
new file mode 100644
index 00000000..f756539a
--- /dev/null
+++ b/tests/unit_tests/udf_wrapper_params/token_classification/prediction_contains_additional_keys.py
@@ -0,0 +1,36 @@
+from pathlib import PurePosixPath
+from exasol_udf_mock_python.connection import Connection
+from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
+    make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
+    text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name
+
+class PredictionContainsAdditionalFields:
+    """
+    Output from model contains additional unrecognized columns. These are ignored and expected columns returned as normal.
+    """
+    expected_model_counter = 1
+    batch_size = 2
+    data_size = 2
+    n_entities = 3
+
+    text_data = "result contains additional keys"
+
+    #todod these are not filled out
+    input_data = make_input_row(text_data=text_data) * data_size
+    output_data = make_output_row(text_data=text_data) * n_entities * data_size
+
+    work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
+    work_with_span_output_data = make_output_row_with_span() * n_entities * data_size
+
+    model_output_rows = make_model_output_for_one_input_row(number_entities=n_entities)
+    for model_output_row in model_output_rows:
+        model_output_row[0].update({"unknown key": "some value", "diff unknown key": 1})
+
+    tokenizer_model_output_df_model1 = [model_output_rows * data_size]
+    tokenizer_models_output_df = [tokenizer_model_output_df_model1]
+
+    tmpdir_name = "_".join(("/tmpdir", __qualname__))
+    base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
+    bfs_connections = {
+        bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
+    }
\ No newline at end of file
diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py b/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py
new file mode 100644
index 00000000..9696ab19
--- /dev/null
+++ b/tests/unit_tests/udf_wrapper_params/token_classification/prediction_returns_empty_result.py
@@ -0,0 +1,52 @@
+from pathlib import PurePosixPath
+from exasol_udf_mock_python.connection import Connection
+from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
+    make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
+    text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name
+
+
+class PredictionReturnsEmptyResult:
+    """
+    Output from model is empty. Respective input row should be dropped and remaining output returned normally.
+    Tests different formats for empty result.
+    """
+    expected_model_counter = 1
+    batch_size = 6
+    data_size = 1
+    n_entities = 3
+
+    text_data = "error_result_empty"
+    input_data = make_input_row() * data_size  + \
+                 make_input_row(text_data=text_data) * data_size  + \
+                  make_input_row(text_data=text_data) * data_size + \
+                  make_input_row(text_data=text_data) * data_size + \
+                  make_input_row(text_data=text_data) * data_size + \
+                  make_input_row() * data_size
+    output_data = make_output_row() * n_entities * data_size + \
+                  make_output_row() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output
+
+    work_with_span_input_data = make_input_row_with_span() * data_size  + \
+                                make_input_row_with_span(text_data=text_data) * data_size  + \
+                                make_input_row_with_span(text_data=text_data) * data_size + \
+                                make_input_row_with_span(text_data=text_data) * data_size + \
+                                make_input_row_with_span(text_data=text_data) * data_size + \
+                                make_input_row_with_span() * data_size
+
+    work_with_span_output_data =  make_output_row_with_span() * n_entities * data_size  + \
+                                  make_output_row_with_span() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output
+
+
+    tokenizer_model_output_df_model1 = make_model_output_for_one_input_row(number_entities=n_entities) * data_size
+    tokenizer_model_output_df_model1.append([])
+    tokenizer_model_output_df_model1.append({})
+    tokenizer_model_output_df_model1.append([[]])
+    tokenizer_model_output_df_model1.append([{}])
+    tokenizer_model_output_df_model1 = tokenizer_model_output_df_model1 + make_model_output_for_one_input_row(number_entities=n_entities) * data_size
+
+    tokenizer_models_output_df = [[tokenizer_model_output_df_model1]]
+
+    tmpdir_name = "_".join(("/tmpdir", __qualname__))
+    base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
+    bfs_connections = {
+        bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
+    }
\ No newline at end of file
diff --git a/tests/unit_tests/udfs/test_token_classification.py b/tests/unit_tests/udfs/test_token_classification.py
index 98f335c9..cd13a384 100644
--- a/tests/unit_tests/udfs/test_token_classification.py
+++ b/tests/unit_tests/udfs/test_token_classification.py
@@ -10,6 +10,14 @@
 
 from exasol_transformers_extension.udfs.models.token_classification_udf import TokenClassificationUDF
 from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
+from tests.unit_tests.udf_wrapper_params.token_classification.error_prediction_containing_only_unknown_fields import \
+    ErrorPredictionOnlyContainsUnknownFields
+from tests.unit_tests.udf_wrapper_params.token_classification.error_prediction_missing_expected_field import \
+    ErrorPredictionMissingExpectedFields
+from tests.unit_tests.udf_wrapper_params.token_classification.prediction_returns_empty_result import \
+    PredictionReturnsEmptyResult
+from tests.unit_tests.udf_wrapper_params.token_classification.prediction_contains_additional_keys import \
+    PredictionContainsAdditionalFields
 from tests.unit_tests.udfs.output_matcher import Output, OutputMatcher
 from tests.utils.mock_bucketfs_location import fake_bucketfs_location_from_conn_object, fake_local_bucketfs_path
 from tests.utils.mock_cast import mock_cast
@@ -55,6 +63,7 @@
     SingleModelSingleBatchIncomplete
 
 
+
 def create_mock_metadata_with_span():
     meta = MockMetaData(
         script_code_wrapper_function=None,
@@ -153,10 +162,9 @@ def create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_
     This mock_pipeline is feed into a mock_pipeline_factory.
     """
     mock_pipeline: List[Union[AutoModel, MagicMock]] = [
-        create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i]) if tokenizer_models_output_df[i][0][0][0]["word"]
-        else [Exception("Traceback mock_pipeline is throwing an error intentionally")]
-    for i in range(0, number_of_intended_used_models)
-    ]
+        create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i])
+        for i in range(0, number_of_intended_used_models)
+        ]
 
     mock_pipeline_factory: Union[Pipeline, MagicMock] = create_autospec(Pipeline,
                                                                         side_effect=mock_pipeline)
@@ -198,7 +206,11 @@ def assert_result_matches_expected_output(result, expected_output_data, input_co
     ErrorNotCachedSingleModelMultipleBatch,
     ErrorNotCachedMultipleModelMultipleBatch,
     ErrorOnPredictionMultipleModelMultipleBatch,
-    ErrorOnPredictionSingleModelMultipleBatch
+    ErrorOnPredictionSingleModelMultipleBatch,
+    PredictionReturnsEmptyResult,
+    ErrorPredictionMissingExpectedFields,
+    ErrorPredictionOnlyContainsUnknownFields,
+    PredictionContainsAdditionalFields
 ])
 
 @patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
@@ -258,7 +270,11 @@ def test_token_classification_with_span(mock_local_path, mock_create_loc, params
     ErrorNotCachedSingleModelMultipleBatch,
     ErrorNotCachedMultipleModelMultipleBatch,
     ErrorOnPredictionMultipleModelMultipleBatch,
-    ErrorOnPredictionSingleModelMultipleBatch
+    ErrorOnPredictionSingleModelMultipleBatch,
+    PredictionReturnsEmptyResult,
+    ErrorPredictionMissingExpectedFields,
+    ErrorPredictionOnlyContainsUnknownFields,
+    PredictionContainsAdditionalFields
 ])
 @patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
 @patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path')