From f28a3c5c45efdd930974b90838ee769f90501faf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=85=E0=A4=AE=E0=A4=A8=20=7C=7C=20Aman?= <7098967+amankhandelia@users.noreply.github.com> Date: Mon, 8 Apr 2024 21:17:37 +0530 Subject: [PATCH] Minor change to fix the incorrect response truncation (#3986) --- ludwig/features/text_feature.py | 2 +- tests/ludwig/features/test_text_feature.py | 27 ++++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/ludwig/features/text_feature.py b/ludwig/features/text_feature.py index 749d867878e..1056ae820c1 100644 --- a/ludwig/features/text_feature.py +++ b/ludwig/features/text_feature.py @@ -68,7 +68,7 @@ def get_decoded_targets_and_predictions( """Returns the decoded targets and predictions, accounting for IGNORE_INDEX_TOKEN_ID.""" sanitized_targets = torch.where(targets != IGNORE_INDEX_TOKEN_ID, targets, tokenizer.pad_token_id) sanitized_predictions = torch.where( - predictions[PREDICTIONS] != IGNORE_INDEX_TOKEN_ID, + targets != IGNORE_INDEX_TOKEN_ID, predictions[PREDICTIONS], tokenizer.pad_token_id, ) diff --git a/tests/ludwig/features/test_text_feature.py b/tests/ludwig/features/test_text_feature.py index 6d22c4cb69c..c3574baccd0 100644 --- a/tests/ludwig/features/test_text_feature.py +++ b/tests/ludwig/features/test_text_feature.py @@ -1,9 +1,10 @@ import pandas as pd +import pytest import torch from transformers import AutoTokenizer from ludwig.backend import LocalBackend -from ludwig.constants import LOGITS, PREDICTIONS, PROBABILITIES +from ludwig.constants import IGNORE_INDEX_TOKEN_ID, LOGITS, PREDICTIONS, PROBABILITIES from ludwig.features import text_feature TEST_MODEL_NAME = "hf-internal-testing/tiny-random-OPTForCausalLM" @@ -108,14 +109,23 @@ def test_backwards_compatibility(): assert list(feature_data[1]) == [1, 3, 3] -def test_get_decoded_targets_and_predictions(): +@pytest.mark.parametrize("vocab_size", [8]) +@pytest.mark.parametrize( + "targets", + [ + ([78, 79, 504, 76, 397, 84, 0], [" first she 18 yearman our"]), + ([IGNORE_INDEX_TOKEN_ID, IGNORE_INDEX_TOKEN_ID, IGNORE_INDEX_TOKEN_ID, 76, 397, 84, 0], [" yearman our"]), + ], +) +@pytest.mark.parametrize("predictions", [[78, 79, 504, 76, 397, 84, 0]]) +def test_get_decoded_targets_and_predictions(vocab_size, targets, predictions): tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_NAME) - vocab_size = 8 # Scenario 1: Prediction and target tensors have the same length, so nothing should change - targets = torch.tensor([[78, 79, 504, 76, 397, 84, 0]]) + targets, decoded_texts_gt = targets + targets = torch.tensor([targets]) predictions = { - PREDICTIONS: torch.tensor([[78, 79, 504, 76, 397, 84, 0]], dtype=torch.int64), + PREDICTIONS: torch.tensor([predictions], dtype=torch.int64), PROBABILITIES: torch.randn(1, 7, vocab_size).to(torch.float32), LOGITS: torch.randn(1, 7, vocab_size).to(torch.float32), } @@ -124,5 +134,8 @@ def test_get_decoded_targets_and_predictions(): decoded_predictions, ) = text_feature.get_decoded_targets_and_predictions(targets, predictions, tokenizer) - assert decoded_targets == [" first she 18 yearman our"] - assert decoded_predictions == [" first she 18 yearman our"] + assert isinstance(decoded_targets, list) + assert isinstance(decoded_predictions, list) + assert all(isinstance(x, str) for x in decoded_targets) + assert all(isinstance(x, str) for x in decoded_predictions) + assert decoded_targets == decoded_predictions == decoded_texts_gt