diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 12aedaca8cf986..bf0da2c2757bbf 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -913,7 +913,7 @@ def test_flash_attn_2_inference_equivalence(self): logits_fa = outputs_fa.decoder_hidden_states[-1] # whisper FA2 needs very high tolerance - assert torch.allclose(logits_fa, logits, atol=4e-1) + self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-1)) # check with inference + dropout model.train() @@ -958,7 +958,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): logits_fa = outputs_fa.decoder_hidden_states[-1] # whisper FA2 needs very high tolerance - assert torch.allclose(logits_fa, logits, atol=4e-1) + self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-1)) other_inputs = { "decoder_input_ids": decoder_input_ids, @@ -973,7 +973,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): logits_fa = outputs_fa.decoder_hidden_states[-1] # whisper FA2 needs very high tolerance - assert torch.allclose(logits_fa[:, -2:], logits[:, -2:], atol=4e-1) + self.assertTrue(torch.allclose(logits_fa[:, -2:], logits[:, -2:], atol=4e-1)) def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: @@ -1246,48 +1246,6 @@ def test_mask_time_prob(self): encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16)) - def test_generate_with_prompt_ids_and_task_and_language(self): - config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = WhisperForConditionalGeneration(config).eval().to(torch_device) - input_features = input_dict["input_features"] - prompt_ids = torch.arange(5).to(torch_device) - language = "<|de|>" - task = "translate" - lang_id = 6 - task_id = 7 - model.generation_config.__setattr__("lang_to_id", {language: lang_id}) - model.generation_config.__setattr__("task_to_id", {task: task_id}) - - output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids) - - expected_output_start = [ - *prompt_ids.tolist(), - model.generation_config.decoder_start_token_id, - lang_id, - task_id, - ] - for row in output.tolist(): - self.assertListEqual(row[: len(expected_output_start)], expected_output_start) - - def test_generate_with_prompt_ids_and_forced_decoder_ids(self): - config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = WhisperForConditionalGeneration(config).eval().to(torch_device) - input_features = input_dict["input_features"] - prompt_ids = torch.arange(5).to(torch_device) - forced_decoder_ids = [(1, 6), (2, 7), (3, 8)] - - output = model.generate( - input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids - ) - - expected_output_start = [ - *prompt_ids.tolist(), - model.generation_config.decoder_start_token_id, - *[token for _rank, token in forced_decoder_ids], - ] - for row in output.tolist(): - self.assertListEqual(row[: len(expected_output_start)], expected_output_start) - def test_generate_with_prompt_ids_max_length(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.max_target_positions = 7 @@ -1349,7 +1307,7 @@ def test_generate_longform_with_prompt_ids(self): ) for row in output.tolist(): # make sure no token below 10 is in generated output => this means for long-form prompt ids should NOT be returned - assert not any(i in row for i in model.generation_config.suppress_tokens) + self.assertTrue(not any(i in row for i in model.generation_config.suppress_tokens)) def _check_longform_generate_single_batch(self, condition_on_prev_tokens): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -1414,13 +1372,15 @@ def _check_longform_generate_single_batch(self, condition_on_prev_tokens): segments = outputs["segments"][0] for _, segment in enumerate(segments): - assert segment["start"] <= segment["end"], "start has to be smaller equal end" - assert any( - s > timestamp_begin for s in segment["tokens"][1:] - ), f"At least one segment token should be a timestamp token, but not first., {segment['tokens']}" - assert ( - segment["tokens"].shape[-1] <= max_length - ), "make sure that no segment is larger than max generation length" + self.assertTrue(segment["start"] <= segment["end"], "start has to be smaller equal end") + self.assertTrue( + any(s > timestamp_begin for s in segment["tokens"][1:]), + f"At least one segment token should be a timestamp token, but not first., {segment['tokens']}", + ) + self.assertTrue( + segment["tokens"].shape[-1] <= max_length, + "make sure that no segment is larger than max generation length", + ) def test_longform_generate_single_batch(self): self._check_longform_generate_single_batch(condition_on_prev_tokens=False) @@ -1503,12 +1463,12 @@ def _check_longform_generate_multi_batch(self, condition_on_prev_tokens): segments = outputs["segments"][1] # make sure batched and non-batched is the same - assert tokens_2.tolist() == tokens[: tokens_2.shape[-1]].tolist() + self.assertEqual(tokens_2.tolist(), tokens[: tokens_2.shape[-1]].tolist()) for seg1, seg2 in zip(segments_2, segments): - assert seg1["start"] == seg2["start"] - assert seg1["end"] == seg2["end"] - assert seg1["tokens"].tolist() == seg2["tokens"].tolist() + self.assertEqual(seg1["start"], seg2["start"]) + self.assertEqual(seg1["end"], seg2["end"]) + self.assertEqual(seg1["tokens"].tolist(), seg2["tokens"].tolist()) def test_longform_generate_multi_batch(self): self._check_longform_generate_multi_batch(condition_on_prev_tokens=False) @@ -1563,7 +1523,7 @@ def test_generate_output_type(self, return_dict_in_generate): # short-form generation without fallback pred_ids = model.generate(**inputs, return_dict_in_generate=return_dict_in_generate) - assert isinstance(pred_ids, expected_output_type) + self.assertIsInstance(pred_ids, expected_output_type) # short-form generation with fallback pred_ids = model.generate( @@ -1572,7 +1532,7 @@ def test_generate_output_type(self, return_dict_in_generate): temperature=[0.0, 0.1], return_dict_in_generate=return_dict_in_generate, ) - assert isinstance(pred_ids, expected_output_type) + self.assertIsInstance(pred_ids, expected_output_type) def test_labels_sequence_max_length_correct(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -1803,10 +1763,7 @@ def test_tiny_en_generation(self): generated_ids = model.generate(input_features, num_beams=5, max_length=20) transcript = processor.tokenizer.batch_decode(generated_ids)[0] - EXPECTED_TRANSCRIPT = ( - "<|startoftranscript|><|notimestamps|> Mr. Quilter is the apostle of the middle" - " classes, and we are glad to" - ) + EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his" self.assertEqual(transcript, EXPECTED_TRANSCRIPT) @slow @@ -1824,18 +1781,15 @@ def test_tiny_generation(self): generated_ids = model.generate(input_features, num_beams=5, max_length=20) transcript = processor.tokenizer.decode(generated_ids[0]) - EXPECTED_TRANSCRIPT = ( - "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle" - " classes and we are glad" - ) + EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel" self.assertEqual(transcript, EXPECTED_TRANSCRIPT) @slow def test_large_generation(self): torch_device = "cpu" set_seed(0) - processor = WhisperProcessor.from_pretrained("openai/whisper-large") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") + processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") model.to(torch_device) input_speech = self._load_datasamples(1) @@ -1847,14 +1801,14 @@ def test_large_generation(self): ) transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad" + EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his" self.assertEqual(transcript, EXPECTED_TRANSCRIPT) @slow def test_large_generation_multilingual(self): set_seed(0) - processor = WhisperProcessor.from_pretrained("openai/whisper-large") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") + processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") model.to(torch_device) ds = load_dataset( @@ -1870,21 +1824,21 @@ def test_large_generation_multilingual(self): input_features, do_sample=False, max_length=20, language="<|de|>", task="transcribe" ) transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - EXPECTED_TRANSCRIPT = " Denken Sie, soeben walten meine Gedanken bei Ihnen in Adela" + EXPECTED_TRANSCRIPT = " denken sie soeben weilten meine gedanken bei ihnen in adelaide und ich wünsch" self.assertEqual(transcript, EXPECTED_TRANSCRIPT) generated_ids = model.generate( input_features, do_sample=False, max_length=20, language="<|de|>", task="translate" ) transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - EXPECTED_TRANSCRIPT = " Think, my thoughts were just rolling with you in Adelaide, and I" + EXPECTED_TRANSCRIPT = " Think, my thoughts were just now in Adelaide with you, and I wished to be able" self.assertEqual(transcript, EXPECTED_TRANSCRIPT) @slow def test_large_batched_generation(self): set_seed(0) - processor = WhisperProcessor.from_pretrained("openai/whisper-large") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") + processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") model.to(torch_device) input_speech = self._load_datasamples(4) @@ -1895,10 +1849,10 @@ def test_large_batched_generation(self): # fmt: off EXPECTED_LOGITS = torch.tensor( [ - [50258, 50259, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404], - [50258, 50259, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257], - [50258, 50259, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904], - [50258, 50259, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439] + [2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281, 2928, 702, 14943], + [6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257, 50257, 50257, 50257], + [415, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 365, 26586, 3799, 293, 12904, 9256, 450, 10539, 949], + [634, 575, 12525, 22618, 1968, 6144, 35617, 1456, 397, 266, 311, 589, 307, 534, 10281, 934, 439, 11, 293, 393] ] ) # fmt: on @@ -1907,10 +1861,10 @@ def test_large_batched_generation(self): # fmt: off EXPECTED_TRANSCRIPT = [ - " Mr. Quilter is the apostle of the middle classes and we are glad", + " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel", " Nor is Mr. Quilter's manner less interesting than his matter.", - " He tells us that at this festive season of the year, with Christmas and roast", - " He has grave doubts whether Sir Frederick Layton's work is really Greek after all", + " he tells us that at this festive season of the year with christmas and roast beef looming before", + " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can", ] # fmt: on @@ -1968,10 +1922,10 @@ def test_tiny_en_batched_generation(self): # fmt: off EXPECTED_LOGITS = torch.tensor( [ - [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284], - [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256], - [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236], - [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460] + [1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284, 7062, 465], + [5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256, 50256, 50256], + [679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236, 878, 514], + [679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460, 7073, 287] ] ) @@ -1981,10 +1935,10 @@ def test_tiny_en_batched_generation(self): # fmt: off EXPECTED_TRANSCRIPT = [ - " Mr. Quilter is the apostle of the middle classes, and we are glad to", + " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his", " Nor is Mr. Quilter's manner less interesting than his matter.", - " He tells us that at this festive season of the year, with Christmas and roast beef looming", - " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can", + " He tells us that at this festive season of the year, with Christmas and roast beef looming before us", + " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in", ] # fmt: on @@ -2004,7 +1958,11 @@ def test_tiny_timestamp_generation(self): generated_ids = model.generate(input_features, max_length=448, return_timestamps=True).to("cpu") - EXPECTED_OUTPUT = torch.tensor([50258, 50259, 50359, 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50257]) # fmt: skip + # fmt: off + EXPECTED_OUTPUT = torch.tensor([ + 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50364, 393, 4411, 13, 50514 + ]) + # fmt: on self.assertTrue(torch.allclose(generated_ids, EXPECTED_OUTPUT)) @@ -2015,7 +1973,7 @@ def test_tiny_timestamp_generation(self): " Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season" " of the year, with Christmas and roast beef looming before us, similarly drawn from eating and" " its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins'" - " work is really Greek after all, and" + " work is really Greek after all, and can discover." ), "offsets": [ { @@ -2047,6 +2005,10 @@ def test_tiny_timestamp_generation(self): ), "timestamp": (23.76, 29.44), }, + { + "text": " can discover.", + "timestamp": (29.44, 32.44), + }, ], } ] @@ -2115,12 +2077,12 @@ def test_tiny_longform_timestamps_generation(self): {"text": " are as national as a jingo poem.", "timestamp": (40.32, 44.72)}, { "text": " Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used", - "timestamp": (44.72, 50.4), + "timestamp": (44.72, 50.400000000000006), }, - {"text": " to flash his teeth.", "timestamp": (50.4, 52.96)}, + {"text": " to flash his teeth.", "timestamp": (50.400000000000006, 52.96)}, { "text": " And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like", - "timestamp": (52.96, 58.68), + "timestamp": (52.96, 58.68000000000001), }, {"text": " a shampoo and a Turkish bath next man.", "timestamp": (58.68, 61.96)}, ] @@ -2137,14 +2099,18 @@ def test_large_timestamp_generation(self): input_speech = np.concatenate(self._load_datasamples(4)) input_features = processor( - input_speech, return_tensors="pt", sampling_rate=16_000, return_token_timestamps=True + input_speech, + return_tensors="pt", + sampling_rate=16_000, ).input_features input_features = input_features.to(torch_device) generated_ids = model.generate(input_features, max_length=448, return_timestamps=True).to("cpu") # fmt: off - EXPECTED_OUTPUT = torch.tensor([50258, 50259, 50360, 50365, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50629, 50682, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50870, 50911, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 949, 505, 11, 51245, 51287, 1034, 4680, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51494, 51523, 634, 575, 12525, 22618, 1968, 6144, 35617, 1456, 397, 266, 311, 589, 307, 534, 10281, 934, 439, 11, 51799, 51815, 50257]) + EXPECTED_OUTPUT = torch.tensor([ + 50365, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50629, 50682, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50870, 50911, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 949, 505, 11, 51245, 51287, 1034, 4680, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51494, 51523, 634, 575, 12525, 22618, 1968, 6144, 35617, 1456, 397, 266, 311, 589, 307, 534, 10281, 934, 439, 11, 51799, 51815, 50365, 293, 393, 4411, 50430 + ]) # fmt: on self.assertTrue(torch.allclose(generated_ids, EXPECTED_OUTPUT)) @@ -2155,7 +2121,7 @@ def test_large_timestamp_generation(self): " Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive" " season of the year, with Christmas and roast beef looming before us, similes drawn from eating" " and its results occur most readily to the mind. He has grave doubts whether Sir Frederick " - "Leighton's work is really Greek after all," + "Leighton's work is really Greek after all, and can discover" ), "offsets": [ { @@ -2184,6 +2150,10 @@ def test_large_timestamp_generation(self): ), "timestamp": (23.16, 28.68), }, + { + "text": (" and can discover"), + "timestamp": (28.68, 29.98), + }, ], } ] @@ -2211,10 +2181,10 @@ def test_tiny_token_timestamp_generation(self): # fmt: off EXPECTED_OUTPUT = torch.tensor([ - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.4800, 0.8200, 0.9600, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0000, 2.3400, 2.5000, 2.6600, 3.1800, 3.5600, 3.6800, 3.8000, 4.1000, 4.3000, 4.5800, 4.9400, 5.3800, 12.4200, 12.8400, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9400, 26.9400, 26.9400, 26.9400, 29.8400 ], - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.5200, 0.9000, 1.1400, 1.4200, 1.5200, 1.6800, 1.6800, 1.8800, 2.1000, 2.2200, 2.6200, 3.1400, 3.5800, 3.9600, 4.4000, 17.3000, 17.3000, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 28.0000 ], - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7600, 1.0000, 1.4200, 1.8000, 1.9400, 2.1800, 2.5200, 3.0200, 3.3200, 3.5400, 3.9400, 4.5600, 4.9200, 5.2800, 5.5600, 5.9000, 6.1600, 6.3000, 6.4800, 6.4800, 6.6400, 7.8200, 7.9600, 8.2200, 8.6000, 8.9200, 9.2200, 9.5200, 9.7200, 10.0600, 10.5400, 10.8800, 11.2600, 11.5400, 11.7400, 12.0800, 15.6800, 15.6800], - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7400, 1.0400, 1.3200, 1.6800, 2.1400, 2.4800, 2.7800, 3.0800, 3.1600, 3.4000, 3.6000, 4.0200, 4.2200, 4.8600, 5.2400, 5.7400, 6.3400, 6.6200, 6.7600, 6.7600, 6.8600, 7.2400, 7.4200, 7.6800, 7.9200, 8.4800, 8.7600, 9.2000, 9.2000, 9.4200, 15.8200, 15.8200, 29.6400, 29.6600, 29.6600, 29.6600, 29.6600, 29.7600] + [0.0000, 0.4800, 0.8200, 0.9600, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0000, 2.3400, 2.5000, 2.6600, 3.1800, 3.5600, 3.6800, 3.8000, 4.1000, 4.3000, 4.5800, 4.9400, 5.3800, 12.4200, 12.8400, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9400, 26.9400, 26.9400, 26.9400], + [0.0000, 0.5200, 0.9000, 1.1400, 1.4200, 1.5200, 1.6800, 1.6800, 1.8800, 2.1000, 2.2200, 2.6200, 3.1400, 3.5800, 3.9600, 4.4000, 17.3000, 17.3000, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400], + [0.0000, 0.0000, 0.7600, 1.0000, 1.4200, 1.8000, 1.9400, 2.1800, 2.5200, 3.0200, 3.3200, 3.5400, 3.9400, 4.5600, 4.9200, 5.2800, 5.5600, 5.9000, 6.1600, 6.3000, 6.4800, 6.4800, 6.6400, 7.8200, 7.9600, 8.2200, 8.6000, 8.9200, 9.2200, 9.5200, 9.7200, 10.0600, 10.5400, 10.8800, 11.2600, 11.5400, 11.7400, 12.0800, 15.6800], + [0.0000, 0.0000, 0.7400, 1.0400, 1.3200, 1.6800, 2.1400, 2.4800, 2.7800, 3.0800, 3.1600, 3.4000, 3.6000, 4.0200, 4.2200, 4.8600, 5.2400, 5.7400, 6.3400, 6.6200, 6.7600, 6.7600, 6.8600, 7.2400, 7.4200, 7.6800, 7.9200, 8.4800, 8.7600, 9.2000, 9.2000, 9.4200, 15.8200, 15.8200, 29.6400, 29.6600, 29.6600, 29.6600, 29.6600] ]) # fmt: on @@ -2241,10 +2211,10 @@ def test_large_token_timestamp_generation(self): # fmt: off EXPECTED_OUTPUT = torch.tensor([ - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6200, 0.7400, 0.8600, 1.0000, 1.0400, 1.3000, 1.4400, 1.7800, 2.1800, 2.2800, 2.5000, 2.9200, 3.0000, 3.3800, 3.5000, 3.6000, 3.8400, 4.1000, 4.4000, 4.6800, 5.1400, 5.3600, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200], - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6000, 0.9200, 1.2200, 1.3400, 1.4200, 1.5400, 1.5800, 1.7400, 2.0600, 2.3800, 3.0400, 3.3800, 3.6400, 4.1200, 4.3600, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800], - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5400, 0.8200, 1.1600, 1.4600, 1.7400, 1.8800, 2.3400, 2.7400, 3.1400, 3.2200, 3.5400, 4.2800, 4.5600, 4.8200, 5.0600, 5.3200, 5.6600, 5.9600, 6.1400, 6.4000, 6.8400, 7.8800, 8.0200, 8.3600, 8.7000, 9.0200, 9.3200, 9.5000, 9.8400, 10.3000, 10.6600, 11.0800, 11.3600, 11.4600, 11.8000, 12.4600], - [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5600, 0.7600, 1.0600, 1.4000, 1.8800, 2.2600, 2.6200, 2.8000, 2.9600, 3.0000, 3.2000, 3.4400, 3.6800, 4.0000, 4.6000, 5.0000, 5.3200, 5.4800, 6.0600, 6.0600, 6.1000, 6.3200, 6.7400, 7.0000, 7.2200, 7.4000, 7.7600, 8.0600, 8.5600, 8.8600, 8.9400, 9.1000, 9.3400, 9.8800, 9.8800, 9.8800] + [0.0000, 0.0000, 0.6200, 0.7400, 0.8600, 1.0000, 1.0400, 1.3000, 1.4400, 1.7800, 2.1800, 2.2800, 2.5000, 2.9200, 3.0000, 3.3800, 3.5000, 3.6000, 3.8400, 4.1000, 4.4000, 4.6800, 5.1400, 5.3600, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200, 5.8200], + [0.0000, 0.0000, 0.6000, 0.9200, 1.2200, 1.3400, 1.4200, 1.5400, 1.5800, 1.7400, 2.0600, 2.3800, 3.0400, 3.3800, 3.6400, 4.1200, 4.3600, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800, 4.7800], + [0.0000, 0.0000, 0.5400, 0.8200, 1.1600, 1.4600, 1.7400, 1.8800, 2.3400, 2.7400, 3.1400, 3.2200, 3.5400, 4.2800, 4.5600, 4.8200, 5.0600, 5.3200, 5.6600, 5.9600, 6.1400, 6.4000, 6.8400, 7.8800, 8.0200, 8.3600, 8.7000, 9.0200, 9.3200, 9.5000, 9.8400, 10.3000, 10.6600, 11.0800, 11.3600, 11.4600, 11.8000], + [0.0000, 0.0000, 0.5600, 0.7600, 1.0600, 1.4000, 1.8800, 2.2600, 2.6200, 2.8000, 2.9600, 3.0000, 3.2000, 3.4400, 3.6800, 4.0000, 4.6000, 5.0000, 5.3200, 5.4800, 6.0600, 6.0600, 6.1000, 6.3200, 6.7400, 7.0000, 7.2200, 7.4000, 7.7600, 8.0600, 8.5600, 8.8600, 8.9400, 9.1000, 9.3400, 9.8800, 9.8800] ]) # fmt: on @@ -2314,16 +2284,16 @@ def test_tiny_token_timestamp_generation_longform(self): # fmt: off EXPECTED_OUTPUT = [ torch.tensor([0.0000, 0.4200, 0.8200, 0.9400, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0400, 2.3400, 2.5200, 2.6600, 3.2000, 3.4400, 3.5600, 3.6800, 3.8200, 4.1000, 4.3000, 4.5800, 4.9400, 5.4000, 6.3600]), - torch.tensor([ 6.5400, 6.5400, 6.7400, 6.9600, 7.2600, 7.3400, 7.5800, 7.5800, 7.6400, 7.8400, 8.1000, 8.5000, 9.0000, 9.4800, 9.7200, 10.2600, 11.1000]), + torch.tensor([6.5400, 6.5400, 6.7400, 6.9600, 7.2600, 7.3400, 7.5800, 7.5800, 7.6400, 7.8400, 8.1000, 8.5000, 9.0000, 9.4800, 9.7200, 10.2600, 11.1000]), torch.tensor([11.2200, 11.2200, 11.4200, 11.6600, 12.0800, 12.4400, 12.5800, 12.8400, 13.1800, 13.6800, 14.0000, 14.2200, 14.6200, 14.9800, 15.2200, 15.6000, 15.9400, 16.2000, 16.5600, 16.8400, 16.9800]), torch.tensor([16.9800, 16.9800, 17.3200, 18.1600, 18.6400, 18.8600, 19.2800, 19.5600, 19.8800, 20.1800, 20.3800, 20.7200, 21.1600, 21.5400, 21.9000, 22.2000, 22.4200, 22.8600, 23.7000]), - torch.tensor([23.7000, 23.7000, 23.9400, 24.1800, 24.3800, 24.8400, 25.2800, 25.6600, 25.9200, 26.2600, 26.4000, 26.5800, 26.7600, 27.1400, 27.3800, 28.0400, 28.3800, 28.8200, 29.3400, 29.5200]), + torch.tensor([23.7000, 23.7000, 23.9400, 24.1800, 24.3800, 24.8400, 25.2800, 25.6600, 25.9200, 26.2600, 26.4000, 26.5800, 26.7600, 27.1400, 27.3800, 28.0400, 28.3800, 28.8200, 29.3400, 29.5200, 29.9800]), torch.tensor([29.4400, 29.4400, 29.7000, 30.0800, 30.3800, 30.5400, 30.8200, 31.0600, 31.6600, 31.9200, 32.3000, 32.4800, 32.6200, 33.6800]), torch.tensor([33.8000, 33.8000, 33.9800, 33.9800, 34.1800, 34.4400, 34.6200, 35.0000, 35.2200, 35.3200, 35.5600, 35.9200, 36.3800, 36.6200, 36.6600, 36.9600, 37.3400, 37.9800, 38.5800, 38.7200, 38.9800, 39.4400, 39.5800, 39.8000, 40.1200, 40.2600]), torch.tensor([40.5200, 40.5200, 40.6200, 41.1000, 41.5400, 41.9200, 42.1000, 42.3200, 42.3200, 43.0600, 44.6000]), torch.tensor([44.7000, 44.7000, 44.8600, 44.9400, 45.1400, 45.1400, 45.2800, 45.6200, 45.9000, 46.2600, 47.1600, 47.4800, 47.7400, 48.1000, 48.2800, 48.4000, 48.6200, 48.8400, 49.0400, 49.2800, 49.4800, 49.6600, 49.9400, 50.5400]), torch.tensor([50.5400, 50.5400, 50.6600, 50.8800, 51.2400, 51.7200, 52.8400]), - torch.tensor([52.9600, 52.9600, 53.0400, 53.2600, 53.4200, 53.5800, 53.9200, 54.1200, 54.7200, 54.9400, 55.2600, 55.6200, 55.9800, 56.5600, 56.8000, 56.9200, 57.3600, 57.9200, 58.1800, 58.5000, 58.6400, 58.8200]), + torch.tensor([52.9600, 52.9600, 53.0400, 53.2600, 53.4200, 53.5800, 53.9200, 54.1200, 54.7200, 54.9400, 55.2600, 55.6200, 55.9800, 56.5600, 56.8000, 56.9200, 57.3600, 57.9200, 58.1800, 58.5000, 58.6400, 58.8200, 59.4200]), torch.tensor([58.6800, 58.6800, 59.1400, 59.5400, 59.9200, 60.1600, 60.3800, 60.8200, 61.6200, 62.2600, 75.2000]), ] # fmt: on @@ -2379,8 +2349,8 @@ def test_generate_with_prompt_ids(self): prompt_ids = processor.get_prompt_ids("Leighton", return_tensors="pt").to(torch_device) output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids) - expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>" - expected_with_prompt = "<|startofprev|> Leighton<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>" + expected_without_prompt = " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca." + expected_with_prompt = " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca." output_without_prompt = processor.decode(output_without_prompt[0]) output_with_prompt = processor.decode(output_with_prompt[0]) @@ -2388,6 +2358,62 @@ def test_generate_with_prompt_ids(self): self.assertEqual(output_without_prompt, expected_without_prompt) self.assertEqual(output_with_prompt, expected_with_prompt) + @slow + def test_generate_with_forced_decoder_ids(self): + processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + model.to(torch_device) + + ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True) + ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) + + input_speech = next(iter(ds))["audio"]["array"] + input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features + input_features = input_features.to(torch_device) + + forced_decoder_ids = processor.get_decoder_prompt_ids( + task="transcribe", + language="german", + ) + + generated_ids = model.generate(input_features, do_sample=False, language="<|de|>", task="transcribe") + generated_ids_forced = model.generate(input_features, do_sample=False, forced_decoder_ids=forced_decoder_ids) + + self.assertListEqual(generated_ids.tolist()[0], generated_ids_forced.tolist()[0]) + + @slow + def test_generate_with_prompt_ids_task_language(self): + EXPECTED_TEXT = " Mr. Kilter is the apostle of the middle classes and we are glad to welcome his gospel." + + processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + model = model.to(torch_device) + + prompt = "Mr. Kilter, Brionno." # let's force Quilter -> Kilter, Brion -> Brionno + prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device) + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]") + ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000)) + input_speech = ds[0]["audio"]["array"] + + input_features = processor( + input_speech, return_tensors="pt", truncation=False, padding="longest", sampling_rate=16_000 + )["input_features"] + input_features = input_features.to(device=torch_device) + + gen_kwargs = { + "do_sample": False, + "return_timestamps": True, + "language": "english", + "task": "transcribe", + "prompt_ids": prompt_ids, + } + + generated_ids = model.generate(input_features, **gen_kwargs) + transcription = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] + + self.assertEqual(transcription, EXPECTED_TEXT) + @slow def test_language_detection(self): processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") @@ -2401,7 +2427,7 @@ def test_language_detection(self): ids_to_lang = {v: k for k, v in model.generation_config.lang_to_id.items()} - assert ids_to_lang[lang_id] == "<|en|>" + self.assertEqual(ids_to_lang[lang_id], "<|en|>") audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset") @@ -2413,7 +2439,7 @@ def test_language_detection(self): lang_id = model.detect_language(input_features)[0].item() - assert ids_to_lang[lang_id] == "<|hi|>" + self.assertEqual(ids_to_lang[lang_id], "<|hi|>") @slow def test_default_multilingual_transcription_short_form(self): @@ -2434,19 +2460,13 @@ def test_default_multilingual_transcription_short_form(self): transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0] - assert ( - transcription - == "<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> Mirchi mein ki tene vibinda prajatiya hai<|endoftext|>" - ) + self.assertEqual(transcription, " Mirchi mein ki tene vibinda prajatiya hai") # set task to translate sequences = model.generate(input_features, task="translate") transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0] - assert ( - transcription - == "<|startoftranscript|><|hi|><|translate|><|notimestamps|> How much is the difference between the girls?<|endoftext|>" - ) + self.assertEqual(transcription, " How much is the difference between the girls?") @slow def test_default_multilingual_transcription_long_form(self): @@ -2467,58 +2487,19 @@ def test_default_multilingual_transcription_long_form(self): # task defaults to transcribe sequences = model.generate(input_features, return_timestamps=True) - transcription = processor.batch_decode(sequences)[0] + transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0] - assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?" + self.assertEqual(transcription, " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?") # set task to translate sequences = model.generate(input_features, task="translate", return_timestamps=True) - transcription = processor.batch_decode(sequences)[0] + transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0] - assert ( - transcription - == " How many different species are there in the chilli? How many different species are there in the chilli?" + self.assertEqual( + transcription, + " How many different species are there in the chilli? How many different species are there in the chilli?", ) - @slow - def test_generate_with_prompt_ids_and_forced_decoder_ids(self): - processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") - model.to(torch_device) - input_speech = self._load_datasamples(1) - input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features - input_features = input_features.to(torch_device) - task = "translate" - language = "de" - expected_tokens = [f"<|{task}|>", f"<|{language}|>"] - prompt = "test prompt" - prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device) - - output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids) - text = processor.decode(output[0]) - - self.assertTrue(prompt in text) - self.assertTrue(all(token in text for token in expected_tokens)) - - @slow - def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self): - processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - model.to(torch_device) - input_speech = self._load_datasamples(1) - input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features - input_features = input_features.to(torch_device) - prompt = "test prompt" - prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device) - - model.generation_config.forced_decoder_ids = None - model.config.forced_decoder_ids = None - - output = model.generate(input_features, prompt_ids=prompt_ids, return_timestamps=True) - text = processor.decode(output[0]) - - self.assertTrue(prompt in text) - @require_non_xpu @slow @require_torch_gpu @@ -2563,11 +2544,12 @@ def test_speculative_decoding_distil(self): transcription_non_ass = processor.batch_decode(tokens, skip_special_tokens=True) - assert transcription_ass == transcription_non_ass - assert transcription_ass == [ - " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." - ] - assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster" + self.assertEqual(transcription_ass, transcription_non_ass) + self.assertEqual( + transcription_ass, + [" Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."], + ) + self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") @slow @require_torch_gpu @@ -2612,11 +2594,12 @@ def test_speculative_decoding_non_distil(self): transcription_non_ass = processor.batch_decode(tokens, skip_special_tokens=True) - assert transcription_ass == transcription_non_ass - assert transcription_ass == [ - " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." - ] - assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster" + self.assertEqual(transcription_ass, transcription_non_ass) + self.assertEqual( + transcription_ass, + [" Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."], + ) + self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") @slow def test_whisper_longform_single_batch(self): @@ -2639,13 +2622,13 @@ def test_whisper_longform_single_batch(self): result = model.generate(input_features, return_timestamps=True) decoded = processor.batch_decode(result, skip_special_tokens=True) - assert decoded == EXPECTED_TEXT + self.assertEqual(decoded, EXPECTED_TEXT) decoded_with_timestamps = processor.batch_decode(result, skip_special_tokens=True, decode_with_timestamps=True) no_timestamp_matches = re.split(r"<\|[\d\.]+\|>", decoded_with_timestamps[0]) - assert ["".join(no_timestamp_matches)] == EXPECTED_TEXT + self.assertEqual(["".join(no_timestamp_matches)], EXPECTED_TEXT) timestamp_matches = re.findall(r"<\|[\d\.]+\|>", decoded_with_timestamps[0]) @@ -2653,7 +2636,7 @@ def test_whisper_longform_single_batch(self): is_increasing = all(timestamp_floats[i] <= timestamp_floats[i + 1] for i in range(len(timestamp_floats) - 1)) - assert is_increasing + self.assertTrue(is_increasing) @slow def test_whisper_longform_prompt_ids(self): @@ -2694,16 +2677,16 @@ def test_whisper_longform_prompt_ids(self): decoded_all_segments = processor.batch_decode(result, skip_special_tokens=True) # show that first segment has quilter and last segment has brion - assert "quilter" in first_text - assert "brion" in last_text + self.assertIn("quilter", first_text) + self.assertIn("brion", last_text) # condition on first segment correctly changes to kilter in first segment, but does not transcribe "brianno" correctly - assert "kilter" in decoded_first_segment[0][: len(first_text)].lower() - assert "brionno" not in decoded_first_segment[0][-len(last_text) :].lower() + self.assertIn("kilter", decoded_first_segment[0][: len(first_text)].lower()) + self.assertNotIn("brionno", decoded_first_segment[0][-len(last_text) :].lower()) # condition on all-segment correctly changes to kilter in first segment and correctly transcribes "brianno" - assert "kilter" in decoded_all_segments[0][: len(first_text)].lower() - assert "brionno" in decoded_all_segments[0][-len(last_text) :].lower() + self.assertIn("kilter", decoded_all_segments[0][: len(first_text)].lower()) + self.assertIn("brionno", decoded_all_segments[0][-len(last_text) :].lower()) @slow def test_whisper_longform_single_batch_prev_cond(self): @@ -2736,13 +2719,13 @@ def test_whisper_longform_single_batch_prev_cond(self): result = model.generate(input_features, **gen_kwargs) decoded = processor.batch_decode(result, skip_special_tokens=True) - assert decoded == EXPECTED_TEXT + self.assertEqual(decoded, EXPECTED_TEXT) @slow def test_whisper_shortform_single_batch_prev_cond(self): # fmt: off - EXPECTED_TEXT = [" Folks, I spend a lot of time right over there, night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing and the most topical antilock breaks and power steering pain, Stakingly stitching, leather seating so soft, it would make JD power and her associate blush. If you were to create the luxury sedan that is my nightly model, but sometimes— you're sometimes, folks— I lurched the consciousness and the back of an abandoned school bus"] - EXPECTED_TEXT1 = [" Folks, I spend a lot of time right over there night after night after, actually. Carefully selecting for you the day's noisiest, most aerodynamic headlines, stress testing, and the most topical, anti-lock breaks and power steering, painstakingly stitching, leather seating, so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school"] + EXPECTED_TEXT = [" Folks, I spend a lot of time right over there, night after night after night, actually. Carefully selecting for you the day's noosiest, most aerodynamic headlines, stress testing, and those topical anti-lock breaks and power steering, painstakingly stitching, leather seating so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school bus and slap myself awake."] + EXPECTED_TEXT1 = [" Folks, I spend a lot of time right over there, night after night after night, actually. Carefully selecting for you the day's noosiest, most aerodynamic headlines, stress testing, and those topical anti-lock breaks and power steering, painstakingly stitching, leather seating so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school bus and slap myself a wig."] # fmt: on processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") @@ -2769,8 +2752,7 @@ def test_whisper_shortform_single_batch_prev_cond(self): torch.manual_seed(0) result = model.generate(input_features, **gen_kwargs) decoded = processor.batch_decode(result, skip_special_tokens=True) - - assert decoded == EXPECTED_TEXT + self.assertEqual(decoded, EXPECTED_TEXT) gen_kwargs = { "return_timestamps": True, @@ -2785,12 +2767,12 @@ def test_whisper_shortform_single_batch_prev_cond(self): result = model.generate(input_features, **gen_kwargs) decoded = processor.batch_decode(result, skip_special_tokens=True) - assert decoded == EXPECTED_TEXT1 + self.assertEqual(decoded, EXPECTED_TEXT1) @slow def test_whisper_longform_single_batch_beam(self): # fmt: off - EXPECTED_TEXT = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath, next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art with Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the topper of painting. By Harry Quilter, M.A., because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded this grace, and your friends are asking for you. I begged Ruggado long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest in all our dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, a great Calico. Calico went to the big gong and pounded on it, just as Ruggado used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggado's discarded ruby crown, and holding in his hand to scepter which Ruggado had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-laying cloth that was the only german who wore. The cut on his chest was still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small, sharp, blow high on his chest. One minute, a voice said, and a time buzzer sounded, a minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oli's heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Breon's head died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. In the powerful twist that's rest of the side, in and under the guard."] + EXPECTED_TEXT = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath, next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art with Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the topper of painting. By Harry Quilter, M.A. Because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced, and your friends are asking for you. I begged Ruggado long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest in all our dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, a great Calico. Calico went to the big gong and pounded on it, just as Ruggado used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggado's discarded ruby crown, and holding in his hand to scepter which Ruggado had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-laying cloth that was the only germany war. The cut on his chest, still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oli's heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest, therefore nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The 20s, he must have drawn his gun because the intruder said quickly, but that away, there'd be no fool. Out, there was silence then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. This had died before during the 20s and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Brienne sensed it and knew the fifth point was his. In the powerful twist that's rest of the side, in and under the guard."] # fmt: on processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") @@ -2813,10 +2795,11 @@ def test_whisper_longform_single_batch_beam(self): "compression_ratio_threshold": 1.35, "condition_on_prev_tokens": True, "logprob_threshold": -1.0, + "renormalize_logits": True, # necessary to match OAI beam search implementation } def check_gen_kwargs(inputs, generation_config, *args, **kwargs): - assert generation_config.num_beams == gen_kwargs["num_beams"] + self.assertEqual(generation_config.num_beams, gen_kwargs["num_beams"]) self._patch_generation_mixin_generate(check_args_fn=check_gen_kwargs) @@ -2824,7 +2807,7 @@ def check_gen_kwargs(inputs, generation_config, *args, **kwargs): result = model.generate(input_features, **gen_kwargs) decoded = processor.batch_decode(result, skip_special_tokens=True) - assert decoded == EXPECTED_TEXT + self.assertEqual(decoded, EXPECTED_TEXT) @slow def test_whisper_longform_multi_batch(self): @@ -2869,16 +2852,16 @@ def test_whisper_longform_multi_batch(self): decoded_all = processor.batch_decode(result, skip_special_tokens=True) # make sure single & batch is exactly the same - assert decoded_all[0:1] == decoded_single[0] - assert decoded_all[1:2] == decoded_single[1] - assert decoded_all[2:3] == decoded_single[2] - assert decoded_all[3:4] == decoded_single[3] + self.assertEqual(decoded_all[0:1], decoded_single[0]) + self.assertEqual(decoded_all[1:2], decoded_single[1]) + self.assertEqual(decoded_all[2:3], decoded_single[2]) + self.assertEqual(decoded_all[3:4], decoded_single[3]) # exact match - assert decoded_all[0:1] == EXPECTED_TEXT_1 - assert decoded_all[1:2] == EXPECTED_TEXT_2 - assert decoded_all[2:3] == EXPECTED_TEXT_3 - assert decoded_all[3:4] == EXPECTED_TEXT_4 + self.assertEqual(decoded_all[0:1], EXPECTED_TEXT_1) + self.assertEqual(decoded_all[1:2], EXPECTED_TEXT_2) + self.assertEqual(decoded_all[2:3], EXPECTED_TEXT_3) + self.assertEqual(decoded_all[3:4], EXPECTED_TEXT_4) @slow def test_whisper_longform_multi_batch_prev_cond(self): @@ -2919,10 +2902,10 @@ def test_whisper_longform_multi_batch_prev_cond(self): decoded_single.append(processor.batch_decode(result, skip_special_tokens=True)) # exact match - assert decoded_single[0] == EXPECTED_TEXT_1 - assert decoded_single[1] == EXPECTED_TEXT_2 - assert decoded_single[2] == EXPECTED_TEXT_3 - assert decoded_single[3] == EXPECTED_TEXT_4 + self.assertEqual(decoded_single[0], EXPECTED_TEXT_1) + self.assertEqual(decoded_single[1], EXPECTED_TEXT_2) + self.assertEqual(decoded_single[2], EXPECTED_TEXT_3) + self.assertEqual(decoded_single[3], EXPECTED_TEXT_4) @slow def test_whisper_longform_multi_batch_hard(self): @@ -2951,12 +2934,21 @@ def test_whisper_longform_multi_batch_hard(self): audio = ds[:num_samples]["audio"] audios = [x["array"] for x in audio] + gen_kwargs = { + "return_timestamps": True, + "no_speech_threshold": 0.6, # necessary to trigger no speech detection + "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), + "compression_ratio_threshold": 1.35, + "condition_on_prev_tokens": False, + "logprob_threshold": -2.0, # necessary to avoid triggering temp fallback that will introduce randomness since we are comparing to openai EXTECTED_TEXT + } + decoded_single = [] for audio in audios: inputs = processor(audio, return_tensors="pt", truncation=False, sampling_rate=16_000) inputs = inputs.to(device=torch_device) - result = model.generate(**inputs, return_timestamps=True) + result = model.generate(**inputs, **gen_kwargs) decoded_single += processor.batch_decode(result, skip_special_tokens=True) inputs = processor( @@ -2972,28 +2964,21 @@ def test_whisper_longform_multi_batch_hard(self): result = model.generate(**inputs, return_timestamps=True) decoded_all = processor.batch_decode(result, skip_special_tokens=True) - for i in range(num_samples): - assert decoded_all[i] == decoded_single[i] - assert decoded_all[i] == EXPECTED_TEXT[i] + self.assertListEqual(decoded_all, decoded_single) + self.assertListEqual(decoded_all, EXPECTED_TEXT) @slow def test_whisper_longform_multi_batch_hard_prev_cond(self): - # Without this set here, this test may fail if it is run with other tests (say, `test_tiny_*`). It's unclear - # why other tests may affect this tests: it seems some random operations are beyond the scene. - set_seed(0) # fmt: off EXPECTED_TEXT = [ - " Folks, if you watch the show, you know I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories, developing the central headline pawns, definitely maneuvering an oh-so-topical night to F6, faming of classic Sicilian, named or variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a Fisher shows in lip-nitsky attack that culminates in the elegant lethal slow-played, all-pass on checkmate that is my nightly monologue, but sometimes sometimes folks I sometimes I start to the wake-up side down in the monkey bars of a condemned playground on a super fun site, get all hepped up on goofballs, rummage that would discard a tag bag of defective toys, yank out a fistball of disembodied doll limbs, toss them on a stain kid's place mad from a defunct denies, set up a table inside a rusty cargo container down by the warf and challenge toothless drifters to the godless bughouse blitz of tournament that is my segment, meanwhile.", - " Folks, I spent a lot of time right over there night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing on those topical anti-lock breaks and power steering, painstakingly stitching, leather seating, so soft, it would make JD power and her associates blush. To create the luxury sedan that is my nightly monologue, but sometimes I just sometimes focus. I lurched to consciousness in the back of an abandoned school bus and slapped myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen-moon render a gas tank out of an empty big gulp, filled with white claw and de-natured alcohol, then light a match and let her rip in the dis-mented one man, soapbox derby of news that is my segment.", - " Ladies and gentlemen, you know, I spent a lot of time right over there, raising the finest hosting news cattle firmly, yet tenderly milking the latest headlines from their jokes, swollen teats, churning the daily stories into the decadent Provincil style triple cream-breed. It is my nightly monologue, but sometimes sometimes I stagger home hungry after being released by the police and root around in the neighbor's trash can for an old milk carton scrape out the blooming dairy residue into the remains of a wet cheese rod I won from a rat in a pre-drawn street fight. Put it in a discarded paint can to leave it to ferment next to a trash fire than a hunker down in hallucinate while eating the Listeria latent demon custard of news that is my segment.", - " Folks, you watched this show, you know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle, and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ickel Greg Waferandi, who carefully died them in a pallet of bright, zesty shades, and adorn them in the finest most topical inlay work, using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddle stitching, and line it with bees, wax, coated linen, and finally attach a mallet hammered strap, purled hardware, and close-shet to create for you the one of a kind hope kutur, Ernme, is burkin bag that is my monologue, but sometimes, sometimes folks, sometimes. Sometimes I wake up in the last car of an abandoned rollercoaster at Coney Island where I'm hiding from the triads, I have some engine lubricants out of a safe way bag and staggered down the shore to tear the sail off a beach skoener, then I ripped the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel, lovely folks, and use it to stitch the sail into a loose pouch-like rock sack, and I stow in the back of a garbage truck to the junkyard, where I pick through to the debris for only the broken toys that make me the saddest, until I have loaded for you, the hobo fugitives bug out bindle of news that", - " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui, to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue, but sometimes just sometimes, I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself and use fry oil, wrap my hands and some old duct tape I stole from a broken car window, pound a six pack of blueberry hard-seller and a second pill, as I stole from a parked ambulance, then arm wrestle a raccoon in the back alley vision quest of news that is my segment.", - " You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press, black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards and a face plate, and finally using fluted strips of white, alloyed molding, I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating, Anglo-Saxon battle helm that is my nightly monologue. But sometimes, sometimes, folks. Sometimes, just sometimes, I come to my senses fully naked on the deck of a pirate-be-seed, melee, container ship that picked me up floating on the detached door of a porta-potty in the Indian Ocean. Then, after a sunstroke induced realization of the crew of this ship plans to sell me an exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe and a pool chain that accepting my new role as captain and declaring myself King of the Windark Seas. I grab a dirty mop bucket covered in barnacles and adorn it with the teeth of the vanquished to create these shopping wet pirate crown of news that is my segment. Me wild!", - " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most relevant midnight valencio oranges. And doubts at all, and I find delimane de voyage cognac, before from bang and basting them tables, I deserve you the James Beard Award worthy creeps to ZET. That is my nightly monologue, but sometimes sometimes folks, I wake up in the baggage hole of Greyhound bus, it's being hoisted by the scrapyard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps, busted up in bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strained pair of sweatpants and as ovenmets to extract and serve the demented transients pound cake of news that is my segment.", - ( - " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cohen-Extra. Slurring the concoction with caffeine pills and a microwave bread bowl, I scream sing a prayer to Janice initiator of human life and God of Transformation as a half horse, half man freak ceases to life before me and the hideous collection of loose animal parts and corrupted men tissue that is my segment. Meanwhile!", - " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cohen-Extra. Slurring the concoction with caffeine pills and a microwave bread bowl, I screamed sing a prayer to Janice initiator of human life and God of Transformation as a half horse, half man freak ceases to life before me and the hideous collection of loose animal parts and corrupted men tissue that is my segment. Meanwhile!", - ) + " Folks, if you watch the show, you know I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories, developing the central headline pawns, definitely maneuvering an oh-so-topical night to F6, faming of classic Sicilian, named or variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a Fisher shows in lip-nitsky attack that culminates in the elegant lethal slow-played all-pass on checkmate that is my nightly monologue, but sometimes sometimes, sometimes folks I sometimes I start a little wake-up side down in the monkey bars of a condemned playground on a super fun site, get all hept up on goofballs, rummage that would discard a tag bag of defective toys, yank out a fistball of disembodied doll limbs, toss them on a stain kid's place mad from a defunct denies, set up a table inside a rusty cargo container down by the warf and challenge toothless drifters to the godless bughouse blitz of tournament that is my segment.", + " Folks, I spent a lot of time right over there night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing on those topical anti-lock breaks and power steering, painstakingly stitching, leather seating, so soft, it would make JD power and her associates blush. To create the luxury sedan that is my nightly monologue, but sometimes I just sometimes folks, I lurched to consciousness in the back of an abandoned school bus and slapped myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen-moon render a gas tank out of an empty big gulp, filled with white claw and de-natured alcohol, then light a match, letter-rip, and the dis-mented one-man soapbox derby of news that is my segment. Meanwhile.", + " Ladies and gentlemen, you know, I spent a lot of time right over there, raising the finest hosting news cattle firmly, yet tenderly milking the latest headlines from their jokes, swollen teats, churning the daily stories into the decadent Provincil style triple cream-breed. It is my nightly monologue, but sometimes sometimes I stagger home hungry after being released by the police and root around in the neighbor's trash can for an old milk carton scraped out the blooming dairy residue into the remains of a wet cheese rod I won from a rat in a pre-dawn street fight. Put it in a discarded paint can to leave it to ferment next to a trash fire than a hunker down in hallucinate while eating the Listeria latent demon custard of news that is my segment.", + " Folks, you watched this show. You know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Icol Greg Waferandi, who carefully die them in a pallet of bright, zesty shades, and adorn them in the finest, most topical inlay work, using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddle stitching, and line it with bees, wax, coated linen, finally attach a mallet hammered strap, pearl hardware, and close-shet to create for you the one-of-a-kind, hout-cout-tour, earned me his burkin bag that is my monologue, but sometimes, sometimes, folks. Sometimes, sometimes, sometimes, sometimes I wake up in the last car of an abandoned roller coaster at Coney Island, where I'm hiding from the triads, I huff some engine lubricants out of a safe way bag, and staggered down the shore to tear the sail off a beach skoener, then I ripped the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel, lovely folks. And use it to stitch the sail into a loose pouch-like rock sack, and I stow in the back of a garbage truck to the junkyard, where I pick through to the debris for only the broken toys that make me the saddest, until I have loaded for you. The hobo fugitives bug out bindle of news that is my segment. Meanwhile!", + " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui, to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue, but sometimes just sometimes, I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself and use fry oil, wrap my hands and some old duct tape I stole from a broken car window, pound a six pack of blueberry hardcelser and a sack of pills I stole from a parked ambulance, then arm wrestle a raccoon in the back alley vision quest of news that is my segment.", + " You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press, black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards and a face plate, and finally using fluted strips of white alloyed molding I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating Anglo-Saxon battle helm that is my nightly monologue. Sometimes, sometimes, folks. Sometimes, just sometimes, I come to my senses fully naked on the deck of a pirate, besieged, melee, container ship that picked me up floating on the detached door of a port of potty in the Indian Ocean. Then, after a sunstroke induced realization of the crew of this ship plans to sell me and exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe and a pool chain that accepting my new role as captain and declaring myself King of the Windark Seas. I grab a dirty mop bucket covered in barnacles and adorn it with the teeth of the vanquished to create these shopping wet pirate crown of news that is my segment. Meanwhile!", + " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most relevant midnight valencio oranges. And doubts at all, and I find delimane de voyage cognac, before from bang and basting them tables, I deserve you the James Beard Award worthy creeps to ZET. That is my nightly monologue, but sometimes sometimes folks, I wake up in the baggage hole of Greyhound bus. It's being hoisted by the scrapyard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps, busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strained pair of sweatpants. As ovenmets to extract and serve the demented transience pound cake of news that is my segment.", + " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from it a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cole and extra. Slering the concoction with caffeine pills and a microwave bread bowl, I screamed sing a prayer to Janice initiator of human life and God of transformation as a half horse, half man freak, seizes to life before me and the hideous collection of loose animal parts and corrupted men tissue that is my segment. Meanwhile.", ] # fmt: on @@ -3025,18 +3010,15 @@ def test_whisper_longform_multi_batch_hard_prev_cond(self): "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), "compression_ratio_threshold": 1.35, "condition_on_prev_tokens": True, - "logprob_threshold": -1.0, + "logprob_threshold": -2.0, # necessary to avoid triggering temp fallback that will introduce randomness since we are comparing to openai EXTECTED_TEXT "num_beams": 5, + "renormalize_logits": True, # necessary to match OAI beam search implementation } result = model.generate(**inputs, **gen_kwargs) decoded_all = processor.batch_decode(result, skip_special_tokens=True) - for i in range(num_samples): - if isinstance(EXPECTED_TEXT[i], str): - assert decoded_all[i] == EXPECTED_TEXT[i] - elif isinstance(EXPECTED_TEXT[i], tuple): - assert decoded_all[i] in EXPECTED_TEXT[i] + self.assertListEqual(decoded_all, EXPECTED_TEXT) @slow def test_whisper_shortform_multi_batch_hard_prev_cond(self): @@ -3045,14 +3027,14 @@ def test_whisper_shortform_multi_batch_hard_prev_cond(self): set_seed(0) # fmt: off EXPECTED_TEXT = [ - ' Mr. Kfilter is the apostle of the Middle Classes and we are glad to welcome his gospel.', - " Nor is Mr. Qilter's manner less interesting than his matter.", - ' He tells us that at this festive season of the year, with Christmas and roce beef, looming before us, similarly drawn from eating and its results occur most readily to the mind.', - ' He has grabbed those with her surfered trigger late and his work is really a great after all, and can discover it in it but little of Rocky Ithaka.', - " L'Neile's pictures are a sort of upguards and add-um paintings, and Maessin's exquisite Itals are a national as a jingo poem. Mr. Birkett Foster's landscapes smiled at one much in the same way that Mr. Carcher used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slapper in the back, before he says,", - ' It is obviously unnecessary for us, to point out how luminous these criticisms are, how delicate and expression.', - ' On the general principles of art and Mr. Kriltor rights with equal lucidity.', - ' Painting, he tells us is of a different quality to mathematics and finish in art is adding more effect.', + " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", + " Nor is Mr. Quilters' manner less interesting than his matter.", + " He tells us that at this festive season of the year with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind.", + " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.", + " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Colier gives his visitor a cheerful slap on the back before he says like a shampoo or a turkish bath. Next man!", + " It is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression.", + " On the general principles of art and Mr. Quilter writes with equal lucidity.", + " Painting he tells us is of a different quality to mathematics and finish in art is adding more effect.", ] # fmt: on @@ -3085,9 +3067,7 @@ def test_whisper_shortform_multi_batch_hard_prev_cond(self): result = model.generate(**inputs, **gen_kwargs) decoded_all = processor.batch_decode(result, skip_special_tokens=True) - for i in range(num_samples): - if isinstance(EXPECTED_TEXT[i], str): - assert decoded_all[i] == EXPECTED_TEXT[i] + self.assertListEqual(decoded_all, EXPECTED_TEXT) @slow def test_whisper_longform_no_speech_detection(self): @@ -3144,8 +3124,7 @@ def test_whisper_longform_no_speech_detection(self): result = model.generate(**inputs, **gen_kwargs) decoded_all = processor.batch_decode(result, skip_special_tokens=True) - for i in range(num_samples): - assert decoded_all[i] == EXPECTED_TEXT[i] + self.assertListEqual(decoded_all, EXPECTED_TEXT) @require_torch_accelerator @slow @@ -3218,6 +3197,7 @@ def test_whisper_empty_longform_multi_gpu(self): "num_beams": 5, "language": "fr", "task": "transcribe", + "return_timestamps": True, } torch.manual_seed(0) @@ -3239,7 +3219,7 @@ def test_tiny_static_generation(self): # compile the forward pass and assert equivalence static_generated_ids = model.generate(input_features, max_new_tokens=64) - assert (eager_generated_ids == static_generated_ids).all() + self.assertTrue((eager_generated_ids == static_generated_ids).all()) # check the compiled graph can be re-used and that the cache is correctly reset # reverse the ordering of the input features @@ -3249,7 +3229,7 @@ def test_tiny_static_generation(self): input_features = input_features[permutation_idx, ...] static_generated_ids = model.generate(input_features, max_new_tokens=64) # assert re-ordered generations match those from eager - assert (eager_generated_ids[permutation_idx, :] == static_generated_ids).all() + self.assertTrue((eager_generated_ids[permutation_idx, :] == static_generated_ids).all()) @slow def test_tiny_static_generation_long_form(self): @@ -3295,7 +3275,7 @@ def test_tiny_static_generation_long_form(self): set_seed(42) static_generated_ids = model.generate(**inputs, **gen_kwargs) - assert (eager_generated_ids == static_generated_ids).all() + self.assertTrue((eager_generated_ids == static_generated_ids).all()) # check the compiled graph can be re-used and that the cache is correctly reset # reverse the ordering of the input features @@ -3309,7 +3289,7 @@ def test_tiny_static_generation_long_form(self): set_seed(42) static_generated_ids = model.generate(input_features, attention_mask=attention_mask, **gen_kwargs) # assert re-ordered generations match those from eager - assert (eager_generated_ids[permutation_idx, :] == static_generated_ids).all() + self.assertTrue((eager_generated_ids[permutation_idx, :] == static_generated_ids).all()) def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None): @@ -3825,7 +3805,7 @@ def create_and_check_decoder_model_past(self, config, input_ids): output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice - assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3) + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) def create_and_check_decoder_model_attention_mask_past(self, config, input_ids): model = WhisperDecoder(config=config).to(torch_device).eval() @@ -3866,7 +3846,7 @@ def create_and_check_decoder_model_attention_mask_past(self, config, input_ids): output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice - assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3) + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) @require_torch