diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index c5c3b202846705..6aa967416d5477 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -293,6 +293,8 @@ def format_speech_generation_kwargs(kwargs):
         elif key.startswith("speech_"):
             key = key[len("speech_") :]
             kwargs_speech[key] = value
+        elif key == "generation_config":
+            kwargs_text[key] = value
         else:
             # If the key is already in a specific config, then it's been set with a
             # submodules specific value and we don't override
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index a8068eb0ad01ea..978000086e2c3b 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -421,6 +421,8 @@ def format_speech_generation_kwargs(kwargs):
         elif key.startswith("speech_"):
             key = key[len("speech_") :]
             kwargs_speech[key] = value
+        elif key == "generation_config":
+            kwargs_text[key] = value
         else:
             # If the key is already in a specific config, then it's been set with a
             # submodules specific value and we don't override
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 451fff0b35fb8c..15f1219556cd0f 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -589,6 +589,11 @@ def test_attention_outputs(self):
                 [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
             )
 
+    # TODO: @ydshieh: refer to #34968
+    @unittest.skip(reason="Failing on multi-gpu runner")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
 
 @require_torch
 class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
index dac2ce6b30ec22..e07e2ad392a3e6 100644
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -27,7 +27,6 @@
     require_torch,
     require_torch_accelerator,
     require_torch_or_tf,
-    run_test_using_subprocess,
     slow,
     torch_device,
 )
@@ -67,10 +66,8 @@ def test_small_musicgen_pt(self):
         audio = [output["audio"] for output in outputs]
         self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
 
-    # TODO: @ylacombe: `SeamlessM4TForTextToSpeech.generate` has issue with `generation_config`. See issue #34811
     @slow
     @require_torch
-    @run_test_using_subprocess
     def test_medium_seamless_m4t_pt(self):
         speech_generator = pipeline(task="text-to-audio", model="facebook/hf-seamless-m4t-medium", framework="pt")