From c1926cef6b2c880766db3581ed6035c99005f00e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 4 Dec 2024 15:58:36 +0530
Subject: [PATCH] [tests] refactor vae tests (#9808)

* add: autoencoderkl tests

* autoencodertiny.

* fix

* asymmetric autoencoder.

* more

* integration tests for stable audio decoder.

* consistency decoder vae tests

* remove grad check from consistency decoder.

* cog

* bye test_models_vae.py

* fix

* fix

* remove allegro

* fixes

* fixes

* fixes

---------

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 .../autoencoders/autoencoder_kl_cogvideox.py  |   20 +-
 .../autoencoder_kl_temporal_decoder.py        |    8 -
 .../models/autoencoders/autoencoder_tiny.py   |    6 +-
 .../test_models_asymmetric_autoencoder_kl.py  |  261 ++++
 .../test_models_autoencoder_kl.py             |  468 ++++++
 .../test_models_autoencoder_kl_cogvideox.py   |  179 +++
 ..._models_autoencoder_kl_temporal_decoder.py |   73 +
 .../test_models_autoencoder_oobleck.py        |  228 +++
 .../test_models_autoencoder_tiny.py           |  251 ++++
 .../test_models_consistency_decoder_vae.py    |  300 ++++
 tests/models/autoencoders/test_models_vae.py  | 1249 -----------------
 tests/models/autoencoders/vae.py              |   86 ++
 tests/models/test_modeling_common.py          |    5 -
 .../controlnet_xs/test_controlnetxs.py        |    2 +-
 .../controlnet_xs/test_controlnetxs_sdxl.py   |    2 +-
 tests/pipelines/test_pipelines_common.py      |    2 +-
 16 files changed, 1863 insertions(+), 1277 deletions(-)
 create mode 100644 tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
 create mode 100644 tests/models/autoencoders/test_models_autoencoder_kl.py
 create mode 100644 tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py
 create mode 100644 tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py
 create mode 100644 tests/models/autoencoders/test_models_autoencoder_oobleck.py
 create mode 100644 tests/models/autoencoders/test_models_autoencoder_tiny.py
 create mode 100644 tests/models/autoencoders/test_models_consistency_decoder_vae.py
 delete mode 100644 tests/models/autoencoders/test_models_vae.py
 create mode 100644 tests/models/autoencoders/vae.py

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
index fbcb964392f9..941b3eb07f10 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -433,7 +433,7 @@ def create_forward(*inputs):
                     hidden_states,
                     temb,
                     zq,
-                    conv_cache=conv_cache.get(conv_cache_key),
+                    conv_cache.get(conv_cache_key),
                 )
             else:
                 hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -531,7 +531,7 @@ def create_forward(*inputs):
                     return create_forward
 
                 hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                    create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
                 )
             else:
                 hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -649,7 +649,7 @@ def create_forward(*inputs):
                     hidden_states,
                     temb,
                     zq,
-                    conv_cache=conv_cache.get(conv_cache_key),
+                    conv_cache.get(conv_cache_key),
                 )
             else:
                 hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -789,7 +789,7 @@ def custom_forward(*inputs):
                     hidden_states,
                     temb,
                     None,
-                    conv_cache=conv_cache.get(conv_cache_key),
+                    conv_cache.get(conv_cache_key),
                 )
 
             # 2. Mid
@@ -798,14 +798,14 @@ def custom_forward(*inputs):
                 hidden_states,
                 temb,
                 None,
-                conv_cache=conv_cache.get("mid_block"),
+                conv_cache.get("mid_block"),
             )
         else:
             # 1. Down
             for i, down_block in enumerate(self.down_blocks):
                 conv_cache_key = f"down_block_{i}"
                 hidden_states, new_conv_cache[conv_cache_key] = down_block(
-                    hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key)
+                    hidden_states, temb, None, conv_cache.get(conv_cache_key)
                 )
 
             # 2. Mid
@@ -953,7 +953,7 @@ def custom_forward(*inputs):
                 hidden_states,
                 temb,
                 sample,
-                conv_cache=conv_cache.get("mid_block"),
+                conv_cache.get("mid_block"),
             )
 
             # 2. Up
@@ -964,7 +964,7 @@ def custom_forward(*inputs):
                     hidden_states,
                     temb,
                     sample,
-                    conv_cache=conv_cache.get(conv_cache_key),
+                    conv_cache.get(conv_cache_key),
                 )
         else:
             # 1. Mid
@@ -1476,7 +1476,7 @@ def forward(
             z = posterior.sample(generator=generator)
         else:
             z = posterior.mode()
-        dec = self.decode(z)
+        dec = self.decode(z).sample
         if not return_dict:
             return (dec,)
-        return dec
+        return DecoderOutput(sample=dec)
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index f25430050ce5..38ad78c0707b 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -229,14 +229,6 @@ def __init__(
 
         self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
 
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
-        )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
-
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Encoder, TemporalDecoder)):
             module.gradient_checkpointing = value
diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py
index 6e503478fe2b..35081c22dfc4 100644
--- a/src/diffusers/models/autoencoders/autoencoder_tiny.py
+++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py
@@ -310,7 +310,9 @@ def decode(
         self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
     ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         if self.use_slicing and x.shape[0] > 1:
-            output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
+            output = [
+                self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x_slice) for x_slice in x.split(1)
+            ]
             output = torch.cat(output)
         else:
             output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
@@ -341,7 +343,7 @@ def forward(
         # as if we were loading the latents from an RGBA uint8 image.
         unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
 
-        dec = self.decode(unscaled_enc)
+        dec = self.decode(unscaled_enc).sample
 
         if not return_dict:
             return (dec,)
diff --git a/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py b/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
new file mode 100644
index 000000000000..11b93ac2fb45
--- /dev/null
+++ b/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
@@ -0,0 +1,261 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from diffusers import AsymmetricAutoencoderKL
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    load_hf_numpy,
+    require_torch_accelerator,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AsymmetricAutoencoderKL
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_asym_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None):
+        block_out_channels = block_out_channels or [2, 4]
+        norm_num_groups = norm_num_groups or 2
+        init_dict = {
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+            "down_block_out_channels": block_out_channels,
+            "layers_per_down_block": 1,
+            "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+            "up_block_out_channels": block_out_channels,
+            "layers_per_up_block": 1,
+            "act_fn": "silu",
+            "latent_channels": 4,
+            "norm_num_groups": norm_num_groups,
+            "sample_size": 32,
+            "scaling_factor": 0.18215,
+        }
+        return init_dict
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        mask = torch.ones((batch_size, 1) + sizes).to(torch_device)
+
+        return {"sample": image, "mask": mask}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_asym_autoencoder_kl_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skip("Unsupported test.")
+    def test_forward_with_norm_groups(self):
+        pass
+
+
+@slow
+class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x-1-5", fp16=False):
+        revision = "main"
+        torch_dtype = torch.float32
+
+        model = AsymmetricAutoencoderKL.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            revision=revision,
+        )
+        model.to(torch_device).eval()
+
+        return model
+
+    def get_generator(self, seed=0):
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            return torch.Generator(device=generator_device).manual_seed(seed)
+        return torch.manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205],
+                [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824],
+            ],
+            [
+                47,
+                [0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529],
+                [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.0340, 0.2870, 0.1698, -0.0105, -0.3448, 0.3529, -0.1321, 0.1097],
+                [-0.0344, 0.2912, 0.1687, -0.0137, -0.3462, 0.3552, -0.1337, 0.1078],
+            ],
+            [
+                47,
+                [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531],
+                [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [13, [-0.0521, -0.2939, 0.1540, -0.1855, -0.5936, -0.3138, -0.4579, -0.2275]],
+            [37, [-0.1820, -0.4345, -0.0455, -0.2923, -0.8035, -0.5089, -0.4795, -0.3106]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator
+    @skip_mps
+    def test_stable_diffusion_decode(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=2e-3)
+
+    @parameterized.expand([(13,), (16,), (37,)])
+    @require_torch_gpu
+    @unittest.skipIf(
+        not is_xformers_available(),
+        reason="xformers is not required when using PyTorch 2.0.",
+    )
+    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=5e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
+            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            dist = model.encode(image).latent_dist
+            sample = dist.sample(generator=generator)
+
+        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
+
+        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        tolerance = 3e-3 if torch_device != "mps" else 1e-2
+        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/tests/models/autoencoders/test_models_autoencoder_kl.py b/tests/models/autoencoders/test_models_autoencoder_kl.py
new file mode 100644
index 000000000000..52bf5aba204b
--- /dev/null
+++ b/tests/models/autoencoders/test_models_autoencoder_kl.py
@@ -0,0 +1,468 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from diffusers import AutoencoderKL
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    load_hf_numpy,
+    require_torch_accelerator,
+    require_torch_accelerator_with_fp16,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKL
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None):
+        block_out_channels = block_out_channels or [2, 4]
+        norm_num_groups = norm_num_groups or 2
+        init_dict = {
+            "block_out_channels": block_out_channels,
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+            "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+            "latent_channels": 4,
+            "norm_num_groups": norm_num_groups,
+        }
+        return init_dict
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_kl_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_enable_disable_tiling(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_tiling()
+        output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE tiling should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_tiling()
+        output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_tiling.detach().cpu().numpy().all(),
+            output_without_tiling_2.detach().cpu().numpy().all(),
+            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
+        )
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Decoder", "Encoder"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
+        model = model.to(torch_device)
+        model.eval()
+
+        # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            generator = torch.Generator(device=generator_device).manual_seed(0)
+        else:
+            generator = torch.manual_seed(0)
+
+        image = torch.randn(
+            1,
+            model.config.in_channels,
+            model.config.sample_size,
+            model.config.sample_size,
+            generator=torch.manual_seed(0),
+        )
+        image = image.to(torch_device)
+        with torch.no_grad():
+            output = model(image, sample_posterior=True, generator=generator).sample
+
+        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
+
+        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
+        # the expected output slices are not the same for CPU and GPU.
+        if torch_device == "mps":
+            expected_output_slice = torch.tensor(
+                [
+                    -4.0078e-01,
+                    -3.8323e-04,
+                    -1.2681e-01,
+                    -1.1462e-01,
+                    2.0095e-01,
+                    1.0893e-01,
+                    -8.8247e-02,
+                    -3.0361e-01,
+                    -9.8644e-03,
+                ]
+            )
+        elif generator_device == "cpu":
+            expected_output_slice = torch.tensor(
+                [
+                    -0.1352,
+                    0.0878,
+                    0.0419,
+                    -0.0818,
+                    -0.1069,
+                    0.0688,
+                    -0.1458,
+                    -0.4446,
+                    -0.0026,
+                ]
+            )
+        else:
+            expected_output_slice = torch.tensor(
+                [
+                    -0.2421,
+                    0.4642,
+                    0.2507,
+                    -0.0438,
+                    0.0682,
+                    0.3160,
+                    -0.2018,
+                    -0.0727,
+                    0.2485,
+                ]
+            )
+
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
+
+
+@slow
+class AutoencoderKLIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
+        revision = "fp16" if fp16 else None
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderKL.from_pretrained(
+            model_id,
+            subfolder="vae",
+            torch_dtype=torch_dtype,
+            revision=revision,
+        )
+        model.to(torch_device)
+
+        return model
+
+    def get_generator(self, seed=0):
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            return torch.Generator(device=generator_device).manual_seed(seed)
+        return torch.manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.1556, 0.9848, -0.0410, -0.0642, -0.2685, 0.8381, -0.2004, -0.0700],
+                [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824],
+            ],
+            [
+                47,
+                [-0.2376, 0.1200, 0.1337, -0.4830, -0.2504, -0.0759, -0.0486, -0.4077],
+                [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
+            [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_stable_diffusion_fp16(self, seed, expected_slice):
+        model = self.get_sd_vae_model(fp16=True)
+        image = self.get_sd_image(seed, fp16=True)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [
+                33,
+                [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814],
+                [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824],
+            ],
+            [
+                47,
+                [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085],
+                [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
+            ],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
+            [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator
+    @skip_mps
+    def test_stable_diffusion_decode(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]],
+            [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
+            # fmt: on
+        ]
+    )
+    @require_torch_accelerator_with_fp16
+    def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
+        model = self.get_sd_vae_model(fp16=True)
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand([(13,), (16,), (27,)])
+    @require_torch_gpu
+    @unittest.skipIf(
+        not is_xformers_available(),
+        reason="xformers is not required when using PyTorch 2.0.",
+    )
+    def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
+        model = self.get_sd_vae_model(fp16=True)
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=1e-1)
+
+    @parameterized.expand([(13,), (16,), (37,)])
+    @require_torch_gpu
+    @unittest.skipIf(
+        not is_xformers_available(),
+        reason="xformers is not required when using PyTorch 2.0.",
+    )
+    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=1e-2)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
+            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            dist = model.encode(image).latent_dist
+            sample = dist.sample(generator=generator)
+
+        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
+
+        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        tolerance = 3e-3 if torch_device != "mps" else 1e-2
+        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py b/tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py
new file mode 100644
index 000000000000..7336bb3d3e97
--- /dev/null
+++ b/tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import AutoencoderKLCogVideoX
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderKLCogVideoXTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKLCogVideoX
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_kl_cogvideox_config(self):
+        return {
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": (
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+                "CogVideoXDownBlock3D",
+            ),
+            "up_block_types": (
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+                "CogVideoXUpBlock3D",
+            ),
+            "block_out_channels": (8, 8, 8, 8),
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 2,
+            "temporal_compression_ratio": 4,
+        }
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_frames = 8
+        num_channels = 3
+        sizes = (16, 16)
+
+        image = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 8, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (3, 8, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_kl_cogvideox_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_enable_disable_tiling(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_tiling()
+        output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE tiling should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_tiling()
+        output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_tiling.detach().cpu().numpy().all(),
+            output_without_tiling_2.detach().cpu().numpy().all(),
+            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
+        )
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {
+            "CogVideoXDownBlock3D",
+            "CogVideoXDecoder3D",
+            "CogVideoXEncoder3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXMidBlock3D",
+        }
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    def test_forward_with_norm_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["norm_num_groups"] = 16
+        init_dict["block_out_channels"] = (16, 32, 32, 32)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    @unittest.skip("Unsupported test.")
+    def test_outputs_equivalence(self):
+        pass
diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py b/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py
new file mode 100644
index 000000000000..4308cb64896e
--- /dev/null
+++ b/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from diffusers import AutoencoderKLTemporalDecoder
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderKLTemporalDecoderTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKLTemporalDecoder
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    @property
+    def dummy_input(self):
+        batch_size = 3
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        num_frames = 3
+
+        return {"sample": image, "num_frames": num_frames}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": [32, 64],
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            "latent_channels": 4,
+            "layers_per_block": 2,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Encoder", "TemporalDecoder"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @unittest.skip("Test unsupported.")
+    def test_forward_with_norm_groups(self):
+        pass
diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
new file mode 100644
index 000000000000..4807fa298344
--- /dev/null
+++ b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+from datasets import load_dataset
+from parameterized import parameterized
+
+from diffusers import AutoencoderOobleck
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderOobleck
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_oobleck_config(self, block_out_channels=None):
+        init_dict = {
+            "encoder_hidden_size": 12,
+            "decoder_channels": 12,
+            "decoder_input_channels": 6,
+            "audio_channels": 2,
+            "downsampling_ratios": [2, 4],
+            "channel_multiples": [1, 2],
+        }
+        return init_dict
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 2
+        seq_len = 24
+
+        waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device)
+
+        return {"sample": waveform, "sample_posterior": False}
+
+    @property
+    def input_shape(self):
+        return (2, 24)
+
+    @property
+    def output_shape(self):
+        return (2, 24)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_oobleck_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+    @unittest.skip("Test unsupported.")
+    def test_forward_with_norm_groups(self):
+        pass
+
+    @unittest.skip("No attention module used in this model")
+    def test_set_attn_processor_for_determinism(self):
+        return
+
+
+@slow
+class AutoencoderOobleckIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return torch.nn.utils.rnn.pad_sequence(
+            [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True
+        )
+
+    def get_audio(self, audio_sample_size=2097152, fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        audio = self._load_datasamples(2).to(torch_device).to(dtype)
+
+        # pad / crop to audio_sample_size
+        audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1]))
+
+        # todo channel
+        audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device)
+
+        return audio
+
+    def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False):
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderOobleck.from_pretrained(
+            model_id,
+            subfolder="vae",
+            torch_dtype=torch_dtype,
+        )
+        model.to(torch_device)
+
+        return model
+
+    def get_generator(self, seed=0):
+        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        if torch_device != "mps":
+            return torch.Generator(device=generator_device).manual_seed(seed)
+        return torch.manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
+            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_diff):
+        model = self.get_oobleck_vae_model()
+        audio = self.get_audio()
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(audio, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == audio.shape
+        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
+
+        output_slice = sample[-1, 1, 5:10].cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
+
+    def test_stable_diffusion_mode(self):
+        model = self.get_oobleck_vae_model()
+        audio = self.get_audio()
+
+        with torch.no_grad():
+            sample = model(audio, sample_posterior=False).sample
+
+        assert sample.shape == audio.shape
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
+            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mean_absolute_diff):
+        model = self.get_oobleck_vae_model()
+        audio = self.get_audio()
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            x = audio
+            posterior = model.encode(x).latent_dist
+            z = posterior.sample(generator=generator)
+            sample = model.decode(z).sample
+
+        # (batch_size, latent_dim, sequence_length)
+        assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024)
+
+        assert sample.shape == audio.shape
+        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
+
+        output_slice = sample[-1, 1, 5:10].cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
diff --git a/tests/models/autoencoders/test_models_autoencoder_tiny.py b/tests/models/autoencoders/test_models_autoencoder_tiny.py
new file mode 100644
index 000000000000..4de3822fa835
--- /dev/null
+++ b/tests/models/autoencoders/test_models_autoencoder_tiny.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from diffusers import AutoencoderTiny
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    floats_tensor,
+    load_hf_numpy,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderTinyTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderTiny
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_tiny_config(self, block_out_channels=None):
+        block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32]
+        init_dict = {
+            "in_channels": 3,
+            "out_channels": 3,
+            "encoder_block_out_channels": block_out_channels,
+            "decoder_block_out_channels": block_out_channels,
+            "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels],
+            "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)],
+        }
+        return init_dict
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_tiny_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skip("Model doesn't yet support smaller resolution.")
+    def test_enable_disable_tiling(self):
+        pass
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict)[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict)[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict)[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+    @unittest.skip("Test not supported.")
+    def test_outputs_equivalence(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_forward_with_norm_groups(self):
+        pass
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"DecoderTiny", "EncoderTiny"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    def test_effective_gradient_checkpointing(self):
+        if not self.model_class._supports_gradient_checkpointing:
+            return  # Skip test if model does not support gradient checkpointing
+
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        inputs_dict_copy = copy.deepcopy(inputs_dict)
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        torch.manual_seed(0)
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict_copy).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < 1e-3)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+
+        for name, param in named_params.items():
+            if "encoder.layers" in name:
+                continue
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=3e-2))
+
+
+@slow
+class AutoencoderTinyIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return image
+
+    def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False):
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderTiny.from_pretrained(model_id, torch_dtype=torch_dtype)
+        model.to(torch_device).eval()
+        return model
+
+    @parameterized.expand(
+        [
+            [(1, 4, 73, 97), (1, 3, 584, 776)],
+            [(1, 4, 97, 73), (1, 3, 776, 584)],
+            [(1, 4, 49, 65), (1, 3, 392, 520)],
+            [(1, 4, 65, 49), (1, 3, 520, 392)],
+            [(1, 4, 49, 49), (1, 3, 392, 392)],
+        ]
+    )
+    def test_tae_tiling(self, in_shape, out_shape):
+        model = self.get_sd_vae_model()
+        model.enable_tiling()
+        with torch.no_grad():
+            zeros = torch.zeros(in_shape).to(torch_device)
+            dec = model.decode(zeros).sample
+            assert dec.shape == out_shape
+
+    def test_stable_diffusion(self):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed=33)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor([0.0093, 0.6385, -0.1274, 0.1631, -0.1762, 0.5232, -0.3108, -0.0382])
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
+
+    @parameterized.expand([(True,), (False,)])
+    def test_tae_roundtrip(self, enable_tiling):
+        # load the autoencoder
+        model = self.get_sd_vae_model()
+        if enable_tiling:
+            model.enable_tiling()
+
+        # make a black image with a white square in the middle,
+        # which is large enough to split across multiple tiles
+        image = -torch.ones(1, 3, 1024, 1024, device=torch_device)
+        image[..., 256:768, 256:768] = 1.0
+
+        # round-trip the image through the autoencoder
+        with torch.no_grad():
+            sample = model(image).sample
+
+        # the autoencoder reconstruction should match original image, sorta
+        def downscale(x):
+            return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor)
+
+        assert torch_all_close(downscale(sample), downscale(image), atol=0.125)
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
new file mode 100644
index 000000000000..77977a78d83b
--- /dev/null
+++ b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -0,0 +1,300 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    slow,
+    torch_all_close,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase):
+    model_class = ConsistencyDecoderVAE
+    main_input_name = "sample"
+    base_precision = 1e-2
+    forward_requires_fresh_args = True
+
+    def get_consistency_vae_config(self, block_out_channels=None, norm_num_groups=None):
+        block_out_channels = block_out_channels or [2, 4]
+        norm_num_groups = norm_num_groups or 2
+        return {
+            "encoder_block_out_channels": block_out_channels,
+            "encoder_in_channels": 3,
+            "encoder_out_channels": 4,
+            "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+            "decoder_add_attention": False,
+            "decoder_block_out_channels": block_out_channels,
+            "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels),
+            "decoder_downsample_padding": 1,
+            "decoder_in_channels": 7,
+            "decoder_layers_per_block": 1,
+            "decoder_norm_eps": 1e-05,
+            "decoder_norm_num_groups": norm_num_groups,
+            "encoder_norm_num_groups": norm_num_groups,
+            "decoder_num_train_timesteps": 1024,
+            "decoder_out_channels": 6,
+            "decoder_resnet_time_scale_shift": "scale_shift",
+            "decoder_time_embedding_type": "learned",
+            "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels),
+            "scaling_factor": 1,
+            "latent_channels": 4,
+        }
+
+    def inputs_dict(self, seed=None):
+        if seed is None:
+            generator = torch.Generator("cpu").manual_seed(0)
+        else:
+            generator = torch.Generator("cpu").manual_seed(seed)
+        image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch.device(torch_device))
+
+        return {"sample": image, "generator": generator}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def init_dict(self):
+        return self.get_consistency_vae_config()
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return self.init_dict, self.inputs_dict()
+
+    def test_enable_disable_tiling(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+        _ = inputs_dict.pop("generator")
+
+        torch.manual_seed(0)
+        output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_tiling()
+        output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE tiling should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_tiling()
+        output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_tiling.detach().cpu().numpy().all(),
+            output_without_tiling_2.detach().cpu().numpy().all(),
+            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
+        )
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+        _ = inputs_dict.pop("generator")
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+
+@slow
+class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @torch.no_grad()
+    def test_encode_decode(self):
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
+        vae.to(torch_device)
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        ).resize((256, 256))
+        image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :].to(
+            torch_device
+        )
+
+        latent = vae.encode(image).latent_dist.mean
+
+        sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
+
+        actual_output = sample[0, :2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024])
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_sd(self):
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None
+        )
+        pipe.to(torch_device)
+
+        out = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        actual_output = out[:2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759])
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_encode_decode_f16(self):
+        vae = ConsistencyDecoderVAE.from_pretrained(
+            "openai/consistency-decoder", torch_dtype=torch.float16
+        )  # TODO - update
+        vae.to(torch_device)
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        ).resize((256, 256))
+        image = (
+            torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :]
+            .half()
+            .to(torch_device)
+        )
+
+        latent = vae.encode(image).latent_dist.mean
+
+        sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
+
+        actual_output = sample[0, :2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor(
+            [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471],
+            dtype=torch.float16,
+        )
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_sd_f16(self):
+        vae = ConsistencyDecoderVAE.from_pretrained(
+            "openai/consistency-decoder", torch_dtype=torch.float16
+        )  # TODO - update
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            torch_dtype=torch.float16,
+            vae=vae,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+
+        out = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        actual_output = out[:2, :2, :2].flatten().cpu()
+        expected_output = torch.tensor(
+            [0.0000, 0.0249, 0.0000, 0.0000, 0.1709, 0.2773, 0.0471, 0.1035],
+            dtype=torch.float16,
+        )
+
+        assert torch_all_close(actual_output, expected_output, atol=5e-3)
+
+    def test_vae_tiling(self):
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        out_1 = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        # make sure tiled vae decode yields the same result
+        pipe.enable_vae_tiling()
+        out_2 = pipe(
+            "horse",
+            num_inference_steps=2,
+            output_type="pt",
+            generator=torch.Generator("cpu").manual_seed(0),
+        ).images[0]
+
+        assert torch_all_close(out_1, out_2, atol=5e-3)
+
+        # test that tiled decode works with various shapes
+        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
+        with torch.no_grad():
+            for shape in shapes:
+                image = torch.zeros(shape, device=torch_device, dtype=pipe.vae.dtype)
+                pipe.vae.decode(image)
diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py
deleted file mode 100644
index d475160cc796..000000000000
--- a/tests/models/autoencoders/test_models_vae.py
+++ /dev/null
@@ -1,1249 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from datasets import load_dataset
-from parameterized import parameterized
-
-from diffusers import (
-    AsymmetricAutoencoderKL,
-    AutoencoderKL,
-    AutoencoderKLTemporalDecoder,
-    AutoencoderOobleck,
-    AutoencoderTiny,
-    ConsistencyDecoderVAE,
-    StableDiffusionPipeline,
-)
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.loading_utils import load_image
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    floats_tensor,
-    is_peft_available,
-    load_hf_numpy,
-    require_peft_backend,
-    require_torch_accelerator,
-    require_torch_accelerator_with_fp16,
-    require_torch_gpu,
-    skip_mps,
-    slow,
-    torch_all_close,
-    torch_device,
-)
-from diffusers.utils.torch_utils import randn_tensor
-
-from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
-
-
-if is_peft_available():
-    from peft import LoraConfig
-
-
-enable_full_determinism()
-
-
-def get_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None):
-    block_out_channels = block_out_channels or [2, 4]
-    norm_num_groups = norm_num_groups or 2
-    init_dict = {
-        "block_out_channels": block_out_channels,
-        "in_channels": 3,
-        "out_channels": 3,
-        "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
-        "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
-        "latent_channels": 4,
-        "norm_num_groups": norm_num_groups,
-    }
-    return init_dict
-
-
-def get_asym_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None):
-    block_out_channels = block_out_channels or [2, 4]
-    norm_num_groups = norm_num_groups or 2
-    init_dict = {
-        "in_channels": 3,
-        "out_channels": 3,
-        "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
-        "down_block_out_channels": block_out_channels,
-        "layers_per_down_block": 1,
-        "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
-        "up_block_out_channels": block_out_channels,
-        "layers_per_up_block": 1,
-        "act_fn": "silu",
-        "latent_channels": 4,
-        "norm_num_groups": norm_num_groups,
-        "sample_size": 32,
-        "scaling_factor": 0.18215,
-    }
-    return init_dict
-
-
-def get_autoencoder_tiny_config(block_out_channels=None):
-    block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32]
-    init_dict = {
-        "in_channels": 3,
-        "out_channels": 3,
-        "encoder_block_out_channels": block_out_channels,
-        "decoder_block_out_channels": block_out_channels,
-        "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels],
-        "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)],
-    }
-    return init_dict
-
-
-def get_consistency_vae_config(block_out_channels=None, norm_num_groups=None):
-    block_out_channels = block_out_channels or [2, 4]
-    norm_num_groups = norm_num_groups or 2
-    return {
-        "encoder_block_out_channels": block_out_channels,
-        "encoder_in_channels": 3,
-        "encoder_out_channels": 4,
-        "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
-        "decoder_add_attention": False,
-        "decoder_block_out_channels": block_out_channels,
-        "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels),
-        "decoder_downsample_padding": 1,
-        "decoder_in_channels": 7,
-        "decoder_layers_per_block": 1,
-        "decoder_norm_eps": 1e-05,
-        "decoder_norm_num_groups": norm_num_groups,
-        "encoder_norm_num_groups": norm_num_groups,
-        "decoder_num_train_timesteps": 1024,
-        "decoder_out_channels": 6,
-        "decoder_resnet_time_scale_shift": "scale_shift",
-        "decoder_time_embedding_type": "learned",
-        "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels),
-        "scaling_factor": 1,
-        "latent_channels": 4,
-    }
-
-
-def get_autoencoder_oobleck_config(block_out_channels=None):
-    init_dict = {
-        "encoder_hidden_size": 12,
-        "decoder_channels": 12,
-        "decoder_input_channels": 6,
-        "audio_channels": 2,
-        "downsampling_ratios": [2, 4],
-        "channel_multiples": [1, 2],
-    }
-    return init_dict
-
-
-class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKL
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = get_autoencoder_kl_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skip("Not tested.")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip("Not tested.")
-    def test_training(self):
-        pass
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"Decoder", "Encoder"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
-        model = model.to(torch_device)
-        model.eval()
-
-        # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
-        if torch_device != "mps":
-            generator = torch.Generator(device=generator_device).manual_seed(0)
-        else:
-            generator = torch.manual_seed(0)
-
-        image = torch.randn(
-            1,
-            model.config.in_channels,
-            model.config.sample_size,
-            model.config.sample_size,
-            generator=torch.manual_seed(0),
-        )
-        image = image.to(torch_device)
-        with torch.no_grad():
-            output = model(image, sample_posterior=True, generator=generator).sample
-
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-
-        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
-        # the expected output slices are not the same for CPU and GPU.
-        if torch_device == "mps":
-            expected_output_slice = torch.tensor(
-                [
-                    -4.0078e-01,
-                    -3.8323e-04,
-                    -1.2681e-01,
-                    -1.1462e-01,
-                    2.0095e-01,
-                    1.0893e-01,
-                    -8.8247e-02,
-                    -3.0361e-01,
-                    -9.8644e-03,
-                ]
-            )
-        elif generator_device == "cpu":
-            expected_output_slice = torch.tensor(
-                [
-                    -0.1352,
-                    0.0878,
-                    0.0419,
-                    -0.0818,
-                    -0.1069,
-                    0.0688,
-                    -0.1458,
-                    -0.4446,
-                    -0.0026,
-                ]
-            )
-        else:
-            expected_output_slice = torch.tensor(
-                [
-                    -0.2421,
-                    0.4642,
-                    0.2507,
-                    -0.0438,
-                    0.0682,
-                    0.3160,
-                    -0.2018,
-                    -0.0727,
-                    0.2485,
-                ]
-            )
-
-        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
-
-    @require_peft_backend
-    def test_lora_adapter(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        vae = self.model_class(**init_dict)
-
-        target_modules_vae = [
-            "conv1",
-            "conv2",
-            "conv_in",
-            "conv_shortcut",
-            "conv",
-            "conv_out",
-            "skip_conv_1",
-            "skip_conv_2",
-            "skip_conv_3",
-            "skip_conv_4",
-            "to_k",
-            "to_q",
-            "to_v",
-            "to_out.0",
-        ]
-        vae_lora_config = LoraConfig(
-            r=16,
-            init_lora_weights="gaussian",
-            target_modules=target_modules_vae,
-        )
-
-        vae.add_adapter(vae_lora_config, adapter_name="vae_lora")
-        active_lora = vae.active_adapters()
-        self.assertTrue(len(active_lora) == 1)
-        self.assertTrue(active_lora[0] == "vae_lora")
-
-
-class AsymmetricAutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
-    model_class = AsymmetricAutoencoderKL
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        mask = torch.ones((batch_size, 1) + sizes).to(torch_device)
-
-        return {"sample": image, "mask": mask}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = get_asym_autoencoder_kl_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skip("Not tested.")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip("Not tested.")
-    def test_forward_with_norm_groups(self):
-        pass
-
-
-class AutoencoderTinyTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AutoencoderTiny
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = get_autoencoder_tiny_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skip("Not tested.")
-    def test_outputs_equivalence(self):
-        pass
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"DecoderTiny", "EncoderTiny"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @unittest.skip(
-        "Gradient checkpointing is supported but this test doesn't apply to this class because it's forward is a bit different from the rest."
-    )
-    def test_effective_gradient_checkpointing(self):
-        pass
-
-
-class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase):
-    model_class = ConsistencyDecoderVAE
-    main_input_name = "sample"
-    base_precision = 1e-2
-    forward_requires_fresh_args = True
-
-    def inputs_dict(self, seed=None):
-        if seed is None:
-            generator = torch.Generator("cpu").manual_seed(0)
-        else:
-            generator = torch.Generator("cpu").manual_seed(seed)
-        image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch.device(torch_device))
-
-        return {"sample": image, "generator": generator}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def init_dict(self):
-        return get_consistency_vae_config()
-
-    def prepare_init_args_and_inputs_for_common(self):
-        return self.init_dict, self.inputs_dict()
-
-    @unittest.skip
-    def test_training(self):
-        ...
-
-    @unittest.skip
-    def test_ema_training(self):
-        ...
-
-
-class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKLTemporalDecoder
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    @property
-    def dummy_input(self):
-        batch_size = 3
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        num_frames = 3
-
-        return {"sample": image, "num_frames": num_frames}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "latent_channels": 4,
-            "layers_per_block": 2,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skip("Not tested.")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip("Not tested.")
-    def test_training(self):
-        pass
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"Encoder", "TemporalDecoder"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-
-class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
-    model_class = AutoencoderOobleck
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 2
-        seq_len = 24
-
-        waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device)
-
-        return {"sample": waveform, "sample_posterior": False}
-
-    @property
-    def input_shape(self):
-        return (2, 24)
-
-    @property
-    def output_shape(self):
-        return (2, 24)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = get_autoencoder_oobleck_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skip("Not tested.")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip("Not tested.")
-    def test_forward_with_norm_groups(self):
-        pass
-
-    @unittest.skip("No attention module used in this model")
-    def test_set_attn_processor_for_determinism(self):
-        return
-
-
-@slow
-class AutoencoderTinyIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return image
-
-    def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False):
-        torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = AutoencoderTiny.from_pretrained(model_id, torch_dtype=torch_dtype)
-        model.to(torch_device).eval()
-        return model
-
-    @parameterized.expand(
-        [
-            [(1, 4, 73, 97), (1, 3, 584, 776)],
-            [(1, 4, 97, 73), (1, 3, 776, 584)],
-            [(1, 4, 49, 65), (1, 3, 392, 520)],
-            [(1, 4, 65, 49), (1, 3, 520, 392)],
-            [(1, 4, 49, 49), (1, 3, 392, 392)],
-        ]
-    )
-    def test_tae_tiling(self, in_shape, out_shape):
-        model = self.get_sd_vae_model()
-        model.enable_tiling()
-        with torch.no_grad():
-            zeros = torch.zeros(in_shape).to(torch_device)
-            dec = model.decode(zeros).sample
-            assert dec.shape == out_shape
-
-    def test_stable_diffusion(self):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed=33)
-
-        with torch.no_grad():
-            sample = model(image).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor([0.0093, 0.6385, -0.1274, 0.1631, -0.1762, 0.5232, -0.3108, -0.0382])
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
-
-    @parameterized.expand([(True,), (False,)])
-    def test_tae_roundtrip(self, enable_tiling):
-        # load the autoencoder
-        model = self.get_sd_vae_model()
-        if enable_tiling:
-            model.enable_tiling()
-
-        # make a black image with a white square in the middle,
-        # which is large enough to split across multiple tiles
-        image = -torch.ones(1, 3, 1024, 1024, device=torch_device)
-        image[..., 256:768, 256:768] = 1.0
-
-        # round-trip the image through the autoencoder
-        with torch.no_grad():
-            sample = model(image).sample
-
-        # the autoencoder reconstruction should match original image, sorta
-        def downscale(x):
-            return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor)
-
-        assert torch_all_close(downscale(sample), downscale(image), atol=0.125)
-
-
-@slow
-class AutoencoderKLIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return image
-
-    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
-        revision = "fp16" if fp16 else None
-        torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = AutoencoderKL.from_pretrained(
-            model_id,
-            subfolder="vae",
-            torch_dtype=torch_dtype,
-            revision=revision,
-        )
-        model.to(torch_device)
-
-        return model
-
-    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
-        if torch_device != "mps":
-            return torch.Generator(device=generator_device).manual_seed(seed)
-        return torch.manual_seed(seed)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [
-                33,
-                [-0.1556, 0.9848, -0.0410, -0.0642, -0.2685, 0.8381, -0.2004, -0.0700],
-                [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824],
-            ],
-            [
-                47,
-                [-0.2376, 0.1200, 0.1337, -0.4830, -0.2504, -0.0759, -0.0486, -0.4077],
-                [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
-            ],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
-            [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]],
-            # fmt: on
-        ]
-    )
-    @require_torch_accelerator_with_fp16
-    def test_stable_diffusion_fp16(self, seed, expected_slice):
-        model = self.get_sd_vae_model(fp16=True)
-        image = self.get_sd_image(seed, fp16=True)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-2)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [
-                33,
-                [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814],
-                [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824],
-            ],
-            [
-                47,
-                [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085],
-                [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
-            ],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-
-        with torch.no_grad():
-            sample = model(image).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
-            [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]],
-            # fmt: on
-        ]
-    )
-    @require_torch_accelerator
-    @skip_mps
-    def test_stable_diffusion_decode(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]],
-            [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
-            # fmt: on
-        ]
-    )
-    @require_torch_accelerator_with_fp16
-    def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
-        model = self.get_sd_vae_model(fp16=True)
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand([(13,), (16,), (27,)])
-    @require_torch_gpu
-    @unittest.skipIf(
-        not is_xformers_available(),
-        reason="xformers is not required when using PyTorch 2.0.",
-    )
-    def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
-        model = self.get_sd_vae_model(fp16=True)
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        model.enable_xformers_memory_efficient_attention()
-        with torch.no_grad():
-            sample_2 = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        assert torch_all_close(sample, sample_2, atol=1e-1)
-
-    @parameterized.expand([(13,), (16,), (37,)])
-    @require_torch_gpu
-    @unittest.skipIf(
-        not is_xformers_available(),
-        reason="xformers is not required when using PyTorch 2.0.",
-    )
-    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        model.enable_xformers_memory_efficient_attention()
-        with torch.no_grad():
-            sample_2 = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        assert torch_all_close(sample, sample_2, atol=1e-2)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
-            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            dist = model.encode(image).latent_dist
-            sample = dist.sample(generator=generator)
-
-        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
-
-        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        tolerance = 3e-3 if torch_device != "mps" else 1e-2
-        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
-
-
-@slow
-class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return image
-
-    def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x-1-5", fp16=False):
-        revision = "main"
-        torch_dtype = torch.float32
-
-        model = AsymmetricAutoencoderKL.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            revision=revision,
-        )
-        model.to(torch_device).eval()
-
-        return model
-
-    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
-        if torch_device != "mps":
-            return torch.Generator(device=generator_device).manual_seed(seed)
-        return torch.manual_seed(seed)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [
-                33,
-                [-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205],
-                [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824],
-            ],
-            [
-                47,
-                [0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529],
-                [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089],
-            ],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [
-                33,
-                [-0.0340, 0.2870, 0.1698, -0.0105, -0.3448, 0.3529, -0.1321, 0.1097],
-                [-0.0344, 0.2912, 0.1687, -0.0137, -0.3462, 0.3552, -0.1337, 0.1078],
-            ],
-            [
-                47,
-                [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531],
-                [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531],
-            ],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-
-        with torch.no_grad():
-            sample = model(image).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [13, [-0.0521, -0.2939, 0.1540, -0.1855, -0.5936, -0.3138, -0.4579, -0.2275]],
-            [37, [-0.1820, -0.4345, -0.0455, -0.2923, -0.8035, -0.5089, -0.4795, -0.3106]],
-            # fmt: on
-        ]
-    )
-    @require_torch_accelerator
-    @skip_mps
-    def test_stable_diffusion_decode(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=2e-3)
-
-    @parameterized.expand([(13,), (16,), (37,)])
-    @require_torch_gpu
-    @unittest.skipIf(
-        not is_xformers_available(),
-        reason="xformers is not required when using PyTorch 2.0.",
-    )
-    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        model.enable_xformers_memory_efficient_attention()
-        with torch.no_grad():
-            sample_2 = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        assert torch_all_close(sample, sample_2, atol=5e-2)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
-            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            dist = model.encode(image).latent_dist
-            sample = dist.sample(generator=generator)
-
-        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
-
-        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        tolerance = 3e-3 if torch_device != "mps" else 1e-2
-        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
-
-
-@slow
-class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        # clean up the VRAM before each test
-        super().setUp()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @torch.no_grad()
-    def test_encode_decode(self):
-        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
-        vae.to(torch_device)
-
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        ).resize((256, 256))
-        image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :].to(
-            torch_device
-        )
-
-        latent = vae.encode(image).latent_dist.mean
-
-        sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
-
-        actual_output = sample[0, :2, :2, :2].flatten().cpu()
-        expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024])
-
-        assert torch_all_close(actual_output, expected_output, atol=5e-3)
-
-    def test_sd(self):
-        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None
-        )
-        pipe.to(torch_device)
-
-        out = pipe(
-            "horse",
-            num_inference_steps=2,
-            output_type="pt",
-            generator=torch.Generator("cpu").manual_seed(0),
-        ).images[0]
-
-        actual_output = out[:2, :2, :2].flatten().cpu()
-        expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759])
-
-        assert torch_all_close(actual_output, expected_output, atol=5e-3)
-
-    def test_encode_decode_f16(self):
-        vae = ConsistencyDecoderVAE.from_pretrained(
-            "openai/consistency-decoder", torch_dtype=torch.float16
-        )  # TODO - update
-        vae.to(torch_device)
-
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        ).resize((256, 256))
-        image = (
-            torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :]
-            .half()
-            .to(torch_device)
-        )
-
-        latent = vae.encode(image).latent_dist.mean
-
-        sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
-
-        actual_output = sample[0, :2, :2, :2].flatten().cpu()
-        expected_output = torch.tensor(
-            [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471],
-            dtype=torch.float16,
-        )
-
-        assert torch_all_close(actual_output, expected_output, atol=5e-3)
-
-    def test_sd_f16(self):
-        vae = ConsistencyDecoderVAE.from_pretrained(
-            "openai/consistency-decoder", torch_dtype=torch.float16
-        )  # TODO - update
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stable-diffusion-v1-5/stable-diffusion-v1-5",
-            torch_dtype=torch.float16,
-            vae=vae,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-
-        out = pipe(
-            "horse",
-            num_inference_steps=2,
-            output_type="pt",
-            generator=torch.Generator("cpu").manual_seed(0),
-        ).images[0]
-
-        actual_output = out[:2, :2, :2].flatten().cpu()
-        expected_output = torch.tensor(
-            [0.0000, 0.0249, 0.0000, 0.0000, 0.1709, 0.2773, 0.0471, 0.1035],
-            dtype=torch.float16,
-        )
-
-        assert torch_all_close(actual_output, expected_output, atol=5e-3)
-
-    def test_vae_tiling(self):
-        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        out_1 = pipe(
-            "horse",
-            num_inference_steps=2,
-            output_type="pt",
-            generator=torch.Generator("cpu").manual_seed(0),
-        ).images[0]
-
-        # make sure tiled vae decode yields the same result
-        pipe.enable_vae_tiling()
-        out_2 = pipe(
-            "horse",
-            num_inference_steps=2,
-            output_type="pt",
-            generator=torch.Generator("cpu").manual_seed(0),
-        ).images[0]
-
-        assert torch_all_close(out_1, out_2, atol=5e-3)
-
-        # test that tiled decode works with various shapes
-        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
-        with torch.no_grad():
-            for shape in shapes:
-                image = torch.zeros(shape, device=torch_device, dtype=pipe.vae.dtype)
-                pipe.vae.decode(image)
-
-
-@slow
-class AutoencoderOobleckIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
-        )
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return torch.nn.utils.rnn.pad_sequence(
-            [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True
-        )
-
-    def get_audio(self, audio_sample_size=2097152, fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        audio = self._load_datasamples(2).to(torch_device).to(dtype)
-
-        # pad / crop to audio_sample_size
-        audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1]))
-
-        # todo channel
-        audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device)
-
-        return audio
-
-    def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False):
-        torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = AutoencoderOobleck.from_pretrained(
-            model_id,
-            subfolder="vae",
-            torch_dtype=torch_dtype,
-        )
-        model.to(torch_device)
-
-        return model
-
-    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
-        if torch_device != "mps":
-            return torch.Generator(device=generator_device).manual_seed(seed)
-        return torch.manual_seed(seed)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
-            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_diff):
-        model = self.get_oobleck_vae_model()
-        audio = self.get_audio()
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(audio, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == audio.shape
-        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
-
-        output_slice = sample[-1, 1, 5:10].cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
-
-    def test_stable_diffusion_mode(self):
-        model = self.get_oobleck_vae_model()
-        audio = self.get_audio()
-
-        with torch.no_grad():
-            sample = model(audio, sample_posterior=False).sample
-
-        assert sample.shape == audio.shape
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
-            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mean_absolute_diff):
-        model = self.get_oobleck_vae_model()
-        audio = self.get_audio()
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            x = audio
-            posterior = model.encode(x).latent_dist
-            z = posterior.sample(generator=generator)
-            sample = model.decode(z).sample
-
-        # (batch_size, latent_dim, sequence_length)
-        assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024)
-
-        assert sample.shape == audio.shape
-        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
-
-        output_slice = sample[-1, 1, 5:10].cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
diff --git a/tests/models/autoencoders/vae.py b/tests/models/autoencoders/vae.py
new file mode 100644
index 000000000000..f8055f1c1cb0
--- /dev/null
+++ b/tests/models/autoencoders/vae.py
@@ -0,0 +1,86 @@
+def get_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None):
+    block_out_channels = block_out_channels or [2, 4]
+    norm_num_groups = norm_num_groups or 2
+    init_dict = {
+        "block_out_channels": block_out_channels,
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+        "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+        "latent_channels": 4,
+        "norm_num_groups": norm_num_groups,
+    }
+    return init_dict
+
+
+def get_asym_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None):
+    block_out_channels = block_out_channels or [2, 4]
+    norm_num_groups = norm_num_groups or 2
+    init_dict = {
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+        "down_block_out_channels": block_out_channels,
+        "layers_per_down_block": 1,
+        "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+        "up_block_out_channels": block_out_channels,
+        "layers_per_up_block": 1,
+        "act_fn": "silu",
+        "latent_channels": 4,
+        "norm_num_groups": norm_num_groups,
+        "sample_size": 32,
+        "scaling_factor": 0.18215,
+    }
+    return init_dict
+
+
+def get_autoencoder_tiny_config(block_out_channels=None):
+    block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32]
+    init_dict = {
+        "in_channels": 3,
+        "out_channels": 3,
+        "encoder_block_out_channels": block_out_channels,
+        "decoder_block_out_channels": block_out_channels,
+        "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels],
+        "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)],
+    }
+    return init_dict
+
+
+def get_consistency_vae_config(block_out_channels=None, norm_num_groups=None):
+    block_out_channels = block_out_channels or [2, 4]
+    norm_num_groups = norm_num_groups or 2
+    return {
+        "encoder_block_out_channels": block_out_channels,
+        "encoder_in_channels": 3,
+        "encoder_out_channels": 4,
+        "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+        "decoder_add_attention": False,
+        "decoder_block_out_channels": block_out_channels,
+        "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels),
+        "decoder_downsample_padding": 1,
+        "decoder_in_channels": 7,
+        "decoder_layers_per_block": 1,
+        "decoder_norm_eps": 1e-05,
+        "decoder_norm_num_groups": norm_num_groups,
+        "encoder_norm_num_groups": norm_num_groups,
+        "decoder_num_train_timesteps": 1024,
+        "decoder_out_channels": 6,
+        "decoder_resnet_time_scale_shift": "scale_shift",
+        "decoder_time_embedding_type": "learned",
+        "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels),
+        "scaling_factor": 1,
+        "latent_channels": 4,
+    }
+
+
+def get_autoencoder_oobleck_config(block_out_channels=None):
+    init_dict = {
+        "encoder_hidden_size": 12,
+        "decoder_channels": 12,
+        "decoder_input_channels": 6,
+        "audio_channels": 2,
+        "downsampling_ratios": [2, 4],
+        "channel_multiples": [1, 2],
+    }
+    return init_dict
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index f6ce6bda7381..a7594f2ea13f 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -858,11 +858,6 @@ def test_gradient_checkpointing_is_applied(
     ):
         if not self.model_class._supports_gradient_checkpointing:
             return  # Skip test if model does not support gradient checkpointing
-        if self.model_class.__name__ in [
-            "UNetSpatioTemporalConditionModel",
-            "AutoencoderKLTemporalDecoder",
-        ]:
-            return
 
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py
index 007a2b0e46d7..508e5008a786 100644
--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -47,7 +47,7 @@
 )
 from diffusers.utils.torch_utils import randn_tensor
 
-from ...models.autoencoders.test_models_vae import (
+from ...models.autoencoders.vae import (
     get_asym_autoencoder_kl_config,
     get_autoencoder_kl_config,
     get_autoencoder_tiny_config,
diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
index c940504d6c3e..53cb070c9be4 100644
--- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
@@ -34,7 +34,7 @@
 from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
 from diffusers.utils.torch_utils import randn_tensor
 
-from ...models.autoencoders.test_models_vae import (
+from ...models.autoencoders.vae import (
     get_asym_autoencoder_kl_config,
     get_autoencoder_kl_config,
     get_autoencoder_tiny_config,
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 7ec677558059..4d2b534c9a28 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -48,7 +48,7 @@
     torch_device,
 )
 
-from ..models.autoencoders.test_models_vae import (
+from ..models.autoencoders.vae import (
     get_asym_autoencoder_kl_config,
     get_autoencoder_kl_config,
     get_autoencoder_tiny_config,