From cfe0c018c116544dae6b8ca5e980b7d38a711d4b Mon Sep 17 00:00:00 2001
From: Evan Smal <esmal@tenstorrent.com>
Date: Thu, 5 Dec 2024 16:15:39 +0000
Subject: [PATCH] Clean up Stable Diffusion tests and CI jobs

Refactor the Stable Diffusion test suite and CI jobs to improve model
stability and maintainability.

 - Move all Stable Diffusion tests into the model directory for better
   organization.

 - Remove unnecessary postfixes from test filenames.

 - Delete dead test code, including tests unrelated to SD modules and
   code for deprecated SD variants using 224x224 input shapes.

 - Re-enable previously broken tests in CI.

 - Update the CODEOWNERS file to reflect moved/deleted directories and
   files.
---
 ...atch-full-regressions-and-models-impl.yaml |   4 +-
 CODEOWNERS                                    |   4 -
 .../test_multiple_iterations.py               | 236 -----
 .../tests}/test_basic_transformer_block.py    |   0
 .../tests}/test_cross_attention.py            |   0
 .../tests/test_cross_attn_up_block_2d.py      |   0
 .../stable_diffusion/tests}/test_demo.py      |   0
 .../stable_diffusion/tests}/test_embedding.py |   0
 .../tests}/test_feedforward.py                |   0
 .../stable_diffusion/tests}/test_geglu.py     |   0
 .../stable_diffusion/tests/test_perf.py       |   0
 .../tests/test_resnet_block_2d.py             |  84 --
 .../tests}/test_sharded_matmuls.py            |   0
 .../tests/test_transformer_2d_model.py        |   0
 .../tests/test_unet_2d_condition_model.py     |   3 +-
 .../stable_diffusion/tests/test_upblock_2d.py |   0
 .../tests/test_upsample_2d.py                 |   0
 .../tests}/test_upsample_nearest_2d.py        |   0
 .../test_basic_transformer_block.py           |   1 +
 .../stable_diffusion/test_cross_attention.py  |   1 +
 .../test_cross_attn_up_block_2d.py            |   1 +
 .../single_card/stable_diffusion/test_demo.py |   1 +
 .../stable_diffusion/test_embedding.py        |   1 +
 .../stable_diffusion/test_feedforward.py      |   1 +
 .../stable_diffusion/test_geglu.py            |   1 +
 .../stable_diffusion/test_resnet_block_2d.py  |   1 +
 .../stable_diffusion/test_sharded_matmuls.py  |   1 +
 .../test_transformer_2d_model.py              |   1 +
 .../test_unet_2d_condition_model.py           |   1 +
 .../stable_diffusion/test_upblock_2d.py       |   1 +
 .../stable_diffusion/test_upsample_2d.py      |   1 +
 .../test_upsample_nearest_2d.py               |   1 +
 .../ttnn/integration_tests/stable_diffusion   |   1 -
 tests/scripts/run_performance.sh              |   4 +-
 tests/scripts/run_python_model_tests.sh       |   3 +
 .../test_sharded_attention.py                 | 966 ------------------
 36 files changed, 22 insertions(+), 1297 deletions(-)
 delete mode 100644 models/demos/wormhole/stable_diffusion/test_multiple_iterations.py
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_basic_transformer_block.py (100%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_cross_attention.py (100%)
 rename tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py (100%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_demo.py (100%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_embedding.py (100%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_feedforward.py (100%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_geglu.py (100%)
 rename tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py => models/demos/wormhole/stable_diffusion/tests/test_perf.py (100%)
 rename tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py (66%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_sharded_matmuls.py (100%)
 rename tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py (100%)
 rename tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py (98%)
 rename tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py (100%)
 rename tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py (100%)
 rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_upsample_nearest_2d.py (100%)
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_cross_attention.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_demo.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_embedding.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_feedforward.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_geglu.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_upblock_2d.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_upsample_2d.py
 create mode 120000 tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py
 delete mode 120000 tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion
 delete mode 100644 tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
index 762324fb3a34..0af646345b18 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -149,8 +149,8 @@ jobs:
       fail-fast: false
       matrix:
         test-config:
-          - model: "wh_b0_unstable"
-            cmd: ./tests/scripts/single_card/nightly/run_wh_b0_unstable.sh
+          - model: "stable_diffusion"
+            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion
           - model: "mamba 1"
             cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
           - model: "mamba 2"
diff --git a/CODEOWNERS b/CODEOWNERS
index aa80b7671c43..3b74d00a0470 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -173,10 +173,6 @@ tests/**/dtx/ @mywoodstock @sankarmanoj-tt
 tests/**/*test*conv*.py @mywoodstock @sankarmanoj-tt
 tests/python_api_testing/conv/ @mywoodstock @sankarmanoj-tt
 tests/python_api_testing/unit_testing/fallback_ops @tt-aho
-tests/ttnn/integration_tests/stable_diffusion @esmalTT @uaydonat @mywoodstock
-tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py @esmalTT @uaydonat @mywoodstock
-tests/ttnn/integration_tests/unet @esmalTT @uaydonat @mywoodstock
-tests/nightly/wh_b0_only_eth/experimental/functional_unet @esmalTT @uaydonat @mywoodstock
 scripts/profiler/ @mo-tenstorrent
 scripts/docker @tenstorrent/metalium-developers-infra
 
diff --git a/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py b/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py
deleted file mode 100644
index 8db6aee6f39e..000000000000
--- a/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import ttnn
-import json
-import torch
-import pytest
-import numpy as np
-from PIL import Image
-from loguru import logger
-from tqdm.auto import tqdm
-from datasets import load_dataset
-
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import (
-    AutoencoderKL,
-    UNet2DConditionModel,
-    LMSDiscreteScheduler,
-)
-from models.utility_functions import (
-    comp_allclose_and_pcc,
-    enable_persistent_kernel_cache,
-    disable_persistent_kernel_cache,
-)
-from models.utility_functions import skip_for_wormhole_b0
-from ttnn.model_preprocessing import preprocess_model_parameters
-from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor
-from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_unet_2d_condition_model import (
-    UNet2DConditionModel as UNet2D,
-)
-
-from torchvision.transforms import ToTensor
-
-
-def load_inputs(input_path):
-    with open(input_path) as f:
-        input_data = json.load(f)
-        assert input_data, "Input data is empty."
-        prompt = [item["prompt"] for item in input_data]
-        return prompt
-
-
-def constant_prop_time_embeddings(timesteps, sample, time_proj):
-    timesteps = timesteps[None]
-    timesteps = timesteps.expand(sample.shape[0])
-    t_emb = time_proj(timesteps)
-    return t_emb
-
-
-def save_image_and_latents(latents, iter, vae, pre_fix="", pre_fix2=""):
-    pre_fix = "" if pre_fix == "" else f"{pre_fix}_"
-    pre_fix2 = "" if pre_fix2 == "" else f"{pre_fix2}_"
-    _latents = 1 / 0.18215 * latents
-
-    with torch.no_grad():
-        image = vae.decode(_latents).sample
-    # Image post-processing
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images][0]
-    pil_images.save(f"{pre_fix}{pre_fix2}image_iter_{iter}.png")
-
-
-def guide(noise_pred, guidance_scale, t):  # will return latents
-    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-    return noise_pred
-
-
-def latent_expansion(latents, scheduler, t):
-    latent_model_input = torch.cat([latents] * 2, dim=0)
-    latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
-    return latent_model_input
-
-
-def calculate_fid_score(imgs_path1, imgs_path2):
-    fid = FrechetInceptionDistance(normalize=True)
-    fid.update(imgs_path1, real=False)
-    fid.update(imgs_path2, real=True)
-    return fid.compute()
-
-
-def preprocess_images(image_paths):
-    images = []
-    for image_path in image_paths:
-        image = Image.open(image_path)
-        image = image.convert("RGB")
-        image = image.resize((299, 299))
-        image = ToTensor()(image)
-        images.append(image)
-    return torch.stack(images)
-
-
-def run_demo_inference_diffusiondb(device, reset_seeds, input_path, num_inference_steps, image_size):
-    disable_persistent_kernel_cache()
-
-    height, width = image_size
-
-    experiment_name = f"diffusiondb_{height}x{width}"
-    input_prompt = [
-        "oil painting frame of Breathtaking mountain range with a clear river running through it, surrounded by tall trees and misty clouds, serene, peaceful, mountain landscape, high detail"
-    ]
-    logger.info(f"input_prompts: {input_prompt}")
-
-    # 1. Load the autoencoder model which will be used to decode the latents into image space.
-    vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
-
-    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-
-    # 3. The UNet model for generating the latents.
-    unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
-
-    # 4. load the K-LMS scheduler with some fitting parameters.
-    ttnn_scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
-
-    torch_device = "cpu"
-    vae.to(torch_device)
-    text_encoder.to(torch_device)
-    unet.to(torch_device)
-
-    guidance_scale = 7.5  # Scale for classifier-free guidance
-    generator = torch.manual_seed(174)  # 10233 Seed generator to create the inital latent noise
-    batch_size = len(input_prompt)
-
-    ## First, we get the text_embeddings for the prompt. These embeddings will be used to condition the UNet model.
-    # Tokenizer and Text Encoder
-    text_input = tokenizer(
-        input_prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
-    uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
-
-    # For classifier-free guidance, we need to do two forward passes: one with the conditioned input (text_embeddings),
-    # and another with the unconditional embeddings (uncond_embeddings).
-    # In practice, we can concatenate both into a single batch to avoid doing two forward passes.
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-    ttnn_text_embeddings = ttnn.from_torch(text_embeddings, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
-
-    vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
-    # Initial random noise
-    latents = torch.randn(
-        (batch_size, unet.config.in_channels, height // vae_scale_factor, width // vae_scale_factor),
-        generator=generator,
-    )
-    latents = latents.to(torch_device)
-
-    ttnn_scheduler.set_timesteps(num_inference_steps)
-
-    latents = latents * ttnn_scheduler.init_noise_sigma
-    ttnn_latents = torch.tensor(latents)
-
-    iter = 0
-    config = unet.config
-
-    parameters = preprocess_model_parameters(
-        initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device
-    )
-    input_height = 64
-    input_width = 64
-    reader_patterns_cache = {} if height == 512 and width == 512 else None
-
-    model = UNet2D(device, parameters, 2, input_height, input_width, reader_patterns_cache)
-    # # Denoising loop
-    for t in tqdm(ttnn_scheduler.timesteps):
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        ttnn_latent_model_input = latent_expansion(ttnn_latents, ttnn_scheduler, t)
-        ttnn_latent_model_input = ttnn.from_torch(
-            ttnn_latent_model_input, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device
-        )
-
-        _t = constant_prop_time_embeddings(t, ttnn_latent_model_input, unet.time_proj)
-        _t = _t.unsqueeze(0).unsqueeze(0)
-        _t = ttnn.from_torch(_t, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
-
-        # predict the noise residual
-        with torch.no_grad():
-            ttnn_output = model(
-                ttnn_latent_model_input,  # input
-                timestep=_t,
-                encoder_hidden_states=ttnn_text_embeddings,
-                class_labels=None,
-                attention_mask=None,
-                cross_attention_kwargs=None,
-                return_dict=True,
-                config=config,
-            )
-            noise_pred = ttnn.to_torch(ttnn_output)
-
-        # perform guidance
-        noise_pred = guide(noise_pred, guidance_scale, t)
-
-        ttnn_latents = ttnn_scheduler.step(noise_pred, t, ttnn_latents).prev_sample
-        save_image_and_latents(ttnn_latents, iter, vae, pre_fix=f"{experiment_name}_tt", pre_fix2="")
-
-        iter += 1
-        enable_persistent_kernel_cache()
-
-    latents = ttnn_latents
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    with torch.no_grad():
-        image = vae.decode(latents).sample
-
-    # Image post-processing
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images][0]
-    ttnn_output_path = f"{experiment_name}_ttnn.png"
-    pil_images.save(ttnn_output_path)
-
-    ref_paths = [ref_img_path, ref_img_path]
-    ttnn_paths = [ttnn_output_path, ttnn_output_path]
-
-    ref_images = preprocess_images(ref_paths)
-    ttnn_images = preprocess_images(ttnn_paths)
-
-
-def test_tt2_multiple_iteration(device, reset_seeds, input_path):
-    # 30 iterations, generate 512x512 image
-    return run_demo_inference_diffusiondb(device, reset_seeds, input_path, 30, (512, 512))
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_basic_transformer_block.py b/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_basic_transformer_block.py
rename to models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py
rename to models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py
rename to models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py b/models/demos/wormhole/stable_diffusion/tests/test_demo.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_demo.py
rename to models/demos/wormhole/stable_diffusion/tests/test_demo.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_embedding.py b/models/demos/wormhole/stable_diffusion/tests/test_embedding.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_embedding.py
rename to models/demos/wormhole/stable_diffusion/tests/test_embedding.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_feedforward.py b/models/demos/wormhole/stable_diffusion/tests/test_feedforward.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_feedforward.py
rename to models/demos/wormhole/stable_diffusion/tests/test_feedforward.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_geglu.py b/models/demos/wormhole/stable_diffusion/tests/test_geglu.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_geglu.py
rename to models/demos/wormhole/stable_diffusion/tests/test_geglu.py
diff --git a/tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py b/models/demos/wormhole/stable_diffusion/tests/test_perf.py
similarity index 100%
rename from tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py
rename to models/demos/wormhole/stable_diffusion/tests/test_perf.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py
similarity index 66%
rename from tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py
rename to models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py
index 51afb5afd0d2..91a0f3755e51 100644
--- a/tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py
@@ -25,90 +25,6 @@ def ttnn_to_torch(input):
     return input
 
 
-@skip_for_grayskull()
-@pytest.mark.parametrize(
-    "batch_size, in_channels, input_height, input_width, index1,index2,block_name,out_channels",
-    [
-        (2, 320, 32, 32, 0, 0, "down", None),
-        (2, 320, 16, 16, 0, 0, "down", None),
-        (2, 640, 16, 16, 1, 1, "down", None),
-        (2, 640, 8, 8, 1, 1, "down", None),
-        (2, 1280, 8, 8, 2, 1, "down", None),
-        (2, 1280, 4, 4, 2, 1, "down", None),
-        (2, 2560, 4, 4, 0, 0, "up", 1280),
-        (2, 2560, 8, 8, 0, 0, "up", 1280),
-        (2, 1920, 8, 8, 2, 0, "up", 640),
-        (2, 1920, 16, 16, 2, 0, "up", 640),
-        (2, 1280, 16, 16, 3, 0, "down", None),
-        (2, 960, 16, 16, 3, 0, "up", 320),
-        (2, 960, 32, 32, 3, 0, "up", 320),
-        (2, 640, 32, 32, 3, 1, "up", 320),
-    ],
-)
-def test_resnet_block_2d_256x256(
-    device, batch_size, in_channels, input_height, input_width, index1, index2, block_name, out_channels
-):
-    pytest.skip()
-    # setup pytorch model
-    model_name = "CompVis/stable-diffusion-v1-4"
-    pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=torch.float32)
-
-    model = pipe.unet
-    model.eval()
-
-    parameters = preprocess_model_parameters(
-        model_name=model_name, initialize_model=lambda: model, custom_preprocessor=custom_preprocessor, device=device
-    )
-
-    if block_name == "up":
-        parameters = parameters.up_blocks[index1].resnets[index2]
-        resnet = pipe.unet.up_blocks[index1].resnets[index2]
-    elif block_name == "down":
-        parameters = parameters.down_blocks[index1].resnets[index2]
-        resnet = pipe.unet.down_blocks[index1].resnets[index2]
-    else:
-        parameters = parameters.mid_block.resnets[index2]
-        resnet = pipe.unet.mid_block.resnets[index2]
-
-    ############ start of residual block #############
-    temb_channels = 1280
-    groups = 32
-    time_embedding_norm = "default"
-    output_scale_factor = 1
-    use_in_shortcut = None
-    ########## end of residual block #############
-    hidden_states_shape = [batch_size, in_channels, input_height, input_width]
-    temb_shape = [1, 1, 2, 1280]
-
-    input = torch.randn(hidden_states_shape)
-    temb = torch.randn(temb_shape)
-
-    torch_output = resnet(input, temb.squeeze(0).squeeze(0))
-
-    input = ttnn.from_torch(input, ttnn.bfloat16)
-    input = ttnn.to_layout(input, ttnn.TILE_LAYOUT)
-    input = ttnn.to_device(input, device, memory_config=ttnn.L1_MEMORY_CONFIG)
-
-    temb = ttnn.from_torch(temb, ttnn.bfloat16)
-    temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT)
-    temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG)
-    ttnn_output = resnetBlock2D(
-        input,
-        temb=temb,
-        temb_channels=temb_channels,
-        time_embedding_norm=time_embedding_norm,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        use_in_shortcut=use_in_shortcut,
-        groups=groups,
-        output_scale_factor=output_scale_factor,
-        parameters=parameters,
-        device=device,
-    )
-    ttnn_output = ttnn_to_torch(ttnn_output)
-    assert_with_pcc(torch_output, ttnn_output, pcc=0.99)
-
-
 @skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True)
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_matmuls.py b/models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_sharded_matmuls.py
rename to models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py
rename to models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
similarity index 98%
rename from tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py
rename to models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
index 35b1253ea540..72efdb4e178e 100644
--- a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
@@ -63,7 +63,6 @@ def unsqueeze_all_params_to_4d(params):
 
 
 @skip_for_grayskull()
-@pytest.mark.skipif(is_wormhole_b0() or is_blackhole(), reason="#10923: CB / L1 buffer clash")
 @pytest.mark.parametrize(
     "device_params", [{"l1_small_size": 32768}], ids=["device_params=l1_small_size_24576"], indirect=True
 )
@@ -204,7 +203,7 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_
     #     print(iter)
     # print(f"Time taken for 50 iterations: {total_time}")
     # print(f"Samples per second: {50 / total_time}")
-    passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.99)
+    passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.981)
     print(output)
     assert passing
 
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py
rename to models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py
rename to models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upsample_nearest_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py
similarity index 100%
rename from tests/ttnn/integration_tests/stable_diffusion/test_upsample_nearest_2d.py
rename to models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py
diff --git a/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py b/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py
new file mode 120000
index 000000000000..61408ffa9e72
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attention.py b/tests/nightly/single_card/stable_diffusion/test_cross_attention.py
new file mode 120000
index 000000000000..c161012b8867
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_cross_attention.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py b/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py
new file mode 120000
index 000000000000..8fce2d91ed28
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_demo.py b/tests/nightly/single_card/stable_diffusion/test_demo.py
new file mode 120000
index 000000000000..c375047f6338
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_demo.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_demo.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_embedding.py b/tests/nightly/single_card/stable_diffusion/test_embedding.py
new file mode 120000
index 000000000000..3e89c1284247
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_embedding.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_embedding.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_feedforward.py b/tests/nightly/single_card/stable_diffusion/test_feedforward.py
new file mode 120000
index 000000000000..915332488d58
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_feedforward.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_feedforward.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_geglu.py b/tests/nightly/single_card/stable_diffusion/test_geglu.py
new file mode 120000
index 000000000000..5880ea6e17d9
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_geglu.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_geglu.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py b/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py
new file mode 120000
index 000000000000..1b6513e5b502
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py b/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py
new file mode 120000
index 000000000000..d5d12d47849c
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py b/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py
new file mode 120000
index 000000000000..d82d4a899f64
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py b/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py
new file mode 120000
index 000000000000..c25a861ed357
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py b/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py
new file mode 120000
index 000000000000..3997b30be69c
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py b/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py
new file mode 120000
index 000000000000..88a986498448
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py b/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py
new file mode 120000
index 000000000000..815ccb622b42
--- /dev/null
+++ b/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py
@@ -0,0 +1 @@
+../../../../models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py
\ No newline at end of file
diff --git a/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion b/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion
deleted file mode 120000
index 608e08f48e29..000000000000
--- a/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../../tests/ttnn/integration_tests/stable_diffusion
\ No newline at end of file
diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh
index 7956d1c7b034..af74c2d032b2 100755
--- a/tests/scripts/run_performance.sh
+++ b/tests/scripts/run_performance.sh
@@ -71,7 +71,7 @@ run_perf_models_cnn_javelin() {
 
     # Run tests
     env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests/test_unet_perf.py -m $test_marker
-    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
+    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=480
 
     ## Merge all the generated reports
     env python models/perf/merge_perf_results.py
@@ -81,7 +81,7 @@ run_device_perf_models() {
     set -eo pipefail
     local test_marker=$1
 
-    env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600
+    env pytest models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=600
 
     env pytest models/demos/distilbert/tests -m $test_marker
 
diff --git a/tests/scripts/run_python_model_tests.sh b/tests/scripts/run_python_model_tests.sh
index cd0913470769..31e14bfaf66a 100755
--- a/tests/scripts/run_python_model_tests.sh
+++ b/tests/scripts/run_python_model_tests.sh
@@ -40,6 +40,9 @@ run_python_model_tests_wormhole_b0() {
     # Unet Shallow
     WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -svv models/experimental/functional_unet/tests/test_unet_model.py
 
+    # Stable Diffusion
+    WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py
+
     # Mamba
     WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -svv models/demos/wormhole/mamba/tests/test_residual_block.py -k "pretrained_weight_false"
 
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py b/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py
deleted file mode 100644
index 1b45761e11c8..000000000000
--- a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py
+++ /dev/null
@@ -1,966 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-import math
-import pytest
-import ttnn
-
-from tests.ttnn.utils_for_testing import assert_with_pcc
-from models.utility_functions import (
-    comp_pcc,
-    tt2torch_tensor,
-    torch2tt_tensor,
-    is_wormhole_b0,
-    skip_for_grayskull,
-)
-from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import (
-    determine_largest_subblock_size,
-    determine_blocking,
-)
-
-
-# Test matmul attention sequence with InterleavedToShardedPartialOp
-@skip_for_grayskull()
-@pytest.mark.parametrize("seq_len", [4096, 1024])
-@pytest.mark.parametrize("num_slices", [16])
-@pytest.mark.parametrize("num_cores", [64])
-@pytest.mark.parametrize("num_heads", [16])
-@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b])
-def test_time_sharded_attnention_hwb(
-    device,
-    seq_len,
-    num_slices,
-    num_cores,
-    num_heads,
-    data_format,
-    function_level_defaults,
-):
-    pytest.skip()
-    compute_grid_size = device.compute_with_storage_grid_size()
-    if num_cores > (compute_grid_size.x * compute_grid_size.y):
-        pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}")
-    grid_size = (8, 8)
-
-    M = seq_len
-    K = 64
-    N = seq_len
-
-    query_layer_shape = [1, num_heads, seq_len, 64]
-    key_layer_transposed_shape = [1, num_heads, 64, seq_len]
-    value_layer_shape = [1, num_heads, seq_len, 64]
-    output_shape = [1, num_heads, seq_len, 64]
-
-    torch_query_layer = torch.randn(query_layer_shape).bfloat16().float()
-    torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float()
-    torch_value_layer = torch.randn(value_layer_shape).bfloat16().float()
-    torch_output = torch.randn(output_shape).bfloat16().float()
-
-    dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG
-
-    height_sharded_mem_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1
-    )
-    block_sharded_mem_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-        buffer_type=ttnn.BufferType.L1,
-    )
-
-    # compare output to regular case
-    reference_query_layer = torch2tt_tensor(
-        torch_query_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_key_layer_transposed = torch2tt_tensor(
-        torch_key_layer_transposed,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_value_layer = torch2tt_tensor(
-        torch_value_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-
-    attn_weights_qkt = torch_query_layer @ torch_key_layer_transposed
-    attn_weights_torch_sm = torch.nn.functional.softmax(attn_weights_qkt, dim=-1)
-    attn_weights_torch = attn_weights_torch_sm @ torch_value_layer
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    passing = True
-    output = None
-
-    mm_out = torch2tt_tensor(
-        torch_output,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-
-    tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32)
-    mm_output_block_shard_spec = [seq_len // 8, seq_len // 8]
-    tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32)
-    mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len]
-
-    heads_per_slice = num_heads // num_slices
-    for i in range(num_slices):
-        q_slice = ttnn.interleaved_to_sharded_partial(
-            reference_query_layer,
-            ttnn.CoreCoord(1, grid_size[0]),
-            [M // grid_size[0], K],
-            num_slices,
-            i,
-            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-            ttnn.ShardOrientation.ROW_MAJOR,
-        )
-        k_slice = ttnn.interleaved_to_sharded_partial(
-            reference_key_layer_transposed,
-            ttnn.CoreCoord(grid_size[1], 1),
-            [K, N // grid_size[1]],
-            num_slices,
-            i,
-            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
-            ttnn.ShardOrientation.ROW_MAJOR,
-        )
-
-        program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            in0_block_w=K // 32,
-            out_subblock_h=1,
-            out_subblock_w=1,
-            per_core_M=M // (32 * grid_size[0]),
-            per_core_N=N // (32 * grid_size[1]),
-            transpose_mcast=False,
-            fused_activation=None,
-        )
-
-        mm_slice = ttnn.matmul(
-            q_slice,
-            k_slice,
-            program_config=program_config,
-            memory_config=block_sharded_mem_config,
-            dtype=data_format,
-            compute_kernel_config=compute_kernel_config,
-        )
-        # mmt = tt2torch_tensor(mm_slice)
-        # passed, message = comp_pcc(mmt, attn_weights_qkt[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :])
-        # print(message)
-        # assert passed
-        k_slice.deallocate()
-        q_slice.deallocate()
-
-        height_per_core = seq_len // 64
-        output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))})
-        output_shard_spec = ttnn.ShardSpec(
-            output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.ROW_MAJOR, False
-        )
-        output_mem_config = ttnn.MemoryConfig(
-            ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec
-        )
-        mm_slice = ttnn.reshard(
-            mm_slice,
-            output_mem_config,
-        )
-        mm_slice = ttnn.move(mm_slice)
-
-        softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            subblock_w=1,
-            block_h=mm_output_height_shard_spec[0] // 32,
-            block_w=mm_output_height_shard_spec[1] // 32,
-        )
-        # print(program_config)
-
-        mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-        # mmt = tt2torch_tensor(mm_slice)
-        # passed, message = comp_pcc(mmt, attn_weights_torch_sm[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :])
-        # print(message)
-        # assert passed
-
-        program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            in0_block_w=seq_len // 32,
-            per_core_M=tiles_per_shard,
-            per_core_N=2,
-            out_subblock_h=1,
-            out_subblock_w=1,
-            fuse_batch=True,
-            fused_activation=None,
-            mcast_in0=False,
-        )
-        v_slice = ttnn.slice(
-            reference_value_layer,
-            (0, (i * heads_per_slice), 0, 0),
-            (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64),
-            memory_config=dram_interleaved_memory_config,
-        )
-
-        mm_slice = ttnn.matmul(
-            mm_slice,
-            v_slice,
-            program_config=program_config,
-            memory_config=height_sharded_mem_config,
-            dtype=data_format,
-            compute_kernel_config=compute_kernel_config,
-        )
-        v_slice.deallocate()
-
-        ttnn.sharded_to_interleaved_partial(
-            mm_slice,
-            mm_out,
-            num_slices,
-            i,
-            memory_config=dram_interleaved_memory_config,
-        )
-
-        mm_slice.deallocate()
-
-    mm_out_torch = tt2torch_tensor(mm_out)
-
-    passing, output = comp_pcc(mm_out_torch, attn_weights_torch)
-
-    print(output)
-    assert passing
-
-
-# Test matmul attention sequence with InterleavedToShardedPartialOp
-@skip_for_grayskull()
-@pytest.mark.parametrize("seq_len", [4096, 1024])
-@pytest.mark.parametrize("num_slices", [16])
-@pytest.mark.parametrize("num_cores", [64])
-@pytest.mark.parametrize("num_heads", [16])
-@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b])
-def test_time_sharded_attnention(
-    device,
-    seq_len,
-    num_slices,
-    num_cores,
-    num_heads,
-    data_format,
-    function_level_defaults,
-):
-    pytest.skip()  # ND hang on CI
-    compute_grid_size = device.compute_with_storage_grid_size()
-    if num_cores > (compute_grid_size.x * compute_grid_size.y):
-        pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}")
-    grid_size = (8, 8)
-
-    query_layer_shape = [1, num_heads, seq_len, 64]
-    key_layer_transposed_shape = [1, num_heads, 64, seq_len]
-    value_layer_shape = [1, num_heads, seq_len, 64]
-    output_shape = [1, num_heads, seq_len, 64]
-
-    torch_query_layer = torch.randn(query_layer_shape).bfloat16().float()
-    torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float()
-    torch_value_layer = torch.randn(value_layer_shape).bfloat16().float()
-    torch_output = torch.randn(output_shape).bfloat16().float()
-
-    dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG
-    l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG
-
-    height_sharded_memory_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1
-    )
-
-    # compare output to regular case
-    reference_query_layer = torch2tt_tensor(
-        torch_query_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_key_layer_transposed = torch2tt_tensor(
-        torch_key_layer_transposed,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_value_layer = torch2tt_tensor(
-        torch_value_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=True,
-    )
-
-    passing = True
-    output = None
-
-    mm_out = torch2tt_tensor(
-        torch_output,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32)
-    mm_activations_height_shard_spec = [tiles_per_shard * 32, 2 * 32]
-    mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len]
-
-    heads_per_slice = num_heads // num_slices
-    for i in range(num_slices):
-        slice = ttnn.interleaved_to_sharded_partial(
-            reference_query_layer,
-            grid_size,
-            mm_activations_height_shard_spec,
-            num_slices,
-            i,
-            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-            ttnn.ShardOrientation.ROW_MAJOR,
-        )
-        program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            in0_block_w=2,
-            per_core_M=tiles_per_shard,
-            per_core_N=seq_len // 32,
-            out_subblock_h=1,
-            out_subblock_w=1,
-            fuse_batch=True,
-            fused_activation=None,
-            mcast_in0=False,
-        )
-
-        k_slice = ttnn.slice(
-            reference_key_layer_transposed,
-            (0, (i * heads_per_slice), 0, 0),
-            (1, (i * heads_per_slice) + (heads_per_slice), 64, seq_len),
-            memory_config=l1_interleaved_memory_config,
-        )
-        mm_slice = ttnn.matmul(
-            slice,
-            k_slice,
-            program_config=program_config,
-            memory_config=height_sharded_memory_config,
-            dtype=data_format,
-            compute_kernel_config=compute_kernel_config,
-        )
-        k_slice.deallocate()
-        slice.deallocate()
-
-        softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            subblock_w=1,
-            block_h=mm_output_height_shard_spec[0] // 32,
-            block_w=mm_output_height_shard_spec[1] // 32,
-        )
-
-        mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-
-        program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            in0_block_w=seq_len // 32,
-            per_core_M=tiles_per_shard,
-            per_core_N=2,
-            out_subblock_h=1,
-            out_subblock_w=1,
-            fuse_batch=True,
-            fused_activation=None,
-            mcast_in0=False,
-        )
-        v_slice = ttnn.slice(
-            reference_value_layer,
-            (0, (i * heads_per_slice), 0, 0),
-            (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64),
-            memory_config=l1_interleaved_memory_config,
-        )
-        mm_slice = ttnn.matmul(
-            mm_slice,
-            v_slice,
-            program_config=program_config,
-            memory_config=height_sharded_memory_config,
-            dtype=data_format,
-            compute_kernel_config=compute_kernel_config,
-        )
-        v_slice.deallocate()
-
-        ttnn.sharded_to_interleaved_partial(
-            mm_slice,
-            mm_out,
-            num_slices,
-            i,
-            memory_config=dram_interleaved_memory_config,
-        )
-
-        mm_slice.deallocate()
-
-        return
-
-    mm_out_torch = tt2torch_tensor(mm_out)
-
-    attn_weights = ttnn.matmul(
-        reference_query_layer, reference_key_layer_transposed, memory_config=dram_interleaved_memory_config
-    )
-    attn_weights = ttnn.softmax_in_place(attn_weights)
-    attn_weights = ttnn.matmul(attn_weights, reference_value_layer, memory_config=dram_interleaved_memory_config)
-
-    attn_weights_torch = tt2torch_tensor(attn_weights)
-    passing, output = comp_pcc(mm_out_torch, attn_weights_torch)
-
-    print(output)
-    assert passing
-
-
-# Test matmul attention sequence with InterleavedToShardedPartialOp
-@skip_for_grayskull()
-@pytest.mark.parametrize("seq_len", [4096, 1024, 256, 64])
-@pytest.mark.parametrize("kv_len", [96])
-@pytest.mark.parametrize("num_heads", [16])
-@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b])
-@pytest.mark.parametrize("reshard_for_softmax", [True, False])
-def test_cross_attnention(
-    device,
-    seq_len,
-    kv_len,
-    num_heads,
-    data_format,
-    reshard_for_softmax,
-    function_level_defaults,
-):
-    if seq_len == 64 and reshard_for_softmax:
-        pytest.skip()
-    compute_grid_size = device.compute_with_storage_grid_size()
-    grid_size = (8, 2)
-    num_cores = grid_size[0] * grid_size[1]
-    if num_cores > (compute_grid_size.x * compute_grid_size.y):
-        pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}")
-
-    query_layer_shape = [1, num_heads, seq_len, 64]
-    key_layer_transposed_shape = [1, num_heads, 64, kv_len]
-    value_layer_shape = [1, num_heads, kv_len, 64]
-    output_shape = [1, num_heads, seq_len, 64]
-
-    torch_query_layer = torch.randn(query_layer_shape).bfloat16().float()
-    torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float()
-    torch_value_layer = torch.randn(value_layer_shape).bfloat16().float()
-    torch_output = torch.randn(output_shape).bfloat16().float()
-
-    dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG
-    l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG
-
-    height_sharded_memory_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1
-    )
-
-    # compare output to regular case
-    reference_query_layer = torch2tt_tensor(
-        torch_query_layer,
-        device,
-        tt_memory_config=l1_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_key_layer_transposed = torch2tt_tensor(
-        torch_key_layer_transposed,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_value_layer = torch2tt_tensor(
-        torch_value_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    passing = True
-    output = None
-
-    q_sharded = ttnn.interleaved_to_sharded(
-        reference_query_layer,
-        grid_size,
-        [num_heads * seq_len // num_cores, 64],
-        ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-        ttnn.ShardOrientation.COL_MAJOR,
-    )
-
-    program_config = ttnn.MatmulMultiCoreReuseProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=2,
-        out_subblock_h=1,
-        out_subblock_w=1,
-        per_core_M=num_heads * seq_len // num_cores // 32,
-        per_core_N=kv_len // 32,
-    )
-    print(program_config)
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    mm_slice = ttnn.matmul(
-        q_sharded,
-        reference_key_layer_transposed,
-        program_config=program_config,
-        memory_config=height_sharded_memory_config,
-        dtype=data_format,
-        compute_kernel_config=compute_kernel_config,
-    )
-    q_sharded.deallocate()
-
-    if reshard_for_softmax:
-        height_per_core = num_heads * seq_len // 64
-        orig_mem_config = mm_slice.memory_config()
-        if seq_len == 1024:
-            mm_slice = ttnn.sharded_to_interleaved(mm_slice, dram_interleaved_memory_config)
-            mm_slice = ttnn.interleaved_to_sharded(
-                mm_slice,
-                (8, 8),
-                [height_per_core, kv_len],
-                ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-                ttnn.ShardOrientation.COL_MAJOR,
-            )
-        else:
-            output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))})
-            output_shard_spec = ttnn.ShardSpec(
-                output_shard_grid, [height_per_core, kv_len], ttnn.ShardOrientation.COL_MAJOR, False
-            )
-            output_mem_config = ttnn.MemoryConfig(
-                ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec
-            )
-            mm_slice = ttnn.reshard(
-                mm_slice,
-                output_mem_config,
-            )
-        softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-            compute_with_storage_grid_size=(8, 8),
-            subblock_w=1,
-            block_h=32,
-            block_w=3,
-        )
-        mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-        mm_slice = ttnn.reshard(mm_slice, orig_mem_config)
-
-    else:
-        softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            subblock_w=1,
-            block_h=seq_len // 32,
-            block_w=kv_len // 32,
-        )
-        mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-
-    v_sharded = ttnn.interleaved_to_sharded(
-        reference_value_layer,
-        grid_size,
-        [num_heads * kv_len // num_cores, 64],
-        ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-        ttnn.ShardOrientation.COL_MAJOR,
-    )
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-    program_config = ttnn.MatmulMultiCoreReuseProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=kv_len // 32,
-        out_subblock_h=1,
-        out_subblock_w=1,
-        per_core_M=num_heads * seq_len // num_cores // 32,
-        per_core_N=2,
-    )
-    mm_slice = ttnn.matmul(
-        mm_slice,
-        v_sharded,
-        program_config=program_config,
-        memory_config=height_sharded_memory_config,
-        dtype=data_format,
-        compute_kernel_config=compute_kernel_config,
-    )
-    v_sharded.deallocate()
-
-    mm_out_torch = tt2torch_tensor(mm_slice)
-
-    attn_weights_torch = torch_query_layer @ torch_key_layer_transposed
-    attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1)
-    attn_weights_torch = attn_weights_torch @ torch_value_layer
-
-    passing, output = comp_pcc(mm_out_torch, attn_weights_torch)
-
-    print(output)
-    assert passing
-
-
-# Test matmul attention sequence with InterleavedToShardedPartialOp
-@skip_for_grayskull()
-@pytest.mark.parametrize("seq_len", [1024, 256, 64])
-@pytest.mark.parametrize("num_heads", [16])
-@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b])
-@pytest.mark.parametrize("reshard_for_softmax", [True, False])
-def test_attention(
-    device,
-    seq_len,
-    num_heads,
-    data_format,
-    reshard_for_softmax,
-    function_level_defaults,
-):
-    if (seq_len == 64 or seq_len == 1024) and reshard_for_softmax:
-        pytest.skip()
-    compute_grid_size = device.compute_with_storage_grid_size()
-    grid_size = (2, 8)
-    num_cores = grid_size[0] * grid_size[1]
-    if num_cores > (compute_grid_size.x * compute_grid_size.y):
-        pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}")
-
-    query_layer_shape = [1, num_heads, seq_len, 64]
-    key_layer_transposed_shape = [1, num_heads, 64, seq_len]
-    value_layer_shape = [1, num_heads, seq_len, 64]
-    output_shape = [1, num_heads, seq_len, 64]
-
-    torch_query_layer = torch.randn(query_layer_shape).bfloat16().float()
-    torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float()
-    torch_value_layer = torch.randn(value_layer_shape).bfloat16().float()
-    torch_output = torch.randn(output_shape).bfloat16().float()
-
-    dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG
-    l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG
-
-    height_sharded_memory_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1
-    )
-
-    # compare output to regular case
-    reference_query_layer = torch2tt_tensor(
-        torch_query_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_key_layer_transposed = torch2tt_tensor(
-        torch_key_layer_transposed,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    reference_value_layer = torch2tt_tensor(
-        torch_value_layer,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    passing = True
-    output = None
-
-    q_sharded = ttnn.interleaved_to_sharded(
-        reference_query_layer,
-        grid_size,
-        [num_heads * seq_len // num_cores, 64],
-        ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-        ttnn.ShardOrientation.ROW_MAJOR,
-    )
-    M = num_heads * seq_len
-    K = 64
-    N = seq_len
-    program_config = ttnn.MatmulMultiCoreReuseProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=K // 32,
-        out_subblock_h=1,
-        out_subblock_w=1,
-        per_core_M=M // num_cores // 32,
-        per_core_N=N // 32,
-    )
-    print(program_config)
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    mm_slice = ttnn.matmul(
-        q_sharded,
-        reference_key_layer_transposed,
-        program_config=program_config,
-        memory_config=height_sharded_memory_config,
-        dtype=data_format,
-        compute_kernel_config=compute_kernel_config,
-    )
-    q_sharded.deallocate()
-
-    if reshard_for_softmax:
-        height_per_core = num_heads * seq_len // 64
-        orig_mem_config = mm_slice.memory_config()
-        if seq_len == 1024:
-            mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config)
-            mm_slice = ttnn.interleaved_to_sharded(
-                mm_slice,
-                (8, 8),
-                [height_per_core, seq_len],
-                ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-                ttnn.ShardOrientation.ROW_MAJOR,
-            )
-            softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-                compute_with_storage_grid_size=(8, 8),
-                subblock_w=1,
-                block_h=height_per_core // 32,
-                block_w=seq_len // 32,
-            )
-            mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-            mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config)
-            mm_slice = ttnn.interleaved_to_sharded(
-                mm_slice,
-                (8, 2),
-                [num_heads * seq_len // 16, seq_len],
-                ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-                ttnn.ShardOrientation.COL_MAJOR,
-            )
-
-        else:
-            output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))})
-            output_shard_spec = ttnn.ShardSpec(
-                output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.COL_MAJOR, False
-            )
-            output_mem_config = ttnn.MemoryConfig(
-                ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec
-            )
-            mm_slice = ttnn.reshard(
-                mm_slice,
-                output_mem_config,
-            )
-            softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-                compute_with_storage_grid_size=(8, 8),
-                subblock_w=1,
-                block_h=height_per_core // 32,
-                block_w=seq_len // 32,
-            )
-            mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-            mm_slice = ttnn.reshard(mm_slice, orig_mem_config)
-    else:
-        softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
-            compute_with_storage_grid_size=grid_size,
-            subblock_w=1,
-            block_h=seq_len // 32,
-            block_w=seq_len // 32,
-        )
-        print(softmax_program_config)
-        mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config)
-
-    v_sharded = ttnn.interleaved_to_sharded(
-        reference_value_layer,
-        grid_size,
-        [num_heads * seq_len // num_cores, 64],
-        ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-        ttnn.ShardOrientation.ROW_MAJOR,
-    )
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-    program_config = ttnn.MatmulMultiCoreReuseProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=seq_len // 32,
-        out_subblock_h=1,
-        out_subblock_w=1,
-        per_core_M=num_heads * seq_len // num_cores // 32,
-        per_core_N=2,
-    )
-    print(program_config)
-    mm_slice = ttnn.matmul(
-        mm_slice,
-        v_sharded,
-        program_config=program_config,
-        memory_config=height_sharded_memory_config,
-        dtype=data_format,
-        compute_kernel_config=compute_kernel_config,
-    )
-    v_sharded.deallocate()
-
-    mm_out_torch = tt2torch_tensor(mm_slice)
-
-    attn_weights_torch = torch_query_layer @ torch_key_layer_transposed
-    attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1)
-    attn_weights_torch = attn_weights_torch @ torch_value_layer
-
-    passing, output = comp_pcc(mm_out_torch, attn_weights_torch)
-
-    print(output)
-    assert passing
-
-
-@skip_for_grayskull()
-@pytest.mark.parametrize("size", [4096, 1024, 256, 64])
-@pytest.mark.parametrize("is_qkv", [1, 2, 3])
-@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b])
-def test_q_and_kv(
-    device,
-    size,
-    data_format,
-    is_qkv,
-    function_level_defaults,
-):
-    # Test matmul attention sequence with InterleavedToShardedPartialOp
-    sizes = {4096: [1, 8192, 320, 512], 1024: [1, 2048, 640, 768], 256: [1, 512, 1280, 1280], 64: [1, 128, 1280, 1280]}
-    grid_sizes = {4096: (5, 8), 1024: (5, 8), 256: (8, 8), 64: (8, 4)}
-    B, M, K, N = sizes[size]
-    N = N * is_qkv
-    grid_size = grid_sizes[size]
-    compute_grid_size = device.compute_with_storage_grid_size()
-    num_cores = grid_size[0] * grid_size[1]
-    if num_cores > (compute_grid_size.x * compute_grid_size.y):
-        pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}")
-
-    in_0_shape = [1, B, M, K]
-    in_1_shape = [1, B, K, N]
-    in_2_shape = [1, B, 192, K]
-    in_3_shape = [1, B, K, 2 * N]
-
-    in_0_torch = torch.randn(in_0_shape).bfloat16().float()
-    in_1_torch = torch.randn(in_1_shape).bfloat16().float()
-    in_2_torch = torch.randn(in_2_shape).bfloat16().float()
-    in_3_torch = torch.randn(in_3_shape).bfloat16().float()
-
-    dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG
-    l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG
-
-    height_sharded_memory_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1
-    )
-
-    block_sharded_memory_config = ttnn.MemoryConfig(
-        memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, buffer_type=ttnn.BufferType.L1
-    )
-
-    # compare output to regular case
-    in_0 = torch2tt_tensor(
-        in_0_torch,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    in_1 = torch2tt_tensor(
-        in_1_torch,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    in_2 = torch2tt_tensor(
-        in_2_torch,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-    in_3 = torch2tt_tensor(
-        in_3_torch,
-        device,
-        tt_memory_config=dram_interleaved_memory_config,
-        tt_dtype=data_format,
-    )
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    passing = True
-    output = None
-
-    in_0_sharded = ttnn.interleaved_to_sharded(
-        in_0,
-        grid_size,
-        [M // grid_size[1], K // grid_size[0]],
-        ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-        ttnn.ShardOrientation.ROW_MAJOR,
-    )
-    M, K = in_0.shape[-2], in_0.shape[-1]
-    N = in_1.shape[-1]
-    in0_block_h, in0_block_w, out_subblock_h, out_subblock_w, out_block_h, out_block_w = determine_blocking(
-        M, K, N, grid_size
-    )
-    program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=in0_block_w,
-        out_subblock_h=out_subblock_h,
-        out_subblock_w=out_subblock_w,
-        per_core_M=out_block_h,
-        per_core_N=out_block_w,
-        transpose_mcast=False,
-        fused_activation=None,
-    )
-
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-    mm = ttnn.matmul(
-        in_0_sharded if size != 4096 else in_0,
-        in_1,
-        program_config=program_config,
-        memory_config=block_sharded_memory_config,
-        dtype=ttnn.bfloat8_b,
-        compute_kernel_config=compute_kernel_config,
-    )
-    in_0_sharded.deallocate()
-
-    M, K, N = in_2.shape[-2], in_2.shape[-1], in_3.shape[-1]
-    in0_block_h = M // grid_size[1] // 32
-    in0_block_w = K // grid_size[0] // 32
-    out_block_h = math.ceil(M / grid_size[1] / 32)
-    out_block_w = math.ceil(N / grid_size[0] / 32)
-    out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w)
-    program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=in0_block_w,
-        out_subblock_h=out_subblock_h,
-        out_subblock_w=out_subblock_w,
-        per_core_M=out_block_h,
-        per_core_N=out_block_w,
-        transpose_mcast=False,
-        fused_activation=None,
-    )
-    compute_kernel_config = ttnn.WormholeComputeKernelConfig(
-        math_fidelity=ttnn.MathFidelity.LoFi,
-        math_approx_mode=True,
-        fp32_dest_acc_en=False,
-        packer_l1_acc=False,
-    )
-
-    mm_out_torch = tt2torch_tensor(mm)
-
-    out_torch = in_0_torch @ in_1_torch
-
-    passing, output = comp_pcc(mm_out_torch, out_torch)
-
-    print(output)
-    assert passing