diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index 762324fb3a3..0af646345b1 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -149,8 +149,8 @@ jobs: fail-fast: false matrix: test-config: - - model: "wh_b0_unstable" - cmd: ./tests/scripts/single_card/nightly/run_wh_b0_unstable.sh + - model: "stable_diffusion" + cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion - model: "mamba 1" cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1 - model: "mamba 2" diff --git a/CODEOWNERS b/CODEOWNERS index aa80b7671c4..3b74d00a047 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -173,10 +173,6 @@ tests/**/dtx/ @mywoodstock @sankarmanoj-tt tests/**/*test*conv*.py @mywoodstock @sankarmanoj-tt tests/python_api_testing/conv/ @mywoodstock @sankarmanoj-tt tests/python_api_testing/unit_testing/fallback_ops @tt-aho -tests/ttnn/integration_tests/stable_diffusion @esmalTT @uaydonat @mywoodstock -tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py @esmalTT @uaydonat @mywoodstock -tests/ttnn/integration_tests/unet @esmalTT @uaydonat @mywoodstock -tests/nightly/wh_b0_only_eth/experimental/functional_unet @esmalTT @uaydonat @mywoodstock scripts/profiler/ @mo-tenstorrent scripts/docker @tenstorrent/metalium-developers-infra diff --git a/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py b/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py deleted file mode 100644 index 8db6aee6f39..00000000000 --- a/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py +++ /dev/null @@ -1,236 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import ttnn -import json -import torch -import pytest -import numpy as np -from PIL import Image -from loguru import logger -from tqdm.auto import tqdm -from datasets import load_dataset - -from transformers import CLIPTextModel, CLIPTokenizer -from diffusers import ( - AutoencoderKL, - UNet2DConditionModel, - LMSDiscreteScheduler, -) -from models.utility_functions import ( - comp_allclose_and_pcc, - enable_persistent_kernel_cache, - disable_persistent_kernel_cache, -) -from models.utility_functions import skip_for_wormhole_b0 -from ttnn.model_preprocessing import preprocess_model_parameters -from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_unet_2d_condition_model import ( - UNet2DConditionModel as UNet2D, -) - -from torchvision.transforms import ToTensor - - -def load_inputs(input_path): - with open(input_path) as f: - input_data = json.load(f) - assert input_data, "Input data is empty." - prompt = [item["prompt"] for item in input_data] - return prompt - - -def constant_prop_time_embeddings(timesteps, sample, time_proj): - timesteps = timesteps[None] - timesteps = timesteps.expand(sample.shape[0]) - t_emb = time_proj(timesteps) - return t_emb - - -def save_image_and_latents(latents, iter, vae, pre_fix="", pre_fix2=""): - pre_fix = "" if pre_fix == "" else f"{pre_fix}_" - pre_fix2 = "" if pre_fix2 == "" else f"{pre_fix2}_" - _latents = 1 / 0.18215 * latents - - with torch.no_grad(): - image = vae.decode(_latents).sample - # Image post-processing - image = (image / 2 + 0.5).clamp(0, 1) - image = image.detach().cpu().permute(0, 2, 3, 1).numpy() - images = (image * 255).round().astype("uint8") - pil_images = [Image.fromarray(image) for image in images][0] - pil_images.save(f"{pre_fix}{pre_fix2}image_iter_{iter}.png") - - -def guide(noise_pred, guidance_scale, t): # will return latents - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - return noise_pred - - -def latent_expansion(latents, scheduler, t): - latent_model_input = torch.cat([latents] * 2, dim=0) - latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t) - return latent_model_input - - -def calculate_fid_score(imgs_path1, imgs_path2): - fid = FrechetInceptionDistance(normalize=True) - fid.update(imgs_path1, real=False) - fid.update(imgs_path2, real=True) - return fid.compute() - - -def preprocess_images(image_paths): - images = [] - for image_path in image_paths: - image = Image.open(image_path) - image = image.convert("RGB") - image = image.resize((299, 299)) - image = ToTensor()(image) - images.append(image) - return torch.stack(images) - - -def run_demo_inference_diffusiondb(device, reset_seeds, input_path, num_inference_steps, image_size): - disable_persistent_kernel_cache() - - height, width = image_size - - experiment_name = f"diffusiondb_{height}x{width}" - input_prompt = [ - "oil painting frame of Breathtaking mountain range with a clear river running through it, surrounded by tall trees and misty clouds, serene, peaceful, mountain landscape, high detail" - ] - logger.info(f"input_prompts: {input_prompt}") - - # 1. Load the autoencoder model which will be used to decode the latents into image space. - vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") - - # 2. Load the tokenizer and text encoder to tokenize and encode the text. - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") - - # 3. The UNet model for generating the latents. - unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet") - - # 4. load the K-LMS scheduler with some fitting parameters. - ttnn_scheduler = LMSDiscreteScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - num_train_timesteps=1000, - ) - - torch_device = "cpu" - vae.to(torch_device) - text_encoder.to(torch_device) - unet.to(torch_device) - - guidance_scale = 7.5 # Scale for classifier-free guidance - generator = torch.manual_seed(174) # 10233 Seed generator to create the inital latent noise - batch_size = len(input_prompt) - - ## First, we get the text_embeddings for the prompt. These embeddings will be used to condition the UNet model. - # Tokenizer and Text Encoder - text_input = tokenizer( - input_prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0] - max_length = text_input.input_ids.shape[-1] - uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt") - uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0] - - # For classifier-free guidance, we need to do two forward passes: one with the conditioned input (text_embeddings), - # and another with the unconditional embeddings (uncond_embeddings). - # In practice, we can concatenate both into a single batch to avoid doing two forward passes. - text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) - ttnn_text_embeddings = ttnn.from_torch(text_embeddings, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) - - vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1) - # Initial random noise - latents = torch.randn( - (batch_size, unet.config.in_channels, height // vae_scale_factor, width // vae_scale_factor), - generator=generator, - ) - latents = latents.to(torch_device) - - ttnn_scheduler.set_timesteps(num_inference_steps) - - latents = latents * ttnn_scheduler.init_noise_sigma - ttnn_latents = torch.tensor(latents) - - iter = 0 - config = unet.config - - parameters = preprocess_model_parameters( - initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device - ) - input_height = 64 - input_width = 64 - reader_patterns_cache = {} if height == 512 and width == 512 else None - - model = UNet2D(device, parameters, 2, input_height, input_width, reader_patterns_cache) - # # Denoising loop - for t in tqdm(ttnn_scheduler.timesteps): - # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. - ttnn_latent_model_input = latent_expansion(ttnn_latents, ttnn_scheduler, t) - ttnn_latent_model_input = ttnn.from_torch( - ttnn_latent_model_input, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device - ) - - _t = constant_prop_time_embeddings(t, ttnn_latent_model_input, unet.time_proj) - _t = _t.unsqueeze(0).unsqueeze(0) - _t = ttnn.from_torch(_t, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) - - # predict the noise residual - with torch.no_grad(): - ttnn_output = model( - ttnn_latent_model_input, # input - timestep=_t, - encoder_hidden_states=ttnn_text_embeddings, - class_labels=None, - attention_mask=None, - cross_attention_kwargs=None, - return_dict=True, - config=config, - ) - noise_pred = ttnn.to_torch(ttnn_output) - - # perform guidance - noise_pred = guide(noise_pred, guidance_scale, t) - - ttnn_latents = ttnn_scheduler.step(noise_pred, t, ttnn_latents).prev_sample - save_image_and_latents(ttnn_latents, iter, vae, pre_fix=f"{experiment_name}_tt", pre_fix2="") - - iter += 1 - enable_persistent_kernel_cache() - - latents = ttnn_latents - # scale and decode the image latents with vae - latents = 1 / 0.18215 * latents - with torch.no_grad(): - image = vae.decode(latents).sample - - # Image post-processing - image = (image / 2 + 0.5).clamp(0, 1) - image = image.detach().cpu().permute(0, 2, 3, 1).numpy() - images = (image * 255).round().astype("uint8") - pil_images = [Image.fromarray(image) for image in images][0] - ttnn_output_path = f"{experiment_name}_ttnn.png" - pil_images.save(ttnn_output_path) - - ref_paths = [ref_img_path, ref_img_path] - ttnn_paths = [ttnn_output_path, ttnn_output_path] - - ref_images = preprocess_images(ref_paths) - ttnn_images = preprocess_images(ttnn_paths) - - -def test_tt2_multiple_iteration(device, reset_seeds, input_path): - # 30 iterations, generate 512x512 image - return run_demo_inference_diffusiondb(device, reset_seeds, input_path, 30, (512, 512)) diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_basic_transformer_block.py b/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_basic_transformer_block.py rename to models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py rename to models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py b/models/demos/wormhole/stable_diffusion/tests/test_demo.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_demo.py rename to models/demos/wormhole/stable_diffusion/tests/test_demo.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_embedding.py b/models/demos/wormhole/stable_diffusion/tests/test_embedding.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_embedding.py rename to models/demos/wormhole/stable_diffusion/tests/test_embedding.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_feedforward.py b/models/demos/wormhole/stable_diffusion/tests/test_feedforward.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_feedforward.py rename to models/demos/wormhole/stable_diffusion/tests/test_feedforward.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_geglu.py b/models/demos/wormhole/stable_diffusion/tests/test_geglu.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_geglu.py rename to models/demos/wormhole/stable_diffusion/tests/test_geglu.py diff --git a/tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py b/models/demos/wormhole/stable_diffusion/tests/test_perf.py similarity index 100% rename from tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py rename to models/demos/wormhole/stable_diffusion/tests/test_perf.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py similarity index 66% rename from tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py index 51afb5afd0d..91a0f3755e5 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py @@ -25,90 +25,6 @@ def ttnn_to_torch(input): return input -@skip_for_grayskull() -@pytest.mark.parametrize( - "batch_size, in_channels, input_height, input_width, index1,index2,block_name,out_channels", - [ - (2, 320, 32, 32, 0, 0, "down", None), - (2, 320, 16, 16, 0, 0, "down", None), - (2, 640, 16, 16, 1, 1, "down", None), - (2, 640, 8, 8, 1, 1, "down", None), - (2, 1280, 8, 8, 2, 1, "down", None), - (2, 1280, 4, 4, 2, 1, "down", None), - (2, 2560, 4, 4, 0, 0, "up", 1280), - (2, 2560, 8, 8, 0, 0, "up", 1280), - (2, 1920, 8, 8, 2, 0, "up", 640), - (2, 1920, 16, 16, 2, 0, "up", 640), - (2, 1280, 16, 16, 3, 0, "down", None), - (2, 960, 16, 16, 3, 0, "up", 320), - (2, 960, 32, 32, 3, 0, "up", 320), - (2, 640, 32, 32, 3, 1, "up", 320), - ], -) -def test_resnet_block_2d_256x256( - device, batch_size, in_channels, input_height, input_width, index1, index2, block_name, out_channels -): - pytest.skip() - # setup pytorch model - model_name = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=torch.float32) - - model = pipe.unet - model.eval() - - parameters = preprocess_model_parameters( - model_name=model_name, initialize_model=lambda: model, custom_preprocessor=custom_preprocessor, device=device - ) - - if block_name == "up": - parameters = parameters.up_blocks[index1].resnets[index2] - resnet = pipe.unet.up_blocks[index1].resnets[index2] - elif block_name == "down": - parameters = parameters.down_blocks[index1].resnets[index2] - resnet = pipe.unet.down_blocks[index1].resnets[index2] - else: - parameters = parameters.mid_block.resnets[index2] - resnet = pipe.unet.mid_block.resnets[index2] - - ############ start of residual block ############# - temb_channels = 1280 - groups = 32 - time_embedding_norm = "default" - output_scale_factor = 1 - use_in_shortcut = None - ########## end of residual block ############# - hidden_states_shape = [batch_size, in_channels, input_height, input_width] - temb_shape = [1, 1, 2, 1280] - - input = torch.randn(hidden_states_shape) - temb = torch.randn(temb_shape) - - torch_output = resnet(input, temb.squeeze(0).squeeze(0)) - - input = ttnn.from_torch(input, ttnn.bfloat16) - input = ttnn.to_layout(input, ttnn.TILE_LAYOUT) - input = ttnn.to_device(input, device, memory_config=ttnn.L1_MEMORY_CONFIG) - - temb = ttnn.from_torch(temb, ttnn.bfloat16) - temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT) - temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG) - ttnn_output = resnetBlock2D( - input, - temb=temb, - temb_channels=temb_channels, - time_embedding_norm=time_embedding_norm, - in_channels=in_channels, - out_channels=out_channels, - use_in_shortcut=use_in_shortcut, - groups=groups, - output_scale_factor=output_scale_factor, - parameters=parameters, - device=device, - ) - ttnn_output = ttnn_to_torch(ttnn_output) - assert_with_pcc(torch_output, ttnn_output, pcc=0.99) - - @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) @pytest.mark.parametrize( diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_matmuls.py b/models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_sharded_matmuls.py rename to models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py similarity index 98% rename from tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py index 35b1253ea54..72efdb4e178 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py @@ -63,7 +63,6 @@ def unsqueeze_all_params_to_4d(params): @skip_for_grayskull() -@pytest.mark.skipif(is_wormhole_b0() or is_blackhole(), reason="#10923: CB / L1 buffer clash") @pytest.mark.parametrize( "device_params", [{"l1_small_size": 32768}], ids=["device_params=l1_small_size_24576"], indirect=True ) @@ -204,7 +203,7 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_ # print(iter) # print(f"Time taken for 50 iterations: {total_time}") # print(f"Samples per second: {50 / total_time}") - passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.99) + passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.981) print(output) assert passing diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upsample_nearest_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_upsample_nearest_2d.py rename to models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py diff --git a/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py b/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py new file mode 120000 index 00000000000..61408ffa9e7 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attention.py b/tests/nightly/single_card/stable_diffusion/test_cross_attention.py new file mode 120000 index 00000000000..c161012b886 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_cross_attention.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py b/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py new file mode 120000 index 00000000000..8fce2d91ed2 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_demo.py b/tests/nightly/single_card/stable_diffusion/test_demo.py new file mode 120000 index 00000000000..c375047f633 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_demo.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_demo.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_embedding.py b/tests/nightly/single_card/stable_diffusion/test_embedding.py new file mode 120000 index 00000000000..3e89c128424 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_embedding.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_embedding.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_feedforward.py b/tests/nightly/single_card/stable_diffusion/test_feedforward.py new file mode 120000 index 00000000000..915332488d5 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_feedforward.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_feedforward.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_geglu.py b/tests/nightly/single_card/stable_diffusion/test_geglu.py new file mode 120000 index 00000000000..5880ea6e17d --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_geglu.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_geglu.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py b/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py new file mode 120000 index 00000000000..1b6513e5b50 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py b/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py new file mode 120000 index 00000000000..d5d12d47849 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py b/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py new file mode 120000 index 00000000000..d82d4a899f6 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py b/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py new file mode 120000 index 00000000000..c25a861ed35 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py b/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py new file mode 120000 index 00000000000..3997b30be69 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py b/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py new file mode 120000 index 00000000000..88a98649844 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py b/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py new file mode 120000 index 00000000000..815ccb622b4 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion b/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion deleted file mode 120000 index 608e08f48e2..00000000000 --- a/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion +++ /dev/null @@ -1 +0,0 @@ -../../../../../../../tests/ttnn/integration_tests/stable_diffusion \ No newline at end of file diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 93f22682e18..7c42512474d 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -73,7 +73,7 @@ run_perf_models_cnn_javelin() { # Run tests env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests/test_unet_perf.py -m $test_marker - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=480 ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -83,7 +83,7 @@ run_device_perf_models() { set -eo pipefail local test_marker=$1 - env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600 + env pytest models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=600 env pytest models/demos/distilbert/tests -m $test_marker diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py b/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py deleted file mode 100644 index 1b45761e11c..00000000000 --- a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py +++ /dev/null @@ -1,966 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import math -import pytest -import ttnn - -from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import ( - comp_pcc, - tt2torch_tensor, - torch2tt_tensor, - is_wormhole_b0, - skip_for_grayskull, -) -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - determine_largest_subblock_size, - determine_blocking, -) - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024]) -@pytest.mark.parametrize("num_slices", [16]) -@pytest.mark.parametrize("num_cores", [64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_time_sharded_attnention_hwb( - device, - seq_len, - num_slices, - num_cores, - num_heads, - data_format, - function_level_defaults, -): - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - grid_size = (8, 8) - - M = seq_len - K = 64 - N = seq_len - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - - height_sharded_mem_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - block_sharded_mem_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - buffer_type=ttnn.BufferType.L1, - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - attn_weights_qkt = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch_sm = torch.nn.functional.softmax(attn_weights_qkt, dim=-1) - attn_weights_torch = attn_weights_torch_sm @ torch_value_layer - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - mm_out = torch2tt_tensor( - torch_output, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_output_block_shard_spec = [seq_len // 8, seq_len // 8] - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len] - - heads_per_slice = num_heads // num_slices - for i in range(num_slices): - q_slice = ttnn.interleaved_to_sharded_partial( - reference_query_layer, - ttnn.CoreCoord(1, grid_size[0]), - [M // grid_size[0], K], - num_slices, - i, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - k_slice = ttnn.interleaved_to_sharded_partial( - reference_key_layer_transposed, - ttnn.CoreCoord(grid_size[1], 1), - [K, N // grid_size[1]], - num_slices, - i, - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=K // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=M // (32 * grid_size[0]), - per_core_N=N // (32 * grid_size[1]), - transpose_mcast=False, - fused_activation=None, - ) - - mm_slice = ttnn.matmul( - q_slice, - k_slice, - program_config=program_config, - memory_config=block_sharded_mem_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - # mmt = tt2torch_tensor(mm_slice) - # passed, message = comp_pcc(mmt, attn_weights_qkt[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :]) - # print(message) - # assert passed - k_slice.deallocate() - q_slice.deallocate() - - height_per_core = seq_len // 64 - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.ROW_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - mm_slice = ttnn.move(mm_slice) - - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=mm_output_height_shard_spec[0] // 32, - block_w=mm_output_height_shard_spec[1] // 32, - ) - # print(program_config) - - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - # mmt = tt2torch_tensor(mm_slice) - # passed, message = comp_pcc(mmt, attn_weights_torch_sm[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :]) - # print(message) - # assert passed - - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - per_core_M=tiles_per_shard, - per_core_N=2, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - v_slice = ttnn.slice( - reference_value_layer, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64), - memory_config=dram_interleaved_memory_config, - ) - - mm_slice = ttnn.matmul( - mm_slice, - v_slice, - program_config=program_config, - memory_config=height_sharded_mem_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_slice.deallocate() - - ttnn.sharded_to_interleaved_partial( - mm_slice, - mm_out, - num_slices, - i, - memory_config=dram_interleaved_memory_config, - ) - - mm_slice.deallocate() - - mm_out_torch = tt2torch_tensor(mm_out) - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024]) -@pytest.mark.parametrize("num_slices", [16]) -@pytest.mark.parametrize("num_cores", [64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_time_sharded_attnention( - device, - seq_len, - num_slices, - num_cores, - num_heads, - data_format, - function_level_defaults, -): - pytest.skip() # ND hang on CI - compute_grid_size = device.compute_with_storage_grid_size() - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - grid_size = (8, 8) - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=True, - ) - - passing = True - output = None - - mm_out = torch2tt_tensor( - torch_output, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_activations_height_shard_spec = [tiles_per_shard * 32, 2 * 32] - mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len] - - heads_per_slice = num_heads // num_slices - for i in range(num_slices): - slice = ttnn.interleaved_to_sharded_partial( - reference_query_layer, - grid_size, - mm_activations_height_shard_spec, - num_slices, - i, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=2, - per_core_M=tiles_per_shard, - per_core_N=seq_len // 32, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - - k_slice = ttnn.slice( - reference_key_layer_transposed, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), 64, seq_len), - memory_config=l1_interleaved_memory_config, - ) - mm_slice = ttnn.matmul( - slice, - k_slice, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - k_slice.deallocate() - slice.deallocate() - - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=mm_output_height_shard_spec[0] // 32, - block_w=mm_output_height_shard_spec[1] // 32, - ) - - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - per_core_M=tiles_per_shard, - per_core_N=2, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - v_slice = ttnn.slice( - reference_value_layer, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64), - memory_config=l1_interleaved_memory_config, - ) - mm_slice = ttnn.matmul( - mm_slice, - v_slice, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_slice.deallocate() - - ttnn.sharded_to_interleaved_partial( - mm_slice, - mm_out, - num_slices, - i, - memory_config=dram_interleaved_memory_config, - ) - - mm_slice.deallocate() - - return - - mm_out_torch = tt2torch_tensor(mm_out) - - attn_weights = ttnn.matmul( - reference_query_layer, reference_key_layer_transposed, memory_config=dram_interleaved_memory_config - ) - attn_weights = ttnn.softmax_in_place(attn_weights) - attn_weights = ttnn.matmul(attn_weights, reference_value_layer, memory_config=dram_interleaved_memory_config) - - attn_weights_torch = tt2torch_tensor(attn_weights) - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024, 256, 64]) -@pytest.mark.parametrize("kv_len", [96]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -@pytest.mark.parametrize("reshard_for_softmax", [True, False]) -def test_cross_attnention( - device, - seq_len, - kv_len, - num_heads, - data_format, - reshard_for_softmax, - function_level_defaults, -): - if seq_len == 64 and reshard_for_softmax: - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - grid_size = (8, 2) - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, kv_len] - value_layer_shape = [1, num_heads, kv_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=l1_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - q_sharded = ttnn.interleaved_to_sharded( - reference_query_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=2, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=kv_len // 32, - ) - print(program_config) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_slice = ttnn.matmul( - q_sharded, - reference_key_layer_transposed, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - q_sharded.deallocate() - - if reshard_for_softmax: - height_per_core = num_heads * seq_len // 64 - orig_mem_config = mm_slice.memory_config() - if seq_len == 1024: - mm_slice = ttnn.sharded_to_interleaved(mm_slice, dram_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 8), - [height_per_core, kv_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - else: - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, kv_len], ttnn.ShardOrientation.COL_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=32, - block_w=3, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.reshard(mm_slice, orig_mem_config) - - else: - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=seq_len // 32, - block_w=kv_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - v_sharded = ttnn.interleaved_to_sharded( - reference_value_layer, - grid_size, - [num_heads * kv_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=kv_len // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=2, - ) - mm_slice = ttnn.matmul( - mm_slice, - v_sharded, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_sharded.deallocate() - - mm_out_torch = tt2torch_tensor(mm_slice) - - attn_weights_torch = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1) - attn_weights_torch = attn_weights_torch @ torch_value_layer - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [1024, 256, 64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -@pytest.mark.parametrize("reshard_for_softmax", [True, False]) -def test_attention( - device, - seq_len, - num_heads, - data_format, - reshard_for_softmax, - function_level_defaults, -): - if (seq_len == 64 or seq_len == 1024) and reshard_for_softmax: - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - grid_size = (2, 8) - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - q_sharded = ttnn.interleaved_to_sharded( - reference_query_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - M = num_heads * seq_len - K = 64 - N = seq_len - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=K // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=M // num_cores // 32, - per_core_N=N // 32, - ) - print(program_config) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_slice = ttnn.matmul( - q_sharded, - reference_key_layer_transposed, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - q_sharded.deallocate() - - if reshard_for_softmax: - height_per_core = num_heads * seq_len // 64 - orig_mem_config = mm_slice.memory_config() - if seq_len == 1024: - mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 8), - [height_per_core, seq_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=height_per_core // 32, - block_w=seq_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 2), - [num_heads * seq_len // 16, seq_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - - else: - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.COL_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=height_per_core // 32, - block_w=seq_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.reshard(mm_slice, orig_mem_config) - else: - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=seq_len // 32, - block_w=seq_len // 32, - ) - print(softmax_program_config) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - v_sharded = ttnn.interleaved_to_sharded( - reference_value_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=2, - ) - print(program_config) - mm_slice = ttnn.matmul( - mm_slice, - v_sharded, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_sharded.deallocate() - - mm_out_torch = tt2torch_tensor(mm_slice) - - attn_weights_torch = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1) - attn_weights_torch = attn_weights_torch @ torch_value_layer - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -@skip_for_grayskull() -@pytest.mark.parametrize("size", [4096, 1024, 256, 64]) -@pytest.mark.parametrize("is_qkv", [1, 2, 3]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_q_and_kv( - device, - size, - data_format, - is_qkv, - function_level_defaults, -): - # Test matmul attention sequence with InterleavedToShardedPartialOp - sizes = {4096: [1, 8192, 320, 512], 1024: [1, 2048, 640, 768], 256: [1, 512, 1280, 1280], 64: [1, 128, 1280, 1280]} - grid_sizes = {4096: (5, 8), 1024: (5, 8), 256: (8, 8), 64: (8, 4)} - B, M, K, N = sizes[size] - N = N * is_qkv - grid_size = grid_sizes[size] - compute_grid_size = device.compute_with_storage_grid_size() - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - in_0_shape = [1, B, M, K] - in_1_shape = [1, B, K, N] - in_2_shape = [1, B, 192, K] - in_3_shape = [1, B, K, 2 * N] - - in_0_torch = torch.randn(in_0_shape).bfloat16().float() - in_1_torch = torch.randn(in_1_shape).bfloat16().float() - in_2_torch = torch.randn(in_2_shape).bfloat16().float() - in_3_torch = torch.randn(in_3_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - block_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - in_0 = torch2tt_tensor( - in_0_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_1 = torch2tt_tensor( - in_1_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_2 = torch2tt_tensor( - in_2_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_3 = torch2tt_tensor( - in_3_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - in_0_sharded = ttnn.interleaved_to_sharded( - in_0, - grid_size, - [M // grid_size[1], K // grid_size[0]], - ttnn.TensorMemoryLayout.BLOCK_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - M, K = in_0.shape[-2], in_0.shape[-1] - N = in_1.shape[-1] - in0_block_h, in0_block_w, out_subblock_h, out_subblock_w, out_block_h, out_block_w = determine_blocking( - M, K, N, grid_size - ) - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=in0_block_w, - out_subblock_h=out_subblock_h, - out_subblock_w=out_subblock_w, - per_core_M=out_block_h, - per_core_N=out_block_w, - transpose_mcast=False, - fused_activation=None, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - mm = ttnn.matmul( - in_0_sharded if size != 4096 else in_0, - in_1, - program_config=program_config, - memory_config=block_sharded_memory_config, - dtype=ttnn.bfloat8_b, - compute_kernel_config=compute_kernel_config, - ) - in_0_sharded.deallocate() - - M, K, N = in_2.shape[-2], in_2.shape[-1], in_3.shape[-1] - in0_block_h = M // grid_size[1] // 32 - in0_block_w = K // grid_size[0] // 32 - out_block_h = math.ceil(M / grid_size[1] / 32) - out_block_w = math.ceil(N / grid_size[0] / 32) - out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w) - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=in0_block_w, - out_subblock_h=out_subblock_h, - out_subblock_w=out_subblock_w, - per_core_M=out_block_h, - per_core_N=out_block_w, - transpose_mcast=False, - fused_activation=None, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_out_torch = tt2torch_tensor(mm) - - out_torch = in_0_torch @ in_1_torch - - passing, output = comp_pcc(mm_out_torch, out_torch) - - print(output) - assert passing