From 90e0d755d8b841b5b5df0d8c81ebfd4d4b50a017 Mon Sep 17 00:00:00 2001 From: Evan Smal Date: Thu, 5 Dec 2024 21:59:48 +0000 Subject: [PATCH] Remove dead test code Deleting tests that don't seem to test any SD modules. Also deleted code for old variant of SD that used 224x224 input shapes. --- .../tests/test_resnet_block_2d_new_conv.py | 84 -- .../tests/test_sharded_attention.py | 966 ------------------ .../test_sharded_attention.py | 1 - 3 files changed, 1051 deletions(-) delete mode 100644 models/demos/wormhole/stable_diffusion/tests/test_sharded_attention.py delete mode 120000 tests/nightly/single_card/stable_diffusion/test_sharded_attention.py diff --git a/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d_new_conv.py index 51afb5afd0d2..91a0f3755e51 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d_new_conv.py @@ -25,90 +25,6 @@ def ttnn_to_torch(input): return input -@skip_for_grayskull() -@pytest.mark.parametrize( - "batch_size, in_channels, input_height, input_width, index1,index2,block_name,out_channels", - [ - (2, 320, 32, 32, 0, 0, "down", None), - (2, 320, 16, 16, 0, 0, "down", None), - (2, 640, 16, 16, 1, 1, "down", None), - (2, 640, 8, 8, 1, 1, "down", None), - (2, 1280, 8, 8, 2, 1, "down", None), - (2, 1280, 4, 4, 2, 1, "down", None), - (2, 2560, 4, 4, 0, 0, "up", 1280), - (2, 2560, 8, 8, 0, 0, "up", 1280), - (2, 1920, 8, 8, 2, 0, "up", 640), - (2, 1920, 16, 16, 2, 0, "up", 640), - (2, 1280, 16, 16, 3, 0, "down", None), - (2, 960, 16, 16, 3, 0, "up", 320), - (2, 960, 32, 32, 3, 0, "up", 320), - (2, 640, 32, 32, 3, 1, "up", 320), - ], -) -def test_resnet_block_2d_256x256( - device, batch_size, in_channels, input_height, input_width, index1, index2, block_name, out_channels -): - pytest.skip() - # setup pytorch model - model_name = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=torch.float32) - - model = pipe.unet - model.eval() - - parameters = preprocess_model_parameters( - model_name=model_name, initialize_model=lambda: model, custom_preprocessor=custom_preprocessor, device=device - ) - - if block_name == "up": - parameters = parameters.up_blocks[index1].resnets[index2] - resnet = pipe.unet.up_blocks[index1].resnets[index2] - elif block_name == "down": - parameters = parameters.down_blocks[index1].resnets[index2] - resnet = pipe.unet.down_blocks[index1].resnets[index2] - else: - parameters = parameters.mid_block.resnets[index2] - resnet = pipe.unet.mid_block.resnets[index2] - - ############ start of residual block ############# - temb_channels = 1280 - groups = 32 - time_embedding_norm = "default" - output_scale_factor = 1 - use_in_shortcut = None - ########## end of residual block ############# - hidden_states_shape = [batch_size, in_channels, input_height, input_width] - temb_shape = [1, 1, 2, 1280] - - input = torch.randn(hidden_states_shape) - temb = torch.randn(temb_shape) - - torch_output = resnet(input, temb.squeeze(0).squeeze(0)) - - input = ttnn.from_torch(input, ttnn.bfloat16) - input = ttnn.to_layout(input, ttnn.TILE_LAYOUT) - input = ttnn.to_device(input, device, memory_config=ttnn.L1_MEMORY_CONFIG) - - temb = ttnn.from_torch(temb, ttnn.bfloat16) - temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT) - temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG) - ttnn_output = resnetBlock2D( - input, - temb=temb, - temb_channels=temb_channels, - time_embedding_norm=time_embedding_norm, - in_channels=in_channels, - out_channels=out_channels, - use_in_shortcut=use_in_shortcut, - groups=groups, - output_scale_factor=output_scale_factor, - parameters=parameters, - device=device, - ) - ttnn_output = ttnn_to_torch(ttnn_output) - assert_with_pcc(torch_output, ttnn_output, pcc=0.99) - - @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) @pytest.mark.parametrize( diff --git a/models/demos/wormhole/stable_diffusion/tests/test_sharded_attention.py b/models/demos/wormhole/stable_diffusion/tests/test_sharded_attention.py deleted file mode 100644 index 1b45761e11c8..000000000000 --- a/models/demos/wormhole/stable_diffusion/tests/test_sharded_attention.py +++ /dev/null @@ -1,966 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import math -import pytest -import ttnn - -from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import ( - comp_pcc, - tt2torch_tensor, - torch2tt_tensor, - is_wormhole_b0, - skip_for_grayskull, -) -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - determine_largest_subblock_size, - determine_blocking, -) - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024]) -@pytest.mark.parametrize("num_slices", [16]) -@pytest.mark.parametrize("num_cores", [64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_time_sharded_attnention_hwb( - device, - seq_len, - num_slices, - num_cores, - num_heads, - data_format, - function_level_defaults, -): - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - grid_size = (8, 8) - - M = seq_len - K = 64 - N = seq_len - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - - height_sharded_mem_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - block_sharded_mem_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - buffer_type=ttnn.BufferType.L1, - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - attn_weights_qkt = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch_sm = torch.nn.functional.softmax(attn_weights_qkt, dim=-1) - attn_weights_torch = attn_weights_torch_sm @ torch_value_layer - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - mm_out = torch2tt_tensor( - torch_output, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_output_block_shard_spec = [seq_len // 8, seq_len // 8] - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len] - - heads_per_slice = num_heads // num_slices - for i in range(num_slices): - q_slice = ttnn.interleaved_to_sharded_partial( - reference_query_layer, - ttnn.CoreCoord(1, grid_size[0]), - [M // grid_size[0], K], - num_slices, - i, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - k_slice = ttnn.interleaved_to_sharded_partial( - reference_key_layer_transposed, - ttnn.CoreCoord(grid_size[1], 1), - [K, N // grid_size[1]], - num_slices, - i, - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=K // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=M // (32 * grid_size[0]), - per_core_N=N // (32 * grid_size[1]), - transpose_mcast=False, - fused_activation=None, - ) - - mm_slice = ttnn.matmul( - q_slice, - k_slice, - program_config=program_config, - memory_config=block_sharded_mem_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - # mmt = tt2torch_tensor(mm_slice) - # passed, message = comp_pcc(mmt, attn_weights_qkt[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :]) - # print(message) - # assert passed - k_slice.deallocate() - q_slice.deallocate() - - height_per_core = seq_len // 64 - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.ROW_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - mm_slice = ttnn.move(mm_slice) - - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=mm_output_height_shard_spec[0] // 32, - block_w=mm_output_height_shard_spec[1] // 32, - ) - # print(program_config) - - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - # mmt = tt2torch_tensor(mm_slice) - # passed, message = comp_pcc(mmt, attn_weights_torch_sm[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :]) - # print(message) - # assert passed - - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - per_core_M=tiles_per_shard, - per_core_N=2, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - v_slice = ttnn.slice( - reference_value_layer, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64), - memory_config=dram_interleaved_memory_config, - ) - - mm_slice = ttnn.matmul( - mm_slice, - v_slice, - program_config=program_config, - memory_config=height_sharded_mem_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_slice.deallocate() - - ttnn.sharded_to_interleaved_partial( - mm_slice, - mm_out, - num_slices, - i, - memory_config=dram_interleaved_memory_config, - ) - - mm_slice.deallocate() - - mm_out_torch = tt2torch_tensor(mm_out) - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024]) -@pytest.mark.parametrize("num_slices", [16]) -@pytest.mark.parametrize("num_cores", [64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_time_sharded_attnention( - device, - seq_len, - num_slices, - num_cores, - num_heads, - data_format, - function_level_defaults, -): - pytest.skip() # ND hang on CI - compute_grid_size = device.compute_with_storage_grid_size() - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - grid_size = (8, 8) - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=True, - ) - - passing = True - output = None - - mm_out = torch2tt_tensor( - torch_output, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_activations_height_shard_spec = [tiles_per_shard * 32, 2 * 32] - mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len] - - heads_per_slice = num_heads // num_slices - for i in range(num_slices): - slice = ttnn.interleaved_to_sharded_partial( - reference_query_layer, - grid_size, - mm_activations_height_shard_spec, - num_slices, - i, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=2, - per_core_M=tiles_per_shard, - per_core_N=seq_len // 32, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - - k_slice = ttnn.slice( - reference_key_layer_transposed, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), 64, seq_len), - memory_config=l1_interleaved_memory_config, - ) - mm_slice = ttnn.matmul( - slice, - k_slice, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - k_slice.deallocate() - slice.deallocate() - - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=mm_output_height_shard_spec[0] // 32, - block_w=mm_output_height_shard_spec[1] // 32, - ) - - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - per_core_M=tiles_per_shard, - per_core_N=2, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - v_slice = ttnn.slice( - reference_value_layer, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64), - memory_config=l1_interleaved_memory_config, - ) - mm_slice = ttnn.matmul( - mm_slice, - v_slice, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_slice.deallocate() - - ttnn.sharded_to_interleaved_partial( - mm_slice, - mm_out, - num_slices, - i, - memory_config=dram_interleaved_memory_config, - ) - - mm_slice.deallocate() - - return - - mm_out_torch = tt2torch_tensor(mm_out) - - attn_weights = ttnn.matmul( - reference_query_layer, reference_key_layer_transposed, memory_config=dram_interleaved_memory_config - ) - attn_weights = ttnn.softmax_in_place(attn_weights) - attn_weights = ttnn.matmul(attn_weights, reference_value_layer, memory_config=dram_interleaved_memory_config) - - attn_weights_torch = tt2torch_tensor(attn_weights) - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024, 256, 64]) -@pytest.mark.parametrize("kv_len", [96]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -@pytest.mark.parametrize("reshard_for_softmax", [True, False]) -def test_cross_attnention( - device, - seq_len, - kv_len, - num_heads, - data_format, - reshard_for_softmax, - function_level_defaults, -): - if seq_len == 64 and reshard_for_softmax: - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - grid_size = (8, 2) - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, kv_len] - value_layer_shape = [1, num_heads, kv_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=l1_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - q_sharded = ttnn.interleaved_to_sharded( - reference_query_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=2, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=kv_len // 32, - ) - print(program_config) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_slice = ttnn.matmul( - q_sharded, - reference_key_layer_transposed, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - q_sharded.deallocate() - - if reshard_for_softmax: - height_per_core = num_heads * seq_len // 64 - orig_mem_config = mm_slice.memory_config() - if seq_len == 1024: - mm_slice = ttnn.sharded_to_interleaved(mm_slice, dram_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 8), - [height_per_core, kv_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - else: - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, kv_len], ttnn.ShardOrientation.COL_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=32, - block_w=3, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.reshard(mm_slice, orig_mem_config) - - else: - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=seq_len // 32, - block_w=kv_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - v_sharded = ttnn.interleaved_to_sharded( - reference_value_layer, - grid_size, - [num_heads * kv_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=kv_len // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=2, - ) - mm_slice = ttnn.matmul( - mm_slice, - v_sharded, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_sharded.deallocate() - - mm_out_torch = tt2torch_tensor(mm_slice) - - attn_weights_torch = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1) - attn_weights_torch = attn_weights_torch @ torch_value_layer - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [1024, 256, 64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -@pytest.mark.parametrize("reshard_for_softmax", [True, False]) -def test_attention( - device, - seq_len, - num_heads, - data_format, - reshard_for_softmax, - function_level_defaults, -): - if (seq_len == 64 or seq_len == 1024) and reshard_for_softmax: - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - grid_size = (2, 8) - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - q_sharded = ttnn.interleaved_to_sharded( - reference_query_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - M = num_heads * seq_len - K = 64 - N = seq_len - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=K // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=M // num_cores // 32, - per_core_N=N // 32, - ) - print(program_config) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_slice = ttnn.matmul( - q_sharded, - reference_key_layer_transposed, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - q_sharded.deallocate() - - if reshard_for_softmax: - height_per_core = num_heads * seq_len // 64 - orig_mem_config = mm_slice.memory_config() - if seq_len == 1024: - mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 8), - [height_per_core, seq_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=height_per_core // 32, - block_w=seq_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 2), - [num_heads * seq_len // 16, seq_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - - else: - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.COL_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=height_per_core // 32, - block_w=seq_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.reshard(mm_slice, orig_mem_config) - else: - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=seq_len // 32, - block_w=seq_len // 32, - ) - print(softmax_program_config) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - v_sharded = ttnn.interleaved_to_sharded( - reference_value_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=2, - ) - print(program_config) - mm_slice = ttnn.matmul( - mm_slice, - v_sharded, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_sharded.deallocate() - - mm_out_torch = tt2torch_tensor(mm_slice) - - attn_weights_torch = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1) - attn_weights_torch = attn_weights_torch @ torch_value_layer - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -@skip_for_grayskull() -@pytest.mark.parametrize("size", [4096, 1024, 256, 64]) -@pytest.mark.parametrize("is_qkv", [1, 2, 3]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_q_and_kv( - device, - size, - data_format, - is_qkv, - function_level_defaults, -): - # Test matmul attention sequence with InterleavedToShardedPartialOp - sizes = {4096: [1, 8192, 320, 512], 1024: [1, 2048, 640, 768], 256: [1, 512, 1280, 1280], 64: [1, 128, 1280, 1280]} - grid_sizes = {4096: (5, 8), 1024: (5, 8), 256: (8, 8), 64: (8, 4)} - B, M, K, N = sizes[size] - N = N * is_qkv - grid_size = grid_sizes[size] - compute_grid_size = device.compute_with_storage_grid_size() - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - in_0_shape = [1, B, M, K] - in_1_shape = [1, B, K, N] - in_2_shape = [1, B, 192, K] - in_3_shape = [1, B, K, 2 * N] - - in_0_torch = torch.randn(in_0_shape).bfloat16().float() - in_1_torch = torch.randn(in_1_shape).bfloat16().float() - in_2_torch = torch.randn(in_2_shape).bfloat16().float() - in_3_torch = torch.randn(in_3_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - block_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - in_0 = torch2tt_tensor( - in_0_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_1 = torch2tt_tensor( - in_1_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_2 = torch2tt_tensor( - in_2_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_3 = torch2tt_tensor( - in_3_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - in_0_sharded = ttnn.interleaved_to_sharded( - in_0, - grid_size, - [M // grid_size[1], K // grid_size[0]], - ttnn.TensorMemoryLayout.BLOCK_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - M, K = in_0.shape[-2], in_0.shape[-1] - N = in_1.shape[-1] - in0_block_h, in0_block_w, out_subblock_h, out_subblock_w, out_block_h, out_block_w = determine_blocking( - M, K, N, grid_size - ) - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=in0_block_w, - out_subblock_h=out_subblock_h, - out_subblock_w=out_subblock_w, - per_core_M=out_block_h, - per_core_N=out_block_w, - transpose_mcast=False, - fused_activation=None, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - mm = ttnn.matmul( - in_0_sharded if size != 4096 else in_0, - in_1, - program_config=program_config, - memory_config=block_sharded_memory_config, - dtype=ttnn.bfloat8_b, - compute_kernel_config=compute_kernel_config, - ) - in_0_sharded.deallocate() - - M, K, N = in_2.shape[-2], in_2.shape[-1], in_3.shape[-1] - in0_block_h = M // grid_size[1] // 32 - in0_block_w = K // grid_size[0] // 32 - out_block_h = math.ceil(M / grid_size[1] / 32) - out_block_w = math.ceil(N / grid_size[0] / 32) - out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w) - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=in0_block_w, - out_subblock_h=out_subblock_h, - out_subblock_w=out_subblock_w, - per_core_M=out_block_h, - per_core_N=out_block_w, - transpose_mcast=False, - fused_activation=None, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_out_torch = tt2torch_tensor(mm) - - out_torch = in_0_torch @ in_1_torch - - passing, output = comp_pcc(mm_out_torch, out_torch) - - print(output) - assert passing diff --git a/tests/nightly/single_card/stable_diffusion/test_sharded_attention.py b/tests/nightly/single_card/stable_diffusion/test_sharded_attention.py deleted file mode 120000 index 233fe5f9ddd0..000000000000 --- a/tests/nightly/single_card/stable_diffusion/test_sharded_attention.py +++ /dev/null @@ -1 +0,0 @@ -../../../../models/demos/wormhole/stable_diffusion/tests/test_sharded_attention.py \ No newline at end of file