Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#4686: add infra for sharing global struct among ops #5456

Merged
merged 7 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
matmul_block
============

.. doxygenfunction:: mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1)
.. doxygenfunction:: mm_block_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1)
.. doxygenfunction:: mm_block_init_short_with_dt(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t old_in1_cb_id=2, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1)
.. doxygenfunction:: mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1)
.. doxygenfunction:: mm_block_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1)
.. doxygenfunction:: mm_block_init_short_with_dt(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t old_in1_cb_id=2, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1)
.. doxygenfunction:: matmul_block(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t in0_tile_index, uint32_t in1_tile_index, uint32_t idst, const uint32_t transpose, uint32_t ct_dim, uint32_t rt_dim, uint32_t kt_dim)
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ move_copy_tile
==============


.. doxygenfunction:: copy_tile_to_dst_init_short_with_dt(uint32_t old_cbid, uint32_t new_cbid) {
.. doxygenfunction:: copy_tile_to_dst_init_short()
.. doxygenfunction:: copy_tile_to_dst_init_short_with_dt(uint32_t old_cbid, uint32_t new_cbid, uint transpose = 0)
.. doxygenfunction:: copy_tile_to_dst_init_short(uint cbid = 0, uint transpose = 0)
.. doxygenfunction:: copy_tile_init()
76 changes: 71 additions & 5 deletions models/demos/resnet/tt/metalResnetBlock50.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from models.utility_functions import tt2torch_tensor
from tt_lib.utils import pad_weight

from models.utility_functions import is_wormhole_b0, is_grayskull
from tt_lib.fused_ops.average_pool import run_avg_pool_on_device_wrapper as TtAvgPool
from tt_lib.fused_ops.max_pool import run_max_pool_on_device_wrapper as TtMaxPool
from tt_lib.fused_ops.max_pool import compute_max_pool_shape
Expand Down Expand Up @@ -107,6 +108,19 @@ def ResnetLinear(
matmul_config = hardcoded_matmul_config_linear[batch_size]

def linear_(act):
if is_grayskull():
compute_kernel_config = tt_lib.tensor.GrayskullComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
)
else:
compute_kernel_config = tt_lib.tensor.WormholeComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
fp32_dest_acc_en=False,
packer_l1_acc=False,
)

## this uses the systolic 1d matmul with bias fused
if matmul_config is None:
output = tt_lib.tensor.resnet_matmul(act, weight_T, bias, output_mem_config)
Expand All @@ -118,7 +132,7 @@ def linear_(act):
program_config=matmul_config,
output_mem_config=output_mem_config,
output_dtype=model_config["ACTIVATIONS_DTYPE"],
math_fidelity=model_config["MATH_FIDELITY"],
compute_kernel_config=compute_kernel_config,
)
return output

Expand Down Expand Up @@ -1089,6 +1103,19 @@ def downsample_conv_op_with_formatting(x):
assert (conv1_as_mm_padded_act_height, inplanes, width) in hardcoded_matmul_config_conv[batch_size]
matmul_config = hardcoded_matmul_config_conv[batch_size][(conv1_as_mm_padded_act_height, inplanes, width)]
# 1x1 conv with stride 1 padding 0 is run using regular matmul
if is_grayskull():
compute_kernel_config = tt_lib.tensor.GrayskullComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
)
else:
compute_kernel_config = tt_lib.tensor.WormholeComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
fp32_dest_acc_en=False,
packer_l1_acc=False,
)

self.conv1 = resnet50_1x1_conv_as_matmul(
conv1_weight.reshape(-1).tolist(),
self.conv1_params,
Expand All @@ -1099,7 +1126,7 @@ def downsample_conv_op_with_formatting(x):
output_mem_config=self.sharded_memory_config,
weights_dtype=model_config["WEIGHTS_DTYPE"],
output_dtype=model_config["ACTIVATIONS_DTYPE"],
math_fidelity=model_config["MATH_FIDELITY"],
compute_kernel_config=compute_kernel_config,
)

self.conv2_params = [width, width, 3, 3, stride, stride, 1, 1, dilation, groups]
Expand Down Expand Up @@ -1228,6 +1255,19 @@ def downsample_conv_op_with_formatting(x):
matmul_config = hardcoded_matmul_config_conv[batch_size][
(conv3_as_mm_padded_act_height, width, planes * self.expansion)
]
if is_grayskull():
compute_kernel_config = tt_lib.tensor.GrayskullComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
)
else:
compute_kernel_config = tt_lib.tensor.WormholeComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
fp32_dest_acc_en=False,
packer_l1_acc=False,
)

# 1x1 conv with stride 1 padding 0 is run using regular matmul
self.conv3 = resnet50_1x1_conv_as_matmul(
conv3_weight.reshape(-1).tolist(),
Expand All @@ -1238,7 +1278,7 @@ def downsample_conv_op_with_formatting(x):
output_mem_config=self.sharded_memory_config,
weights_dtype=model_config["WEIGHTS_DTYPE"],
output_dtype=model_config["ACTIVATIONS_DTYPE"],
math_fidelity=model_config["MATH_FIDELITY"],
compute_kernel_config=compute_kernel_config,
)
self.conv3_output_shape = compute_conv_output_shape(self.conv3_params, self.conv2_output_shape)

Expand Down Expand Up @@ -1781,6 +1821,19 @@ def _make_layer(
matmul_config = hardcoded_matmul_config_conv[batch_size][
(downsample_output_padded_face_size, self.inplanes, downsample_output_channels)
]
if is_grayskull():
compute_kernel_config = tt_lib.tensor.GrayskullComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
)
else:
compute_kernel_config = tt_lib.tensor.WormholeComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
fp32_dest_acc_en=False,
packer_l1_acc=False,
)

self.downsample_conv_on_tt = resnet50_1x1_conv_as_matmul(
downsample_conv_weight.reshape(-1).tolist(),
self.downsample_params,
Expand All @@ -1790,7 +1843,7 @@ def _make_layer(
output_mem_config=self.ds_conv_output_memory_config,
weights_dtype=model_config["WEIGHTS_DTYPE"],
output_dtype=model_config["ACTIVATIONS_DTYPE"],
math_fidelity=model_config["MATH_FIDELITY"],
compute_kernel_config=compute_kernel_config,
)
elif use_downsample_op_and_mm_for_conv1x1_s2:
assert (
Expand All @@ -1804,6 +1857,19 @@ def _make_layer(
assert stride == 2
downsample_op_params = [batch_size, layer_input_shape[1], layer_input_shape[2], stride, stride]
# logger.info("Calling ds op and matmul op, input shape - ", layer_input_shape)
if is_grayskull():
compute_kernel_config = tt_lib.tensor.GrayskullComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
)
else:
compute_kernel_config = tt_lib.tensor.WormholeComputeKernelConfig(
math_fidelity=model_config["MATH_FIDELITY"],
math_approx_mode=True,
fp32_dest_acc_en=False,
packer_l1_acc=False,
)

self.downsample_conv_on_tt = resnet50_1x1_conv_s2_as_downsample_and_matmul(
downsample_conv_weight.reshape(-1).tolist(),
self.downsample_params,
Expand All @@ -1814,7 +1880,7 @@ def _make_layer(
self.ds_conv_output_memory_config,
weights_dtype=model_config["WEIGHTS_DTYPE"],
output_dtype=model_config["ACTIVATIONS_DTYPE"],
math_fidelity=model_config["MATH_FIDELITY"],
compute_kernel_config=compute_kernel_config,
)
else:
assert (
Expand Down
154 changes: 154 additions & 0 deletions tests/tt_eager/python_api_testing/unit_testing/test_attn_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import tt_lib as ttl
from models.utility_functions import comp_pcc
from models.utility_functions import is_grayskull


def generate_input_shapes():
Expand Down Expand Up @@ -59,6 +60,47 @@ def test_attn_matmul(in0_dtype, in1_dtype, out_dtype, device):
assert allclose, f"FAILED: {output}"


@pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
@pytest.mark.parametrize(
"in_dtype", [ttl.tensor.DataType.FLOAT32, ttl.tensor.DataType.BFLOAT16, ttl.tensor.DataType.BFLOAT8_B]
)
def test_attn_matmul_fp32(in_dtype, device):
torch.manual_seed(0)

for input_shape_a, input_shape_b in generate_input_shapes():
input_tensor_a = torch.randn(input_shape_a).bfloat16()
input_tensor_b = torch.randn(input_shape_b).bfloat16()

tt_input_tensor_a = ttl.tensor.Tensor(input_tensor_a, in_dtype).to(ttl.tensor.Layout.TILE).to(device)
tt_input_tensor_b = ttl.tensor.Tensor(input_tensor_b, in_dtype).to(ttl.tensor.Layout.TILE).to(device)

compute_grid_size = device.compute_with_storage_grid_size()

compute_kernel_config = ttl.tensor.WormholeComputeKernelConfig(
math_fidelity=ttl.tensor.MathFidelity.LoFi,
math_approx_mode=True,
fp32_dest_acc_en=True,
packer_l1_acc=False,
)

tt_output_tensor_on_device = ttl.operations.primary.transformers.attn_matmul(
tt_input_tensor_a,
tt_input_tensor_b,
compute_with_storage_grid_size=ttl.tensor.CoreCoord(compute_grid_size.x, compute_grid_size.y),
output_mem_config=ttl.tensor.MemoryConfig(
ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1
),
output_dtype=in_dtype,
compute_kernel_config=compute_kernel_config,
)
tt_output_tensor = tt_output_tensor_on_device.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch()

golden_output_tensor = (input_tensor_a.transpose(0, 2) @ input_tensor_b).transpose(0, 2)

allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor)
assert allclose, f"FAILED: {output}"


@pytest.mark.parametrize("in0_dtype", [ttl.tensor.DataType.BFLOAT16, ttl.tensor.DataType.BFLOAT8_B])
@pytest.mark.parametrize("in1_dtype", [ttl.tensor.DataType.BFLOAT16, ttl.tensor.DataType.BFLOAT8_B])
@pytest.mark.parametrize("out_dtype", [ttl.tensor.DataType.BFLOAT16, ttl.tensor.DataType.BFLOAT8_B])
Expand Down Expand Up @@ -276,3 +318,115 @@ def test_group_attn_matmul_with_program_cache(in0_dtype, in1_dtype, output_dtype
assert allclose, f"FAILED: {output}"

assert num_cache_entries == 1


@pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
@pytest.mark.parametrize("in_dtype", [ttl.tensor.DataType.FLOAT32, ttl.tensor.DataType.BFLOAT16])
@pytest.mark.parametrize(
"shard_orientation",
(ttl.tensor.ShardOrientation.ROW_MAJOR,),
)
@pytest.mark.parametrize(
"output_sharded",
(True,),
)
@pytest.mark.parametrize(
"in1_sharded",
(True,),
)
@pytest.mark.parametrize(
"in0_sharded",
(True,),
)
@pytest.mark.parametrize(
"batch, K, seq_len, q_heads, kv_heads",
(
(32, 64, 512 + 96, 32, 2),
(32, 64 + 32, 64, 32, 2),
(32, 32, 32, 2, 1),
),
)
def test_group_attn_matmul_fp32(
batch, K, seq_len, q_heads, kv_heads, in0_sharded, in1_sharded, output_sharded, shard_orientation, in_dtype, device
):
torch.manual_seed(0)

compute_grid_size = device.compute_with_storage_grid_size()

interleaved_mem_config = ttl.tensor.MemoryConfig(
ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM
)

# NOTE: Mixed precision is supported as well; but might not have enough space for larger seq_len with BFLOAT16
in0_dtype = in_dtype
in1_dtype = in_dtype
output_dtype = in_dtype

q_len = 1
input_shape_a = [q_len, q_heads, batch, K]
input_shape_b = [batch, kv_heads, K, seq_len]

input_tensor_a = torch.randn(input_shape_a).bfloat16()
input_tensor_b = torch.randn(input_shape_b).bfloat16()

tt_input_tensor_a = (
ttl.tensor.Tensor(input_tensor_a, in0_dtype).to(ttl.tensor.Layout.TILE).to(device, interleaved_mem_config)
)
tt_input_tensor_b = (
ttl.tensor.Tensor(input_tensor_b, in1_dtype).to(ttl.tensor.Layout.TILE).to(device, interleaved_mem_config)
)

if in0_sharded:
tt_input_tensor_a = ttl.tensor.interleaved_to_sharded(
tt_input_tensor_a,
compute_grid_size,
[q_len * batch, K],
ttl.tensor.TensorMemoryLayout.HEIGHT_SHARDED,
shard_orientation,
)

if in1_sharded:
tt_input_tensor_b = ttl.tensor.interleaved_to_sharded(
tt_input_tensor_b,
compute_grid_size,
[kv_heads * K, seq_len],
ttl.tensor.TensorMemoryLayout.HEIGHT_SHARDED,
shard_orientation,
)

if output_sharded:
output_mem_config = ttl.tensor.MemoryConfig(
memory_layout=ttl.tensor.TensorMemoryLayout.HEIGHT_SHARDED,
buffer_type=ttl.tensor.BufferType.L1,
)
else:
output_mem_config = interleaved_mem_config

compute_kernel_config = ttl.tensor.WormholeComputeKernelConfig(
math_fidelity=ttl.tensor.MathFidelity.LoFi,
math_approx_mode=True,
fp32_dest_acc_en=True,
packer_l1_acc=False,
)

tt_output_tensor_on_device = ttl.operations.primary.transformers.group_attn_matmul(
tt_input_tensor_a,
tt_input_tensor_b,
compute_with_storage_grid_size=compute_grid_size,
output_mem_config=output_mem_config,
output_dtype=output_dtype,
compute_kernel_config=compute_kernel_config,
)
if output_sharded:
tt_output_tensor_on_device = ttl.tensor.sharded_to_interleaved(
tt_output_tensor_on_device, interleaved_mem_config
)

tt_output_tensor = tt_output_tensor_on_device.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch()

input_tensor_a = input_tensor_a.to(torch.float)
input_tensor_b = torch.repeat_interleave(input_tensor_b.to(torch.float), q_heads // kv_heads, dim=1)
golden_output_tensor = (input_tensor_a.transpose(0, 2) @ input_tensor_b).transpose(0, 2)

allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor)
assert allclose, f"FAILED: {output}"
Loading