Skip to content

Commit

Permalink
#0: Clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
TT-BrianLiu committed May 17, 2024
1 parent a72a6ac commit c9556f4
Show file tree
Hide file tree
Showing 3 changed files with 249 additions and 95 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,95 +14,259 @@
from models.utility_functions import torch_random


core_grid = ttnn.CoreCoord(8, 7)
parameters = {
"matmul_specs": [
# # Matmul 1D mcast in0
# (
# (1,),
# (64, 2048, 1024),
# ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
# compute_with_storage_grid_size=(8, 4),
# in0_block_w=2, # K // 32
# out_subblock_h=1,
# out_subblock_w=1,
# per_core_M=2, # M // 32
# per_core_N=1, # N // num_cores(32) // 32
# fuse_batch=True,
# fused_activation=None,
# mcast_in0=True,
# ),
# ttnn.MemoryConfig(
# memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
# buffer_type=ttnn.BufferType.L1,
# shard_spec=ttnn.ShardSpec(
# ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
# (64, 64), # M, K // num_cores(32)
# ttnn.ShardOrientation.ROW_MAJOR,
# False,
# ),
# ),
# ),
# Matmul 1D mcast in0
# (
# (1,),
# (64, 2048, 1024),
# ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
# compute_with_storage_grid_size=(8, 4),
# in0_block_w=1, # K // 32
# out_subblock_h=1,
# out_subblock_w=1,
# per_core_M=2, # M // 32
# per_core_N=1, # N // num_cores(32) // 32
# fuse_batch=True,
# fused_activation=None,
# mcast_in0=True,
# ),
# ttnn.MemoryConfig(
# memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
# buffer_type=ttnn.BufferType.L1,
# shard_spec=ttnn.ShardSpec(
# ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
# (64, 64), # M, K // num_cores(32)
# ttnn.ShardOrientation.ROW_MAJOR,
# False,
# ),
# ),
# ),
# (
# (1,),
# (64, 2048, 32 * 20), # 25 works
# ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
# compute_with_storage_grid_size=(8, 4),
# in0_block_w=1, # K // 32
# out_subblock_h=1,
# out_subblock_w=1,
# per_core_M=2, # M // 32
# per_core_N=1, # N // num_cores(32) // 32
# fuse_batch=True,
# fused_activation=None,
# mcast_in0=True,
# ),
# ttnn.MemoryConfig(
# memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
# buffer_type=ttnn.BufferType.L1,
# shard_spec=ttnn.ShardSpec(
# ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
# (64, 64), # M, K // num_cores(32)
# ttnn.ShardOrientation.ROW_MAJOR,
# False,
# ),
# ),
# ),
# Matmul 1D mcast in0: in0 grid == output grid
# loop along in0 shard width
(
(1,),
(64, 1600, 32 * 38),
(64, 32 * 64, 32 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 4),
in0_block_w=1,
out_subblock_h=1,
out_subblock_w=1,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# no looping along in0 shard width
(
(1,),
(64, 32 * 64, 32 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 4),
in0_block_w=2,
out_subblock_h=1,
out_subblock_w=1,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# Matmul 1D mcast in0: in0 grid < output grid
# loop along in0 shard width
(
(1,),
(64, 28 * 64, 35 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 5),
in0_block_w=1, # K // 32
in0_block_w=1,
out_subblock_h=1,
out_subblock_w=3,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# no looping along in0 shard width
(
(1,),
(64, 28 * 64, 35 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 5),
in0_block_w=2,
out_subblock_h=1,
out_subblock_w=3,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# Matmul 1D mcast in0: in0 grid > output grid
# loop along in0 shard width
(
(1,),
(64, 35 * 64, 28 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 5),
in0_block_w=1,
out_subblock_h=1,
out_subblock_w=3,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(35, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# no looping along in0 shard width
(
(1,),
(64, 35 * 64, 28 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 5),
in0_block_w=2,
out_subblock_h=1,
out_subblock_w=3,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(35, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# Matmul 1D mcast in0: in0 grid.y == output grid.y but in0 grid.x < output grid.x and output grid.x isn't full row; tests mcast logic for num_active_cores
# loop along in0 shard width
(
(1,),
(64, 28 * 64, 30 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 4),
in0_block_w=1,
out_subblock_h=1,
out_subblock_w=1,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# no looping along in0 shard width
(
(1,),
(64, 28 * 64, 30 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 4),
in0_block_w=2,
out_subblock_h=1,
out_subblock_w=1,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# Matmul 1D mcast in0: in0 grid.y == output grid.y but in0 grid.x > output grid.x and in0 grid.x isn't full row; tests mcast logic for num_active_cores
# loop along in0 shard width
(
(1,),
(64, 30 * 64, 28 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 4),
in0_block_w=1,
out_subblock_h=1,
out_subblock_w=1,
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
),
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(30, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
),
),
# no looping along in0 shard width
(
(1,),
(64, 30 * 64, 28 * 96),
ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=(8, 4),
in0_block_w=2,
out_subblock_h=1,
out_subblock_w=1,
per_core_M=2, # M // 32
per_core_N=1, # N // num_cores(32) // 32
per_core_M=2,
per_core_N=3,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
Expand All @@ -111,8 +275,8 @@
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
shard_spec=ttnn.ShardSpec(
ttnn.experimental.tensor.num_cores_to_core_range_set(25, ttnn.CoreCoord(8, 5), row_wise=True),
(64, 64), # M, K // num_cores(32)
ttnn.experimental.tensor.num_cores_to_core_range_set(30, core_grid, row_wise=True),
(64, 64),
ttnn.ShardOrientation.ROW_MAJOR,
False,
),
Expand All @@ -121,14 +285,10 @@
],
"batch_matrix_multiply": [False],
"input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG],
# "output_memory_config": [ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG],
"output_memory_config": [ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG],
"output_memory_config": [ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG],
"input_a_dtype": [ttnn.bfloat16],
"input_b_dtype": [ttnn.bfloat8_b],
"output_dtype": [ttnn.bfloat16],
# "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
# "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
# "output_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
"input_layout": [ttnn.TILE_LAYOUT],
"compute_kernel_config": [None],
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,6 @@ void kernel_main() {
local_read_addr = get_read_ptr(cb_id_in2);
}

DPRINT << in0_mcast_num_dests << ENDL();
DPRINT << in0_mcast_num_cores << ENDL();
for (uint32_t b = 0; b < batch; ++b) {
for (uint32_t block = 0; block < num_blocks; ++block) {
const uint32_t block_id = block / num_blocks_per_shard;
Expand Down
Loading

0 comments on commit c9556f4

Please sign in to comment.