#0: Clean up

tenstorrent · May 17, 2024 · c9556f4 · c9556f4
1 parent a72a6ac
commit c9556f4
Show file tree

Hide file tree

Showing 3 changed files with 249 additions and 95 deletions.
diff --git a/tests/ttnn/sweep_tests/sweeps/sweeps/matmul/short/matmul_user_program_config_mcast_1d.py b/tests/ttnn/sweep_tests/sweeps/sweeps/matmul/short/matmul_user_program_config_mcast_1d.py
@@ -14,95 +14,259 @@
 from models.utility_functions import torch_random
 
 
+core_grid = ttnn.CoreCoord(8, 7)
 parameters = {
     "matmul_specs": [
-        # # Matmul 1D mcast in0
-        # (
-        #     (1,),
-        #     (64, 2048, 1024),
-        #     ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-        #         compute_with_storage_grid_size=(8, 4),
-        #         in0_block_w=2,  # K // 32
-        #         out_subblock_h=1,
-        #         out_subblock_w=1,
-        #         per_core_M=2,  # M // 32
-        #         per_core_N=1,  # N // num_cores(32) // 32
-        #         fuse_batch=True,
-        #         fused_activation=None,
-        #         mcast_in0=True,
-        #     ),
-        #     ttnn.MemoryConfig(
-        #         memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
-        #         buffer_type=ttnn.BufferType.L1,
-        #         shard_spec=ttnn.ShardSpec(
-        #             ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
-        #             (64, 64),  # M, K // num_cores(32)
-        #             ttnn.ShardOrientation.ROW_MAJOR,
-        #             False,
-        #         ),
-        #     ),
-        # ),
-        # Matmul 1D mcast in0
-        # (
-        #     (1,),
-        #     (64, 2048, 1024),
-        #     ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-        #         compute_with_storage_grid_size=(8, 4),
-        #         in0_block_w=1,  # K // 32
-        #         out_subblock_h=1,
-        #         out_subblock_w=1,
-        #         per_core_M=2,  # M // 32
-        #         per_core_N=1,  # N // num_cores(32) // 32
-        #         fuse_batch=True,
-        #         fused_activation=None,
-        #         mcast_in0=True,
-        #     ),
-        #     ttnn.MemoryConfig(
-        #         memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
-        #         buffer_type=ttnn.BufferType.L1,
-        #         shard_spec=ttnn.ShardSpec(
-        #             ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
-        #             (64, 64),  # M, K // num_cores(32)
-        #             ttnn.ShardOrientation.ROW_MAJOR,
-        #             False,
-        #         ),
-        #     ),
-        # ),
-        # (
-        #     (1,),
-        #     (64, 2048, 32 * 20), # 25 works
-        #     ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-        #         compute_with_storage_grid_size=(8, 4),
-        #         in0_block_w=1,  # K // 32
-        #         out_subblock_h=1,
-        #         out_subblock_w=1,
-        #         per_core_M=2,  # M // 32
-        #         per_core_N=1,  # N // num_cores(32) // 32
-        #         fuse_batch=True,
-        #         fused_activation=None,
-        #         mcast_in0=True,
-        #     ),
-        #     ttnn.MemoryConfig(
-        #         memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
-        #         buffer_type=ttnn.BufferType.L1,
-        #         shard_spec=ttnn.ShardSpec(
-        #             ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
-        #             (64, 64),  # M, K // num_cores(32)
-        #             ttnn.ShardOrientation.ROW_MAJOR,
-        #             False,
-        #         ),
-        #     ),
-        #  ),
+        # Matmul 1D mcast in0: in0 grid == output grid
+        # loop along in0 shard width
         (
             (1,),
-            (64, 1600, 32 * 38),
+            (64, 32 * 64, 32 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 4),
+                in0_block_w=1,
+                out_subblock_h=1,
+                out_subblock_w=1,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # no looping along in0 shard width
+        (
+            (1,),
+            (64, 32 * 64, 32 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 4),
+                in0_block_w=2,
+                out_subblock_h=1,
+                out_subblock_w=1,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # Matmul 1D mcast in0: in0 grid < output grid
+        # loop along in0 shard width
+        (
+            (1,),
+            (64, 28 * 64, 35 * 96),
             ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
                 compute_with_storage_grid_size=(8, 5),
-                in0_block_w=1,  # K // 32
+                in0_block_w=1,
+                out_subblock_h=1,
+                out_subblock_w=3,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # no looping along in0 shard width
+        (
+            (1,),
+            (64, 28 * 64, 35 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 5),
+                in0_block_w=2,
+                out_subblock_h=1,
+                out_subblock_w=3,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # Matmul 1D mcast in0: in0 grid > output grid
+        # loop along in0 shard width
+        (
+            (1,),
+            (64, 35 * 64, 28 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 5),
+                in0_block_w=1,
+                out_subblock_h=1,
+                out_subblock_w=3,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(35, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # no looping along in0 shard width
+        (
+            (1,),
+            (64, 35 * 64, 28 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 5),
+                in0_block_w=2,
+                out_subblock_h=1,
+                out_subblock_w=3,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(35, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # Matmul 1D mcast in0: in0 grid.y == output grid.y but in0 grid.x < output grid.x and output grid.x isn't full row; tests mcast logic for num_active_cores
+        # loop along in0 shard width
+        (
+            (1,),
+            (64, 28 * 64, 30 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 4),
+                in0_block_w=1,
+                out_subblock_h=1,
+                out_subblock_w=1,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # no looping along in0 shard width
+        (
+            (1,),
+            (64, 28 * 64, 30 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 4),
+                in0_block_w=2,
+                out_subblock_h=1,
+                out_subblock_w=1,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(28, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # Matmul 1D mcast in0: in0 grid.y == output grid.y but in0 grid.x > output grid.x and in0 grid.x isn't full row; tests mcast logic for num_active_cores
+        # loop along in0 shard width
+        (
+            (1,),
+            (64, 30 * 64, 28 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 4),
+                in0_block_w=1,
+                out_subblock_h=1,
+                out_subblock_w=1,
+                per_core_M=2,
+                per_core_N=3,
+                fuse_batch=True,
+                fused_activation=None,
+                mcast_in0=True,
+            ),
+            ttnn.MemoryConfig(
+                memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+                buffer_type=ttnn.BufferType.L1,
+                shard_spec=ttnn.ShardSpec(
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(30, core_grid, row_wise=True),
+                    (64, 64),
+                    ttnn.ShardOrientation.ROW_MAJOR,
+                    False,
+                ),
+            ),
+        ),
+        # no looping along in0 shard width
+        (
+            (1,),
+            (64, 30 * 64, 28 * 96),
+            ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCast1DProgramConfig(
+                compute_with_storage_grid_size=(8, 4),
+                in0_block_w=2,
                 out_subblock_h=1,
                 out_subblock_w=1,
-                per_core_M=2,  # M // 32
-                per_core_N=1,  # N // num_cores(32) // 32
+                per_core_M=2,
+                per_core_N=3,
                 fuse_batch=True,
                 fused_activation=None,
                 mcast_in0=True,
@@ -111,8 +275,8 @@
                 memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
                 buffer_type=ttnn.BufferType.L1,
                 shard_spec=ttnn.ShardSpec(
-                    ttnn.experimental.tensor.num_cores_to_core_range_set(25, ttnn.CoreCoord(8, 5), row_wise=True),
-                    (64, 64),  # M, K // num_cores(32)
+                    ttnn.experimental.tensor.num_cores_to_core_range_set(30, core_grid, row_wise=True),
+                    (64, 64),
                     ttnn.ShardOrientation.ROW_MAJOR,
                     False,
                 ),
@@ -121,14 +285,10 @@
     ],
     "batch_matrix_multiply": [False],
     "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG],
-    # "output_memory_config": [ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG],
-    "output_memory_config": [ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG],
+    "output_memory_config": [ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG],
     "input_a_dtype": [ttnn.bfloat16],
     "input_b_dtype": [ttnn.bfloat8_b],
     "output_dtype": [ttnn.bfloat16],
-    # "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
-    # "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
-    # "output_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
     "input_layout": [ttnn.TILE_LAYOUT],
     "compute_kernel_config": [None],
 }

diff --git a/...bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp b/...bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
@@ -107,8 +107,6 @@ void kernel_main() {
         local_read_addr = get_read_ptr(cb_id_in2);
     }
 
-    DPRINT << in0_mcast_num_dests << ENDL();
-    DPRINT << in0_mcast_num_cores << ENDL();
     for (uint32_t b = 0; b < batch; ++b) {
         for (uint32_t block = 0; block < num_blocks; ++block) {
             const uint32_t block_id = block / num_blocks_per_shard;