diff --git a/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp index 0413f01f6a2..b647008c843 100644 --- a/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp @@ -200,7 +200,7 @@ int main(int argc, char **argv) { auto generic_binary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/generic_binary_reader_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); @@ -218,7 +218,7 @@ int main(int argc, char **argv) { auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unswizzle.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -251,7 +251,7 @@ int main(int argc, char **argv) { auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block_zm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_add_two_ints.cpp b/tests/tt_metal/tt_metal/test_add_two_ints.cpp index b4524d82c47..6da67a50792 100644 --- a/tests/tt_metal/tt_metal/test_add_two_ints.cpp +++ b/tests/tt_metal/tt_metal/test_add_two_ints.cpp @@ -40,7 +40,7 @@ int main(int argc, char **argv) { std::vector second_runtime_args = {303, 606}; tt_metal::KernelID add_two_ints_kernel = tt_metal::CreateDataMovementKernel( - program, "tt_metal/kernels/riscv_draft/add_two_ints.cpp", core, + program, "tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); //////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp index f905e891c87..eb5cadd3ff9 100644 --- a/tests/tt_metal/tt_metal/test_bcast.cpp +++ b/tests/tt_metal/tt_metal/test_bcast.cpp @@ -28,16 +28,16 @@ const char* get_reader_name(bool multibank, BcastDim::Enum bcast_dim) { TT_ASSERT(multibank && "Only multibank is supported correctly."); if (bcast_dim == BcastDim::H) { return multibank ? - "tt_metal/kernels/dataflow/reader_bcast_h_8bank.cpp" : - "tt_metal/kernels/dataflow/reader_bcast_h.cpp"; + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp" : + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp"; } else if (bcast_dim == BcastDim::W) { return multibank ? - "tt_metal/kernels/dataflow/reader_bcast_w_8bank.cpp" : - "tt_metal/kernels/dataflow/reader_bcast_w.cpp"; + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp" : + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp"; } if (bcast_dim == BcastDim::HW) { return multibank ? - "tt_metal/kernels/dataflow/reader_bcast_hw_8bank.cpp" : - "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp"; + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp" : + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp"; } TT_ASSERT(false && "Unexpected bcast_dim!"); return ""; @@ -45,9 +45,9 @@ const char* get_reader_name(bool multibank, BcastDim::Enum bcast_dim) { const char* get_compute_name(BcastDim::Enum bcast_dim) { switch (bcast_dim) { - case BcastDim::H: return "tt_metal/kernels/compute/bcast_h.cpp"; - case BcastDim::W: return "tt_metal/kernels/compute/bcast_w.cpp"; - case BcastDim::HW: return "tt_metal/kernels/compute/bcast_hw.cpp"; + case BcastDim::H: return "tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp"; + case BcastDim::W: return "tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp"; + case BcastDim::HW: return "tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp"; default: TT_ASSERT(false && "Unexpected bcast_dim!"); } return ""; @@ -214,8 +214,8 @@ int main(int argc, char **argv) { auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" - : "tt_metal/kernels/dataflow/writer_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp index f4d42cca314..effdf8c57fd 100644 --- a/tests/tt_metal/tt_metal/test_bmm.cpp +++ b/tests/tt_metal/tt_metal/test_bmm.cpp @@ -77,13 +77,13 @@ int main(int argc, char **argv) { std::vector writer_compile_time_args = {(uint32_t)dst_is_dram}; auto reader = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_bmm_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bmm_8bank.cpp", core, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = reader_compile_time_args}); auto writer = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_bmm_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp", core, DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = writer_compile_time_args}); @@ -96,7 +96,7 @@ int main(int argc, char **argv) { auto eltwise_binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/bmm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index 096f4171cea..b842da46505 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -35,13 +35,13 @@ bool test_compile_args(std::vector compile_args_vec, int device_id) { tt_metal::KernelID unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/test_compile_args.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = compile_args_vec}); tt_metal::KernelID unary_writer_kernel = tt_metal::CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/blank.cpp", + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -50,7 +50,7 @@ bool test_compile_args(std::vector compile_args_vec, int device_id) { }; auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( - program, "tt_metal/kernels/compute/blank.cpp", + program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_args}); //////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp index cd658ef13ee..e0f43f8f2d0 100644 --- a/tests/tt_metal/tt_metal/test_compile_program.cpp +++ b/tests/tt_metal/tt_metal/test_compile_program.cpp @@ -114,13 +114,13 @@ Program create_program(Device *device, const ProgramAttributes &program_attribut auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core, tt_metal::DataMovementConfig{.processor = program_attributes.reader_processor, .noc = program_attributes.reader_noc}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = program_attributes.writer_processor, .noc = program_attributes.writer_noc}); @@ -130,7 +130,7 @@ Program create_program(Device *device, const ProgramAttributes &program_attribut auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core, tt_metal::ComputeConfig{ .math_fidelity = program_attributes.math_fidelity, diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 0dbed1a3a72..c674bf1936e 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -86,13 +86,13 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); @@ -102,7 +102,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp index a02ba4f6555..7b44f1ea4ba 100644 --- a/tests/tt_metal/tt_metal/test_core_range_set.cpp +++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp @@ -105,13 +105,13 @@ bool test_program_specified_with_core_range_set(tt_metal::Device *device, tt_met auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core_range_set, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core_range_set, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -122,7 +122,7 @@ bool test_program_specified_with_core_range_set(tt_metal::Device *device, tt_met auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core_range_set, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp index ba077f7475a..003e4b0f157 100644 --- a/tests/tt_metal/tt_metal/test_datacopy.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy.cpp @@ -77,13 +77,13 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -93,7 +93,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp index 001519e5b4d..e784f731266 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp @@ -69,13 +69,13 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -85,7 +85,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp index 8da5cd41a4d..89d1380ef96 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp @@ -141,13 +141,13 @@ std::tuple create_pro auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_copy_tile_layout.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_copy_tile_layout.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_copy_tile_layout.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_copy_tile_layout.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -158,7 +158,7 @@ std::tuple create_pro auto compute_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_block.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp", all_cores, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp index f5b2c12a7c6..4ab7de7cc0a 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp @@ -75,13 +75,13 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -91,7 +91,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp index 494e24bb9f4..4aaf60897e7 100644 --- a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp +++ b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp @@ -80,13 +80,13 @@ int main(int argc, char **argv) { auto reader_cb_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_cb_test.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_cb_kernel_args}); auto writer_cb_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_cb_test.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_cb_kernel_args}); diff --git a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp index b6d6d0bb3bf..7ceaa114d74 100644 --- a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp @@ -74,7 +74,7 @@ int main(int argc, char **argv) { } auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_copy_sticks.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp index 790678fd0a3..93bd8b33719 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp @@ -85,14 +85,14 @@ int main(int argc, char **argv) { // Loader (producer kernel) running on BRISC on logical core {0, 0} auto producer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_loader_sync.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp", loader_logical_core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); // Writer (consumer kernel) running on NCRISC on logical core {0, 1} auto consumer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/remote_read_remote_write_sync.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp", writer_logical_core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp index ebd1d23e235..2a961b09686 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp @@ -99,14 +99,14 @@ int main(int argc, char **argv) { // Loader (producer kernel) running on BRISC on logical core {0, 0} auto producer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_loader_sync_db.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp", loader_logical_core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); // Writer (consumer kernel) running on NCRISC on logical core {0, 1} auto consumer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/remote_read_remote_write_sync_db.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp", writer_logical_core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp index 1a1b61b5954..a040fe95937 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp @@ -57,7 +57,7 @@ int main(int argc, char **argv) { auto dram_copy_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp index 0059141c2ef..a0581c2b352 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp @@ -59,7 +59,7 @@ int main(int argc, char **argv) { auto dram_copy_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_copy_db.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp index 34d7e41e02b..727fbb6c5f3 100644 --- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp @@ -77,7 +77,7 @@ int main(int argc, char **argv) { log_info(LogTest, "End = {}, {}", core_end_physical.x, core_end_physical.y); auto mcast_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_to_l1_multicast.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp index fb0f3b0fbad..f4a0ba6d9c3 100644 --- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp +++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp @@ -73,7 +73,7 @@ int main(int argc, char **argv) { log_info(LogTest, "End = {}, {}", core_end_physical.x, core_end_physical.y); auto mcast_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/dram_to_l1_multicast_include_src.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp index 93512dcc988..36129982b4d 100644 --- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp @@ -107,15 +107,15 @@ int main(int argc, char** argv) { auto binary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/reader_dual_8bank.cpp" - : "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" - : "tt_metal/kernels/dataflow/writer_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -130,7 +130,7 @@ int main(int argc, char** argv) { }; auto eltwise_binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines}); diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp index c8bcfc46c61..461095cfac2 100644 --- a/tests/tt_metal/tt_metal/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/test_flatten.cpp @@ -122,13 +122,13 @@ int main(int argc, char **argv) { auto flatten_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/flatten.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -138,7 +138,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp index 1f142a68ed4..facce24c0dd 100644 --- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp @@ -238,7 +238,7 @@ int main(int argc, char **argv) { auto generic_binary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/generic_binary_reader_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); @@ -256,7 +256,7 @@ int main(int argc, char **argv) { auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unswizzle.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -289,7 +289,7 @@ int main(int argc, char **argv) { auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block_zm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_graph_interpreter.cpp b/tests/tt_metal/tt_metal/test_graph_interpreter.cpp index 87a0441b58d..e22bb7ae1e7 100644 --- a/tests/tt_metal/tt_metal/test_graph_interpreter.cpp +++ b/tests/tt_metal/tt_metal/test_graph_interpreter.cpp @@ -36,9 +36,9 @@ void run_compile_blank(tt_metal::Device *device) { .dummy = 0, }; build_kernel_for_riscv_options.set_hlk_args_all_cores(hlk_args, sizeof(blank::hlk_args_t)); - build_kernel_for_riscv_options.set_hlk_file_name_all_cores("tt_metal/kernels/compute/blank.cpp"); - build_kernel_for_riscv_options.ncrisc_kernel_file_name = "tt_metal/kernels/dataflow/blank.cpp"; - build_kernel_for_riscv_options.brisc_kernel_file_name = "tt_metal/kernels/dataflow/blank.cpp"; + build_kernel_for_riscv_options.set_hlk_file_name_all_cores("tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp"); + build_kernel_for_riscv_options.ncrisc_kernel_file_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp"; + build_kernel_for_riscv_options.brisc_kernel_file_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp"; generate_binaries_params_t params; tt_metal::detail::GenerateDeviceHeaders(device, &build_kernel_for_riscv_options, build_kernel_for_riscv_options.name); @@ -189,13 +189,13 @@ bool run_chained_sfpu_test(int chain_length) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -206,7 +206,7 @@ bool run_chained_sfpu_test(int chain_length) { auto graph_interpreter_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/graph_interpreter.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); @@ -403,13 +403,13 @@ bool run_binary_add_and_then_eltwise_gelu_test() { auto binary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -420,7 +420,7 @@ bool run_binary_add_and_then_eltwise_gelu_test() { auto graph_interpreter_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/graph_interpreter.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); @@ -642,13 +642,13 @@ bool run_forked_binary_test() { auto nary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_nary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -659,7 +659,7 @@ bool run_forked_binary_test() { auto graph_interpreter_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/graph_interpreter.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp index ab0841f70a4..96b5c38b244 100644 --- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp +++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp @@ -125,13 +125,13 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt: auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_stick_layout_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = {1}}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -141,7 +141,7 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt: auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); @@ -291,13 +291,13 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt: auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_stick_layout_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = {1}}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary_stick_layout_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_stick_layout_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -307,7 +307,7 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt: auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); @@ -410,13 +410,13 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = {not src_is_in_l1}}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = {not dst_is_in_l1}}); @@ -424,7 +424,7 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) { vector compute_kernel_args = { num_pages }; auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp new file mode 100644 index 00000000000..c68f206eb31 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp @@ -0,0 +1,124 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "llk_math_common.h" +#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_matmul.h" +namespace NAMESPACE +{ + +inline void tilize_activation( + uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks) { + llk_math_eltwise_unary_datacopy_init(); + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t i = 0U; i < in0_subblock_h; i++) { + for (uint32_t j = 0U; j < in0_block_w; j++) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_unary_datacopy(0); + llk_math_dest_section_done(); + } + } + } +} + +inline void reblock_and_untilize_output(uint32_t out_subblock_h, uint32_t out_block_w) { + llk_math_eltwise_unary_datacopy_init(); + + for (uint32_t i = 0; i < out_subblock_h; i++) { + for (int j = 0; j < 2; j++) { + for (uint32_t k = 0; k < out_block_w; k++) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_unary_datacopy(0); + llk_math_dest_section_done(); + } + } + } +} + +void math_main() +{ +uint32_t in0_block_w = get_compile_time_arg_val(0); +llk_math_pack_sync_init(); + +// inner block size in tiles +uint32_t in0_num_subblocks = get_compile_time_arg_val(1); +// outer row block size (in inner row blocks) +uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); +// out_subblock_h*in0_block_w*in0_num_subblocks; +uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); + +uint32_t in0_subblock_h = get_compile_time_arg_val(4); + +// out_subblock_h*in0_block_w +uint32_t in1_num_subblocks = get_compile_time_arg_val(5); +// outer column block size (in inner column blocks) +uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); +//out_subblock_w*in0_block_w* in1_num_subblocks; +uint32_t in1_per_core_w = get_compile_time_arg_val(7); +// out_subblock_w*in1_num_subblocks +constexpr uint32_t num_blocks = get_compile_time_arg_val(8); +// outer inner dim (in inner dim blocks) +uint32_t out_subblock_h = get_compile_time_arg_val(9); +// inner row block size in tiles +uint32_t out_subblock_w = get_compile_time_arg_val(10); +// inner column block size in tiles +uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11); + +uint32_t out_block_w = in1_per_core_w; + +// If true, this assumes data coming in RM +constexpr bool tilize_in = get_compile_time_arg_val(12); + +// If true, this assumes consumer wants data RM +constexpr bool untilize_out = get_compile_time_arg_val(13); + +constexpr bool spill = num_blocks > 1U; +bool enable_reload = false; + +for (uint32_t block = 0U; block < num_blocks; block++) { + bool last_out = block == num_blocks - 1U; + + if constexpr (tilize_in) { + tilize_activation(in0_subblock_h, in0_block_w, in0_num_subblocks); + } + + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) { + + llk_math_wait_for_dest_available(); + if (enable_reload) { + llk_math_eltwise_unary_datacopy_init(); + for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) { + llk_math_eltwise_unary_datacopy(i); + } + } + llk_math_matmul_init(0); + + int dst_index = 0; + for (uint32_t h = 0U; h < out_subblock_h; h++) { + for (uint32_t w = 0U; w < out_subblock_w; w++) { + for (uint32_t inner_dim = 0U; inner_dim < in0_block_w; inner_dim++) { + llk_math_matmul(dst_index); + } + dst_index++; + } + } + + llk_math_dest_section_done(); + } + if constexpr (untilize_out) { + if (last_out) { + reblock_and_untilize_output(out_subblock_h, out_block_w); + } + } + + } + if constexpr (spill) { + enable_reload = true; + } +} +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_pack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_pack.cpp new file mode 100644 index 00000000000..53de202b66c --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_pack.cpp @@ -0,0 +1,179 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "llk_pack_common.h" +#include "llk_pack.h" +namespace NAMESPACE +{ + +inline void tilize_activation( + uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t in0_block_num_tiles, uint32_t matmul_act_cb_id) { + llk_wait_for_free_tiles(matmul_act_cb_id, in0_block_num_tiles); + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t i = 0U; i < in0_subblock_h; i++) { + for (uint32_t j = 0U; j < in0_block_w; j++) { + llk_packer_wait_for_math_done(); + llk_pack(0, matmul_act_cb_id); + llk_pack_dest_section_done(); + llk_push_tiles(matmul_act_cb_id, 1); + } + } + } +} + +inline void pack_row(uint32_t num_tiles_to_pack, uint32_t cb_id) { + /* + Used either for packing reblocked tiles for untilized tiles + */ + llk_wait_for_free_tiles(cb_id, num_tiles_to_pack); + for (uint32_t i = 0; i < num_tiles_to_pack; i++) { + llk_packer_wait_for_math_done(); + llk_pack(0, cb_id); + llk_pack_dest_section_done(); + } + llk_push_tiles(cb_id, num_tiles_to_pack); +} + +inline void reblock_and_untilize_output(uint32_t out_subblock_h, uint32_t out_block_w, uint32_t reblock_cb_id, uint32_t untilize_cb_id) { + for (uint32_t h = 0; h < out_subblock_h; h++) { + // Can only push row because the CB can only fit + // one row + pack_row(out_block_w, reblock_cb_id); + pack_row(out_block_w, untilize_cb_id); + } +} + +inline void pack_block_and_untilize( + uint32_t in0_num_subblocks, uint32_t in1_num_subblocks, + uint32_t out_subblock_num_tiles, uint32_t out_subblock_h, uint32_t out_block_w, + uint32_t interm_cb_id, uint32_t reblock_cb_id) { + + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) { + llk_packer_wait_for_math_done(); + + llk_wait_for_free_tiles(interm_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) { + llk_pack(i, interm_cb_id); + } + llk_push_tiles(interm_cb_id, out_subblock_num_tiles); + llk_pack_dest_section_done(); + } + reblock_and_untilize_output(out_subblock_h, out_block_w, reblock_cb_id, 16); + } +} + +inline void pack_block(uint32_t in0_num_subblocks, uint32_t in1_num_subblocks, uint32_t out_subblock_num_tiles, uint32_t cb_id) { + + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) { + llk_packer_wait_for_math_done(); + + llk_wait_for_free_tiles(cb_id, out_subblock_num_tiles); + for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) { + llk_pack(i, cb_id); + } + llk_push_tiles(cb_id, out_subblock_num_tiles); + llk_pack_dest_section_done(); + } + } +} + + +void pack_main() +{ +uint32_t in0_block_w = get_compile_time_arg_val(0); +llk_pack_init(); +llk_setup_outputs(); +llk_pack_dest_init(); +llk_init_packer_dest_offset_registers(); +llk_pack_hw_configure_disaggregated(16); +// inner block size in tiles +uint32_t in0_num_subblocks = get_compile_time_arg_val(1); +// outer row block size (in inner row blocks) +uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); +// out_subblock_h*in0_block_w*in0_num_subblocks; +uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); +uint32_t in0_subblock_h = get_compile_time_arg_val(4); +uint32_t in1_num_subblocks = get_compile_time_arg_val(5); +uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); +uint32_t in1_per_core_w = get_compile_time_arg_val(7); +constexpr uint32_t num_blocks = get_compile_time_arg_val(8); +uint32_t out_subblock_h = get_compile_time_arg_val(9); +uint32_t out_subblock_w = get_compile_time_arg_val(10); +uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11); + +uint32_t out_block_w = in1_per_core_w; + +// If true, this assumes data coming in RM +constexpr bool tilize_in = get_compile_time_arg_val(12); + +// If true, this assumes consumer wants data RM +constexpr bool untilize_out = get_compile_time_arg_val(13); + +constexpr bool spill = num_blocks > 1U; +bool enable_reload = false; + +// These are required depending on tilize/untilize +uint32_t matmul_act_cb_id = 0; +uint32_t matmul_out_intermediate_cb_id = 24; +if constexpr (tilize_in) { + // If we tilize, matmul doesn't consume original input, + // it consumes what is produced by tilize + matmul_act_cb_id = 24; + matmul_out_intermediate_cb_id = 25; // Given 24 is no longer available, we use 25 instead +} + +uint32_t reblock_cb_id = 26; // Only used if untilize is required +uint32_t matmul_out_cb_id = 16; + +for (uint32_t block = 0U; block < num_blocks - 1; block++) { + if constexpr (tilize_in) { + tilize_activation( + in0_subblock_h, + in0_block_w, + in0_num_subblocks, + in0_block_num_tiles, + matmul_act_cb_id); + } + + pack_block( + in0_num_subblocks, + in1_num_subblocks, + out_subblock_num_tiles, + matmul_out_intermediate_cb_id); +} + +// Last block +if constexpr (tilize_in) { + tilize_activation( + in0_subblock_h, + in0_block_w, + in0_num_subblocks, + in0_block_num_tiles, + matmul_act_cb_id); +} + +if constexpr (untilize_out) { + pack_block_and_untilize( + in0_num_subblocks, + in1_num_subblocks, + out_subblock_num_tiles, + out_subblock_h, + out_block_w, + matmul_out_intermediate_cb_id, + reblock_cb_id + ); +} else { + pack_block( + in0_num_subblocks, + in1_num_subblocks, + out_subblock_num_tiles, + matmul_out_cb_id); +} + + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp new file mode 100644 index 00000000000..7f6b7684c68 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp @@ -0,0 +1,228 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "llk_unpack_common.h" +#include "llk_unpack_tilize.h" +#include "llk_unpack_untilize.h" +#include "llk_unpack_A.h" +#include "llk_unpack_AB_matmul.h" +namespace NAMESPACE +{ + +inline void tilize_activation(uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks) { + // Tilize block code + llk_unpack_tilize_init(0, in0_block_w); + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t i = 0U; i < in0_subblock_h; i++) { + llk_wait_tiles(0, in0_block_w); // These "tiles" are actually not real tiles + llk_unpack_tilize_(0,in0_block_w); + llk_pop_tiles(0,in0_block_w); // Pop the original untilized inputs + } + } + llk_unpack_tilize_uninit(); +} + + +inline __attribute__((always_inline)) +void reblock_and_untilize( + uint32_t num_out_subblocks_in_col, + uint32_t out_subblock_num_tiles, + uint32_t out_subblock_h, + uint32_t out_subblock_w, + uint32_t out_block_w, + uint32_t interm_cb_id, + uint32_t reblock_cb_id) { + + // Wait for a row of subblocks such that the total width matches + // the out block width. Must wait for a whole row of subblocks to arrive + // before we can proceed. + uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col); + llk_wait_tiles(interm_cb_id, num_tiles_in_row_of_subblocks); + + int within_block_index = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + int block_offset = 0; + + llk_unpack_A_init(); + for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + uint32_t tile_index = block_offset + within_block_index + w; + llk_unpack_A(interm_cb_id, tile_index); + } + block_offset += out_subblock_num_tiles; + } + + // Since our reblock CB can only fit one row of + // tiles, we need to immediately untilize to + // consume this row + llk_wait_tiles(reblock_cb_id, out_block_w); + /* + for (uint32_t i = 0; i < out_block_w; i++) { + llk_unpack_A(reblock_cb_id, i); + } + */ + + llk_unpack_untilize_init(reblock_cb_id); + llk_unpack_untilize_(reblock_cb_id, out_block_w); + llk_unpack_untilize_(reblock_cb_id, out_block_w); + llk_unpack_untilize_uninit(reblock_cb_id); + + llk_pop_tiles(reblock_cb_id, out_block_w); + + within_block_index += out_subblock_w; + } + llk_pop_tiles(interm_cb_id, num_tiles_in_row_of_subblocks); +} + +inline void unpack_for_matmul_output_row( + uint32_t in1_num_subblocks, + bool enable_reload, + uint32_t out_subblock_num_tiles, + uint32_t out_subblock_h, + uint32_t out_subblock_w, + uint32_t in0_block_w, + uint32_t in0_index_subblock_offset, + uint32_t in1_per_core_w, + uint32_t matmul_act_cb_id, + uint32_t matmul_out_intermediate_cb_id) { + + uint32_t in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) { + if (enable_reload) { + llk_unpack_A_init(); + llk_wait_tiles(matmul_out_intermediate_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) { + llk_unpack_A(matmul_out_intermediate_cb_id, i); + } + llk_pop_tiles(matmul_out_intermediate_cb_id, out_subblock_num_tiles); + } + + llk_unpack_AB_matmul_init(0); + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0U; h < out_subblock_h; h++) { + for (uint32_t w = 0U; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0U; inner_dim < in0_block_w; inner_dim++) { + int in0_index = ((in0_index_subblock_offset + in0_index_h_offset) + inner_dim); + int in1_index = ((in1_index_subblock_offset + in1_index_inner_dim_offset) + w); + llk_unpack_AB_matmul(matmul_act_cb_id, 1, in0_index, in1_index); + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + in1_index_subblock_offset += out_subblock_w; + } +} + +void unpack_main() +{ +uint32_t in0_block_w = get_compile_time_arg_val(0); +llk_setup_operands(); +llk_unpack_AB_matmul_init(0); +// inner block size in tiles +uint32_t in0_num_subblocks = get_compile_time_arg_val(1); +// outer row block size (in inner row blocks) +uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); +// out_subblock_h*in0_block_w*in0_num_subblocks; +uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); + +uint32_t in0_subblock_h = get_compile_time_arg_val(4); + +// out_subblock_h*in0_block_w +uint32_t in1_num_subblocks = get_compile_time_arg_val(5); +// outer column block size (in inner column blocks) +uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); +//out_subblock_w*in0_block_w* in1_num_subblocks; +uint32_t in1_per_core_w = get_compile_time_arg_val(7); +// out_subblock_w*in1_num_subblocks +constexpr uint32_t num_blocks = get_compile_time_arg_val(8); +// outer inner dim (in inner dim blocks) +uint32_t out_subblock_h = get_compile_time_arg_val(9); +// inner row block size in tiles +uint32_t out_subblock_w = get_compile_time_arg_val(10); +// inner column block size in tiles +uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11); + +uint32_t out_block_w = in1_per_core_w; + +// If true, this assumes data coming in RM +constexpr bool tilize_in = get_compile_time_arg_val(12); + +// If true, this assumes consumer wants data RM +constexpr bool untilize_out = get_compile_time_arg_val(13); + + +// These are required depending on tilize/untilize +uint32_t matmul_act_cb_id = 0; +uint32_t matmul_out_intermediate_cb_id = 24; +if constexpr (tilize_in) { + // If we tilize, matmul doesn't consume original input, + // it consumes what is produced by tilize + matmul_act_cb_id = 24; + + matmul_out_intermediate_cb_id = 25; // Given 24 is no longer available, we use 25 instead +} + +llk_unpack_AB_matmul_hw_configure_disaggregated(0,1,0); + +uint32_t reblock_cb_id = 26; + +constexpr bool spill = num_blocks > 1U; +bool enable_reload = false; +for (uint32_t block = 0U; block < num_blocks; block++) { + bool last_out = block == num_blocks - 1U; + + if constexpr (tilize_in) { + tilize_activation(in0_subblock_h, in0_block_w, in0_num_subblocks); + } else { + llk_wait_tiles(matmul_act_cb_id, in0_block_num_tiles); + } + + // Wait on weight tiles + llk_wait_tiles(1, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) { + unpack_for_matmul_output_row( + in1_num_subblocks, + enable_reload, + out_subblock_num_tiles, + out_subblock_h, + out_subblock_w, + in0_block_w, + in0_index_subblock_offset, + in1_per_core_w, + matmul_act_cb_id, + matmul_out_intermediate_cb_id); + + if constexpr (untilize_out) { + if (last_out) { + reblock_and_untilize( + in1_num_subblocks, + out_subblock_num_tiles, + out_subblock_h, + out_subblock_w, + out_block_w, + matmul_out_intermediate_cb_id, + reblock_cb_id); + } + } + + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + // Need to do a reblock datacopy + if constexpr (spill) { + enable_reload = true; + } + + llk_pop_tiles(matmul_act_cb_id, in0_block_num_tiles); + llk_pop_tiles(1, in1_block_num_tiles); +} + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp new file mode 100644 index 00000000000..bdc0507c5ce --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "llk_math_common.h" +#include "llk_math_eltwise_binary.h" +#include "llk_math_eltwise_unary_datacopy.h" + +namespace NAMESPACE +{ + +void math_main() +{ +uint32_t per_core_num_blocks = get_compile_time_arg_val(0); +uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1); +uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2); + +llk_math_pack_sync_init(); +for (uint32_t block = 0; block < per_core_num_blocks; block++) { + for (uint32_t r = 0; r < per_core_block_r_tiles; r++) { + // Untilize + llk_math_eltwise_unary_datacopy_init(); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_unary_datacopy(0); + llk_math_dest_section_done(); + } + + llk_math_eltwise_binary_init(); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_binary(0); + llk_math_dest_section_done(); + } + } +} +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_pack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_pack.cpp new file mode 100644 index 00000000000..ef6afbc0113 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_pack.cpp @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "llk_pack_common.h" +#include "llk_pack.h" +namespace NAMESPACE +{ + +void pack_main() +{ +uint32_t per_core_num_blocks = get_compile_time_arg_val(0); +uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1); +uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2); +llk_pack_init(); +llk_pack_hw_configure_disaggregated(16); +llk_setup_outputs(); +llk_pack_dest_init(); + +for (uint32_t block = 0; block < per_core_num_blocks; block++) { + for (uint32_t r = 0; r < per_core_block_r_tiles; r++) { + llk_wait_for_free_tiles(24, per_core_block_c_tiles); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_packer_wait_for_math_done(); + llk_pack(0,24); + llk_pack_dest_section_done(); + } + llk_push_tiles(24, per_core_block_c_tiles); + + llk_wait_for_free_tiles(16, per_core_block_c_tiles); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_packer_wait_for_math_done(); + llk_pack(0,16); + llk_pack_dest_section_done(); + } + llk_push_tiles(16, per_core_block_c_tiles); + } +} + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp new file mode 100644 index 00000000000..7f1e967ac54 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "llk_unpack_common.h" +#include "llk_unpack_AB.h" +#include "llk_unpack_untilize.h" + +namespace NAMESPACE +{ + +void unpack_main() +{ +uint32_t per_core_num_blocks = get_compile_time_arg_val(0); +uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1); +uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2); + +llk_setup_operands(); +llk_unpack_AB_hw_configure_disaggregated(0,1); +// llk_unpack_untilize_hw_configure_disaggregated(0); + +// llk_unpack_untilize_init(0); +for (uint32_t block = 0U; block < per_core_num_blocks; ++block) { + for (uint32_t r = 0; r < per_core_block_r_tiles; r++) { + llk_unpack_untilize_init(0); + llk_wait_tiles(0, per_core_block_c_tiles); + llk_unpack_untilize_(0, per_core_block_c_tiles); + llk_unpack_untilize_(0, per_core_block_c_tiles); + llk_unpack_untilize_uninit(0); + llk_pop_tiles(0, per_core_block_c_tiles); + llk_pop_tiles(1, per_core_block_c_tiles); + + llk_unpack_AB_init(); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_wait_tiles(24, 1); + llk_wait_tiles(1, 1); + llk_unpack_AB(24, 1, 0, 0); + llk_pop_tiles(24, 1); + llk_pop_tiles(1, 1); + } + } +} +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp new file mode 100644 index 00000000000..747765489ac --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/bcast.h" + + +namespace NAMESPACE { +void MAIN { + constexpr uint32_t onetile = 1; + uint32_t B = get_arg_val(0); + uint32_t Ht = get_arg_val(1); + uint32_t Wt = get_arg_val(2); + init_bcast(tt::CB::c_in0, tt::CB::c_in1); + + for (uint32_t b = 0; b < B; b++) { + for (uint32_t h = 0; h < Ht; h++) { + for (uint32_t w = 0; w < Wt; w++) { + // For this bcast-h op the reader will wrap the RHS source tile around at Wt + // so here we just linearly read 2 parallel arrays and apply bcast op per tile + // (bcast_h propagates the op down the H dimension, so it can be though of as bcast to H) + cb_wait_front(tt::CB::c_in1, onetile); + + cb_reserve_back(tt::CB::c_out0, onetile); + + acquire_dst(tt::DstMode::Half); + + cb_wait_front(tt::CB::c_in0, onetile); + + BCAST_OP(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0); + pack_tile(0, tt::CB::c_out0); + + cb_pop_front(tt::CB::c_in0, onetile); + + release_dst(tt::DstMode::Half); + + cb_push_back(tt::CB::c_out0, onetile); + cb_pop_front(tt::CB::c_in1, onetile); + } } } +} +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp new file mode 100644 index 00000000000..230ee8b9c36 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/bcast.h" + +namespace NAMESPACE { +void MAIN { + constexpr uint32_t onetile = 1; + uint32_t B = get_arg_val(0); + uint32_t Ht = get_arg_val(1); + uint32_t Wt = get_arg_val(2); + init_bcast(tt::CB::c_in0, tt::CB::c_in1); + + #ifdef BCAST_SCALAR + cb_wait_front(tt::CB::c_in1, onetile); + #endif + + for (uint32_t b = 0; b < B; b++) { + for (uint32_t h = 0; h < Ht; h++) { + for (uint32_t w = 0; w < Wt; w++) { + #ifndef BCAST_SCALAR + cb_wait_front(tt::CB::c_in1, onetile); + #endif + cb_reserve_back(tt::CB::c_out0, onetile); + + acquire_dst(tt::DstMode::Half); + + cb_wait_front(tt::CB::c_in0, onetile); + + BCAST_OP(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0); + pack_tile(0, tt::CB::c_out0); + + cb_pop_front(tt::CB::c_in0, onetile); + #ifndef BCAST_SCALAR + cb_pop_front(tt::CB::c_in1, onetile); + #endif + release_dst(tt::DstMode::Half); + + cb_push_back(tt::CB::c_out0, onetile); + } } } + +} +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp new file mode 100644 index 00000000000..0de0e2f82c0 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/bcast.h" + +namespace NAMESPACE { +void MAIN { + uint32_t w = 0; + constexpr uint32_t onetile = 1; + uint32_t B = get_arg_val(0); + uint32_t Ht = get_arg_val(1); + uint32_t Wt = get_arg_val(2); + + init_bcast(tt::CB::c_in0, tt::CB::c_in1); + + for (uint32_t b = 0; b < B; b++) { + for (uint32_t h = 0; h < Ht; h++) { + cb_wait_front(tt::CB::c_in1, onetile); + for (uint32_t w = 0; w < Wt; w++) { + + cb_reserve_back(tt::CB::c_out0, onetile); + + acquire_dst(tt::DstMode::Half); + + cb_wait_front(tt::CB::c_in0, onetile); + BCAST_OP(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0); + pack_tile(0, tt::CB::c_out0); + cb_pop_front(tt::CB::c_in0, onetile); + + release_dst(tt::DstMode::Half); + + cb_push_back(tt::CB::c_out0, onetile); + + } + cb_pop_front(tt::CB::c_in1, onetile); + }} +} +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp new file mode 100644 index 00000000000..accc0b59fc2 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "compute_kernel_api/blank.h" + +namespace NAMESPACE { +void MAIN { +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp new file mode 100644 index 00000000000..6e42eb29d49 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + +using std::uint32_t; + +// matmul C=A*B using dims MK*KN = MN (row major order) +// +namespace NAMESPACE { +void MAIN { + + constexpr int onetile = 1; + + int dst_tile_index = 0; + int in0_block_tile_index = 0; + + uint32_t batch = get_compile_time_arg_val(0); + uint32_t Mt = get_compile_time_arg_val(1); + uint32_t Kt = get_compile_time_arg_val(2); + uint32_t Nt = get_compile_time_arg_val(3); + + mm_init(); + + // the simplest possible version of outer product blocked matmul + // the reader is expected to read the A's and B's tile rows and tile columns for each output tile + for (uint32_t nb = 0; nb < batch; nb++) + for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C + for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) // output tile index of C + { + acquire_dst(tt::DstMode::Full); + for (uint32_t kt = 0; kt < Kt; kt++) { + cb_wait_front(tt::CB::c_in0, onetile); + cb_wait_front(tt::CB::c_in1, onetile); + + matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false); + + cb_pop_front(tt::CB::c_in0, onetile); + cb_pop_front(tt::CB::c_in1, onetile); + } + + cb_reserve_back(tt::CB::c_out0, onetile); + pack_tile(0, tt::CB::c_out0); + cb_push_back(tt::CB::c_out0, onetile); + + release_dst(tt::DstMode::Full); + } + + +} +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp new file mode 100644 index 00000000000..ec293c8c7bb --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks + uint32_t num_blocks = get_compile_time_arg_val(7); // outer inner dim (in inner dim blocks) + uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w; + uint32_t batch = get_compile_time_arg_val(11); // batch dim + + mm_init(); + + for (uint32_t b = 0; b < batch; b++){ + bool spill = num_blocks > 1; + bool enable_reload = false; + uint32_t out_num_tiles_to_wait = out_subblock_num_tiles; + + for(uint32_t block = 0; block < num_blocks; block++) + { + bool last_out = block == (num_blocks-1); + + cb_wait_front(tt::CB::c_in0, in0_block_num_tiles); + cb_wait_front(tt::CB::c_in1, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) { + + acquire_dst(tt::DstMode::Half); + + if (enable_reload) { + copy_tile_to_dst_init_short(); + cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(tt::CB::c_intermed0, i, i); + } + cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles); + mm_init_short(); + } + + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w; + matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */); + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + + if (last_out) { + // Pack out to output buffer + cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, tt::CB::c_out0); + } + cb_push_back(tt::CB::c_out0, out_subblock_num_tiles); + } else { + // Wait for tiles in output buffer to be written out since interm and output share memory + if (block == 0) { + cb_reserve_back(tt::CB::c_out0, out_num_tiles_to_wait); + out_num_tiles_to_wait += out_subblock_num_tiles; + } + // Move partial result to interm buffer + cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, tt::CB::c_intermed0); + } + cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles); + } + + release_dst(tt::DstMode::Half); + in1_index_subblock_offset += out_subblock_w; + } + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + cb_pop_front(tt::CB::c_in0, in0_block_num_tiles); + cb_pop_front(tt::CB::c_in1, in1_block_num_tiles); + + } + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp new file mode 100644 index 00000000000..8f61fca907f --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp @@ -0,0 +1,167 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + +#ifdef FUSE_BIAS +#include "compute_kernel_api/bcast.h" +#endif + +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks + uint32_t num_blocks = get_compile_time_arg_val(7); // outer inner dim (in inner dim blocks) + uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w; + uint32_t batch = get_compile_time_arg_val(11); // batch dim + + uint32_t in0_cb_id = tt::CB::c_in0; + uint32_t in1_cb_id = tt::CB::c_in1; + uint32_t out_cb_id = tt::CB::c_out0; + uint32_t mm_partials_cb_id = tt::CB::c_intermed0; + uint32_t mm_bias_intermediate_cb_id = tt::CB::c_intermed1; + uint32_t bias_cb_id = tt::CB::c_in3; + + #ifdef FUSE_BIAS + init_bcast(mm_bias_intermediate_cb_id, bias_cb_id); + #endif + + mm_init(in0_cb_id, in1_cb_id, out_cb_id); + + for (uint32_t b = 0; b < batch; b++){ + bool spill = num_blocks > 1; + bool enable_reload = false; + uint32_t out_num_tiles_to_wait = out_subblock_num_tiles; + + for(uint32_t block = 0; block < num_blocks; block++) + { + bool last_out = block == (num_blocks-1); + + cb_wait_front(in0_cb_id, in0_block_num_tiles); + cb_wait_front(in1_cb_id, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) { + + acquire_dst(tt::DstMode::Half); + + if (enable_reload) { + // Reconfigure input + copy_tile_to_dst_init_short_with_dt(mm_partials_cb_id); + cb_wait_front(mm_partials_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(mm_partials_cb_id, i, i); + } + cb_pop_front(mm_partials_cb_id, out_subblock_num_tiles); + // Reconfigure srcA back + mm_init_short_with_dt(mm_partials_cb_id); + } + + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w; + matmul_tiles(in0_cb_id, in1_cb_id, in0_index, in1_index, dst_index, false /* transpose */); + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + + if (last_out) { + + #ifdef FUSE_BIAS + // Move matmul result to interm buffer + cb_reserve_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, mm_bias_intermediate_cb_id); + } + cb_push_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles); + release_dst(tt::DstMode::Half); + + // Redundant wait since we know data was just pushed + cb_wait_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles); + cb_wait_front(bias_cb_id, in1_per_core_w); + add_bcast_rows_init_short(); + // reconfigure unpacker df for src B + unpack_reconfig_data_format(mm_bias_intermediate_cb_id, bias_cb_id); + // reconfigure packer df for out + pack_reconfig_data_format(out_cb_id); + acquire_dst(tt::DstMode::Half); + for (uint32_t i = 0, j = 0; j < out_subblock_h; j++) { + uint32_t bcast_tile_idx = in1_index_subblock_offset; + for (uint32_t k = 0; k < out_subblock_w; k++, i++) { + add_tiles_bcast_rows(mm_bias_intermediate_cb_id, bias_cb_id, i, bcast_tile_idx, i); + bcast_tile_idx++; + } + } + cb_pop_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles); + // reconfigure init for matmul + mm_init_short(); + // reconfigure unpacker df for src B + unpack_reconfig_data_format(in1_cb_id, in0_cb_id); + #endif + + // sfpu activation + #ifdef SFPU_OP_INIT_ACTIVATION + SFPU_OP_INIT_ACTIVATION + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + SFPU_OP_FUNC_ACTIVATION + } + #endif + // Pack out to output buffer + cb_reserve_back(out_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, out_cb_id); + } + cb_push_back(out_cb_id, out_subblock_num_tiles); + } else { + // Wait for tiles in output buffer to be written out since interm and output share memory + if (block == 0) { + cb_reserve_back(out_cb_id, out_num_tiles_to_wait); + out_num_tiles_to_wait += out_subblock_num_tiles; + } + // Move partial result to interm buffer + cb_reserve_back(mm_partials_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, mm_partials_cb_id); + } + cb_push_back(mm_partials_cb_id, out_subblock_num_tiles); + } + + release_dst(tt::DstMode::Half); + in1_index_subblock_offset += out_subblock_w; + } + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + cb_pop_front(in0_cb_id, in0_block_num_tiles); + cb_pop_front(in1_cb_id, in1_block_num_tiles); + + } + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp new file mode 100644 index 00000000000..b802f2303c4 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks + uint32_t num_blocks = get_compile_time_arg_val(7); // outer inner dim (in inner dim blocks) + uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w; + uint32_t batch = get_compile_time_arg_val(11); // batch dim + + uint32_t in0_cb_id = tt::CB::c_in0; + uint32_t in1_cb_id = tt::CB::c_in1; + uint32_t out_cb_id = tt::CB::c_out0; + uint32_t mm_partials_cb_id = tt::CB::c_intermed0; + + mm_init(in0_cb_id, in1_cb_id, out_cb_id); + + for (uint32_t b = 0; b < batch; b++){ + bool spill = num_blocks > 1; + bool enable_reload = false; + uint32_t out_num_tiles_to_wait = out_subblock_num_tiles; + + for(uint32_t block = 0; block < num_blocks; block++) + { + bool last_out = block == (num_blocks-1); + + cb_wait_front(in0_cb_id, in0_block_num_tiles); + cb_wait_front(in1_cb_id, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) { + + acquire_dst(tt::DstMode::Half); + + if (enable_reload) { + // Reconfigure input + copy_tile_to_dst_init_short_with_dt(mm_partials_cb_id); + cb_wait_front(mm_partials_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(mm_partials_cb_id, i, i); + } + cb_pop_front(mm_partials_cb_id, out_subblock_num_tiles); + // Reconfigure srcA back + mm_init_short_with_dt(mm_partials_cb_id); + } + + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w; + matmul_tiles(in0_cb_id, in1_cb_id, in0_index, in1_index, dst_index, false /* transpose */); + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + + if (last_out) { + // Pack out to output buffer + cb_reserve_back(out_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, out_cb_id); + } + cb_push_back(out_cb_id, out_subblock_num_tiles); + } else { + // Wait for tiles in output buffer to be written out since interm and output share memory + if (block == 0) { + cb_reserve_back(out_cb_id, out_num_tiles_to_wait); + out_num_tiles_to_wait += out_subblock_num_tiles; + } + // Move partial result to interm buffer + cb_reserve_back(mm_partials_cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, mm_partials_cb_id); + } + cb_push_back(mm_partials_cb_id, out_subblock_num_tiles); + } + + release_dst(tt::DstMode::Half); + in1_index_subblock_offset += out_subblock_w; + } + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + cb_pop_front(in0_cb_id, in0_block_num_tiles); + cb_pop_front(in1_cb_id, in1_block_num_tiles); + + } + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp new file mode 100644 index 00000000000..a91141fb45e --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp @@ -0,0 +1,279 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tilize.h" +#include "compute_kernel_api/untilize.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + +#ifdef FUSE_BIAS +#include "compute_kernel_api/bcast.h" +#endif + +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +#define DEBUG_PRINT 0 + +// #include "debug_macros.h" + +// SliceRange srt = SliceRange{.h0 = 0, .h1 = 4, .hs = 1, .w0 = 0, .w1 = 8, .ws = 1}; +// SliceRange srr = SliceRange{.h0 = 0, .h1 = 1, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; +// SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; +// SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1}; + + +inline void tilize_in( + uint32_t in_cb_id, + uint32_t in_subblock_h, + uint32_t in_block_w, + uint32_t in_num_subblocks, + uint32_t out_cb_id) { + + tilize_init_short(in_cb_id, in_block_w); + for (uint32_t in_subblock = 0; in_subblock < in_num_subblocks; ++in_subblock) { + for (uint32_t h = 0; h < in_subblock_h; ++h) { + cb_wait_front(in_cb_id, in_block_w); + cb_reserve_back(out_cb_id, in_block_w);; + tilize_block(in_cb_id, in_block_w, out_cb_id); + cb_push_back(out_cb_id, in_block_w); + cb_pop_front(in_cb_id, in_block_w); + } + } + tilize_uninit(); +} // tilize_in() + +// NOTE: Bias is not supported with the untilize option +#ifndef FUSE_BIAS + + inline void reblock_and_untilize( + uint32_t num_out_subblocks_in_col, + uint32_t out_subblock_num_tiles, + uint32_t out_subblock_h, + uint32_t out_subblock_w, + uint32_t out_block_w, + uint32_t interm_cb_id, + uint32_t reblock_cb_id, + uint32_t out_cb_id) { + + uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col); + cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks); + + int within_block_index = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + int block_offset = 0; + + // Reblock + copy_tile_to_dst_init_short(); + cb_reserve_back(reblock_cb_id, out_block_w); + for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + uint32_t tile_index = block_offset + within_block_index + w; + acquire_dst(tt::DstMode::Half); + copy_tile(interm_cb_id, tile_index, 0); + pack_tile(0, reblock_cb_id); + release_dst(tt::DstMode::Half); + } + block_offset += out_subblock_num_tiles; + } + cb_push_back(reblock_cb_id, out_block_w); + + // Untilize + untilize_init_short(reblock_cb_id); + cb_wait_front(reblock_cb_id, out_block_w); + cb_reserve_back(out_cb_id, out_block_w); + untilize_block(reblock_cb_id, out_block_w, out_cb_id); + cb_pop_front(reblock_cb_id, out_block_w); + cb_push_back(out_cb_id, out_block_w); + untilize_uninit(reblock_cb_id); + + within_block_index += out_subblock_w; + } + cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks); + } // reblock_and_untilize() + +#endif + +inline void pack_matmul_subblock(uint32_t cb_id, uint32_t out_subblock_num_tiles) { + cb_reserve_back(cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; ++i) { + pack_tile(i, cb_id); + } + cb_push_back(cb_id, out_subblock_num_tiles); +} + +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in0_subblock_h = get_compile_time_arg_val(4); + uint32_t in1_num_subblocks = get_compile_time_arg_val(5); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_block_w = get_compile_time_arg_val(7); // out_subblock_w*in1_num_subblocks + // if these are not defined as volatile, it causes code size for TRISC2 to be too large if num_blocks > 1 + volatile uint32_t in0_num_blocks_h = get_compile_time_arg_val(8); + volatile uint32_t in0_num_blocks_w = get_compile_time_arg_val(9); + volatile uint32_t in1_num_blocks_w = get_compile_time_arg_val(10); + uint32_t out_subblock_h = get_compile_time_arg_val(11); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(12); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(13); // out_subblock_h * out_subblock_w; + bool tilize_in0 = get_compile_time_arg_val(14); + bool untilize_out = get_compile_time_arg_val(15); + + uint32_t out_block_w = in1_block_w; + bool spill = in0_num_blocks_w > 1; + + // CB indices + constexpr uint32_t in0_cb_id = tt::CB::c_in0; + constexpr uint32_t in1_cb_id = tt::CB::c_in1; + constexpr uint32_t matmul_partials_cb = tt::CB::c_intermed0; + constexpr uint32_t tilized_in0_cb_id = tt::CB::c_intermed1; + constexpr uint32_t untilize_mode_final_matmul_partials_cb = tt::CB::c_intermed2; + constexpr uint32_t untilize_mode_reblock_cb = tt::CB::c_intermed3; + constexpr uint32_t out_cb_id = tt::CB::c_out0; + + #ifdef FUSE_BIAS + uint32_t bias_ntiles_w = get_compile_time_arg_val(16); + constexpr uint32_t bias_cb_id = tt::CB::c_in2; + constexpr uint32_t out_for_bias_cb_id = tt::CB::c_intermed4; + init_bcast(out_for_bias_cb_id, bias_cb_id, out_cb_id); + #endif + + mm_init(in0_cb_id, in1_cb_id, out_cb_id); + for(uint32_t in0_block_h_i = 0; in0_block_h_i < in0_num_blocks_h; ++in0_block_h_i) { + #ifdef FUSE_BIAS + uint32_t bias_block_offset = 0; + #endif + for(uint32_t in1_block_w_i = 0; in1_block_w_i < in1_num_blocks_w; ++in1_block_w_i) { + bool enable_reload = false; + for(uint32_t in0_block_w_i = 0; in0_block_w_i < in0_num_blocks_w; ++in0_block_w_i) { + bool last_out = (in0_block_w_i == in0_num_blocks_w - 1); + if (tilize_in0) { + tilize_in(in0_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id); + mm_init_short(); + cb_wait_front(tilized_in0_cb_id, in0_block_num_tiles); + } else { + cb_wait_front(in0_cb_id, in0_block_num_tiles); + } + cb_wait_front(in1_cb_id, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock_i = 0; in0_subblock_i < in0_num_subblocks; ++in0_subblock_i) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock_i = 0; in1_subblock_i < in1_num_subblocks; ++in1_subblock_i) { + acquire_dst(tt::DstMode::Half); + if (enable_reload) { + // Reconfigure input + copy_tile_to_dst_init_short_with_dt(matmul_partials_cb); + cb_wait_front(matmul_partials_cb, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; ++i) { + copy_tile(matmul_partials_cb, i, i); + } + cb_pop_front(matmul_partials_cb, out_subblock_num_tiles); + // Reconfigure srcA back + mm_init_short_with_dt(matmul_partials_cb); + } // enable_reload + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; ++h) { + for (uint32_t w = 0; w < out_subblock_w; ++w) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; ++inner_dim) { + matmul_tiles(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, // in0_cb + in1_cb_id, // in1_cb + in0_index_subblock_offset + in0_index_h_offset + inner_dim, // in0 tile + in1_index_subblock_offset + in1_index_inner_dim_offset + w, // in1 tile + dst_index, // dst + false); + in1_index_inner_dim_offset += in1_block_w; + } // for in0_block_w + ++dst_index; + } // for out_subblock_w + in0_index_h_offset += in0_block_w; + } // for out_subblock_h + #ifdef FUSE_BIAS + // if bias is to be added, add it to the data in dst before packing into the out cb + if (last_out) { + // first move the current result from dst to interim CB + pack_matmul_subblock(out_for_bias_cb_id, out_subblock_num_tiles); + release_dst(tt::DstMode::Half); + // reconfig unpacker df for src B + // unpack_reconfig_data_format(out_for_bias_cb_id, bias_cb_id); + // bcast add data from bias_cb_id + cb_wait_front(bias_cb_id, bias_ntiles_w); + cb_wait_front(out_for_bias_cb_id, out_subblock_num_tiles); + add_bcast_rows_init_short(); + // reconfig packer df for out + // pack_reconfig_data_format(out_cb_id); + acquire_dst(tt::DstMode::Half); + uint32_t i = 0; + for (uint32_t h = 0; h < out_subblock_h; ++ h) { + uint32_t bcast_tile_i = bias_block_offset + in1_index_subblock_offset; + for (uint32_t w = 0; w < out_subblock_w; ++ w) { + add_tiles_bcast_rows(out_for_bias_cb_id, bias_cb_id, i, bcast_tile_i, i); + ++ bcast_tile_i; + ++ i; + } + } + // do not pop front bias as it may be used again for subsequent blocks + cb_pop_front(out_for_bias_cb_id, out_subblock_num_tiles); + // reconfig for matmul + mm_init_short(); + // reconfig unpacker df for srcB + // unpack_reconfig_data_format(in1_cb_id, in0_cb_id); + } + #endif + + #ifdef SFPU_OP_INIT_ACTIVATION + if (last_out) { + SFPU_OP_INIT_ACTIVATION + for (uint32_t i = 0; i < out_subblock_num_tiles; ++ i) { + SFPU_OP_FUNC_ACTIVATION + } + } + #endif + + auto curr_matmul_out_cb = last_out + ? (untilize_out + ? untilize_mode_final_matmul_partials_cb + : out_cb_id) + : matmul_partials_cb; + pack_matmul_subblock(curr_matmul_out_cb, out_subblock_num_tiles); + release_dst(tt::DstMode::Half); + in1_index_subblock_offset += out_subblock_w; + } // for in1_num_subblocks + #ifndef FUSE_BIAS + // untilizing is only supported if there is no bias + if (last_out && untilize_out) { + reblock_and_untilize( + in1_num_subblocks, + out_subblock_num_tiles, + out_subblock_h, + out_subblock_w, + out_block_w, + untilize_mode_final_matmul_partials_cb, + untilize_mode_reblock_cb, + out_cb_id); + mm_init_short(); + } // last_out + #endif + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + cb_pop_front(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, in0_block_num_tiles); + cb_pop_front(in1_cb_id, in1_block_num_tiles); + } // for in0_num_blocks_w + #ifdef FUSE_BIAS + bias_block_offset += in1_block_w; + #endif + } // for in1_num_blocks_w + } // for in0_num_blocks_h +} // MAIN +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp new file mode 100644 index 00000000000..cf04922122a --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/tile_move_copy.h" + +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +ALWI void ACQ() { acquire_dst(tt::DstMode::Half); } +ALWI void REL() { release_dst(tt::DstMode::Half); } + + +#define PRE_SCALE defined SFPU_OP_INIT_PRE_IN0_0 || defined SFPU_OP_INIT_PRE_IN1_0 + +namespace NAMESPACE { +void MAIN { + uint32_t per_core_block_cnt = get_arg_val(0); + uint32_t per_core_block_size = get_arg_val(1); + + #ifdef SFPU_OP_INIT_PRE_IN0_0 + constexpr auto cb_inp0 = tt::CB::c_intermed0; + #else + constexpr auto cb_inp0 = tt::CB::c_in0; + #endif + + #ifdef SFPU_OP_INIT_PRE_IN1_0 + constexpr auto cb_inp1 = tt::CB::c_intermed1; + #else + constexpr auto cb_inp1 = tt::CB::c_in1; + #endif + + binary_op_init_common(cb_inp0, cb_inp1); + + #if not PRE_SCALE + binary_op_specific_init(ELTWISE_OP_CODE); + #endif + + for(uint32_t block = 0; block < per_core_block_cnt; ++block) { + + cb_reserve_back(tt::CB::c_out0, per_core_block_size); + + #ifdef SFPU_OP_INIT_PRE_IN0_0 + cb_wait_front(tt::CB::c_in0, per_core_block_size); + cb_reserve_back(cb_inp0, per_core_block_size); + copy_tile_init(); // need to copy from CB to DST to be able to run sfpu math + ACQ(); + SFPU_OP_INIT_PRE_IN0_0 + for(uint32_t i = 0; i < per_core_block_size; ++i) + { + copy_tile(tt::CB::c_in0, i, i); // copy from c_in[0] to DST[0] + SFPU_OP_FUNC_PRE_IN0_0 + pack_tile(i, cb_inp0); // DST[0]->cb + } + REL(); + cb_pop_front(tt::CB::c_in0, per_core_block_size); + cb_push_back(cb_inp0, per_core_block_size); + #endif + + #ifdef SFPU_OP_INIT_PRE_IN1_0 + cb_wait_front(tt::CB::c_in1, per_core_block_size); + cb_reserve_back(cb_inp1, per_core_block_size); + copy_tile_init(); // need to copy from CB to DST to be able to run sfpu math + ACQ(); + SFPU_OP_INIT_PRE_IN1_0 + for(uint32_t i = 0; i < per_core_block_size; ++i) + { + copy_tile(tt::CB::c_in1, i, i); // copy from c_in[0] to DST[0] + SFPU_OP_FUNC_PRE_IN1_0 + pack_tile(i, cb_inp1); // DST[0]->cb + } + REL(); + cb_pop_front(tt::CB::c_in1, per_core_block_size); + cb_push_back(cb_inp1, per_core_block_size); + #endif + + cb_wait_front(cb_inp0, per_core_block_size); + cb_wait_front(cb_inp1, per_core_block_size); + + #if PRE_SCALE + binary_op_specific_init(ELTWISE_OP_CODE); + #endif + ACQ(); + for(uint32_t i = 0; i < per_core_block_size; ++i) + { + ELTWISE_OP(cb_inp0, cb_inp1, i, i, i); + + #ifdef SFPU_OP_INIT_0 + SFPU_OP_INIT_0 + SFPU_OP_FUNC_0 + #endif + + #ifdef SFPU_OP_CHAIN_0 + SFPU_OP_CHAIN_0 + #endif + + pack_tile(i, tt::CB::c_out0); + } + REL(); + cb_pop_front(cb_inp0, per_core_block_size); + cb_pop_front(cb_inp1, per_core_block_size); + cb_push_back(tt::CB::c_out0, per_core_block_size); + } + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp new file mode 100644 index 00000000000..1e7c029d9a3 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" + +namespace NAMESPACE { +void MAIN { + uint32_t per_core_tile_cnt = get_compile_time_arg_val(0); + + unary_op_init_common(tt::CB::c_in0); + for(uint32_t b=0;b + + +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api.h" + +namespace NAMESPACE { + +#ifdef TRISC_MATH +#include "llk_math_common.h" +#include "llk_math_eltwise_unary_datacopy.h" + +void math_main() +{ + int __outer_loop_iter; + #ifdef ARCH_GRAYSKULL + llk_math_eltwise_unary_datacopy_init(); + #else + MATH(( llk_math_eltwise_unary_datacopy_init(0, 0, 0) )); + #endif + llk_math_pack_sync_init(); + constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0); + for (uint32_t b = 0; b < per_core_tile_cnt; ++b) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_unary_datacopy(0); + llk_math_dest_section_done(); + } +} +#endif + +#ifdef TRISC_PACK +#include "llk_pack_common.h" +#include "llk_pack.h" + +void pack_main() +{ + int __outer_loop_iter; + llk_pack_init(); + llk_pack_hw_configure_disaggregated(16); + llk_setup_outputs(); + llk_pack_dest_init(); + constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0); + for (uint32_t b = 0; b < per_core_tile_cnt; ++b) { + llk_packer_wait_for_math_done(); + llk_wait_for_free_tiles(16,1); + llk_pack(0,16); + llk_push_tiles(16,1); + llk_pack_dest_section_done(); + } +} +#endif + +#ifdef TRISC_UNPACK +void unpack_main() +{ + int __outer_loop_iter; + llk_setup_operands(); + #ifdef ARCH_GRAYSKULL + llk_unpack_A_init(); + llk_unpack_A_hw_configure_disaggregated(0); + #else + UNPACK(( llk_unpack_A_init() )); + UNPACK(( llk_unpack_A_hw_configure_disaggregated<>(0) )); + #endif + constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0); + for (uint32_t b = 0; b < per_core_tile_cnt; ++b) { + llk_wait_tiles(0,1); + llk_unpack_A(0,0); + llk_pop_tiles(0,1); + } +} +#endif + +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp new file mode 100644 index 00000000000..7d0df464e7a --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/common.h" + +namespace NAMESPACE { +void MAIN { + + constexpr uint32_t block_num_tiles = get_compile_time_arg_val(0); + constexpr uint32_t num_blocks = get_compile_time_arg_val(1); + + for(uint32_t block = 0; block < num_blocks; ++block) { + acquire_dst(tt::DstMode::Half); + + // Wait tiles on the input / copy to dst / pop from input + cb_wait_front(tt::CB::c_in0, block_num_tiles); + for(uint32_t t = 0; t < block_num_tiles; ++t) { + copy_tile(tt::CB::c_in0, t, t); + } + cb_pop_front(tt::CB::c_in0, block_num_tiles); + + // Reserve space in output / pack / push to output + cb_reserve_back(tt::CB::c_out0, block_num_tiles); + for(uint32_t t = 0; t < block_num_tiles; ++t) { + pack_tile(t, tt::CB::c_out0); + } + cb_push_back(tt::CB::c_out0, block_num_tiles); + + release_dst(tt::DstMode::Half); + } + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp new file mode 100644 index 00000000000..51b21cff002 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "llk_3c.h" + +namespace NAMESPACE { +void MAIN { + // expands to hlk_relu_config(nullptr, 1); for relu only + + uint32_t per_core_block_cnt = get_compile_time_arg_val(0); + uint32_t per_core_block_dim = get_compile_time_arg_val(1); + + INIT_RELU + for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) { + cb_reserve_back(CB::c_out0, per_core_block_dim); + for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) { + acquire_dst(DstMode::Full); + + // Pop tile after tile, copy to DST and pack + cb_wait_front(CB::c_in0, 1); + + copy_tile(CB::c_in0, 0, 0); + // SFPU_OP expected to be defined via add_define as one of + // exp_tile, gelu_tile, recip_tile. etc followed by pack_tile + // (except for relu because the llk is fused for relu) + // "sfpu_gelu(0); pack_tile(0, CB::c_out0);" + + SFPI_OP_AND_PACK + // comes from add_define in kernel config + // Also is epxected to include pack_tile(0, CB::c_out0); for non-relu + // For relu it expects the hlk_pack_relu variant + + cb_pop_front(CB::c_in0, 1); + + release_dst(DstMode::Full); + } + cb_push_back(CB::c_out0, per_core_block_dim); + } + DEINIT_RELU + // expands to hlk_relu_config(nullptr, 0); for relu only +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp new file mode 100644 index 00000000000..e1dd04fad5d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +namespace NAMESPACE { +void MAIN { + uint32_t per_core_block_cnt = get_compile_time_arg_val(0); + uint32_t per_core_block_dim = get_compile_time_arg_val(1); + + kernel_profiler::mark_time(9997); + + init_sfpu(tt::CB::c_in0); + for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) { + cb_reserve_back(tt::CB::c_out0, per_core_block_dim); + for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) { + acquire_dst(tt::DstMode::Half); + + // Pop tile after tile, copy to DST and pack + cb_wait_front(tt::CB::c_in0, 1); + + copy_tile(tt::CB::c_in0, 0, 0); + + #ifdef SFPU_OP_CHAIN_0 + SFPU_OP_CHAIN_0 + #endif + + pack_tile(0, tt::CB::c_out0); + + cb_pop_front(tt::CB::c_in0, 1); + + release_dst(tt::DstMode::Half); + } + cb_push_back(tt::CB::c_out0, per_core_block_dim); + } + + kernel_profiler::mark_time(9998); + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp new file mode 100644 index 00000000000..9cf8fca349a --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/tile_move_copy.h" + + +#include "compute_kernel_api/eltwise_unary/exp.h" +#include "compute_kernel_api/eltwise_unary/gelu.h" +#include "compute_kernel_api/eltwise_unary/recip.h" +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t per_core_tile_cnt = get_compile_time_arg_val(0); + uint32_t num_ops = get_compile_time_arg_val(1); + + // Need to pre-initialize an op_info struct and pass into get_next_op_info and modify in that func, since hlkc doesn't support funcs returning vals yet + tt::op_info_t op_info = {0, 0, 0, 0, 0, 0, 0}; + graph_interpreter_init(); + + for (uint32_t op_idx = 0; op_idx < num_ops; op_idx++) { + get_next_op_info(op_info); + + for (uint32_t idx = 0; idx < per_core_tile_cnt; idx++) { + cb_reserve_back(op_info.cb_out_id, 1); + acquire_dst(tt::DstMode::Half); + cb_wait_front(op_info.cb_in0_id, 1); + + + if (op_info.unary) { + copy_tile_init(); + copy_tile(op_info.cb_in0_id, 0, 0); + } else { + cb_wait_front(op_info.cb_in1_id, 1); + } + + if (op_info.op_code == (int)tt::OpCode::Exponential) { // 0 + exp_tile_init(); + exp_tile(0); + } else if (op_info.op_code == (int)tt::OpCode::Reciprocal) { // 1 + recip_tile_init(); + recip_tile(0); + } else if (op_info.op_code == (int)tt::OpCode::Gelu) { // 2 + gelu_tile_init(); + gelu_tile(0, false); + } else if (op_info.op_code == (int)tt::OpCode::Add) { // 3 + add_tiles_init(); + add_tiles(op_info.cb_in0_id, op_info.cb_in1_id, 0, 0, 0); + } else if (op_info.op_code == (int)tt::OpCode::Subtract) { // 4 + sub_tiles_init(); + sub_tiles(op_info.cb_in0_id, op_info.cb_in1_id, 0, 0, 0); + } else if (op_info.op_code == (int)tt::OpCode::Multiply) { // 5 + mul_tiles_init(); + mul_tiles(op_info.cb_in0_id, op_info.cb_in1_id, 0, 0, 0); + } + + pack_tile(0, op_info.cb_out_id); + + if (op_info.pop0) { + cb_pop_front(op_info.cb_in0_id, 1); // Don't always pop, may need the input for later + } + + if (not op_info.unary and op_info.pop1) { + cb_pop_front(op_info.cb_in1_id, 1); + } + + release_dst(tt::DstMode::Half); + cb_push_back(op_info.cb_out_id, 1); + } + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp similarity index 100% rename from tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp rename to tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp new file mode 100644 index 00000000000..77eb18beda7 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp @@ -0,0 +1,260 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define REDUCE_OP PoolType::SUM +#define REDUCE_DIM ReduceDim::REDUCE_ROW + +#define BCAST_LLKOP EltwiseBinaryType::ELWMUL +#define BCAST_DIM BroadcastType::COL + + +#include "compute_kernel_api/reduce.h" +#include "compute_kernel_api/bcast.h" +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/layernorm.h" + + +ALWI void ACQ() { acquire_dst(tt::DstMode::Half); } +ALWI void REL() { release_dst(tt::DstMode::Half); } + + +namespace NAMESPACE { +void MAIN { + uint32_t NCHt = get_arg_val(0); + constexpr uint32_t Wt = get_compile_time_arg_val(0); + constexpr uint32_t blk = get_compile_time_arg_val(1); + constexpr uint32_t do_gamma = get_compile_time_arg_val(2); + constexpr uint32_t do_beta = get_compile_time_arg_val(3); + + + #ifdef FUSE_PRE_ADD + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1); + #else + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0); + #endif + + constexpr uint32_t onetile = 1; + // reserve one tile for zeros on cb_in2 + // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes + + // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct + constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader + constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader + constexpr auto cb_xmm = tt::CB::c_intermed0; // x minus mean, this is a large buffer, see setup code in layernorm_op.cpp + constexpr auto cb_ex = tt::CB::c_intermed1; // E[x] + constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2] + constexpr auto cb_xmm2 = tt::CB::c_intermed3; // xmm^2 + constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps + constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b) + constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add + constexpr auto cb_out = tt::CB::c_out0; // output + constexpr auto cb_gamma = tt::CB::c_in5; + constexpr auto cb_beta = tt::CB::c_in6; + constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta + constexpr auto scaler0 = 0; + #ifdef FUSE_PRE_ADD + constexpr auto cb_x = tt::CB::c_intermed6; + #else + constexpr auto cb_x = tt::CB::c_in0; + #endif + + cb_wait_front(cb_scaler, 1); // comes from the reader + cb_wait_front(cb_eps, 1); // comes from the reader + + + constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CB::c_out0; + + + for (uint32_t ncht = 0; ncht < NCHt; ncht++) { + + constexpr int onetile = 1; + constexpr int dst0 = 0; + + /* + * X + Y + */ + #ifdef FUSE_PRE_ADD + add_tiles_init(); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + ACQ(); + //UNPACK(( { DPRINT << "Waiting on cb_x" << ENDL(); } )); + cb_wait_front(cb_in, blk); + //UNPACK(( { DPRINT << "Waiting on cb_inb" << ENDL(); } )); + cb_wait_front(cb_inb, blk); + //UNPACK(( { DPRINT << "Done Waiting on cb_inb" << ENDL(); } )); + cb_reserve_back(cb_x, blk); + for (uint32_t j = 0; j < blk; j++) { + add_tiles(cb_in, cb_inb, j, j, j); + pack_tile(j, cb_x); + } + REL(); + cb_push_back(cb_x, blk); // push the sum into the same buffer + cb_pop_front(cb_in, blk); + cb_pop_front(cb_inb, blk); + } + // by the end of this loop we should end up with Wt tiles in cb_x + #endif + + /* + * E[x] + * means = tensor.reduce(x, RSUM, RW, 1.0/W) # -> NCH1 + */ + ACQ(); + cb_reserve_back(cb_ex, 1*onetile); + reduce_init_delta(REDUCE_OP, REDUCE_DIM); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + cb_wait_front(cb_x, wt+blk); + for (uint32_t j = 0; j < blk; j++) { + reduce_tile(REDUCE_OP, REDUCE_DIM, cb_x, cb_scaler, wt+j, scaler0, dst0); + } + // we don't pop cb_x until we compute Ex + } + pack_tile(dst0, cb_ex); + reduce_revert_delta(); + REL(); + + cb_push_back(cb_ex, 1); + + /* + * x - E[x] + * compute xmm=x-mean. Reuse cb_x since we didn't pop anything from it + */ + cb_wait_front(cb_ex, 1); // should have 1 tile + cb_reserve_back(cb_xmm, Wt); + sub_bcast_cols_init_short(); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + ACQ(); + for (uint32_t wtr = 0; wtr(REDUCE_OP, REDUCE_DIM); + ACQ(); + cb_wait_front(cb_xmm2, Wt); + //cb_wait_front(cb_xmm, Wt); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + // reduce + for (uint32_t wtr = 0; wtr + +#include "compute_kernel_api/tilize.h" +#include "compute_kernel_api/untilize.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + + + +inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb) +{ + tilize_init_short(in0_cb, in0_block_w); + + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t h = 0; h < in0_subblock_h; h++) { + cb_wait_front(in0_cb, in0_block_w); + cb_reserve_back(out_cb, in0_block_w); + tilize_block(in0_cb, in0_block_w, out_cb); + cb_push_back(out_cb, in0_block_w); + cb_pop_front(in0_cb, in0_block_w); + } + } + + tilize_uninit(); + +} + +inline void reblock_and_untilize( + uint32_t num_out_subblocks_in_col, + uint32_t out_subblock_num_tiles, + uint32_t out_subblock_h, + uint32_t out_subblock_w, + uint32_t out_block_w, + uint32_t interm_cb_id, + uint32_t reblock_cb_id, + uint32_t out_cb_id) +{ + uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col); + cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks); + + int within_block_index = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + int block_offset = 0; + + // Reblock + copy_tile_to_dst_init_short(); + cb_reserve_back(reblock_cb_id, out_block_w); + for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + uint32_t tile_index = block_offset + within_block_index + w; + acquire_dst(tt::DstMode::Half); + copy_tile(interm_cb_id, tile_index, 0); + pack_tile(0, reblock_cb_id); + release_dst(tt::DstMode::Half); + } + block_offset += out_subblock_num_tiles; + } + cb_push_back(reblock_cb_id, out_block_w); + + // Untilize + untilize_init_short(reblock_cb_id); + cb_wait_front(reblock_cb_id, out_block_w); + cb_reserve_back(out_cb_id, out_block_w); + untilize_block(reblock_cb_id, out_block_w, out_cb_id); + cb_pop_front(reblock_cb_id, out_block_w); + cb_push_back(out_cb_id, out_block_w); + untilize_uninit(reblock_cb_id); + + within_block_index += out_subblock_w; + } + cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks); +} + +inline void pack_matmul_subblock(uint32_t cb_id, uint32_t out_subblock_num_tiles) { + cb_reserve_back(cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, cb_id); + } + cb_push_back(cb_id, out_subblock_num_tiles); +} + +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in0_subblock_h = get_compile_time_arg_val(4); + uint32_t in1_num_subblocks = get_compile_time_arg_val(5); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_per_core_w = get_compile_time_arg_val(7); // out_subblock_w*in1_num_subblocks + + // If I don't make this volatile, causes code size for TRISC2 to be too large if num_blocks > 1 + volatile uint32_t num_blocks = get_compile_time_arg_val(8); // outer inner dim (in inner dim blocks) + + uint32_t out_subblock_h = get_compile_time_arg_val(9); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(10); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11); // out_subblock_h * out_subblock_w; + + uint32_t out_block_w = in1_per_core_w; + + // If true, this assumes data coming in RM + bool tilize_in = get_compile_time_arg_val(12); + + // If true, this assumes consumer wants data RM + bool untilize_out = get_compile_time_arg_val(13); + + bool spill = num_blocks > 1; + + bool enable_reload = false; + + // CB mapping of in0, union of all possible variants (with + // and without fusing combinations of tilize/untilize) + // in0: + // input 0 + // in1: + // input 1 + // interm0: + // If under tilized mode, this is CB in which we write the tilized + // input 0 + // interm1: + // intermediate CB we write to so that we store partial matmul results + // interm2: + // if under untilize mode, this is the CB we write to so that we store + // the final matmul result + // interm3: + // if under untilize mode, this is the CB we write to so that we can + // reblock the output + uint32_t in0_cb = tt::CB::c_in0; + uint32_t tilize_mode_tilized_in0_cb = tt::CB::c_intermed0; + uint32_t matmul_partials_cb = tt::CB::c_intermed1; + uint32_t untilize_mode_final_matmul_partials_cb = tt::CB::c_intermed2; + uint32_t untilize_mode_reblock_cb = tt::CB::c_intermed3; + uint32_t out0_cb = tt::CB::c_out0; + + mm_init(); + for(uint32_t block = 0; block < num_blocks; block++) + { + bool last_out = block == (num_blocks-1); + if (tilize_in) { + tilize_activation(in0_cb, in0_subblock_h, in0_block_w, in0_num_subblocks, tilize_mode_tilized_in0_cb); + mm_init_short(); + cb_wait_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles); + } else { + cb_wait_front(in0_cb, in0_block_num_tiles); + } + + cb_wait_front(tt::CB::c_in1, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) { + + acquire_dst(tt::DstMode::Half); + + if (enable_reload) { + copy_tile_to_dst_init_short(); + cb_wait_front(matmul_partials_cb, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(matmul_partials_cb, i, i); + } + cb_pop_front(matmul_partials_cb, out_subblock_num_tiles); + mm_init_short(); + } + + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w; + if (tilize_in) { + matmul_tiles(tilize_mode_tilized_in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */); + } else { + matmul_tiles(in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */); + } + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + + if (last_out) { + if (not untilize_out) { + pack_matmul_subblock(out0_cb, out_subblock_num_tiles); + } else { + pack_matmul_subblock(untilize_mode_final_matmul_partials_cb, out_subblock_num_tiles); + } + } else { + pack_matmul_subblock(matmul_partials_cb, out_subblock_num_tiles); + } + + release_dst(tt::DstMode::Half); + + in1_index_subblock_offset += out_subblock_w; + } + + if (untilize_out) { + if (last_out) { + reblock_and_untilize( + in1_num_subblocks, + out_subblock_num_tiles, + out_subblock_h, + out_subblock_w, + out_block_w, + untilize_mode_final_matmul_partials_cb, + untilize_mode_reblock_cb, + out0_cb + ); + mm_init_short(); + } + } + + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + if (tilize_in) { + cb_pop_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles); + } else { + cb_pop_front(in0_cb, in0_block_num_tiles); + } + cb_pop_front(tt::CB::c_in1, in1_block_num_tiles); + } + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp new file mode 100644 index 00000000000..1158d26f61e --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp @@ -0,0 +1,245 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tilize_untilize.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + + +inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb) +{ + tilize_init_short(in0_cb, in0_block_w); + + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + for (uint32_t h = 0; h < in0_subblock_h; h++) { + cb_wait_front(in0_cb, in0_block_w); + cb_reserve_back(out_cb, in0_block_w); + tilize_block(in0_cb, in0_block_w, out_cb); + cb_push_back(out_cb, in0_block_w); + cb_pop_front(in0_cb, in0_block_w); + } + } + + tilize_uninit(); + +} + +inline void reblock_and_untilize( + uint32_t num_out_subblocks_in_col, + uint32_t out_subblock_num_tiles, + uint32_t out_subblock_h, + uint32_t out_subblock_w, + uint32_t out_block_w, + uint32_t interm_cb_id, + uint32_t reblock_cb_id, + uint32_t out_cb_id) +{ + uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col); + cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks); + + int within_block_index = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + int block_offset = 0; + + // Reblock + copy_tile_to_dst_init_short(); + cb_reserve_back(reblock_cb_id, out_block_w); + for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + uint32_t tile_index = block_offset + within_block_index + w; + acquire_dst(tt::DstMode::Half); + copy_tile(interm_cb_id, tile_index, 0); + pack_tile(0, reblock_cb_id); + release_dst(tt::DstMode::Half); + } + block_offset += out_subblock_num_tiles; + } + cb_push_back(reblock_cb_id, out_block_w); + + // Untilize + untilize_init_short(reblock_cb_id); + cb_wait_front(reblock_cb_id, out_block_w); + cb_reserve_back(out_cb_id, out_block_w); + untilize_block(reblock_cb_id, out_block_w, out_cb_id); + cb_pop_front(reblock_cb_id, out_block_w); + cb_push_back(out_cb_id, out_block_w); + untilize_uninit(reblock_cb_id); + + within_block_index += out_subblock_w; + } + cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks); +} + +inline void pack_matmul_subblock(uint32_t cb_id, uint32_t out_subblock_num_tiles) { + cb_reserve_back(cb_id, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, cb_id); + } + cb_push_back(cb_id, out_subblock_num_tiles); +} + +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in0_subblock_h = get_compile_time_arg_val(4); + uint32_t in1_num_subblocks = get_compile_time_arg_val(5); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_per_core_w = get_compile_time_arg_val(7); // out_subblock_w*in1_num_subblocks + // If I don't make this volatile, causes code size for TRISC2 to be too large if num_blocks > 1 + volatile uint32_t num_blocks_in0_h = get_compile_time_arg_val(8); // outer inner dim (in inner dim blocks) + volatile uint32_t num_blocks_in0_w = get_compile_time_arg_val(9); // outer inner dim (in inner dim blocks) + volatile uint32_t num_blocks_in1_w = get_compile_time_arg_val(10); + + uint32_t out_subblock_h = get_compile_time_arg_val(11); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(12); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(13); // out_subblock_h * out_subblock_w; + + uint32_t out_block_w = in1_per_core_w; + + // If true, this assumes data coming in RM + bool tilize_in = get_compile_time_arg_val(13); + + // If true, this assumes consumer wants data RM + bool untilize_out = get_compile_time_arg_val(14); + + bool spill = num_blocks_in0_w > 1; + + bool enable_reload = false; + + // CB mapping of in0, union of all possible variants (with + // and without fusing combinations of tilize/untilize) + // in0: + // input 0 + // in1: + // input 1 + // interm0: + // If under tilized mode, this is CB in which we write the tilized + // input 0 + // interm1: + // intermediate CB we write to so that we store partial matmul results + // interm2: + // if under untilize mode, this is the CB we write to so that we store + // the final matmul result + // interm3: + // if under untilize mode, this is the CB we write to so that we can + // reblock the output + uint32_t in0_cb = tt::CB::c_in0; + uint32_t tilize_mode_tilized_in0_cb = tt::CB::c_intermed0; + uint32_t matmul_partials_cb = tt::CB::c_intermed1; + uint32_t untilize_mode_final_matmul_partials_cb = tt::CB::c_intermed2; + uint32_t untilize_mode_reblock_cb = tt::CB::c_intermed3; + uint32_t out0_cb = tt::CB::c_out0; + mm_init(); + for(uint32_t block_in0_h = 0; block_in0_h < num_blocks_in0_h; block_in0_h++) { + for(uint32_t block_in1_w = 0; block_in1_w < num_blocks_in1_w; block_in1_w++) { + enable_reload = false; + //DPRINT << 'B' << ENDL(); + for(uint32_t block_in0_w = 0; block_in0_w < num_blocks_in0_w; block_in0_w++) + { + + bool last_out = block_in0_w == (num_blocks_in0_w-1); + if (tilize_in) { + tilize_activation(in0_cb, in0_subblock_h, in0_block_w, in0_num_subblocks, tilize_mode_tilized_in0_cb); + mm_init_short(); + cb_wait_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles); + + } else { + cb_wait_front(in0_cb, in0_block_num_tiles); + } + + cb_wait_front(tt::CB::c_in1, in1_block_num_tiles); + + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) { + + acquire_dst(tt::DstMode::Half); + + if (enable_reload) { + copy_tile_to_dst_init_short(); + cb_wait_front(matmul_partials_cb, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(matmul_partials_cb, i, i); + } + cb_pop_front(matmul_partials_cb, out_subblock_num_tiles); + mm_init_short(); + } + + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w; + if (tilize_in) { + matmul_tiles(tilize_mode_tilized_in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */); + } else { + matmul_tiles(in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */); + } + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + if (last_out) { + if (not untilize_out) { + pack_matmul_subblock(out0_cb, out_subblock_num_tiles); + } else { + pack_matmul_subblock(untilize_mode_final_matmul_partials_cb, out_subblock_num_tiles); + } + } else { + pack_matmul_subblock(matmul_partials_cb, out_subblock_num_tiles); + } + release_dst(tt::DstMode::Half); + + in1_index_subblock_offset += out_subblock_w; + } + + if (untilize_out) { + if (last_out) { + reblock_and_untilize( + in1_num_subblocks, + out_subblock_num_tiles, + out_subblock_h, + out_subblock_w, + out_block_w, + untilize_mode_final_matmul_partials_cb, + untilize_mode_reblock_cb, + out0_cb + ); + mm_init_short(); + } + } + + + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + if (tilize_in) { + cb_pop_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles); + } else { + cb_pop_front(in0_cb, in0_block_num_tiles); + } + cb_pop_front(tt::CB::c_in1, in1_block_num_tiles); + } + } + + } + + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp new file mode 100644 index 00000000000..c1fd470c266 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp @@ -0,0 +1,102 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" + +// #include "tools/profiler/kernel_profiler.hpp" +namespace NAMESPACE { +void MAIN { + + uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks) + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks; + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); // out_subblock_h*in0_block_w + uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks) + uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks; + uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks + uint32_t num_blocks = get_compile_time_arg_val(7); // outer inner dim (in inner dim blocks) + uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles + uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w; + + bool spill = num_blocks > uint32_t(1); + + mm_init(); + bool enable_reload = false; + + for(uint32_t block = 0; block < num_blocks; block++) + { + bool last_out = block == (num_blocks-1); + + cb_wait_front(tt::CB::c_in0, in0_block_num_tiles); + cb_wait_front(tt::CB::c_in1, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) { + // kernel_profiler::mark_time(6); + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) { + + acquire_dst(tt::DstMode::Half); + + if (enable_reload) { + copy_tile_to_dst_init_short(); + cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(tt::CB::c_intermed0, i, i); + } + cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles); + mm_init_short(); + } + + // Compute output sub-block from in0_subblock x in1_subblock + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w; + matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */); + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + + if (last_out) { + // Pack out to output buffer + cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, tt::CB::c_out0); + } + cb_push_back(tt::CB::c_out0, out_subblock_num_tiles); + } else { + // Move partial result to interm buffer + cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, tt::CB::c_intermed0); + } + cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles); + } + + release_dst(tt::DstMode::Half); + in1_index_subblock_offset += out_subblock_w; + } + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) enable_reload = true; + + cb_pop_front(tt::CB::c_in0, in0_block_num_tiles); + cb_pop_front(tt::CB::c_in1, in1_block_num_tiles); + + } + +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp new file mode 100644 index 00000000000..560aba53a77 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define BCAST_LLKOP ELWADD +#define BCAST_DIM BroadcastType::ROW + +#include "compute_kernel_api/matmul.h" +#include "compute_kernel_api/bcast.h" + +namespace NAMESPACE { +void MAIN { + + + uint32_t block_tile_dim = get_compile_time_arg_val(0); + uint32_t dst_tile_rows = get_compile_time_arg_val(1); + uint32_t dst_tile_cols = get_compile_time_arg_val(2); + uint32_t block_cnt = get_compile_time_arg_val(3); + uint32_t in0_block_tile_cnt = get_compile_time_arg_val(4); + uint32_t in1_block_tile_cnt = get_compile_time_arg_val(5); + uint32_t out_block_tile_cnt = get_compile_time_arg_val(6); + uint32_t with_bias = get_compile_time_arg_val(7); + + + acquire_dst(tt::DstMode::Full); + + mm_init(); + for(uint32_t b=0;b(tt::HlkOperand::intermed0, tt::HlkOperand::in2, dst_tile_index, c, dst_tile_index); + dst_tile_index++; + } + } + cb_pop_front(tt::CB::c_in2, dst_tile_cols); + } + + // Pack to c_out0 + cb_reserve_back(tt::CB::c_out0, out_block_tile_cnt); + for(uint32_t i=0;i + +// #include "compute_kernel_api.h" +#include "compute_kernel_api/tilize.h" +#include "compute_kernel_api/reduce.h" +// #include "tools/profiler/kernel_profiler.hpp" + +#define DEBUG_PRINT 0 + +#if DEBUG_PRINT == 1 + #include "debug_macros.h" + + SliceRange srt = SliceRange{.h0 = 0, .h1 = 32, .hs = 8, .w0 = 0, .w1 = 32, .ws = 4}; + SliceRange srr = SliceRange{.h0 = 0, .h1 = 1, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; + SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; + SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1}; + + inline void print_full_tile(uint32_t cb_id, uint32_t tile_id = 0, bool untilize = false) { + PDPRINT("======"); + for (int32_t r = 0; r < 32; ++ r) { + SliceRange sr = SliceRange{.h0 = r, .h1 = r+1, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1}; + PDPRINT((uint)r << TileSlice(cb_id, tile_id, sr, true, untilize)); + } + PDPRINT("++++++"); + } + + inline void print_cb_details(uint32_t cb_id) { + PDPRINT("cb_id " << cb_id << ": { " + << "size: " << cb_interface[cb_id].fifo_size << ", " + << "limit: " << cb_interface[cb_id].fifo_limit << ", " + << "page_size: " << cb_interface[cb_id].fifo_page_size << ", " + << "num_pages: " << cb_interface[cb_id].fifo_num_pages << ", " + << "rd_ptr: " << cb_interface[cb_id].fifo_rd_ptr << ", " + << "wr_ptr: " << cb_interface[cb_id].fifo_wr_ptr << ", " + << "wr_tile_ptr: " << cb_interface[cb_id].fifo_wr_tile_ptr << " }"); + } +#endif + +inline void tilize(uint32_t out_nelems, + uint32_t in_cb_id, + uint32_t in_ntiles_hw, + uint32_t in_ntiles_c, + uint32_t in_ntiles_hwc, + uint32_t window_hw_padded, + uint32_t out_cb_id) { + tilize_init_short(in_cb_id, in_ntiles_hwc); + for (uint32_t out_elem_i = 0; out_elem_i < out_nelems; ++ out_elem_i) { + cb_wait_front(in_cb_id, 1); + cb_reserve_back(out_cb_id, in_ntiles_hwc); + tilize_block(in_cb_id, in_ntiles_hwc, out_cb_id); // TODO: need to ensure the ordering for reduction when in_ntiles_hw > 1 + // print_full_tile(in_cb_id, 0, false); + // PDPRINT("OUT TILE :: " << TileSlice(out_cb_id, 0, srr, true, true)); + // print_cb_details(in_cb_id); + cb_push_back(out_cb_id, in_ntiles_hwc); + cb_pop_front(in_cb_id, 1); + } + tilize_uninit(); +} + +inline void reduce_h(uint32_t out_nelems, + uint32_t in_cb_id, + uint32_t in_scalar_cb_id, + uint32_t in_ntiles_hw, + uint32_t in_ntiles_c, + uint32_t in_ntiles_hwc, + uint32_t out_ntiles_c, + uint32_t out_cb_id) { + cb_wait_front(in_cb_id, in_ntiles_hwc * out_nelems); + cb_reserve_back(out_cb_id, out_ntiles_c * out_nelems); + reduce_init_delta(PoolType::MAX, ReduceDim::REDUCE_COL, out_cb_id); + uint32_t base_tile_id = 0; + for (uint32_t c_i = 0; c_i < in_ntiles_c * out_nelems; ++c_i) { + // add to accumulator all the in_ntiles_hw in a column of tiles + acquire_dst(tt::DstMode::Half); + uint32_t dst_i = 0; // TODO [AS]: Use more than one dst tile at a time + for (uint32_t hw_i = 0; hw_i < in_ntiles_hw; ++hw_i) { + uint32_t tile_i = base_tile_id + hw_i; + reduce_tile(PoolType::MAX, ReduceDim::REDUCE_COL, in_cb_id, in_scalar_cb_id, tile_i, 0, dst_i); + } + pack_tile(dst_i, out_cb_id); + release_dst(tt::DstMode::Half); + base_tile_id += in_ntiles_hw; + } + reduce_revert_delta(out_cb_id); + cb_push_back(out_cb_id, out_ntiles_c * out_nelems); + cb_pop_front(in_cb_id, in_ntiles_hwc * out_nelems); +} + +namespace NAMESPACE { + +void MAIN { + constexpr uint32_t in_cb_id = tt::CB::c_in0; + constexpr uint32_t in_scalar_cb_id = tt::CB::c_in1; + constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0; + constexpr uint32_t out_cb_id = tt::CB::c_out0; + + const uint32_t in_ntiles_hw = get_compile_time_arg_val(0); + const uint32_t in_ntiles_c = get_compile_time_arg_val(1); + const uint32_t in_ntiles_hwc = get_compile_time_arg_val(2); + const uint32_t window_hw_padded = get_compile_time_arg_val(3); + const uint32_t out_h = get_compile_time_arg_val(4); + const uint32_t out_w = get_compile_time_arg_val(5); + const uint32_t out_ntiles_c = get_compile_time_arg_val(7); + const uint32_t out_nelems = get_compile_time_arg_val(8); + const uint32_t out_w_loop_count = get_compile_time_arg_val(9); + const uint32_t nbatch = get_compile_time_arg_val(10); + const uint32_t out_h_per_core = get_compile_time_arg_val(11); + + tilize_init(in_cb_id, in_ntiles_hwc, in_tiled_cb_id); + + #if DEBUG_PRINT == 1 + print_cb_details(in_cb_id); + print_cb_details(in_scalar_cb_id); + print_cb_details(in_tiled_cb_id); + print_cb_details(out_cb_id); + #endif + + cb_wait_front(in_scalar_cb_id, 1); + for (uint32_t batch = 0; batch < nbatch; ++ batch) { + for (uint32_t out_h_i = 0; out_h_i < out_h_per_core; ++out_h_i) { + for (uint32_t out_w_i = 0; out_w_i < out_w_loop_count; ++out_w_i) { + // NOTE: Assuming in_ntiles_hw < 8 for now. + // TODO: subblocking to support this. + // kernel_profiler::mark_time(11); + // UDPRINT('T' << out_w_i); + // tilize + tilize(out_nelems, in_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, window_hw_padded, in_tiled_cb_id); + // UDPRINT('R' << out_w_i); + // Reduce H + reduce_h(out_nelems, in_tiled_cb_id, in_scalar_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, out_ntiles_c, out_cb_id); + // kernel_profiler::mark_time(12); + } + } + } + cb_pop_front(in_scalar_cb_id, 1); +} + +} // namespace NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp new file mode 100644 index 00000000000..d06c4094d50 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp @@ -0,0 +1,138 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +// #include "compute_kernel_api.h" +#include "compute_kernel_api/tilize.h" +#include "compute_kernel_api/reduce.h" +// #include "tools/profiler/kernel_profiler.hpp" + +#define DEBUG_PRINT 0 + +#if DEBUG_PRINT == 1 + #include "debug_macros.h" + + SliceRange srt = SliceRange{.h0 = 0, .h1 = 32, .hs = 8, .w0 = 0, .w1 = 32, .ws = 4}; + SliceRange srr = SliceRange{.h0 = 0, .h1 = 1, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; + SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; + SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1}; + + inline void print_full_tile(uint32_t cb_id, uint32_t tile_id = 0, bool untilize = false) { + PDPRINT("======"); + for (int32_t r = 0; r < 32; ++ r) { + SliceRange sr = SliceRange{.h0 = r, .h1 = r+1, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1}; + PDPRINT((uint)r << TileSlice(cb_id, tile_id, sr, true, untilize)); + } + PDPRINT("++++++"); + } + + inline void print_cb_details(uint32_t cb_id) { + PDPRINT("cb_id " << cb_id << ": { " + << "size: " << cb_interface[cb_id].fifo_size << ", " + << "limit: " << cb_interface[cb_id].fifo_limit << ", " + << "page_size: " << cb_interface[cb_id].fifo_page_size << ", " + << "num_pages: " << cb_interface[cb_id].fifo_num_pages << ", " + << "rd_ptr: " << cb_interface[cb_id].fifo_rd_ptr << ", " + << "wr_ptr: " << cb_interface[cb_id].fifo_wr_ptr << ", " + << "wr_tile_ptr: " << cb_interface[cb_id].fifo_wr_tile_ptr << " }"); + } +#endif + +inline void tilize(uint32_t out_nelems, + uint32_t in_cb_id, + uint32_t in_ntiles_hw, + uint32_t in_ntiles_c, + uint32_t in_ntiles_hwc, + uint32_t window_hw_padded, + uint32_t out_cb_id) { + tilize_init_short(in_cb_id, in_ntiles_hwc); + for (uint32_t out_elem_i = 0; out_elem_i < out_nelems; ++ out_elem_i) { + cb_wait_front(in_cb_id, 1); + cb_reserve_back(out_cb_id, in_ntiles_hwc); + tilize_block(in_cb_id, in_ntiles_hwc, out_cb_id); // TODO: need to ensure the ordering for reduction when in_ntiles_hw > 1 + // print_full_tile(in_cb_id, 0, false); + // PDPRINT("OUT TILE :: " << TileSlice(out_cb_id, 0, srr, true, true)); + // print_cb_details(in_cb_id); + cb_push_back(out_cb_id, in_ntiles_hwc); + cb_pop_front(in_cb_id, 1); + } + tilize_uninit(); +} + +inline void reduce_h(uint32_t out_nelems, + uint32_t in_cb_id, + uint32_t in_scalar_cb_id, + uint32_t in_ntiles_hw, + uint32_t in_ntiles_c, + uint32_t in_ntiles_hwc, + uint32_t out_ntiles_c, + uint32_t out_cb_id) { + cb_wait_front(in_cb_id, in_ntiles_hwc * out_nelems); + cb_reserve_back(out_cb_id, out_ntiles_c * out_nelems); + reduce_init_delta(PoolType::MAX, ReduceDim::REDUCE_COL, out_cb_id); + uint32_t base_tile_id = 0; + for (uint32_t c_i = 0; c_i < in_ntiles_c * out_nelems; ++c_i) { + // add to accumulator all the in_ntiles_hw in a column of tiles + acquire_dst(tt::DstMode::Half); + uint32_t dst_i = 0; // TODO [AS]: Use more than one dst tile at a time + for (uint32_t hw_i = 0; hw_i < in_ntiles_hw; ++hw_i) { + uint32_t tile_i = base_tile_id + hw_i; + reduce_tile(PoolType::MAX, ReduceDim::REDUCE_COL, in_cb_id, in_scalar_cb_id, tile_i, 0, dst_i); + } + pack_tile(dst_i, out_cb_id); + release_dst(tt::DstMode::Half); + base_tile_id += in_ntiles_hw; + } + reduce_revert_delta(out_cb_id); + cb_push_back(out_cb_id, out_ntiles_c * out_nelems); + cb_pop_front(in_cb_id, in_ntiles_hwc * out_nelems); +} + +namespace NAMESPACE { + +void MAIN { + constexpr uint32_t in_cb_id = tt::CB::c_in0; + constexpr uint32_t in_scalar_cb_id = tt::CB::c_in1; + constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0; + constexpr uint32_t out_cb_id = tt::CB::c_out0; + + const uint32_t in_ntiles_hw = get_compile_time_arg_val(0); + const uint32_t in_ntiles_c = get_compile_time_arg_val(1); + const uint32_t in_ntiles_hwc = get_compile_time_arg_val(2); + const uint32_t window_hw_padded = get_compile_time_arg_val(3); + const uint32_t out_h = get_compile_time_arg_val(4); + const uint32_t out_w = get_compile_time_arg_val(5); + const uint32_t out_ntiles_c = get_compile_time_arg_val(7); + const uint32_t out_nelems = get_compile_time_arg_val(8); + const uint32_t out_w_loop_count = get_compile_time_arg_val(9); + const uint32_t nbatch = get_compile_time_arg_val(10); + const uint32_t out_h_per_core = get_compile_time_arg_val(11); + const uint32_t nsticks_per_core = get_compile_time_arg_val(12); + const uint32_t nsticks_per_core_by_nblocks = get_compile_time_arg_val(13); + + tilize_init(in_cb_id, in_ntiles_hwc, in_tiled_cb_id); + + #if DEBUG_PRINT == 1 + print_cb_details(in_cb_id); + print_cb_details(in_scalar_cb_id); + print_cb_details(in_tiled_cb_id); + print_cb_details(out_cb_id); + #endif + + cb_wait_front(in_scalar_cb_id, 1); + for (uint32_t i = 0; i < nsticks_per_core_by_nblocks; ++ i) { + // NOTE: Assuming in_ntiles_hw < 8 for now. + // TODO: subblocking to support this. + // kernel_profiler::mark_time(11); + // tilize + tilize(out_nelems, in_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, window_hw_padded, in_tiled_cb_id); + // Reduce H + reduce_h(out_nelems, in_tiled_cb_id, in_scalar_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, out_ntiles_c, out_cb_id); + // kernel_profiler::mark_time(12); + } + cb_pop_front(in_scalar_cb_id, 1); +} + +} // namespace NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp new file mode 100644 index 00000000000..267233258a1 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "debug_print.h" + +#include "compute_kernel_api/reduce.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t Ht = get_compile_time_arg_val(0); + uint32_t Wt = get_compile_time_arg_val(1); + uint32_t NC = get_compile_time_arg_val(2); + + reduce_init(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2); + cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader + + for (uint32_t nc = 0; nc < NC; nc++) { + + constexpr int onetile = 1; + int reduce_dst_idx = 0; + for(uint32_t wt = 0; wt < Wt; ++wt) { + // tiles are expected to be coming in in NCWH order (H-contiguous) + // reducing in W means out[0][w] = sum(h=0..H-1, in[h][w]) + // in this case we just sequentially add to accumulator all the H-tiles in a column + acquire_dst(tt::DstMode::Half); + for(uint32_t ht = 0; ht < Ht; ++ht) { + cb_wait_front(tt::CB::c_in0, onetile); + // REDUCE_OP is expected to come from add_define + reduce_tile(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx); + cb_pop_front(tt::CB::c_in0, onetile); + } + + cb_reserve_back(tt::CB::c_out0, onetile); + pack_tile(reduce_dst_idx, tt::CB::c_out0); + cb_push_back(tt::CB::c_out0, onetile); + release_dst(tt::DstMode::Half); + } + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp new file mode 100644 index 00000000000..5c956a4804a --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "debug_print.h" + +#include "compute_kernel_api/reduce.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t Ht = get_compile_time_arg_val(0); + uint32_t Wt = get_compile_time_arg_val(1); + uint32_t NC = get_compile_time_arg_val(2); + + reduce_init(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2); + + cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader + for (uint32_t nc = 0; nc < NC; nc++) { + constexpr int onetile = 1; + int reduce_dst_idx = 0; + acquire_dst(tt::DstMode::Half); + for(uint32_t ht = 0; ht < Ht; ++ht) { + // tiles are expected to be coming in in NCHW order (W-contiguous) + // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w]) + // in this case we just sequentially add to accumulator all the W-tiles in a row + for(uint32_t wt = 0; wt < Wt; ++wt) { + cb_wait_front(tt::CB::c_in0, onetile); + // REDUCE_OP/DIM is expected to come from add_define + reduce_tile(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx); + cb_pop_front(tt::CB::c_in0, onetile); + } + } + cb_reserve_back(tt::CB::c_out0, onetile); + pack_tile(reduce_dst_idx, tt::CB::c_out0); + cb_push_back(tt::CB::c_out0, onetile); + release_dst(tt::DstMode::Half); + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp new file mode 100644 index 00000000000..64da8ad468d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "debug_print.h" + +#include "compute_kernel_api/reduce.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t Ht = get_compile_time_arg_val(0); + uint32_t Wt = get_compile_time_arg_val(1); + uint32_t NC = get_compile_time_arg_val(2); + + reduce_init(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2); + + cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader + for (uint32_t nc = 0; nc < NC; nc++) { + + constexpr int onetile = 1; + int reduce_dst_idx = 0; + for(uint32_t ht = 0; ht < Ht; ++ht) { + // tiles are expected to be coming in in NCHW order (W-contiguous) + // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w]) + // in this case we just sequentially add to accumulator all the W-tiles in a row + acquire_dst(tt::DstMode::Half); + for(uint32_t wt = 0; wt < Wt; ++wt) { + cb_wait_front(tt::CB::c_in0, onetile); + // REDUCE_OP is expected to come from add_define + reduce_tile(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx); + cb_pop_front(tt::CB::c_in0, onetile); + } + + cb_reserve_back(tt::CB::c_out0, onetile); + pack_tile(reduce_dst_idx, tt::CB::c_out0); + cb_push_back(tt::CB::c_out0, onetile); + release_dst(tt::DstMode::Half); + } + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp new file mode 100644 index 00000000000..04e21805ff9 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp @@ -0,0 +1,217 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define REDUCE_OP PoolType::SUM +#define REDUCE_DIM ReduceDim::REDUCE_ROW + +#define BCAST_LLKOP EltwiseBinaryType::ELWMUL +#define BCAST_DIM BroadcastType::COL + +#include "compute_kernel_api/reduce.h" +#include "compute_kernel_api/bcast.h" +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/layernorm.h" + + +ALWI void ACQ() { acquire_dst(tt::DstMode::Half); } +ALWI void REL() { release_dst(tt::DstMode::Half); } + + +namespace NAMESPACE { +void MAIN { + const uint32_t NCHt = get_arg_val(0); + constexpr uint32_t Wt = get_compile_time_arg_val(0); + constexpr uint32_t blk = get_compile_time_arg_val(1); + constexpr uint32_t do_gamma = get_compile_time_arg_val(2); + constexpr uint32_t do_beta = get_compile_time_arg_val(3); + + + #ifdef FUSE_PRE_ADD + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1); + #else + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0); + #endif + + constexpr uint32_t onetile = 1; + // reserve one tile for zeros on cb_in2 + // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes + + // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct + constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader + constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader + constexpr auto cb_ex = tt::CB::c_intermed1; // E[x] + constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2] + constexpr auto cb_x2 = tt::CB::c_intermed3; // x^2 + constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps + constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b) + constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add + constexpr auto cb_out = tt::CB::c_out0; // output + constexpr auto cb_gamma = tt::CB::c_in5; + constexpr auto cb_beta = tt::CB::c_in6; + constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta + constexpr auto scaler0 = 0; + #ifdef FUSE_PRE_ADD + constexpr auto cb_x = tt::CB::c_intermed6; + #else + constexpr auto cb_x = tt::CB::c_in0; + #endif + + cb_wait_front(cb_scaler, 1); // comes from the reader + cb_wait_front(cb_eps, 1); // comes from the reader + + + constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CB::c_out0; + + + for (uint32_t ncht = 0; ncht < NCHt; ncht++) { + + constexpr int onetile = 1; + constexpr int dst0 = 0; + + /* + * X + Y + */ + #ifdef FUSE_PRE_ADD + add_tiles_init(); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + ACQ(); + //UNPACK(( { DPRINT << "Waiting on cb_x" << ENDL(); } )); + cb_wait_front(cb_in, blk); + //UNPACK(( { DPRINT << "Waiting on cb_inb" << ENDL(); } )); + cb_wait_front(cb_inb, blk); + //UNPACK(( { DPRINT << "Done Waiting on cb_inb" << ENDL(); } )); + cb_reserve_back(cb_x, blk); + for (uint32_t j = 0; j < blk; j++) { + add_tiles(cb_in, cb_inb, j, j, j); + pack_tile(j, cb_x); + } + REL(); + cb_push_back(cb_x, blk); // push the sum into the same buffer + cb_pop_front(cb_in, blk); + cb_pop_front(cb_inb, blk); + } + // by the end of this loop we should end up with Wt tiles in cb_x + #endif + + /* (x)^2 + * compute temp = x^2 + */ + mul_tiles_init(); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + cb_wait_front(cb_x, wt+blk); + cb_reserve_back(cb_x2, blk); // can probably use less space for this if we block + ACQ(); + for (uint32_t wtr = 0; wtr(REDUCE_OP, REDUCE_DIM); + ACQ(); + cb_wait_front(cb_x2, Wt); + //cb_wait_front(cb_xmm, Wt); + for (uint32_t wt = 0; wt < Wt; wt += blk) { + // reduce + for (uint32_t wtr = 0; wtr + +#define REDUCE_OP PoolType::SUM +#define REDUCE_DIM ReduceDim::REDUCE_ROW + +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/bcast.h" +#include "compute_kernel_api/softmax.h" +#include "compute_kernel_api/reduce.h" + +ALWI void ACQ() { acquire_dst(tt::DstMode::Half); } +ALWI void REL() { release_dst(tt::DstMode::Half); } + +// for scale+mask+softmax: +// bcast HW (mul by 1 tile) example: ( [2,1,1024,64] * [1,1,32,32] ) +// bcast add H example: ( [2,1,1024,64] + [2,1,32,64] ) (bcast W -> H) +// Note that the attention mask will not fit in L1 for the entire tensor +// The buffer for the att mask is currently sized as (1t,Wt) so we only reuse it for one HtWt-sized batch of x +// then read another Wt tiles of mask for the next batch + +namespace NAMESPACE { +void MAIN { + + const uint32_t NCHt = get_arg_val(0); + const uint32_t Ht = get_arg_val(1); + const uint32_t Wt = get_arg_val(2); + const uint32_t ndst = get_arg_val(3); + const uint32_t start_ht = get_arg_val(4); + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2); + + constexpr uint32_t onetile = 1; + // reserve one tile for zeros on cb_in2 + // We only do the reserve for the intermediates once and use pack_tile + // So effectively these are used as pre-allocated arrays + // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct + constexpr auto cb_bcast_scaler = tt::CB::c_in2; + constexpr auto cb_fused_scale = tt::CB::c_in3; + constexpr auto cb_fused_attn = tt::CB::c_in4; + constexpr auto cb_exps = tt::CB::c_intermed0; + constexpr auto cb_scale_mask = tt::CB::c_intermed3; + constexpr auto cb_recipsumexps = tt::CB::c_intermed1; + constexpr auto cb_in0 = tt::CB::c_in0; + constexpr auto cb_out0 = tt::CB::c_out0; + + + cb_wait_front(cb_bcast_scaler, 1); // comes from the reader + + #if FUSED_SCALE_MASK + cb_wait_front(cb_fused_scale, 1); + #endif + + constexpr int dst0 = 0; + uint32_t ht = start_ht; + bool wait_mask = true; + for (uint32_t ncht = 0; ncht < NCHt; ncht++) { + #if FUSED_SCALE_MASK + for (uint32_t wt = 0; wt < Wt; wt+=ndst) { + // apply fused scale [*= 1/sqrt(...)] + ACQ(); + mul_tiles_bcast_scalar_init_short(); + cb_wait_front(cb_in0, ndst); + cb_reserve_back(cb_scale_mask, ndst); + for (uint32_t wt8 = 0; wt8 < ndst; wt8++) { + mul_tiles_bcast_scalar(cb_in0, cb_fused_scale, wt8, 0, wt8); // mul bcast-HW -> DST[wt8] + pack_tile(wt8, cb_scale_mask); // reuse exps buffer + } + cb_push_back(cb_scale_mask, ndst); + cb_pop_front(cb_in0, ndst); + REL(); + } + + for (uint32_t wt = 0; wt < Wt; wt+=ndst) { + ACQ(); + if (wait_mask) { + cb_wait_front(cb_fused_attn, wt+ndst); // cumulative wait for up to Wt tiles, only at first ht + } + cb_wait_front(cb_scale_mask, ndst); + add_bcast_rows_init_short(); + for (uint32_t wt8 = 0; wt8 < ndst; wt8++) { + add_tiles_bcast_rows(cb_scale_mask, cb_fused_attn, wt8, wt+wt8, wt8); // tile *= 1/(sum(exp(x))) + } + cb_pop_front(cb_scale_mask, ndst); + cb_reserve_back(cb_exps, ndst); + exp_tile_init(true); + for (uint32_t wt8 = 0; wt8 < ndst; wt8++) { + exp_tile(wt8,true); // exp on DST[0] + pack_tile(wt8, cb_exps); // reuse the exps buffer again, this time in a circular manner + } + cb_push_back(cb_exps, ndst); + REL(); + } + if (wait_mask) { + wait_mask = false; + } + ht++; + if (ht == Ht) { + cb_pop_front(cb_fused_attn, Wt); + ht = 0; + wait_mask = true; + } + #else + + for (uint32_t wt = 0; wt < Wt; wt+=ndst) { + + ACQ(); + cb_wait_front(cb_in0, ndst); + copy_tile_init(); // need to copy from CB to DST to be able to run sfpu math + for (uint32_t wt8 = 0; wt8 < ndst; ++wt8) { + copy_tile(cb_in0, wt8, wt8); // copy from c_in[0] to DST[0] + } + cb_pop_front(cb_in0, ndst); + + cb_reserve_back(cb_exps, ndst); + exp_tile_init(true); + for (uint32_t wt8 = 0; wt8 < ndst; ++wt8) { + exp_tile(wt8, true); // exp on DST[0] + pack_tile(wt8, cb_exps); // DST[0]->cb_id[wt] + } + cb_push_back(cb_exps, ndst); + REL(); + } + #endif + + ACQ(); + cb_reserve_back(cb_recipsumexps, onetile); + reduce_init_delta(REDUCE_OP, REDUCE_DIM); + for (uint32_t wt = 0; wt < Wt; wt++) { + cb_wait_front(cb_exps, wt+1); // must be a cumulative wait for correctness + constexpr uint32_t bcast_scaler0 = 0; // 0th index from bcast_scaler CB + reduce_tile(REDUCE_OP, REDUCE_DIM, cb_exps, cb_bcast_scaler, wt, bcast_scaler0, dst0); + } + reduce_revert_delta(); + recip_tile_init(); + recip_tile(dst0); // DST[0] = 1/sum(exp(x)) + pack_tile(dst0, cb_recipsumexps); + cb_push_back(cb_recipsumexps, 1); + + REL(); + + + cb_wait_front(cb_recipsumexps, 1); // will reuse Wt times for bcast + + // now cb_sumexps has exp tiles, need to multiply by our DST[2] + // by now we already did a umulative wait for Wt tiles in cb_exps + mul_bcast_cols_init_short(); + for (uint32_t wt = 0; wt < Wt; wt += ndst) { + ACQ(); + cb_reserve_back(tt::CB::c_out0, ndst); + for (uint32_t wt8 = 0; wt8 < ndst; wt8++) { + // wt+wt8 since we pop Wt after the entire loop + mul_tiles_bcast(cb_exps, cb_recipsumexps, wt+wt8, 0, wt8); // tile *= 1/(sum(exp(x))) + pack_tile(wt8, tt::CB::c_out0); + } + cb_push_back(tt::CB::c_out0, ndst); + REL(); + } + cb_pop_front(cb_recipsumexps, 1); + cb_pop_front(cb_exps, Wt); + } // NCHt loop + //cb_pop_front(cb_bcast_scaler, 1); // we don't actually have to do this + //cb_pop_front(cb_fused_scale, 1); // we don't actually have to do this +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp new file mode 100644 index 00000000000..2218d5393f5 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/tilize.h" + +//#include "debug_print.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t per_core_block_cnt = get_compile_time_arg_val(0); + uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1); + //UNPACK(( DPRINT << "Block count=" << uint32_t(per_core_block_cnt) << " tile count=" << per_core_block_tile_cnt << ENDL() )); + tilize_init(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0); + + for(uint32_t b=0;b +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" +#include "compute_kernel_api/tilize.h" +#include "compute_kernel_api/untilize.h" + +using std::uint32_t; + +// matmul C=A*B using dims MK*KN = MN (row major order) +// +namespace NAMESPACE { +void MAIN { + + constexpr uint32_t onetile = 1; + + constexpr uint32_t transpose_hw = get_compile_time_arg_val(0); + uint32_t batch = get_arg_val(0); + uint32_t Mt = get_arg_val(1); + uint32_t Kt = get_arg_val(2); + uint32_t Nt = get_arg_val(3); + + constexpr uint32_t cb_intermed0 = 24; + constexpr uint32_t cb_intermed1 = 25; + constexpr uint32_t cb_intermed2 = 26; + constexpr uint32_t out_cb_id = 16; + + constexpr uint32_t num_rows_in_one_tile = 32; + + mm_init(tt::CB::c_in0, tt::CB::c_in1, out_cb_id, transpose_hw); + + for (uint32_t nb = 0; nb < batch; nb++) + for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C + for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) // output tile index of C + { + for (uint32_t tile_row_id = 0; tile_row_id < num_rows_in_one_tile; tile_row_id++) { + acquire_dst(tt::DstMode::Half); + for (uint32_t kt = 0; kt < Kt; kt++) { + if (tile_row_id == 0) { + cb_wait_front(tt::CB::c_in0, kt+1); + } + cb_wait_front(tt::CB::c_in1, onetile); + + matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, kt, 0, 0, transpose_hw); + + cb_pop_front(tt::CB::c_in1, onetile); + } + + cb_reserve_back(cb_intermed0, onetile); + pack_tile(0, cb_intermed0); + release_dst(tt::DstMode::Half); + cb_push_back(cb_intermed0, onetile); + + // untilize tile and write to CB::c_intermed1 + cb_wait_front(cb_intermed0, onetile); + untilize_init_short(cb_intermed0); + cb_reserve_back(cb_intermed1, 1); + untilize_block(cb_intermed0, 1, cb_intermed1); + cb_push_back(cb_intermed1, 1); + + cb_pop_front(cb_intermed0, 1); + untilize_uninit(cb_intermed0); + + mm_init_short(transpose_hw); + } + cb_pop_front(tt::CB::c_in0, Kt); + + // cb_intermed2 comes from reader; untilized row-major tile + cb_wait_front(cb_intermed2, 1); + cb_reserve_back(tt::CB::c_out0, onetile); + + // tilize CB::intermed2 and write to CB::c_out0 + tilize_init_short(cb_intermed2, 1); + tilize_block(cb_intermed2, 1, out_cb_id); + cb_push_back(out_cb_id, 1); + + cb_pop_front(cb_intermed2, 1); + tilize_uninit(); + + mm_init_short(transpose_hw); + } + +} +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp new file mode 100644 index 00000000000..42452f19fd4 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/transpose_wh.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t NHtWt = get_compile_time_arg_val(0); + transpose_wh_init(tt::CB::c_in0); + + // transpose a row-major block: + // - assumes the tiles come in in column major order from reader + // - uses reader_unary_transpose_wh + // - transpose_wh each tile + for (uint32_t n = 0; n < NHtWt; n++) { + cb_wait_front(tt::CB::c_in0, 1); + cb_reserve_back(tt::CB::c_out0, 1); + + acquire_dst(tt::DstMode::Half); + transpose_wh_tile(tt::CB::c_in0, 0, 0); + pack_tile(0, tt::CB::c_out0); + release_dst(tt::DstMode::Half); + + cb_push_back(tt::CB::c_out0, 1); + cb_pop_front(tt::CB::c_in0, 1); + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp new file mode 100644 index 00000000000..2b31e7b49dd --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/matmul.h" +#include "compute_kernel_api.h" + +namespace NAMESPACE { +void MAIN { + const uint32_t in0_cb = get_compile_time_arg_val(0); + const uint32_t in1_cb = get_compile_time_arg_val(1); + const uint32_t out_cb = get_compile_time_arg_val(2); + const uint32_t partials_cb = get_compile_time_arg_val(3); + const uint32_t in0_block_num_tiles = get_compile_time_arg_val(4); + const uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); + const uint32_t out_block_num_tiles = get_compile_time_arg_val(6); + const uint32_t out_r = get_compile_time_arg_val(7); + const uint32_t out_c = get_compile_time_arg_val(8); + const uint32_t in0_k = get_compile_time_arg_val(9); + const uint32_t num_blocks = get_compile_time_arg_val(10); + const bool transpose = false; + const uint32_t last_block_id = num_blocks - 1; + + // we are looking at block + // out = in0[r x k]*in1[k x c] + mm_init(); + for (uint32_t block_id = 0; block_id < num_blocks; block_id++) { + acquire_dst(tt::DstMode::Half); + if (block_id > 0) { + copy_tile_to_dst_init_short(); + cb_wait_front(partials_cb, out_block_num_tiles); + for (uint32_t i = 0; i < out_block_num_tiles; i++) { + copy_tile(partials_cb, i, i); + } + cb_pop_front(partials_cb, out_block_num_tiles); + mm_init_short(); + } + uint32_t out_tile_index = 0; + uint32_t in0_index_r_offset = 0; + cb_wait_front(in0_cb, in0_block_num_tiles); + cb_wait_front(in1_cb, in1_block_num_tiles); + for (uint32_t r = 0; r < out_r; r++) { + for (uint32_t c = 0; c < out_c; c++) { + uint32_t in1_index_c_offset = 0; + for (uint32_t k = 0; k < in0_k; k++) { + int in0_tile_index = in0_index_r_offset + k; + int in1_tile_index = in1_index_c_offset + c; + matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose); + in1_index_c_offset += k; + } + out_tile_index++; + } + in0_index_r_offset += in0_k; + } + cb_pop_front(in0_cb, in0_block_num_tiles); + cb_pop_front(in1_cb, in1_block_num_tiles); + + for (uint32_t tile_index = 0; tile_index < out_block_num_tiles; tile_index++) { + if (block_id == last_block_id) { + cb_reserve_back(out_cb, out_block_num_tiles); + pack_tile(tile_index, out_cb); + cb_push_back(out_cb, out_block_num_tiles); + } else { + cb_reserve_back(partials_cb, out_block_num_tiles); + pack_tile(tile_index, partials_cb); + cb_push_back(partials_cb, out_block_num_tiles); + } + } + release_dst(tt::DstMode::Half); + } +} +} // namespace NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp new file mode 100644 index 00000000000..6190640c8bd --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/matmul.h" +#include "compute_kernel_api.h" + +namespace NAMESPACE { +void MAIN { + const uint32_t in0_cb = get_compile_time_arg_val(0); + const uint32_t in1_cb = get_compile_time_arg_val(1); + const uint32_t out_cb = get_compile_time_arg_val(2); + const uint32_t in0_num_tiles = get_compile_time_arg_val(3); + const uint32_t in1_num_tiles = get_compile_time_arg_val(4); + const uint32_t out_num_tiles = get_compile_time_arg_val(5); + const uint32_t out_r = get_compile_time_arg_val(6); + const uint32_t out_c = get_compile_time_arg_val(7); + const uint32_t in0_k = get_compile_time_arg_val(8); + const bool transpose = false; + + // we are looking at block + // out = in0[r x k]*in1[k x c] + mm_init(); + acquire_dst(tt::DstMode::Half); + + uint32_t out_tile_index = 0; + uint32_t in0_index_r_offset = 0; + cb_wait_front(in0_cb, in0_num_tiles); + cb_wait_front(in1_cb, in1_num_tiles); + for (uint32_t r = 0; r < out_r; r++) { + for (uint32_t c = 0; c < out_c; c++) { + uint32_t in1_index_c_offset = 0; + for (uint32_t k = 0; k < in0_k; k++) { + int in0_tile_index = in0_index_r_offset + k; + int in1_tile_index = in1_index_c_offset + c; + matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose); + in1_index_c_offset += k; + } + out_tile_index++; + } + in0_index_r_offset += in0_k; + } + cb_pop_front(in0_cb, in0_num_tiles); + cb_pop_front(in1_cb, in1_num_tiles); + + cb_reserve_back(out_cb, out_num_tiles); + for (uint32_t tile_index = 0; tile_index < out_num_tiles; tile_index++) { + pack_tile(tile_index, out_cb); + } + cb_push_back(out_cb, out_num_tiles); + release_dst(tt::DstMode::Half); +} +} // namespace NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp new file mode 100644 index 00000000000..60105fd134c --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + const uint32_t in0_cb = get_compile_time_arg_val(0); + const uint32_t in1_cb = get_compile_time_arg_val(1); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_noc_x = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); + uint32_t num_tiles = get_arg_val(6); + + // single-tile ublocks + uint32_t ublock_size_bytes_0 = get_tile_size(in0_cb); + uint32_t ublock_size_bytes_1 = get_tile_size(in1_cb); + uint32_t ublock_size_tiles = 1; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + + cb_reserve_back(in0_cb, ublock_size_tiles); + cb_reserve_back(in1_cb, ublock_size_tiles); + + l1_write_addr_in0 = get_write_ptr(in0_cb); + l1_write_addr_in1 = get_write_ptr(in1_cb); + + noc_async_read(src0_noc_addr, l1_write_addr_in0, ublock_size_bytes_0); + noc_async_read(src1_noc_addr, l1_write_addr_in1, ublock_size_bytes_1); + + noc_async_read_barrier(); + + cb_push_back(in0_cb, ublock_size_tiles); + cb_push_back(in1_cb, ublock_size_tiles); + + src0_addr += ublock_size_bytes_0; + src1_addr += ublock_size_bytes_1; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp new file mode 100644 index 00000000000..69f230a6001 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + const uint32_t in0_cb = get_compile_time_arg_val(0); + const uint32_t in1_cb = get_compile_time_arg_val(1); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_noc_x = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); + uint32_t num_blocks = get_arg_val(6); + uint32_t in0_block_tile_cnt = get_arg_val(7); + uint32_t in1_block_tile_cnt = get_arg_val(8); + uint32_t in0_block_size_bytes = get_arg_val(9); + uint32_t in1_block_size_bytes = get_arg_val(10); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + for (uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + + cb_reserve_back(in0_cb, in0_block_tile_cnt); + cb_reserve_back(in1_cb, in1_block_tile_cnt); + + l1_write_addr_in0 = get_write_ptr(in0_cb); + l1_write_addr_in1 = get_write_ptr(in1_cb); + + noc_async_read(src0_noc_addr, l1_write_addr_in0, in0_block_size_bytes); + noc_async_read(src1_noc_addr, l1_write_addr_in1, in1_block_size_bytes); + + noc_async_read_barrier(); + auto ptr0 = reinterpret_cast (l1_write_addr_in0); + auto ptr1 = reinterpret_cast (l1_write_addr_in1); + + cb_push_back(in0_cb, in0_block_tile_cnt); + cb_push_back(in1_cb, in1_block_tile_cnt); + + src0_addr += in0_block_size_bytes; + src1_addr += in1_block_size_bytes; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp new file mode 100644 index 00000000000..8792a0af75e --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/matmul.h" +#include "compute_kernel_api.h" + +namespace NAMESPACE { +void MAIN { + const uint32_t in0_cb = get_compile_time_arg_val(0); + const uint32_t in1_cb = get_compile_time_arg_val(1); + const uint32_t out_cb = get_compile_time_arg_val(2); + const uint32_t num_in0_tiles = 1; + const uint32_t num_in1_tiles = 1; + const uint32_t num_out_tiles = 1; + const uint32_t in0_tile_index = 0; + const uint32_t in1_tile_index = 0; + const uint32_t out_tile_index = 0; + const bool transpose = false; + mm_init(); + cb_reserve_back(out_cb, num_out_tiles); + acquire_dst(tt::DstMode::Half); + cb_wait_front(in0_cb, num_in0_tiles); + cb_wait_front(in1_cb, num_in1_tiles); + matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose); + pack_tile(0, out_cb); + cb_pop_front(in0_cb, num_in0_tiles); + cb_pop_front(in1_cb, num_in1_tiles); + release_dst(tt::DstMode::Half); + cb_push_back(out_cb, num_out_tiles); +} +} // namespace NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp new file mode 100644 index 00000000000..3caec0ae567 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + const uint32_t out_cb = get_compile_time_arg_val(0); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_noc_x = get_arg_val(1); + uint32_t dst_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + // single-tile ublocks + uint32_t ublock_size_bytes = get_tile_size(out_cb); + uint32_t ublock_size_tiles = 1; + + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + + cb_wait_front(out_cb, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(out_cb); + noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + + noc_async_write_barrier(); + + cb_pop_front(out_cb, ublock_size_tiles); + dst_addr += ublock_size_bytes; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp new file mode 100644 index 00000000000..56be069bed3 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define ELTWISE_OP_CODE 0 // TODO(AP): temporary - refactor + +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api.h" + +namespace NAMESPACE { + +#ifdef TRISC_MATH +#include +#include "llk_math_common.h" +#include "llk_math_eltwise_binary.h" +#include "llk_math_eltwise_unary_datacopy.h" + +void math_main() +{ + uint32_t per_core_num_blocks = get_compile_time_arg_val(0); + uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1); + uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2); + + llk_math_pack_sync_init(); + for (uint32_t block = 0; block < per_core_num_blocks; block++) { + for (uint32_t r = 0; r < per_core_block_r_tiles; r++) { + // Untilize + llk_math_eltwise_unary_datacopy_init(); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_unary_datacopy(0); + llk_math_dest_section_done(); + } + + llk_math_eltwise_binary_init(); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_math_wait_for_dest_available(); + llk_math_eltwise_binary(0); + llk_math_dest_section_done(); + } + } + } +} +#endif + +#ifdef TRISC_UNPACK +#include +#include "llk_unpack_common.h" +#include "llk_unpack_AB.h" +#include "llk_unpack_untilize.h" + +void unpack_main() +{ +uint32_t per_core_num_blocks = get_compile_time_arg_val(0); +uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1); +uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2); + +llk_setup_operands(); +llk_unpack_AB_hw_configure_disaggregated(0,1); +// llk_unpack_untilize_hw_configure_disaggregated(0); + +// llk_unpack_untilize_init(0); +for (uint32_t block = 0U; block < per_core_num_blocks; ++block) { + for (uint32_t r = 0; r < per_core_block_r_tiles; r++) { + llk_unpack_untilize_init(0); + llk_wait_tiles(0, per_core_block_c_tiles); + llk_unpack_untilize(0, per_core_block_c_tiles); + llk_unpack_untilize_uninit(0); + llk_pop_tiles(0, per_core_block_c_tiles); + llk_pop_tiles(1, per_core_block_c_tiles); + + llk_unpack_AB_init(); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_wait_tiles(24, 1); + llk_wait_tiles(1, 1); + llk_unpack_AB(24, 1, 0, 0); + llk_pop_tiles(24, 1); + llk_pop_tiles(1, 1); + } + } +} +} +#endif + + +#ifdef TRISC_PACK +#include +#include "llk_pack_common.h" +#include "llk_pack.h" + +void pack_main() +{ + uint32_t per_core_num_blocks = get_compile_time_arg_val(0); + uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1); + uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2); + llk_pack_init(); + llk_pack_hw_configure_disaggregated(16); + llk_setup_outputs(); + llk_pack_dest_init(); + + for (uint32_t block = 0; block < per_core_num_blocks; block++) { + for (uint32_t r = 0; r < per_core_block_r_tiles; r++) { + llk_wait_for_free_tiles(24, per_core_block_c_tiles); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_packer_wait_for_math_done(); + llk_pack(0,24); + llk_pack_dest_section_done(); + } + llk_push_tiles(24, per_core_block_c_tiles); + + llk_wait_for_free_tiles(16, per_core_block_c_tiles); + for (uint32_t c = 0; c < per_core_block_c_tiles; c++) { + llk_packer_wait_for_math_done(); + llk_pack(0,16); + llk_pack_dest_section_done(); + } + llk_push_tiles(16, per_core_block_c_tiles); + } + } +} +#endif + +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp new file mode 100644 index 00000000000..df416389dfc --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/untilize.h" +//#include "debug_print.h" + +namespace NAMESPACE { +void MAIN { + + uint32_t per_core_block_cnt = get_compile_time_arg_val(0); + uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1); + untilize_init(tt::CB::c_in0); + + //UNPACK(( DPRINT << "Block count=" << uint32_t(per_core_block_cnt) << " tile count=" << per_core_block_tile_cnt << ENDL() )); + + for(uint32_t b = 0; b < per_core_block_cnt; ++ b) { + cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt); + cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt); + + untilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0); + + cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt); + cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt); + } +} +} diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/update_cache.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/update_cache.cpp new file mode 100644 index 00000000000..007f4ae618f --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/compute/update_cache.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/untilize.h" +#include "compute_kernel_api/tilize.h" + + +namespace NAMESPACE { +void MAIN { + constexpr uint32_t onetile = 1; + + constexpr uint32_t cache_cb = get_compile_time_arg_val(0); + constexpr uint32_t in_cb = get_compile_time_arg_val(1); + constexpr uint32_t untilized_cache_cb = get_compile_time_arg_val(2); + constexpr uint32_t untilized_cache2_cb = get_compile_time_arg_val(3); + constexpr uint32_t untilized_in_cb = get_compile_time_arg_val(4); + constexpr uint32_t out_cb = get_compile_time_arg_val(5); + constexpr uint32_t B = get_compile_time_arg_val(6); + constexpr uint32_t Wt = get_compile_time_arg_val(7); + + untilize_init(in_cb, untilized_in_cb); + + for (uint32_t b = 0; b < B / 32; b++) { + untilize_init_short(in_cb); + + cb_wait_front(in_cb, Wt); + cb_reserve_back(untilized_in_cb, Wt); + untilize_block(in_cb, Wt, untilized_in_cb); + cb_push_back(untilized_in_cb, Wt); + cb_pop_front(in_cb, Wt); + untilize_uninit(in_cb); + + for(uint32_t u = 0; u < 32; u++) { + untilize_init_short(cache_cb); + cb_wait_front(cache_cb, Wt); + cb_reserve_back(untilized_cache_cb, Wt); + untilize_block(cache_cb, Wt, untilized_cache_cb); + cb_push_back(untilized_cache_cb, Wt); + cb_pop_front(cache_cb, Wt); + untilize_uninit(cache_cb); + + tilize_init_short(untilized_cache2_cb, Wt); + cb_wait_front(untilized_cache2_cb, Wt); + cb_reserve_back(out_cb, Wt); + tilize_block(untilized_cache2_cb, Wt, out_cb); + cb_push_back(out_cb, Wt); + // Untilized cache CBs share same address space + // Compute pops both + cb_pop_front(untilized_cache2_cb, Wt); + cb_pop_front(untilized_cache_cb, Wt); + tilize_uninit(); + } + } +} +} // NAMESPACE diff --git a/tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp similarity index 100% rename from tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp rename to tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp similarity index 100% rename from tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp rename to tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp new file mode 100644 index 00000000000..04ba7e3c561 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp @@ -0,0 +1,8 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + + +void kernel_main() { + +} diff --git a/tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp similarity index 100% rename from tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp rename to tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp similarity index 100% rename from tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp rename to tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp new file mode 100644 index 00000000000..c32eedaece1 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t l1_buffer_addr = get_arg_val(0); + + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_src_noc_x = get_arg_val(2); + std::uint32_t dram_src_noc_y = get_arg_val(3); + + std::uint32_t dram_buffer_dst_addr = get_arg_val(4); + std::uint32_t dram_dst_noc_x = get_arg_val(5); + std::uint32_t dram_dst_noc_y = get_arg_val(6); + + std::uint32_t dram_buffer_size = get_arg_val(7); + + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size); + noc_async_read_barrier(); + + // DRAM NOC dst address + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size); + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp new file mode 100644 index 00000000000..411fc6a494d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t dram_src_noc_x = get_arg_val(1); + std::uint32_t dram_src_noc_y = get_arg_val(2); + + std::uint32_t dram_buffer_dst_addr_base = get_arg_val(3); + std::uint32_t dram_dst_noc_x = get_arg_val(4); + std::uint32_t dram_dst_noc_y = get_arg_val(5); + + std::uint32_t dram_buffer_size = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(7); + + std::uint32_t l1_buffer_addr = get_arg_val(8); + std::uint32_t l1_buffer_size_tiles = get_arg_val(9); + std::uint32_t l1_buffer_size_bytes = get_arg_val(10); + + std::uint32_t rd_wr_l1_buffer_size_tiles = l1_buffer_size_tiles / 2; + std::uint32_t rd_wr_l1_buffer_size_bytes = l1_buffer_size_bytes / 2; + + // Keeps track of how many tiles we copied so far + std::uint32_t num_tiles_read = 0; + + std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; + std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base; + std::uint64_t dram_buffer_src_noc_addr; + std::uint64_t dram_buffer_dst_noc_addr; + + std::uint32_t l1_addr1 = l1_buffer_addr; + std::uint32_t l1_addr2 = l1_buffer_addr + rd_wr_l1_buffer_size_bytes; + + // DRAM NOC src address + dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + + // Copy data from DRAM into destination L1 buffer + noc_async_read( + dram_buffer_src_noc_addr, + l1_addr1, + rd_wr_l1_buffer_size_bytes + ); + dram_buffer_src_addr += rd_wr_l1_buffer_size_bytes; + num_tiles_read += rd_wr_l1_buffer_size_tiles; + + while (num_tiles_read < num_tiles) { + // DRAM NOC src address + dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + // DRAM NOC dst address + dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + + noc_async_read( + dram_buffer_src_noc_addr, + l1_addr2, + rd_wr_l1_buffer_size_bytes + ); + dram_buffer_src_addr += rd_wr_l1_buffer_size_bytes; + num_tiles_read += rd_wr_l1_buffer_size_tiles; + + // Wait all reads flushed (ie received) + noc_async_read_barrier(); + + noc_async_write( + l1_addr1, + dram_buffer_dst_noc_addr, + rd_wr_l1_buffer_size_bytes + ); + + dram_buffer_dst_addr += rd_wr_l1_buffer_size_bytes; + + // Wait for all the writes to complete (ie acked) + noc_async_write_barrier(); + + // Swap L1 addr locations + if (num_tiles_read < num_tiles) { + std::uint32_t temp_l1_addr = l1_addr1; + l1_addr1 = l1_addr2; + l1_addr2 = temp_l1_addr; + } + } + + // DRAM NOC dst address + dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + noc_async_write( + l1_addr2, + dram_buffer_dst_noc_addr, + rd_wr_l1_buffer_size_bytes + ); + // Wait for all the writes to complete (ie acked) + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp new file mode 100644 index 00000000000..91cf24d0ca5 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t l1_buffer_addr = get_arg_val(0); + + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_src_noc_x = get_arg_val(2); + std::uint32_t dram_src_noc_y = get_arg_val(3); + + std::uint32_t num_sticks = get_arg_val(4); + std::uint32_t stick_size = get_arg_val(5); + for(uint32_t i = 0; i < 1; i++) { + for(uint32_t stick_id = 0; stick_id < num_sticks; stick_id++) { + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, stick_size); + noc_async_read_barrier(); + l1_buffer_addr += stick_size; + } + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp new file mode 100644 index 00000000000..c4817644167 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "hostdevcommon/common_runtime_address_map.h" +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +constexpr static std::uint32_t VALID_VAL = 0x1234; +constexpr static std::uint32_t INVALID_VAL = 0x4321; +void kernel_main() { + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t dram_src_noc_x = get_arg_val(1); + std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t local_buffer_addr = get_arg_val(3); + std::uint32_t consumer_core_noc_x = get_arg_val(4); + std::uint32_t consumer_core_noc_y = get_arg_val(5); + std::uint32_t stream_register_address = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(7); + std::uint32_t transient_buffer_size_tiles = get_arg_val(8); + std::uint32_t transient_buffer_size_bytes = get_arg_val(9); + + // Scratch address in L1, to write register value before we copy it to into local/remote registers + volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); + *(constant_ptr) = VALID_VAL; + // Local and remote register addresses (used for sync) + std::uint64_t local = get_noc_addr(stream_register_address); + std::uint64_t remote = get_noc_addr(consumer_core_noc_x, consumer_core_noc_y, stream_register_address); + + // keeps track of how many tiles we moved so far + std::uint32_t counter = 0; + std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; + while(counter < num_tiles) { + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer) + wait_for_sync_register_value(stream_register_address, INVALID_VAL); + // Copy data from dram into destination buffer + noc_async_read(dram_buffer_src_noc_addr, local_buffer_addr, transient_buffer_size_bytes); + dram_buffer_src_addr += transient_buffer_size_bytes; + // wait all reads flushed (ie received) + noc_async_read_barrier(); + + // Write VALID_VAL into local register + noc_async_write(CONSTANT_REGISTER_VALUE, local, 4); + noc_async_write_barrier(); + + + // Write VALID_VAL into remote register + noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4); + noc_async_write_barrier(); + + counter += transient_buffer_size_tiles; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp new file mode 100644 index 00000000000..dfe6fc7ede0 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "hostdevcommon/common_runtime_address_map.h" +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +constexpr static std::uint32_t VALID_VAL = 0x1234; +constexpr static std::uint32_t INVALID_VAL = 0x4321; + +inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, std::uint32_t index) { + if((index & 0x1) == 0) { + return addr1; + } else { + return addr2; + } +} +void kernel_main() { + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t dram_src_noc_x = get_arg_val(1); + std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t local_buffer_addr1 = get_arg_val(3); + std::uint32_t local_buffer_addr2 = get_arg_val(4); + std::uint32_t consumer_core_noc_x = get_arg_val(5); + std::uint32_t consumer_core_noc_y = get_arg_val(6); + std::uint32_t stream_register_address1 = get_arg_val(7); + std::uint32_t stream_register_address2 = get_arg_val(8); + std::uint32_t num_tiles = get_arg_val(9); + std::uint32_t transient_buffer_size_tiles = get_arg_val(10); + std::uint32_t transient_buffer_size_bytes = get_arg_val(11); + + // Scratch address in L1, to write register value before we copy it to into local/remote registers + volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); + *(constant_ptr) = VALID_VAL; + + // keeps track of how many tiles we moved so far + std::uint32_t counter = 0; + std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; + std::uint64_t dram_buffer_src_noc_addr; + while(counter < num_tiles) { + std::uint32_t reg_addr = ping_pong_address(stream_register_address1, stream_register_address2, counter); + std::uint64_t local = get_noc_addr(reg_addr); + std::uint64_t remote = get_noc_addr(consumer_core_noc_x, consumer_core_noc_y, reg_addr); + std::uint32_t local_buffer_address = ping_pong_address(local_buffer_addr1, local_buffer_addr2, counter); + + // DRAM NOC src address + dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer) + wait_for_sync_register_value(reg_addr, INVALID_VAL); + // Copy data from dram into destination buffer + noc_async_read(dram_buffer_src_noc_addr, local_buffer_address, transient_buffer_size_bytes); + dram_buffer_src_addr += transient_buffer_size_bytes; + // wait all reads flushed (ie received) + noc_async_read_barrier(); + + noc_async_write(CONSTANT_REGISTER_VALUE, local, 4); + noc_async_write_barrier(); + // Write VALID_VAL into remote register + noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4); + noc_async_write_barrier(); + + counter += transient_buffer_size_tiles; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp new file mode 100644 index 00000000000..f2e2fad994a --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t src_buffer_size = get_arg_val(3); + + uint32_t local_addr = get_arg_val(4); + + uint32_t dst_addr = get_arg_val(5); + uint32_t dst_noc_x_start = get_arg_val(6); + uint32_t dst_noc_y_start = get_arg_val(7); + uint32_t dst_noc_x_end = get_arg_val(8); + uint32_t dst_noc_y_end = get_arg_val(9); + uint32_t num_dests = get_arg_val(10); + + + // Read src buffer into local L1 buffer + uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); + noc_async_read_barrier(); + + // multicast local L1 buffer to all destination cores + uint64_t dst_noc_multicast_addr = get_noc_multicast_addr( + dst_noc_x_start, + dst_noc_y_start, + dst_noc_x_end, + dst_noc_y_end, + dst_addr); + noc_async_write_multicast(local_addr, dst_noc_multicast_addr, src_buffer_size, num_dests); + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp new file mode 100644 index 00000000000..c1b390934d8 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t src_buffer_size = get_arg_val(3); + + uint32_t local_addr = get_arg_val(4); + + uint32_t dst_addr = get_arg_val(5); + uint32_t dst_noc_x_start = get_arg_val(6); + uint32_t dst_noc_y_start = get_arg_val(7); + uint32_t dst_noc_x_end = get_arg_val(8); + uint32_t dst_noc_y_end = get_arg_val(9); + uint32_t num_dests = get_arg_val(10); + + + // Read src buffer into local L1 buffer + uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); + noc_async_read_barrier(); + + // multicast local L1 buffer to all destination cores + uint64_t dst_noc_multicast_addr = get_noc_multicast_addr( + dst_noc_x_start, + dst_noc_y_start, + dst_noc_x_end, + dst_noc_y_end, + dst_addr); + noc_async_write_multicast_loopback_src(local_addr, dst_noc_multicast_addr, src_buffer_size, num_dests); + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp new file mode 100644 index 00000000000..5181692863e --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + // Kernel args + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t num_tiles_r = get_arg_val(3); + uint32_t num_tiles_c = get_arg_val(4); + + // How many bytes along a row in the original tensor + uint32_t num_bytes_per_tensor_row = get_arg_val(5); + + /* + Constants + Since I am 'constexpr'ing here, I can multiply + */ + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t num_bytes_per_tile_row = 64; // 32 bfloat16, each 2 bytes + constexpr uint32_t num_bytes_for_sending_eight_tile_rows = num_bytes_per_tile_row * 8; + constexpr uint32_t num_bytes_for_sending_seven_tile_rows = num_bytes_per_tile_row * 7; + constexpr uint32_t num_bytes_for_sending_twenty_four_tile_rows = num_bytes_per_tile_row * 24; + uint32_t num_bytes_per_tile = get_tile_size(cb_id_in0); + + // Variables + uint64_t replicate_dest_addr; + uint32_t start_dram_addr_offset_for_tensor_row = 0; + + constexpr uint32_t num_elements_in_zeros_buffer = MEM_ZEROS_SIZE / sizeof(uint32_t); + volatile tt_l1_ptr uint32_t* zero_base_ptr = reinterpret_cast(MEM_ZEROS_BASE); + for (uint32_t zero_base_offset = 0; zero_base_offset < num_elements_in_zeros_buffer; zero_base_offset++) { + *(zero_base_ptr + zero_base_offset) = 0; + } + + uint64_t zeros_base_noc_addr = get_noc_addr(MEM_ZEROS_BASE); + for (uint32_t i = 0; i < num_tiles_r; i++) { + for (uint32_t j = 0; j < 32; j++) { + uint32_t src_addr_ = src_addr + start_dram_addr_offset_for_tensor_row; + for (uint32_t k = 0; k < num_tiles_c; k++) { + cb_reserve_back(cb_id_in0, 1); + uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr_); + + // Read one row of data + uint32_t l1_write_addr = get_write_ptr(cb_id_in0); + noc_async_read(src_noc_addr, l1_write_addr, num_bytes_per_tile_row); + + // We move one row down + l1_write_addr += num_bytes_per_tile_row; + + /* + Move 31 rows of zeros behind the row that we just moved. We send + 8 rows three times, then we send 7 rows + */ + for (uint32_t z = 0; z < 3; z++) { + noc_async_read(zeros_base_noc_addr, l1_write_addr, num_bytes_for_sending_eight_tile_rows); + l1_write_addr += num_bytes_for_sending_eight_tile_rows; + } + + noc_async_read(zeros_base_noc_addr, l1_write_addr, num_bytes_for_sending_seven_tile_rows); + + src_addr_ += num_bytes_per_tile; + noc_async_read_barrier(); + cb_push_back(cb_id_in0, 1); + + } // End num_tiles_c loop + start_dram_addr_offset_for_tensor_row += num_bytes_per_tile_row; + } // End 32 iter loop + start_dram_addr_offset_for_tensor_row += num_bytes_per_tensor_row; + } // End num_tiles_r loop +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp new file mode 100644 index 00000000000..0d53bc3cfa3 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "debug_print.h" + +// This kernel is used to read untilized src0 data from DRAM and copy it to L1 in tilized layout. +// For layout transformation, it uses a list of source addresses (a vector in L1 written by the host) to perform scattered and multiple reads from DRAM. +// The kernel writes to contiguous location in L1 CB. Therefore, the src addresses must be provided in the order in which tiles are generated. +// It expects src1 data to already be tilized and it simply copies it to L1. +void kernel_main() { + std::uint32_t dram_buffer_src0_addr = get_arg_val(0); + std::uint32_t dram_src0_noc_x = get_arg_val(1); + std::uint32_t dram_src0_noc_y = get_arg_val(2); + std::uint32_t dram_buffer_src1_addr = get_arg_val(3); + std::uint32_t dram_src1_noc_x = get_arg_val(4); + std::uint32_t dram_src1_noc_y = get_arg_val(5); + std::uint32_t address_map_size = get_arg_val(6); + std::uint32_t address_map_l1_addr = get_arg_val(7); + std::uint32_t num_blocks = get_arg_val(8); + std::uint32_t src0_num_reads_per_block = get_arg_val(9); + std::uint32_t src0_dram_read_size_bytes = get_arg_val(10); + std::uint32_t src1_num_bytes_per_block = get_arg_val(11); + std::uint32_t src0_num_tiles_per_block = get_arg_val(12); + std::uint32_t src1_num_tiles_per_block = get_arg_val(13); + + constexpr uint32_t cb0_id = 0; + constexpr uint32_t cb1_id = 1; + + volatile tt_l1_ptr std::uint32_t* source_addresses = (volatile tt_l1_ptr uint32_t*)(address_map_l1_addr); + + uint32_t source_addresses_list_index = 0; + // We push one block of tiles of src0 and src1. + // src0 and src1 can have different number of tiles per block. + for(uint32_t b = 0; b < num_blocks; b+=1) { + cb_reserve_back(cb0_id, src0_num_tiles_per_block); + cb_reserve_back(cb1_id, src1_num_tiles_per_block); + uint32_t l1_write0_addr = get_write_ptr(cb0_id); + uint32_t l1_write1_addr = get_write_ptr(cb1_id); + std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr); + // src1 is already tilized in DRAM. Read the whole block of tiles in a single DRAM read access. + noc_async_read(dram_buffer_src1_noc_addr, l1_write1_addr, src1_num_bytes_per_block); + // src0 is not tilized in DRAM. + // For src0, Do multiple DRAM read accesses using addresses provided in "source_addresses" to produce one block of tiles + // The source addresses in the list must be in the order of tiles + for(uint32_t i = 0; i < src0_num_reads_per_block; i++) { + uint32_t src_addr = source_addresses[source_addresses_list_index]; + std::uint64_t dram_buffer_src0_noc_addr = get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr + src_addr); + noc_async_read(dram_buffer_src0_noc_addr, l1_write0_addr, src0_dram_read_size_bytes); + l1_write0_addr += src0_dram_read_size_bytes; + source_addresses_list_index += 1; + } + noc_async_read_barrier(); + dram_buffer_src1_addr += src1_num_bytes_per_block; + cb_push_back(cb0_id, src0_num_tiles_per_block); + cb_push_back(cb1_id, src1_num_tiles_per_block); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp new file mode 100644 index 00000000000..495855e0d5d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +void kernel_main() { + std::uint32_t dram_buffer_src_addr = get_arg_val(0); + std::uint32_t dram_src_noc_x = get_arg_val(1); + std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t l1_buffer_src_addr_base = get_arg_val(3); + std::uint32_t l1_buffer_dst_addr_base = get_arg_val(4); + std::uint32_t l1_dst_noc_x = get_arg_val(5); + std::uint32_t l1_dst_noc_y = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(7); + std::uint32_t single_tile_size_bytes = get_arg_val(8); + std::uint32_t total_tiles_size_bytes = get_arg_val(9); + + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + noc_async_read(dram_buffer_src_noc_addr, l1_buffer_src_addr_base, total_tiles_size_bytes); + noc_async_read_barrier(); + + for(uint32_t i = 0; i < 1000; i++) { + // L1 NOC dst address + std::uint64_t l1_buffer_dst_noc_addr = get_noc_addr(l1_dst_noc_x, l1_dst_noc_y, l1_buffer_dst_addr_base); + noc_async_write(l1_buffer_src_addr_base, l1_buffer_dst_noc_addr, total_tiles_size_bytes); + noc_async_write_barrier(); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp new file mode 100644 index 00000000000..e79f77ff069 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + uint32_t src1_noc_x = get_arg_val(5); + uint32_t src1_noc_y = get_arg_val(6); + // skip arg 7 for compat with reader_diff_lengths + uint32_t NCHtWt = get_arg_val(8); + uint32_t NC = get_arg_val(9); + uint32_t Ht = get_arg_val(10); + uint32_t Wt = get_arg_val(11); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + constexpr uint32_t onetile = 1; + + // single-tile ublocks + uint32_t tile_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles; + uint32_t i1 = 0; + for (uint32_t i = 0; i < NCHtWt; i += onetile) { + uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + cb_reserve_back(cb_id_in0, onetile); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + noc_async_read(src0_noc_addr, l1_write_addr_in0, tile_bytes); + noc_async_read_barrier(); + cb_push_back(cb_id_in0, onetile); + src0_addr += tile_bytes; + + // for each W-tile of the first tensor we push one tile from the second arg tile list + // but we loop the second list around + cb_reserve_back(cb_id_in1, onetile); + uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + noc_async_read(src1_noc_addr, l1_write_addr_in1, tile_bytes); + noc_async_read_barrier(); + cb_push_back(cb_id_in1, onetile); + i1 ++; + src1_addr += tile_bytes; + if (i1 == Wt) { + // wrap around + i1 = 0; + src1_addr = get_arg_val(4); + } + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp new file mode 100644 index 00000000000..697c9253013 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths + uint32_t NCHtWt = get_arg_val(8); + uint32_t NC = get_arg_val(9); + uint32_t Ht = get_arg_val(10); + uint32_t Wt = get_arg_val(11); + uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 + + constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; + constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + constexpr uint32_t onetile = 1; + + // single-tile ublocks + const uint32_t tile_bytes = get_tile_size(cb_id_in0); + const DataFormat data_format = get_dataformat(cb_id_in0); + + const InterleavedAddrGenFast s0 = { + .bank_base_address = src0_addr, + .page_size = tile_bytes, + .data_format = data_format + }; + + const InterleavedAddrGenFast s1 = { + .bank_base_address = src1_addr, + .page_size = tile_bytes, + .data_format = data_format + }; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles; + uint32_t i = 0; + uint32_t i1 = 0; + for (uint32_t nc = 0; nc < NC; nc++) { + for (uint32_t ht = 0; ht < Ht; ht++) { + for (uint32_t wt = 0; wt < Wt; wt++) { + cb_reserve_back(cb_id_in0, onetile); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + noc_async_read_tile(i, s0, l1_write_addr_in0); + noc_async_read_barrier(); + cb_push_back(cb_id_in0, onetile); + + // for each W-tile of the first tensor we push one tile from the second arg tile list + // but we loop the second list around + cb_reserve_back(cb_id_in1, onetile); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + noc_async_read_tile(i1, s1, l1_write_addr_in1); + noc_async_read_barrier(); + cb_push_back(cb_id_in1, onetile); + i1 ++; + i ++; // input tile iterates over NC Ht Wt + } + + // bcast tensor should be NC1W (actually NC32W padded with 0s in H) + // wrap W around for each h (broadcast) + i1 -= Wt; + } + // we reused Wt tiles out of NCWt bcast tensor Ht times, now advance for next NC + if (nc1 == 0) // if bcast NC==1 we don't advance but reuse the tensor + i1 += Wt; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp new file mode 100644 index 00000000000..55f2a7154b7 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths + uint32_t NCHtWt = get_arg_val(8); + uint32_t NC = get_arg_val(9); + uint32_t Ht = get_arg_val(10); + uint32_t Wt = get_arg_val(11); + uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC + + constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; + constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + constexpr uint32_t onetile = 1; + + // single-tile ublocks + const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0); + const DataFormat in0_data_format = get_dataformat(cb_id_in0); + const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1); + const DataFormat in1_data_format = get_dataformat(cb_id_in1); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles; + uint32_t i = 0; + uint32_t i1 = 0; + + const InterleavedAddrGenFast s0 = { + .bank_base_address = src0_addr, + .page_size = in0_tile_bytes, + .data_format = in0_data_format + }; + + const InterleavedAddrGenFast s1 = { + .bank_base_address = src1_addr, + .page_size = in1_tile_bytes, + .data_format = in1_data_format + }; + + for (uint32_t nc = 0; nc < NC; nc++) { + for (uint32_t ht = 0; ht < Ht; ht++) { + for (uint32_t wt = 0; wt < Wt; wt++) { + cb_reserve_back(cb_id_in0, onetile); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + noc_async_read_tile(i, s0, l1_write_addr_in0); + noc_async_read_barrier(); + cb_push_back(cb_id_in0, onetile); + + // for each H,W-tile of the first tensor we push one tile from the second arg tile list + // but we don't advance the second tile index for H,W + cb_reserve_back(cb_id_in1, onetile); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + noc_async_read_tile(i1, s1, l1_write_addr_in1); + noc_async_read_barrier(); + cb_push_back(cb_id_in1, onetile); + + i ++; // input tile iterates over NC Ht Wt + } // wt loop + } // ht loop + if (nc1 == 0) + i1 ++; // bcast-HW tile iterates only for nc loop and only if NC>1 + } // nc loop +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp new file mode 100644 index 00000000000..974820cf28b --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + uint32_t src1_noc_x = get_arg_val(5); + uint32_t src1_noc_y = get_arg_val(6); + // skip arg 7 for compat with reader_diff_lengths + uint32_t NCHtWt = get_arg_val(8); + uint32_t NC = get_arg_val(9); + uint32_t Ht = get_arg_val(10); + uint32_t Wt = get_arg_val(11); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + constexpr uint32_t onetile = 1; + + // single-tile ublocks + uint32_t tile_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles; + uint32_t i1 = 0; + for (uint32_t nc = 0; nc < NC; nc ++ ) { + for (uint32_t ht = 0; ht < Ht; ht++ ) { + { + // only read one tile in H per W-line of tiles + // So we push a total of NC*H tiles from src1 + cb_reserve_back(cb_id_in1, onetile); + uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + noc_async_read(src1_noc_addr, l1_write_addr_in1, tile_bytes); + noc_async_read_barrier(); + cb_push_back(cb_id_in1, onetile); + src1_addr += tile_bytes; + } + + for (uint32_t wt = 0; wt < Wt; wt++) { + uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + cb_reserve_back(cb_id_in0, onetile); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + noc_async_read(src0_noc_addr, l1_write_addr_in0, tile_bytes); + noc_async_read_barrier(); + cb_push_back(cb_id_in0, onetile); + src0_addr += tile_bytes; + } // Wt loop + } // Ht loop + src1_addr = get_arg_val(4); // reset the H-tile ptr + } // NC loop +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp new file mode 100644 index 00000000000..a57865b6016 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + // skip args 1,2,5,6,7 for compat with single-bank readers and reader_diff_lengths + uint32_t NCHtWt = get_arg_val(8); + uint32_t NC = get_arg_val(9); + uint32_t Ht = get_arg_val(10); + uint32_t Wt = get_arg_val(11); + uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 + + constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; + constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + constexpr uint32_t onetile = 1; + + // single-tile ublocks + const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0); + const DataFormat in0_data_format = get_dataformat(cb_id_in0); + const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1); + const DataFormat in1_data_format = get_dataformat(cb_id_in1); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles; + uint32_t i = 0; + uint32_t i_bcast = 0; + + const InterleavedAddrGenFast s0 = { + .bank_base_address = src0_addr, + .page_size = in0_tile_bytes, + .data_format = in0_data_format + }; + + const InterleavedAddrGenFast s1 = { + .bank_base_address = src1_addr, + .page_size = in1_tile_bytes, + .data_format = in1_data_format + }; + + for (uint32_t nc = 0; nc < NC; nc ++ ) { + for (uint32_t ht = 0; ht < Ht; ht++ ) { + { + // only read one tile in H per W-line of tiles + // So we push a total of NC*H tiles from src1 + cb_reserve_back(cb_id_in1, onetile); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + noc_async_read_tile(i_bcast, s1, l1_write_addr_in1); + noc_async_read_barrier(); + cb_push_back(cb_id_in1, onetile); + i_bcast++; + } + + for (uint32_t wt = 0; wt < Wt; wt++) { + cb_reserve_back(cb_id_in0, onetile); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + noc_async_read_tile(i, s0, l1_write_addr_in0); + noc_async_read_barrier(); + cb_push_back(cb_id_in0, onetile); + i++; + } // Wt loop + } // Ht loop + + if (nc1) // if we also bcast from NC=1, go back Ht tiles on bcasted tensor + i_bcast -= Ht; + } // NC loop +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp new file mode 100644 index 00000000000..e21b62ea57d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_noc_x = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); + uint32_t num_tiles = get_arg_val(6); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + // single-tile ublocks + uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0); + uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1); + uint32_t ublock_size_tiles = 1; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) + for (uint32_t i=0; i +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + uint32_t src1_noc_x = get_arg_val(5); + uint32_t src1_noc_y = get_arg_val(6); + uint32_t src1_num_tiles = get_arg_val(7); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + // single-tile ublocks + uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0); + uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1); + uint32_t ublock_size_tiles = 1; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles > src1_num_tiles ? src0_num_tiles : src1_num_tiles; + + // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) + for (uint32_t i=0; i +#include "dataflow_api.h" + +#include "debug_print.h" + +void kernel_main() { + // same arg indices as in reader_binary_diff_lenghts for compat + uint32_t src0_addr = get_arg_val(0); + uint32_t src1_addr = get_arg_val(1); + uint32_t Mt = get_arg_val(2); + uint32_t Kt = get_arg_val(3); + uint32_t Nt = get_arg_val(4); + uint32_t MtKt = get_arg_val(5); // if 0 + uint32_t KtNt = get_arg_val(6); + uint32_t batch = get_arg_val(7); + uint32_t bcast_B = get_arg_val(8); // if 1 we broadcast B to batch + + constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; + constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; + + //DPRINT << "Mt=" << Mt << " Kt=" << Kt << " Nt=" << Nt << " MtKt=" << MtKt << "KtNt=" << KtNt << ENDL(); + //DPRINT << "src0=" << src0_addr << " src1=" << src1_addr << ENDL(); + //DPRINT << "batch=" << batch << ENDL(); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + constexpr uint32_t onetile = 1; + const uint32_t src0_tile_bytes = get_tile_size(cb_id_in0); + const DataFormat src0_data_format = get_dataformat(cb_id_in0); + const uint32_t src1_tile_bytes = get_tile_size(cb_id_in1); + const DataFormat src1_data_format = get_dataformat(cb_id_in1); + + uint32_t itileA_batch = 0; + uint32_t itileB_batch = 0; + + const InterleavedAddrGenFast s0 = { + .bank_base_address = src0_addr, + .page_size = src0_tile_bytes, + .data_format = src0_data_format + }; + + const InterleavedAddrGenFast s1 = { + .bank_base_address = src1_addr, + .page_size = src1_tile_bytes, + .data_format = src1_data_format + }; + + for (uint32_t nb = 0; nb < batch; nb++) { + uint32_t itileA = itileA_batch; + for (uint32_t mt = 0; mt < Mt; mt++) { + uint32_t itileB = itileB_batch; + for (uint32_t nt = 0; nt < Nt; nt++) { + for (uint32_t kt = 0; kt < Kt; kt++) { + { // Read A's tile at (mt, kt) + cb_reserve_back(cb_id_in0, onetile); + uint32_t l1_write_addr_in0 = get_write_ptr(cb_id_in0); + noc_async_read_tile(itileA, s0, l1_write_addr_in0); + noc_async_read_barrier(); + cb_push_back(cb_id_in0, onetile); + } + + { // Read B's tile at (kt, nt) + cb_reserve_back(cb_id_in1, onetile); + uint32_t l1_write_addr_in1 = get_write_ptr(cb_id_in1); + noc_async_read_tile(itileB, s1, l1_write_addr_in1); + noc_async_read_barrier(); + cb_push_back(cb_id_in1, onetile); + } + //DPRINT << "Pushed itileA=" << itileA << " itileB=" << itileB << ENDL(); + + itileA += 1; // A is MK + itileB += Nt; // B is KN, so to get k++ we stride by Nt + } // Kt loop + itileB -= KtNt; // revert B to previous state before the K loop (to avoid multiplies) + itileB += 1; // B is KN, so here in the end of Nt loop we increment N by 1 + itileA -= Kt; // resets tileA to kt=0, keep the same mt + } // Nt loop + itileA += Kt; // A is MK, advance to next M + } // Mt loop + itileA_batch += MtKt; // update batch strides + if (bcast_B == 0) // don't increment batch if we broadcast matrix B + itileB_batch += KtNt; + } // batch loop +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp new file mode 100644 index 00000000000..ff94c5ec424 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +inline __attribute__((always_inline)) +void read_and_push_to_cb(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes, + uint32_t dram_src_noc_x, uint32_t dram_src_noc_y, uint32_t& dram_buffer_src_addr) { + // read a ublock of tiles at the time from DRAM to L1 buffer, and push a ublock at the time to unpacker + for (uint32_t i = 0; i(0); + std::uint32_t dram_src_noc_x = get_arg_val(1); + std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t num_tiles_per_cb = get_arg_val(3); + + constexpr uint32_t cb_id = get_compile_time_arg_val(0); + constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1); + uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; + + read_and_push_to_cb(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes, + dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp new file mode 100644 index 00000000000..b8688948cc1 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + // same arg indices as in reader_binary_diff_lenghts for compat + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(3); + uint32_t src1_addr = get_arg_val(4); + uint32_t src1_num_tiles = get_arg_val(7); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + // single-tile ublocks + uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0); + uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1); + uint32_t ublock_size_tiles = 1; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t num_tiles = src0_num_tiles > src1_num_tiles ? src0_num_tiles : src1_num_tiles; + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = src0_addr, + + + .log_base_2_of_page_size = 11 + }; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = src1_addr, + + + .log_base_2_of_page_size = 11 + }; + + // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) + for (uint32_t i=0; i +#include "dataflow_api.h" +// #include "tools/profiler/kernel_profiler.hpp" + +void kernel_main() { + + std::uint32_t buffer_src_addr = get_arg_val(0); + std::uint32_t src_noc_x = get_arg_val(1); + std::uint32_t src_noc_y = get_arg_val(2); + std::uint32_t num_tiles = get_arg_val(3); + std::uint32_t num_repetitions = get_arg_val(4); + + constexpr uint32_t cb_id = get_compile_time_arg_val(0); + constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); + uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles; + + for (uint32_t j = 0; j < num_repetitions; j++) { + uint32_t src_addr = buffer_src_addr; + for (uint32_t i = 0; i +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_noc_x = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); + uint32_t num_blocks = get_arg_val(6); + uint32_t in0_block_tile_cnt = get_arg_val(7); + uint32_t in1_block_tile_cnt = get_arg_val(8); + uint32_t in0_block_size_bytes = get_arg_val(9); + uint32_t in1_block_size_bytes = get_arg_val(10); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + for(uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + + cb_reserve_back(cb_id_in0, in0_block_tile_cnt); + cb_reserve_back(cb_id_in1, in1_block_tile_cnt); + + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + noc_async_read(src0_noc_addr, l1_write_addr_in0, in0_block_size_bytes); + noc_async_read(src1_noc_addr, l1_write_addr_in1, in1_block_size_bytes); + + noc_async_read_barrier(); + + cb_push_back(cb_id_in0, in0_block_tile_cnt); + cb_push_back(cb_id_in1, in1_block_tile_cnt); + + src0_addr += in0_block_size_bytes; + src1_addr += in1_block_size_bytes; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp new file mode 100644 index 00000000000..572811cb90d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + std::uint32_t dram_buffer_src0_addr = get_arg_val(0); + std::uint32_t dram_src0_noc_x = get_arg_val(1); + std::uint32_t dram_src0_noc_y = get_arg_val(2); + + std::uint32_t dram_buffer_src1_addr = get_arg_val(3); + std::uint32_t dram_src1_noc_x = get_arg_val(4); + std::uint32_t dram_src1_noc_y = get_arg_val(5); + + std::uint32_t num_tiles = get_arg_val(6); + + // single-tile chunks + uint32_t chunk_size_bytes_0 = get_tile_size(0); + uint32_t chunk_size_bytes_1 = get_tile_size(1); + uint32_t chunk_size_tiles = 1; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + // read a chunk of tiles at the time from DRAM to L1 buffer, and push a chunk at the time to unpacker + for (uint32_t i=0; i +#include "dataflow_api.h" + +void kernel_main() { + + + bool one_time_profile = true; + + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = in0_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = in1_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + + for(uint32_t b = 0; b < num_blocks; b++) { + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in0_block_w; w++) { + // kernel_profiler::mark_time(5); + uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0); + noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + + uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in1_block_w; w++) { + uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1); + noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + noc_async_read_barrier(); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + cb_push_back(cb_id_in1, in1_block_num_tiles); + + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp new file mode 100644 index 00000000000..3c260f77ab8 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp @@ -0,0 +1,113 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(7); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in0_mcast_receiver_semaphore_addr = get_arg_val(25); + + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in1); + + uint32_t l1_write_addr_in1; + + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + + volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in0_mcast_receiver_semaphore_addr); + + bool one_time_noc_wait = true; + bool one_time_cb_push = true; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = in1_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + for(uint32_t b = 0; b < num_blocks; b++) { + // Operand 0 + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + + // Set in0 semaphore value to INVALID + noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID); + + // Atomic increment source core counter + uint64_t in0_mcast_sender_semaphore_noc_addr = get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, in0_mcast_sender_semaphore_addr); + noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1); + + // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data) + noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID); + + // kernel_profiler::mark_time_once(8, &one_time_noc_wait); + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in1_block_w; w++) { + uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1); + noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + noc_async_read_barrier(); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(9, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp new file mode 100644 index 00000000000..85ce6be87ca --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp @@ -0,0 +1,165 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(10); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in0_mcast_receiver_semaphore_addr = get_arg_val(25); + + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + + // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted + volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in0_mcast_receiver_semaphore_addr); + *(in0_mcast_receiver_semaphore_addr_ptr) = VALID; + // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready + // to receive the mcast + volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_addr_ptr = reinterpret_cast(in0_mcast_sender_semaphore_addr); + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = in0_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = in1_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + bool one_time_multicast = true; + bool one_time_cb_push = true; + for(uint32_t b = 0; b < num_blocks; b++) { + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + uint32_t in0_start_address = l1_write_addr_in0; // copy start address of block, to be used for mcasting + uint32_t in0_block_size_bytes = 0; // can be optimized later, pass it to kernel + + // Copy in0 block into CB, as the default kernel + uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in0_block_w; w++) { + uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0); + noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + in0_block_size_bytes += single_tile_size_bytes; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + // Barrier! make sure the reads are done + noc_async_read_barrier(); + + // wait until all in0 mcast destinations have atomically incremented the in0 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset + // the semaphore_addr value back to zero for the next block + noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, in0_mcast_num_dests); + noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0); + + // kernel_profiler::mark_time_once(11, &one_time_multicast); + + // Now we have the block in the CB address, we can mcast to dests! + uint64_t in0_multicast_data_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_start_x, + in0_mcast_dest_noc_start_y, + in0_mcast_dest_noc_end_x, + in0_mcast_dest_noc_end_y, + in0_start_address); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, in0_block_size_bytes, in0_mcast_num_dests); + noc_async_write_barrier(); + // We should also multicast the flag to destinations + uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_start_x, + in0_mcast_dest_noc_start_y, + in0_mcast_dest_noc_end_x, + in0_mcast_dest_noc_end_y, + in0_mcast_receiver_semaphore_addr); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, in0_mcast_receiver_semaphore_noc_addr, in0_mcast_num_dests); + + // Copy in1 block into CB, as the default kernel + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in1_block_w; w++) { + uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1); + noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + noc_async_read_barrier(); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(12, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp new file mode 100644 index 00000000000..7437946d76c --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp @@ -0,0 +1,103 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(34); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in0_mcast_receiver_semaphore_addr = get_arg_val(25); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = get_arg_val(33); + uint32_t in1_mcast_receiver_semaphore_addr = get_arg_val(34); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in0_mcast_receiver_semaphore_addr); + volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in1_mcast_receiver_semaphore_addr); + + bool one_time_noc_wait_0 = true; + bool one_time_noc_wait_1 = true; + bool one_time_cb_push = true; + + for(uint32_t b = 0; b < num_blocks; b++) { + // Operand 0 + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + + // Set in0 semaphore value to INVALID + noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID); + + // Atomic increment source core counter + uint64_t in0_mcast_sender_semaphore_noc_addr = get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, in0_mcast_sender_semaphore_addr); + noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1); + + // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data) + noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID); + // kernel_profiler::mark_time_once(35, &one_time_noc_wait_0); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + + // Set in1 semaphore value to INVALID + noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in1_mcast_sender_semaphore_noc_addr = get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, in1_mcast_sender_semaphore_addr); + noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1); + + // wait on in1 semaphore value to become VALID (set by mcast sender after it multicasts data) + noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID); + // kernel_profiler::mark_time_once(36, &one_time_noc_wait_1); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(37, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp new file mode 100644 index 00000000000..3f7b50fd881 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp @@ -0,0 +1,169 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(24); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in0_mcast_receiver_semaphore_addr = get_arg_val(25); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = get_arg_val(33); + uint32_t in1_mcast_receiver_semaphore_addr = get_arg_val(34); + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in1; + + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + + volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in0_mcast_receiver_semaphore_addr); + + // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted + volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in1_mcast_receiver_semaphore_addr); + *(in1_mcast_receiver_semaphore_addr_ptr) = VALID; + // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready + // to receive the mcast + volatile tt_l1_ptr uint32_t* in1_mcast_sender_semaphore_addr_ptr = reinterpret_cast(in1_mcast_sender_semaphore_addr); + + bool one_time_noc_wait_0 = true; + bool one_time_noc_wait_1 = true; + bool one_time_cb_push = true; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = in1_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + for(uint32_t b = 0; b < num_blocks; b++) { + // Operand 0 + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + + // Set in0 semaphore value to INVALID + noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID); + + // Atomic increment source core counter + uint64_t in0_mcast_sender_semaphore_noc_addr = get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, in0_mcast_sender_semaphore_addr); + noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1); + + // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data) + noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID); + // kernel_profiler::mark_time_once(25, &one_time_noc_wait_0); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_start_address = l1_write_addr_in1; // copy start address of block, to be used for mcasting + uint32_t in1_block_size_bytes = 0; // can be optimized later, pass it to kernel + + // Copy in1 block into CB, as the default kernel + uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in1_block_w; w++) { + uint64_t in1_tile_noc_address = get_noc_addr(in1_tensor_tile_id, s1); + noc_async_read(in1_tile_noc_address, l1_write_addr_in1, single_tile_size_bytes); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + in1_block_size_bytes += single_tile_size_bytes; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + // Barrier! make sure the reads are done + noc_async_read_barrier(); + + // wait until all in1 mcast destinations have atomically incremented the in1 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset + // the semaphore_addr value back to zero for the next block + noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, in1_mcast_num_dests); + noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0); + // kernel_profiler::mark_time_once(26, &one_time_noc_wait_1); + + // Now we have the block in the CB address, we can mcast to dests! + uint64_t in1_multicast_data_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_start_x, + in1_mcast_dest_noc_start_y, + in1_mcast_dest_noc_end_x, + in1_mcast_dest_noc_end_y, + in1_start_address); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_dests); + + // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf + // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC). + + // We should also multicast the flag to destinations + uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_start_x, + in1_mcast_dest_noc_start_y, + in1_mcast_dest_noc_end_x, + in1_mcast_dest_noc_end_y, + in1_mcast_receiver_semaphore_addr); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, in1_mcast_receiver_semaphore_noc_addr, in1_mcast_num_dests); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(27, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp new file mode 100644 index 00000000000..dd29b87ec0f --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp @@ -0,0 +1,169 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(39); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in0_mcast_receiver_semaphore_addr = get_arg_val(25); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = get_arg_val(33); + uint32_t in1_mcast_receiver_semaphore_addr = get_arg_val(34); + + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + + // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted + volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in0_mcast_receiver_semaphore_addr); + *(in0_mcast_receiver_semaphore_addr_ptr) = VALID; + // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready + // to receive the mcast + volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_addr_ptr = reinterpret_cast(in0_mcast_sender_semaphore_addr); + + + volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in1_mcast_receiver_semaphore_addr); + + bool one_time_noc_wait_0 = true; + bool one_time_noc_wait_1 = true; + bool one_time_cb_push = true; + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = in0_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + for(uint32_t b = 0; b < num_blocks; b++) { + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + uint32_t in0_start_address = l1_write_addr_in0; // copy start address of block, to be used for mcasting + uint32_t in0_block_size_bytes = 0; // can be optimized later, pass it to kernel + + // Copy in0 block into CB, as the default kernel + uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in0_block_w; w++) { + uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0); + noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + in0_block_size_bytes += single_tile_size_bytes; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + // Barrier! make sure the reads are done + noc_async_read_barrier(); + + // wait until all in0 mcast destinations have atomically incremented the in0 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset + // the semaphore_addr value back to zero for the next block + noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, in0_mcast_num_dests); + noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0); + // kernel_profiler::mark_time_once(40, &one_time_noc_wait_0); + // Now we have the block in the CB address, we can mcast to dests! + uint64_t in0_multicast_data_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, + in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, + in0_mcast_dest_noc_start_y, + in0_start_address); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, in0_block_size_bytes, in0_mcast_num_dests); + + // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf + // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC). + + // We should also multicast the flag to destinations + uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, + in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, + in0_mcast_dest_noc_start_y, + in0_mcast_receiver_semaphore_addr); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, in0_mcast_receiver_semaphore_noc_addr, in0_mcast_num_dests); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + + // Set in1 semaphore value to INVALID + noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in1_mcast_sender_semaphore_noc_addr = get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, in1_mcast_sender_semaphore_addr); + noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1); + + // wait on in1 semaphore value to become VALID (set by mcast sender after it multicasts data) + noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID); + // kernel_profiler::mark_time_once(41, &one_time_noc_wait_1); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(42, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp new file mode 100644 index 00000000000..80896331ed9 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp @@ -0,0 +1,226 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(29); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in0_mcast_receiver_semaphore_addr = get_arg_val(25); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = get_arg_val(33); + uint32_t in1_mcast_receiver_semaphore_addr = get_arg_val(34); + + + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + + // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted + volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in0_mcast_receiver_semaphore_addr); + *(in0_mcast_receiver_semaphore_addr_ptr) = VALID; + + // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted + volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in1_mcast_receiver_semaphore_addr); + *(in1_mcast_receiver_semaphore_addr_ptr) = VALID; + // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready + // to receive the mcast + volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_addr_ptr = reinterpret_cast(in0_mcast_sender_semaphore_addr); + + // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready + // to receive the mcast + volatile tt_l1_ptr uint32_t* in1_mcast_sender_semaphore_addr_ptr = reinterpret_cast(in1_mcast_sender_semaphore_addr); + + bool one_time_noc_wait_0 = true; + bool one_time_noc_wait_1 = true; + bool one_time_cb_push = true; + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = in0_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = in1_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + + for(uint32_t b = 0; b < num_blocks; b++) { + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + uint32_t in0_start_address = l1_write_addr_in0; // copy start address of block, to be used for mcasting + uint32_t in0_block_size_bytes = 0; // can be optimized later, pass it to kernel + + // Copy in0 block into CB, as the default kernel + uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in0_block_w; w++) { + uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0); + noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + in0_block_size_bytes += single_tile_size_bytes; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + // Barrier! make sure the reads are done + noc_async_read_barrier(); + + // wait until all in0 mcast destinations have atomically incremented the in0 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset + // the semaphore_addr value back to zero for the next block + noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, in0_mcast_num_dests); + noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0); + // kernel_profiler::mark_time_once(30, &one_time_noc_wait_0); + + // Now we have the block in the CB address, we can mcast to dests! + uint64_t in0_multicast_data_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, + in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, + in0_mcast_dest_noc_start_y, + in0_start_address); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, in0_block_size_bytes, in0_mcast_num_dests); + + // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf + // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC). + + // We should also multicast the flag to destinations + uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, + in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, + in0_mcast_dest_noc_start_y, + in0_mcast_receiver_semaphore_addr); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, in0_mcast_receiver_semaphore_noc_addr, in0_mcast_num_dests); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_start_address = l1_write_addr_in1; // copy start address of block, to be used for mcasting + uint32_t in1_block_size_bytes = 0; // can be optimized later, pass it to kernel + + // Copy in1 block into CB, as the default kernel + uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in1_block_w; w++) { + uint64_t in1_tile_noc_address = get_noc_addr(in1_tensor_tile_id, s1); + noc_async_read(in1_tile_noc_address, l1_write_addr_in1, single_tile_size_bytes); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + in1_block_size_bytes += single_tile_size_bytes; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + // Barrier! make sure the reads are done + noc_async_read_barrier(); + + // wait until all in1 mcast destinations have atomically incremented the in1 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset + // the semaphore_addr value back to zero for the next block + noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, in1_mcast_num_dests); + noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0); + // kernel_profiler::mark_time_once(31, &one_time_noc_wait_1); + + // Now we have the block in the CB address, we can mcast to dests! + uint64_t in1_multicast_data_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_end_x, + in1_mcast_dest_noc_end_y, + in1_mcast_dest_noc_start_x, + in1_mcast_dest_noc_start_y, + in1_start_address); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_dests); + + // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf + // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC). + + // We should also multicast the flag to destinations + uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_end_x, + in1_mcast_dest_noc_end_y, + in1_mcast_dest_noc_start_x, + in1_mcast_dest_noc_start_y, + in1_mcast_receiver_semaphore_addr); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, in1_mcast_receiver_semaphore_noc_addr, in1_mcast_num_dests); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(32, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp new file mode 100644 index 00000000000..d1ed40fd2f4 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp @@ -0,0 +1,110 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(16); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in1_mcast_num_dests = get_arg_val(21); + uint32_t in1_mcast_sender_noc_x = get_arg_val(22); + uint32_t in1_mcast_sender_noc_y = get_arg_val(23); + uint32_t in1_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in1_mcast_receiver_semaphore_addr = get_arg_val(25); + + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in1); + + uint32_t l1_write_addr_in0; + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in1_mcast_receiver_semaphore_addr); + + bool one_time_noc_wait = true; + bool one_time_cb_push = true; + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = in0_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + for(uint32_t b = 0; b < num_blocks; b++) { + // Operand 0 + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in0_block_w; w++) { + uint64_t in0_tile_noc_addr = get_noc_addr(in0_tensor_tile_id, s0); + noc_async_read(in0_tile_noc_addr, l1_write_addr_in0, single_tile_size_bytes); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + noc_async_read_barrier(); + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + + // Set in0 semaphore value to INVALID + noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in1_mcast_sender_semaphore_noc_addr = get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, in1_mcast_sender_semaphore_addr); + noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1); + + // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data) + noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID); + // kernel_profiler::mark_time_once(17, &one_time_noc_wait); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(18, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp new file mode 100644 index 00000000000..e2a14e90e00 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp @@ -0,0 +1,166 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" + +void kernel_main() { + // kernel_profiler::mark_time(20); + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in1_mcast_num_dests = get_arg_val(21); + uint32_t in1_mcast_sender_noc_x = get_arg_val(22); + uint32_t in1_mcast_sender_noc_y = get_arg_val(23); + uint32_t in1_mcast_sender_semaphore_addr = get_arg_val(24); + uint32_t in1_mcast_receiver_semaphore_addr = get_arg_val(25); + + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = in0_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + const InterleavedPow2AddrGen s1 = { + .bank_base_address = in1_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + + // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted + volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast(in1_mcast_receiver_semaphore_addr); + *(in1_mcast_receiver_semaphore_addr_ptr) = VALID; + // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready + // to receive the mcast + volatile tt_l1_ptr uint32_t* in1_mcast_sender_semaphore_addr_ptr = reinterpret_cast(in1_mcast_sender_semaphore_addr); + + bool one_time_noc_wait = true; + bool one_time_cb_push = true; + + for(uint32_t b = 0; b < num_blocks; b++) { + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + // Copy in0 block into CB, as the default kernel + uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in0_block_w; w++) { + uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0); + noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + // Barrier! make sure the reads are done + noc_async_read_barrier(); + + + // Operand 1 + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_start_address = l1_write_addr_in1; // copy start address of block, to be used for mcasting + uint32_t in1_block_size_bytes = 0; // can be optimized later, pass it to kernel + + uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id; + for(uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for(uint32_t w = 0; w < in1_block_w; w++) { + uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1); + noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + in1_block_size_bytes += single_tile_size_bytes; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + noc_async_read_barrier(); + + // wait until all in1 mcast destinations have atomically incremented the in1 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset + // the semaphore_addr value back to zero for the next block + noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, in1_mcast_num_dests); + noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0); + // kernel_profiler::mark_time_once(21, &one_time_noc_wait); + + // Now we have the block in the CB address, we can mcast to dests! + uint64_t in1_multicast_data_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_start_x, + in1_mcast_dest_noc_start_y, + in1_mcast_dest_noc_end_x, + in1_mcast_dest_noc_end_y, + in1_start_address); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_dests); + noc_async_write_barrier(); + // We should also multicast the flag to destinations + uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_start_x, + in1_mcast_dest_noc_start_y, + in1_mcast_dest_noc_end_x, + in1_mcast_dest_noc_end_y, + in1_mcast_receiver_semaphore_addr); + // num_dests must not include source, since we are NOT really doing a local copy! + noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, in1_mcast_receiver_semaphore_noc_addr, in1_mcast_num_dests); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + cb_push_back(cb_id_in1, in1_block_num_tiles); + // kernel_profiler::mark_time_once(22, &one_time_cb_push); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp new file mode 100644 index 00000000000..a8861beac39 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_noc_x = get_arg_val(1); + uint32_t src0_noc_y = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_noc_x = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); + uint32_t num_blocks = get_arg_val(6); + + uint32_t in0_block_tile_cnt = get_arg_val(7); + uint32_t in1_block_tile_cnt = get_arg_val(8); + uint32_t in0_block_size_bytes = get_arg_val(9); + uint32_t in1_block_size_bytes = get_arg_val(10); + + uint32_t with_bias = get_arg_val(11); + uint32_t src2_addr; + uint32_t src2_noc_x; + uint32_t src2_noc_y; + uint32_t in2_block_tile_cnt; + uint32_t in2_block_size_bytes; + + if (with_bias) { + src2_addr = get_arg_val(12); + src2_noc_x = get_arg_val(13); + src2_noc_y = get_arg_val(14); + in2_block_tile_cnt = get_arg_val(15); + in2_block_size_bytes = get_arg_val(16); + } + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + constexpr uint32_t cb_id_in2 = 2; + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + uint32_t l1_write_addr_in2; + + for(uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + + cb_reserve_back(cb_id_in0, in0_block_tile_cnt); + cb_reserve_back(cb_id_in1, in1_block_tile_cnt); + + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + noc_async_read(src0_noc_addr, l1_write_addr_in0, in0_block_size_bytes); + noc_async_read(src1_noc_addr, l1_write_addr_in1, in1_block_size_bytes); + + noc_async_read_barrier(); + + cb_push_back(cb_id_in0, in0_block_tile_cnt); + cb_push_back(cb_id_in1, in1_block_tile_cnt); + + src0_addr += in0_block_size_bytes; + src1_addr += in1_block_size_bytes; + } + + if (with_bias) { + uint64_t src2_noc_addr = get_noc_addr(src2_noc_x, src2_noc_y, src2_addr); + l1_write_addr_in2 = get_write_ptr(cb_id_in2); + cb_reserve_back(cb_id_in2, in2_block_tile_cnt); + noc_async_read(src2_noc_addr, l1_write_addr_in2, in2_block_size_bytes); + noc_async_read_barrier(); + cb_push_back(cb_id_in2, in2_block_tile_cnt); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp new file mode 100644 index 00000000000..b2135f798f6 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +// Make n reads defined by num_reads +// Writes to Specified Circular Buffers in L1 +// Expects n provided src_addr, src_noc_x, src_noc_y, and cb_id_in +void kernel_main() { + uint32_t num_reads = get_arg_val(0); + uint32_t num_tiles_per_read = get_arg_val(1); + + // ublocks size defined in tiles + constexpr uint32_t ublock_size_tiles = 1; + + for (uint32_t i = 0; i(2 + i * 4); + uint32_t src_noc_x = get_arg_val(3 + i * 4); + uint32_t src_noc_y = get_arg_val(4 + i * 4); + uint32_t cb_id_in = get_arg_val(5 + i * 4); + + uint32_t ublock_size_bytes = get_tile_size(cb_id_in); + + // read a ublock of tiles from src to CB, and then push the ublock to unpacker + for (uint32_t i = 0; i + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + constexpr uint32_t cb_id_in0 = 0; + + // ublocks size defined in tiles + constexpr uint32_t ublock_size_tiles = 1; + uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles; + + // read a ublock of tiles from src to CB, and then push the ublock to unpacker + for (uint32_t i = 0; i +#include "dataflow_api.h" + +//#include "debug_print.h" + +void generate_bcast_scaler() { + constexpr uint32_t cb_in_2 = 2; + uint32_t scaler = get_arg_val(8); + union { float f; uint32_t u; } u; u.u = scaler; + //DPRINT << "basic Scaler = " << F32(u.f) << ENDL(); + cb_reserve_back(cb_in_2, 1); + auto ptr = reinterpret_cast(get_write_ptr(cb_in_2)); + for (int j = 0; j < 1024; j++) + ptr[j] = uint16_t(0); + + for (int k = 0; k < 4; k++) + for (int j = 0; j < 16; j++) + ptr[k*256 + j] = uint16_t(u.u>>16); + cb_push_back(cb_in_2, 1); +} + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t num_tiles = get_arg_val(3); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank + + constexpr uint32_t cb_id_in0 = 0, cb_id_in1 = 1; + + // ublocks size defined in tiles + constexpr uint32_t onetile = 1; + uint32_t tile_bytes = get_tile_size(cb_id_in0); + + #ifdef KERNEL_COMPILE_TIME_ARG_0 + constexpr bool read_from_dram = get_compile_time_arg_val(0); + #else + constexpr bool read_from_dram = true; + #endif + + const InterleavedPow2AddrGen src_a = { src_addr, 11 }; + + #if GENERATE_BCAST_SCALER + // TODO(AP): cleanup, probably with named args/param pack/reflection. + generate_bcast_scaler(); + constexpr uint32_t blk = BLOCK_SIZE; + #else + constexpr uint32_t blk = 1; // 1 for correctness for unfused kernels + #endif + + #ifdef TILE_OFFSET + uint32_t tile_offset = TILE_OFFSET; + #else + constexpr uint32_t tile_offset = 0; + #endif + //DPRINT << "Reader Tile offset=" << tile_offset << ENDL(); + + // read a ublock of tiles from src to CB, and then push the ublock to unpacker + uint32_t i_tile = 0; + for (uint32_t i = 0; i num_tiles) ? num_tiles - i : blk; + cb_reserve_back(cb_id_in0, rem); + uint32_t l1_write_addr = get_write_ptr(cb_id_in0); + + for (uint32_t r = 0; r +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + constexpr uint32_t cb_id_in0 = 0; + + // ublocks size defined in tiles + constexpr uint32_t ublock_size_tiles = 4; + uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles; + + // read a ublock of tiles from src to CB, and then push the ublock to unpacker + for (uint32_t i = 0; i +#include "dataflow_api.h" + +void kernel_main() { + + // Constexpr + constexpr uint32_t num_dram_channels = 8; + constexpr uint32_t log_base_2_of_num_dram_channels = 3; + constexpr uint32_t cb_id_in0 = 0; + + const uint32_t src_addr = get_arg_val(0); + const uint32_t num_sticks = get_arg_val(1); + const uint32_t stick_size = get_arg_val(2); + + // TODO(agrebenisan): This isn't good... here we are assuming + // that the stick size dictates tiles c, but stick size + // doesn't necessarily need to be divisible by tiles c... + // this is only the case really for tilize + const uint32_t num_tiles_c = stick_size / 64; // Assuming 2 bytes per datum, there are 64 bytes per tile row + uint32_t stick_id = 0; + + constexpr bool stick_size_is_power_of_two = (get_compile_time_arg_val(0) == 1); + #if (stick_size_is_power_of_two) + const uint32_t log_base_2_of_page_size = get_arg_val(3); + const InterleavedPow2AddrGen s = { + .bank_base_address = src_addr, + + + .log_base_2_of_page_size = log_base_2_of_page_size // TODO(AP): refactor + }; + #else + const InterleavedAddrGen s = { + .bank_base_address = src_addr, + + + .page_size = stick_size + }; + #endif + + for (uint32_t i = 0; i < num_sticks / 32; i++) { + // We reserve back an entire tile row and issue a bunch of reads + cb_reserve_back(cb_id_in0, num_tiles_c); + uint32_t l1_write_addr = get_write_ptr(cb_id_in0); + for (uint32_t j = 0; j < 32; j++) { + uint64_t src_noc_addr = get_noc_addr( + stick_id, s); + + uint32_t bank_id = stick_id & (num_dram_channels - 1); + noc_async_read(src_noc_addr, l1_write_addr, stick_size); + l1_write_addr += stick_size; + stick_id++; + } + noc_async_read_barrier(); + cb_push_back(cb_id_in0, num_tiles_c); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp new file mode 100644 index 00000000000..c74fe410677 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + // skip 3 for compat with reader_unary_8bank, reader_unary + uint32_t N = get_arg_val(4); + uint32_t Ht = get_arg_val(5); + uint32_t Wt = get_arg_val(6); + uint32_t HtWt = get_arg_val(7); + uint32_t HtWtTileBytes = HtWt*2048; // TODO(AP): assumed 16-bits + uint32_t WtTileBytes = Wt*2048; // TODO(AP): assumed 16-bits + + constexpr uint32_t cb_id_in0 = 0; + + // ublocks size defined in tiles + constexpr uint32_t onetile = 1; + uint32_t tile_bytes = get_tile_size(cb_id_in0); + + uint32_t src_addrN = src_addr; + // this reader will read a NHW tensor in NWH order + for (uint32_t n = 0; n +#include "dataflow_api.h" + +//#include "debug_print.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + // skip args 1,2,3 for compat with reader_unary, reader_unary_8bank + uint32_t N = get_arg_val(4); // args match the order of reader_unary + uint32_t Ht = get_arg_val(5); + uint32_t Wt = get_arg_val(6); + uint32_t HtWt = get_arg_val(7); + uint32_t scaler = get_arg_val(8); + + constexpr uint32_t cb_id_in0 = 0; + + // ublocks size defined in tiles + constexpr uint32_t onetile = 1; + uint32_t tile_bytes = get_tile_size(cb_id_in0); + + if (scaler != 0) { + union { float f; uint32_t u; } u; u.u = scaler; + //DPRINT << "TWH Scaler = " << F32(u.f) << ENDL(); + constexpr uint32_t cb_in_2 = 2; + cb_reserve_back(cb_in_2, 1); + auto ptr = reinterpret_cast(get_write_ptr(cb_in_2)); + for (int j = 0; j < 1024; j++) + ptr[j] = uint16_t(0); + + for (int k = 0; k < 4; k++) + for (int j = 0; j < 16; j++) + ptr[k*256 + j] = uint16_t(u.u>>16); + cb_push_back(cb_in_2, 1); + } + + uint32_t i_tile_N = 0; // first tile in current batch + uint32_t i_tile = 0; + + const InterleavedPow2AddrGen s = { + .bank_base_address = src_addr, + + + .log_base_2_of_page_size = 11 + }; + + // this reader will read a NHW tensor in NWH order + for (uint32_t n = 0; n +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t N = get_arg_val(1); + uint32_t Ht = get_arg_val(2); + uint32_t Wt = get_arg_val(3); + uint32_t HtWt = get_arg_val(4); + + constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; + constexpr uint32_t cb_id_in0 = 0; + + // ublocks size defined in tiles + constexpr uint32_t onetile = 1; + const uint32_t tile_bytes = get_tile_size(cb_id_in0); + const DataFormat data_format = get_dataformat(cb_id_in0); + + #ifdef REDUCE_SCALER + constexpr uint32_t cb_in_2 = 2; + constexpr uint32_t scaler = get_compile_time_arg_val(1); + cb_reserve_back(cb_in_2, 1); + if (scaler != 0) { + uint16_t u = uint16_t(scaler>>16); + auto ptr = reinterpret_cast(get_write_ptr(cb_in_2)); + for (int j = 0; j < 1024; j++) + ptr[j] = uint16_t(0); + + for (int k = 0; k < 4; k++) + for (int j = 0; j < 16; j++) + ptr[k*256 + j] = u; + + } + cb_push_back(cb_in_2, 1); + #endif + + uint32_t i_tile_N = 0; // first tile in current batch + uint32_t i_tile = 0; + + const InterleavedAddrGenFast s = { + .bank_base_address = src_addr, + .page_size = tile_bytes, + .data_format = data_format + }; + + // this reader will read a NHW tensor in NWH order + for (uint32_t n = 0; n +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" +// #include "tools/profiler/kernel_profiler.hpp" + +void kernel_main() { + + uint32_t sender_noc_x = get_arg_val(0); + uint32_t sender_noc_y = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t sender_semaphore_addr = get_arg_val(3); + uint32_t receiver_semaphore_addr = get_arg_val(4); + uint32_t num_repetitions = get_arg_val(5); + + volatile tt_l1_ptr uint32_t* receiver_semaphore_addr_ptr = reinterpret_cast(receiver_semaphore_addr); + + constexpr uint32_t cb_id = get_compile_time_arg_val(0); + constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); + + uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles; + + uint64_t sender_semaphore_noc_addr = get_noc_addr(sender_noc_x, sender_noc_y, sender_semaphore_addr); + + for (uint32_t j = 0; j < num_repetitions; j++) { + for (uint32_t i = 0; i +#include "hostdevcommon/common_runtime_address_map.h" +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +constexpr static std::uint32_t VALID_VAL = 0x1234; +constexpr static std::uint32_t INVALID_VAL = 0x4321; +void kernel_main() { + std::uint32_t buffer_src_addr = get_arg_val(0); + std::uint32_t src_noc_x = get_arg_val(1); + std::uint32_t src_noc_y = get_arg_val(2); + std::uint32_t buffer_dst_addr = get_arg_val(3); + std::uint32_t dst_noc_x = get_arg_val(4); + std::uint32_t dst_noc_y = get_arg_val(5); + std::uint32_t l1_buffer_address = get_arg_val(6); + std::uint32_t stream_register_address = get_arg_val(7); + std::uint32_t num_tiles = get_arg_val(8); + std::uint32_t transient_buffer_size_tiles = get_arg_val(9); + std::uint32_t transient_buffer_size_bytes = get_arg_val(10); + + // Scratch address in L1, two write register value before we copy it to into local/remote registers + volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); + *(constant_ptr) = INVALID_VAL; + + std::uint32_t counter = 0; + // src noc address + std::uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, buffer_src_addr); + // Local and remote register addresses (used for sync) + std::uint64_t local = get_noc_addr(stream_register_address); + std::uint64_t remote= get_noc_addr(src_noc_x, src_noc_y, stream_register_address); + + std::uint32_t dst_buffer_addr = buffer_dst_addr; + while(counter < num_tiles) { + // Wait until sync register is VALID_VAL (means its safe to read data from source buffer into operand buffer) + wait_for_sync_register_value(stream_register_address, VALID_VAL); + noc_async_read(src_noc_addr, l1_buffer_address, transient_buffer_size_bytes); + noc_async_read_barrier(); + + // DRAM NOC dst address + std::uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_buffer_addr); + noc_async_write(l1_buffer_address, dst_noc_addr, transient_buffer_size_bytes); + + dst_buffer_addr += transient_buffer_size_bytes; + + // Write INVALID_VAL into local register + noc_async_write(CONSTANT_REGISTER_VALUE, local, 4); + noc_async_write_barrier(); + + noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4); + noc_async_write_barrier(); + + counter += transient_buffer_size_tiles; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp new file mode 100644 index 00000000000..65d0443c414 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "hostdevcommon/common_runtime_address_map.h" +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +constexpr static std::uint32_t VALID_VAL = 0x1234; +constexpr static std::uint32_t INVALID_VAL = 0x4321; + +inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, std::uint32_t index) { + if((index & 0x1) == 0) { + return addr1; + } else { + return addr2; + } +} + +void kernel_main() { + std::uint32_t buffer_src_addr1 = get_arg_val(0); + std::uint32_t buffer_src_addr2 = get_arg_val(1); + std::uint32_t src_noc_x = get_arg_val(2); + std::uint32_t src_noc_y = get_arg_val(3); + std::uint32_t buffer_dst_addr = get_arg_val(4); + std::uint32_t dst_noc_x = get_arg_val(5); + std::uint32_t dst_noc_y = get_arg_val(6); + std::uint32_t local_buffer_addr1 = get_arg_val(7); + std::uint32_t local_buffer_addr2 = get_arg_val(8); + std::uint32_t stream_register_address1 = get_arg_val(9); + std::uint32_t stream_register_address2 = get_arg_val(10); + std::uint32_t num_tiles = get_arg_val(11); + std::uint32_t transient_buffer_size_tiles = get_arg_val(12); + std::uint32_t transient_buffer_size_bytes = get_arg_val(13); + + // Scratch address in L1, two write register value before we copy it to into local/remote registers + volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); + *(constant_ptr) = INVALID_VAL; + + std::uint32_t counter = 0; + std::uint32_t dst_buffer_addr = buffer_dst_addr; + std::uint64_t dst_noc_addr; + while(counter < num_tiles) { + std::uint32_t reg_addr = ping_pong_address(stream_register_address1, stream_register_address2, counter); + std::uint64_t local = get_noc_addr(reg_addr); + std::uint64_t remote = get_noc_addr(src_noc_x, src_noc_y, reg_addr); + std::uint32_t local_buffer_address = ping_pong_address(local_buffer_addr1, local_buffer_addr2, counter); + std::uint32_t src_buffer_address = ping_pong_address(buffer_src_addr1, buffer_src_addr2, counter); + std::uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_buffer_address); + + // Wait until sync register is VALID_VAL (means its safe to read data from source buffer into operand buffer) + wait_for_sync_register_value(reg_addr, VALID_VAL); + noc_async_read(src_noc_addr, local_buffer_address, transient_buffer_size_bytes); + noc_async_read_barrier(); + + // DRAM NOC dst address + dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_buffer_addr); + noc_async_write(local_buffer_address, dst_noc_addr, transient_buffer_size_bytes); + + dst_buffer_addr += transient_buffer_size_bytes; + + noc_async_write(CONSTANT_REGISTER_VALUE, local, 4); + noc_async_write_barrier(); + + noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4); + noc_async_write_barrier(); + + counter += transient_buffer_size_tiles; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp new file mode 100644 index 00000000000..f4196705869 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" +#include "hostdevcommon/common_values.hpp" +// #include "tools/profiler/kernel_profiler.hpp" + +void kernel_main() { + + uint32_t receiver_noc_x = get_arg_val(0); + uint32_t receiver_noc_y = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t sender_semaphore_addr = get_arg_val(3); + uint32_t receiver_semaphore_addr = get_arg_val(4); + uint32_t l1_valid_value_addr = get_arg_val(5); + uint32_t num_repetitions = get_arg_val(6); + + // initialized by the host to 0 before program launch + volatile tt_l1_ptr uint32_t* sender_semaphore_addr_ptr = reinterpret_cast(sender_semaphore_addr); + // local valid value in L1 + volatile tt_l1_ptr uint32_t* l1_valid_value_addr_ptr = reinterpret_cast(l1_valid_value_addr); + *(l1_valid_value_addr_ptr) = VALID; + + constexpr uint32_t cb_id = get_compile_time_arg_val(0); + constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); + + uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles; + + uint64_t receiver_semaphore_noc_addr = get_noc_addr(receiver_noc_x, receiver_noc_y, receiver_semaphore_addr); + + for (uint32_t j = 0; j < num_repetitions; j++) { + for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) { + + // wait until receiver has set the sender's semaphore_addr value to 1, which means receiver has reserved space in the CB + noc_semaphore_wait(sender_semaphore_addr_ptr, 1); + + if (i > 0) { + cb_pop_front(cb_id, block_size_tiles); + } + cb_wait_front(cb_id, block_size_tiles); + uint32_t l1_addr = get_read_ptr(cb_id); + + // now we have the block in the CB (at l1_addr), we can send to receiver + uint64_t receiver_data_noc_addr = get_noc_addr(receiver_noc_x, receiver_noc_y, l1_addr); + noc_async_write(l1_addr, receiver_data_noc_addr, block_size_bytes); + + // set the sender's semaphore value back to zero for the next block + // we need to reset before we set the receiver's semaphore + noc_semaphore_set(sender_semaphore_addr_ptr, 0); + + // we now set the receiver's semaphore, so that it knows that the data has been written to the CB + // must use noc_semaphore_set_remote and not noc_semaphore_inc in the sender + // because we need to ensure that data is written to the remote CB before we set the semaphore + // noc_async_write and noc_semaphore_set_remote are ordered + noc_semaphore_set_remote(l1_valid_value_addr, receiver_semaphore_noc_addr); + + // this barrier is not needed, sempahore inter-lock already guarantees that we won't overwrite local CB with new data + // ie, it is safe to pop here, because the data in the CB won't actually be overwritten until the receiver has set the semaphore (which means it was received) + // this barrier would hurt performance for smaller transfers (<16KB), but for larger transfers it wouldn't make a difference + // noc_async_write_barrier(); + } + cb_pop_front(cb_id, block_size_tiles); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp new file mode 100644 index 00000000000..8e7b4bce00f --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include "debug_print.h" + +void kernel_main() { + DPRINT<<"Kernel Compile Time Args"< +#include "dataflow_api.h" + +#include "debug_print.h" + +using uint32_t = std::uint32_t; + +// tile index to address +inline uint32_t TADDR(uint32_t ti) { + return ti << 11; +} + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t W = get_arg_val(3); + uint32_t H = get_arg_val(4); + uint32_t C = get_arg_val(5); + uint32_t HW = get_arg_val(6); + uint32_t N = get_arg_val(7); + uint32_t CHW = get_arg_val(8); + + auto WT = (W >> 5); // number of tiles in W + auto HT = (H >> 5); // number of tiles in H + auto CT = (C >> 5); // number of tiles in C + auto HTWT = (HW >> 10); // product of HT*WT + auto HW2 = (HW << 1); // HW stride in bytes + auto CHW2 = (CHW << 1); // batch stride in bytes + constexpr uint32_t SUBTILE_LINE_BYTES = (16<<1); + constexpr uint32_t onetile = 1; + constexpr uint32_t operand0 = 0; + + + // The basic idea here is to iterate over output tiles (that will be over CT,WT) and H + // this will generate a linearly incremented output address in the inner loop + // we then reverse map this linear dest address to src address + uint64_t batch_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t n = 0; n < N; n++) { + uint32_t htWT = 0; + for (uint32_t h = 0; h < H; h++) { + uint32_t ctoffs = 0; + for (uint32_t ct = 0; ct < CT; ct++) { + for (uint32_t wt = 0; wt < WT; wt++) { + // what is the source address for the current tile? + // c32 = intra-C-tile loop + // every 32 C's acquire a new output tile address + // DPRINT << "h=" << h << " ct=" << ct << " wt=" << wt << " W=" << W << " HW2=" << HW2 << ENDL(); + + cb_reserve_back(operand0, onetile); + + uint32_t dest_tr0_l1 = get_write_ptr(operand0); + uint32_t save_dest = dest_tr0_l1; + uint32_t cSubtileOffs = 0; + for (uint32_t sub = 0; sub < 4; sub++) { + uint32_t c16offs = cSubtileOffs; + for (uint32_t c16 = 0; c16 < 16; c16++) { + // In this loop sub, c16 are source subtile, c16 + // dest in this loop is varying h implicitly via dest address increment + + // Dest is HCW + // We are iterating over it as H Ct Wt-tiles + // intra-tile FC16 for F going over 4-subtiles + // the source address is (bytes): + // src_addr = c*HW2 + (ht*Wt + wt)*1024*2 + f*256*2 + (h16*16 + w16)*2 + // we have 512 bytes per subtile and 32 bytes per subtile row of 16 elems + // here sub<<9 is multiply by 512 which offset in bytes of a subtile + // note that dest h is decomposed as h = ht+h32 and htWT is incremented by WT in the outer H loop + auto h32 = (h&31); + // TODO(AP): not really trivial need better comments here + auto sub_src_offs = (sub & 1) << 9; // if dest subtile w==16, add 512 to src subtile offset + sub_src_offs += (((h32 >> 4) << 1) << 9); // if intra-tile source h is > 16, add 2*512 to subtile offset + // below we only use the lower 4 bits out of 5-bit range for h, shift by 5 because 2 bytes per element + auto src_offs = ctoffs + c16offs + TADDR(htWT + wt) + sub_src_offs + ((h32&15)<<5); // bytes offset + auto src_addr = batch_addr + src_offs; + + //if (h == 0 && ct == 0 && wt == 0) { + // DPRINT << " Sub=" << sub << " c16=" << c16 << ENDL(); + // DPRINT << " Reading from src_offs=" << src_offs << ENDL(); + // DPRINT << " Writing to dst_offs=" << dest_tr0_l1-save_dest << ENDL(); + //} + + // this starts async NOC dma from DRAM to TR0_L1 buffer + noc_async_read(src_addr, dest_tr0_l1, SUBTILE_LINE_BYTES); + + //if (h == 0 && ct == 0 && wt == 0) + // DPRINT << uint32_t( reinterpret_cast( dest_tr0_l1 )[0] ) << ENDL(); + + // the output address is just linearly incremented + dest_tr0_l1 += SUBTILE_LINE_BYTES; + c16offs += HW2; + } + // subtiles are ordered like this: + // 0 1 + // 2 3 + // Here we offset C by 16 starting with subtile=2 + if (sub == 1) // after we are done with subtile 1, increment for sub=2 + cSubtileOffs += (HW2<<4); // 16*HWbytes, which is subtile vertical size + } // sub<4 + + // block on all outstanding noc DMA requests to complete + noc_async_read_barrier(); + + // notifies the unpacker that the buffer is populated + cb_push_back(operand0, onetile); + } + ctoffs += (HW2<<5); // since we increment ct, we need to mlutiply by 32 + } // ct loop + // multiplication-free computation of ht*WT, since ht = h/32 + if ((h&31) == 31) + htWT += WT; + } // h < H loop + batch_addr += CHW2; + } // n +#include "dataflow_api.h" + +#include "debug_print.h" + +using uint32_t = std::uint32_t; + +// tile index to address +inline uint32_t TADDR(uint32_t ti) { + return ti << 11; +} + +void kernel_main() { + uint32_t src0_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t W = get_arg_val(3); + uint32_t H = get_arg_val(4); + uint32_t C = get_arg_val(5); + uint32_t HW = get_arg_val(6); + uint32_t N = get_arg_val(7); + uint32_t CHW = get_arg_val(8); + + auto WT = (W >> 5); // number of tiles in W + auto HT = (H >> 5); // number of tiles in H + auto CT = (C >> 5); // number of tiles in C + auto HTWT = (HW >> 10); // product of HT*WT + auto HW2 = (HW << 1); // HW stride in bytes + auto CHW2 = (CHW << 1); // batch stride in bytes + constexpr uint32_t SUBTILE_LINE_BYTES = (16<<1); + constexpr uint32_t onetile = 1; + constexpr uint32_t operand0 = 0; + + + // The basic idea here is to iterate over output tiles (that will be over CT,WT) and H + // this will generate a linearly incremented output address in the inner loop + // we then reverse map this linear dest address to src address + + const InterleavedPow2AddrGen s0 = { + .bank_base_address = src0_addr, + + + .log_base_2_of_page_size = 11 + }; + + uint64_t batch_addr = src0_addr; + for (uint32_t n = 0; n < N; n++) { + uint32_t htWT = 0; + for (uint32_t h = 0; h < H; h++) { + uint32_t ctoffs = 0; + for (uint32_t ct = 0; ct < CT; ct++) { + for (uint32_t wt = 0; wt < WT; wt++) { + // what is the source address for the current tile? + // c32 = intra-C-tile loop + // every 32 C's acquire a new output tile address + // DPRINT << "8B h=" << h << " ct=" << ct << " wt=" << wt << " W=" << W << " HW2=" << HW2 << ENDL(); + + cb_reserve_back(operand0, onetile); + + uint32_t dest_tr0_l1 = get_write_ptr(operand0); + uint32_t save_dest = dest_tr0_l1; + uint32_t cSubtileOffs = 0; + for (uint32_t sub = 0; sub < 4; sub++) { + uint32_t c16offs = cSubtileOffs; + for (uint32_t c16 = 0; c16 < 16; c16++) { + // In this loop sub, c16 are source subtile, c16 + // dest in this loop is varying h implicitly via dest address increment + + // Dest is HCW + // We are iterating over it as H Ct Wt-tiles + // intra-tile FC16 for F going over 4-subtiles + // the source address is (bytes): + // src_addr = c*HW2 + (ht*Wt + wt)*1024*2 + f*256*2 + (h16*16 + w16)*2 + // we have 512 bytes per subtile and 32 bytes per subtile row of 16 elems + // here sub<<9 is multiply by 512 which offset in bytes of a subtile + // note that dest h is decomposed as h = ht+h32 and htWT is incremented by WT in the outer H loop + auto h32 = (h&31); + // TODO(AP): not really trivial need better comments here + auto sub_src_offs = (sub & 1) << 9; // if dest subtile w==16, add 512 to src subtile offset + sub_src_offs += (((h32 >> 4) << 1) << 9); // if intra-tile source h is > 16, add 2*512 to subtile offset + // below we only use the lower 4 bits out of 5-bit range for h, shift by 5 because 2 bytes per element + auto src_offs = ctoffs + c16offs + TADDR(htWT + wt) + sub_src_offs + ((h32&15)<<5); // bytes offset + auto bsrc_offs = (batch_addr + src_offs)-src0_addr; + uint32_t batch_itile = (bsrc_offs >> 11); + uint32_t rem = (bsrc_offs & 2047); + + //if (h == 0 && ct == 0 && wt == 0) { + // DPRINT << " Sub=" << sub << " c16=" << c16 << ENDL(); + // DPRINT << " Reading from src_offs=" << src_offs << ENDL(); + // DPRINT << " Writing to dst_offs=" << dest_tr0_l1-save_dest << ENDL(); + //} + + uint64_t banked_addr = get_noc_addr(batch_itile, s0); + banked_addr += rem; + + // this starts async NOC dma from DRAM to TR0_L1 buffer + noc_async_read(banked_addr, dest_tr0_l1, SUBTILE_LINE_BYTES); + + //if (h == 0 && ct == 0 && wt == 0) + // DPRINT << uint32_t( reinterpret_cast( dest_tr0_l1 )[0] ) << ENDL(); + + // the output address is just linearly incremented + dest_tr0_l1 += SUBTILE_LINE_BYTES; + c16offs += HW2; + } + // subtiles are ordered like this: + // 0 1 + // 2 3 + // Here we offset C by 16 starting with subtile=2 + if (sub == 1) // after we are done with subtile 1, increment for sub=2 + cSubtileOffs += (HW2<<4); // 16*HWbytes, which is subtile vertical size + } // sub<4 + + // block on all outstanding noc DMA requests to complete + noc_async_read_barrier(); + + // notifies the unpacker that the buffer is populated + cb_push_back(operand0, onetile); + } + ctoffs += (HW2<<5); // since we increment ct, we need to mlutiply by 32 + } // ct loop + // multiplication-free computation of ht*WT, since ht = h/32 + if ((h&31) == 31) + htWT += WT; + } // h < H loop + batch_addr += CHW2; + } // n + +#include "dataflow_api.h" + +void kernel_main() { + for (uint32_t i = 0; i < 20; i++) { + uint32_t load = *reinterpret_cast(400 * 1024); + uint32_t local_load1 = *reinterpret_cast(MEM_LOCAL_BASE); + uint32_t local_load2 = *reinterpret_cast(MEM_LOCAL_BASE); + uint32_t local_load3 = *reinterpret_cast(MEM_LOCAL_BASE); + uint32_t local_load4 = *reinterpret_cast(MEM_LOCAL_BASE); + uint32_t local_load5 = *reinterpret_cast(MEM_LOCAL_BASE); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_dram.cpp new file mode 100644 index 00000000000..c694971cdf7 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_dram.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + constexpr std::uint32_t cb_id = get_compile_time_arg_val(0); + constexpr std::uint32_t page_size = get_compile_time_arg_val(1); + std::uint32_t src_addr_base = get_arg_val(0); + std::uint32_t num_tiles = get_arg_val(1); + + constexpr bool IS_DRAM = true; + const uint32_t ublock_size_tiles = 1; + uint32_t tile_bytes = get_tile_size(cb_id); + InterleavedAddrGen src_addrgen = { + .bank_base_address = src_addr_base, + .page_size = page_size, + }; + + // read tiles from src to CB + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t src_noc_addr = get_noc_addr(i, src_addrgen); + + cb_reserve_back(cb_id, ublock_size_tiles); + uint32_t l1_write_addr = get_write_ptr(cb_id); + noc_async_read(src_noc_addr, l1_write_addr, tile_bytes); + + noc_async_read_barrier(); + + cb_push_back(cb_id, ublock_size_tiles); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_unary.cpp new file mode 100644 index 00000000000..4593299dd01 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_unary.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + constexpr std::uint32_t cb_id = get_compile_time_arg_val(0); + constexpr std::uint32_t page_size = get_compile_time_arg_val(1); + std::uint32_t src_addr_base = get_arg_val(0); + std::uint32_t num_tiles = get_arg_val(1); + + constexpr bool IS_DRAM = false; + const uint32_t ublock_size_tiles = 1; + uint32_t tile_bytes = get_tile_size(cb_id); + InterleavedAddrGen src_addrgen = { + .bank_base_address = src_addr_base, + .page_size = page_size, + }; + + // read tiles from src to CB + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t src_noc_addr = get_noc_addr(i, src_addrgen); + + cb_reserve_back(cb_id, ublock_size_tiles); + uint32_t l1_write_addr = get_write_ptr(cb_id); + noc_async_read(src_noc_addr, l1_write_addr, tile_bytes); + + noc_async_read_barrier(); + + cb_push_back(cb_id, ublock_size_tiles); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_dram.cpp new file mode 100644 index 00000000000..67105917c1c --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_dram.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + constexpr std::uint32_t cb_id = get_compile_time_arg_val(0); + constexpr std::uint32_t page_size = get_compile_time_arg_val(1); + std::uint32_t dst_addr_base = get_arg_val(0); + std::uint32_t num_tiles = get_arg_val(1); + + constexpr bool IS_DRAM = true; + const uint32_t ublock_size_tiles = 1; + uint32_t tile_bytes = get_tile_size(cb_id); + InterleavedAddrGen dst_addrgen = { + .bank_base_address = dst_addr_base, + .page_size = page_size, + }; + + // Write tiles from CB to dram(interleaved) + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(i, dst_addrgen); + + cb_wait_front(cb_id, ublock_size_tiles); + uint32_t l1_read_ptr = get_read_ptr(cb_id); + noc_async_write(l1_read_ptr, dst_noc_addr, tile_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id, ublock_size_tiles); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_unary.cpp new file mode 100644 index 00000000000..99caf9fbb62 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_unary.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + constexpr std::uint32_t cb_id = get_compile_time_arg_val(0); + constexpr std::uint32_t page_size = get_compile_time_arg_val(1); + std::uint32_t dst_addr_base = get_arg_val(0); + std::uint32_t num_tiles = get_arg_val(1); + + constexpr bool IS_DRAM = false; + const uint32_t ublock_size_tiles = 1; + uint32_t tile_bytes = get_tile_size(cb_id); + InterleavedAddrGen dst_addrgen = { + .bank_base_address = dst_addr_base, + .page_size = page_size, + }; + + // Write tiles from CB to dram(interleaved) + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(i, dst_addrgen); + + cb_wait_front(cb_id, ublock_size_tiles); + uint32_t l1_read_ptr = get_read_ptr(cb_id); + noc_async_write(l1_read_ptr, dst_noc_addr, tile_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id, ublock_size_tiles); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_binary.cpp new file mode 100644 index 00000000000..1d4884ba7f4 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_binary.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + const uint32_t cb_id = get_compile_time_arg_val(0); + const uint32_t cb_id = get_compile_time_arg_val(0); + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + // ublocks size defined in tiles + constexpr uint32_t ublock_size_tiles = 1; + uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; + + // read a ublock of tiles from src to CB, and then push the ublock to unpacker + for (uint32_t i = 0; i + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t dram_src_noc_x = get_arg_val(1); + std::uint32_t dram_src_noc_y = get_arg_val(2); + + std::uint32_t l1_buffer_dst_addr_base = get_arg_val(3); + std::uint32_t dram_buffer_size = get_arg_val(4); + + std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + + noc_async_read(dram_buffer_src_noc_addr, l1_buffer_dst_addr_base, dram_buffer_size); + noc_async_read_barrier(); + +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp new file mode 100644 index 00000000000..615a4c02274 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void kernel_main() { + const uint32_t cb_id = get_compile_time_arg_val(0); + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + // ublocks size defined in tiles + constexpr uint32_t ublock_size_tiles = 1; + uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; + + // read a ublock of tiles from src to CB, and then push the ublock to unpacker + for (uint32_t i = 0; i + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on NCRISC or other RISCs + * Any two RISC processors cannot use the same CMD_BUF + * non_blocking APIs shouldn't be mixed with slow noc.h APIs + * explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t dram_buffer_dst_addr_base = get_arg_val(0); + std::uint32_t dram_dst_noc_x = get_arg_val(1); + std::uint32_t dram_dst_noc_y = get_arg_val(2); + + std::uint32_t l1_buffer_src_addr_base = get_arg_val(3); + std::uint32_t dram_buffer_size = get_arg_val(4); + + std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base; + + // DRAM NOC dst address + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + + noc_async_write(l1_buffer_src_addr_base, dram_buffer_dst_noc_addr, dram_buffer_size); + noc_async_write_barrier(); + +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp new file mode 100644 index 00000000000..2ed57a38744 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + const uint32_t cb_id = get_compile_time_arg_val(0); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_noc_x = get_arg_val(1); + uint32_t dst_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + // single-tile ublocks + uint32_t ublock_size_bytes = get_tile_size(cb_id); + uint32_t ublock_size_tiles = 1; + + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + + cb_wait_front(cb_id, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(cb_id); + noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id, ublock_size_tiles); + dst_addr += ublock_size_bytes; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp new file mode 100644 index 00000000000..ea72f5598f7 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +//#include "debug_print.h" + +void kernel_main() { + // same arg indices as in reader_bmm_8bank for reuse + uint32_t dst_addr = get_arg_val(0); + uint32_t Mt = get_arg_val(2); + uint32_t Nt = get_arg_val(4); + uint32_t batch = get_arg_val(7); + + constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1; + + constexpr int onetile = 1; + constexpr uint32_t cb_id_out0 = 16; + const uint32_t tile_bytes = get_tile_size(cb_id_out0); + uint32_t itileC = 0; + const DataFormat data_format = get_dataformat(cb_id_out0); + + const InterleavedAddrGenFast s = { + .bank_base_address = dst_addr, + .page_size = tile_bytes, + .data_format = data_format + }; + + // C is MN so we iterate in tile RM order + for (uint32_t nb = 0; nb < batch; nb ++) + for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C + for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) { // output tile index of C + // bmm will generate C's tiles C=A*B, MN=MK*KN, in row major order, we just read them from CB and write out to DRAM + cb_wait_front(cb_id_out0, onetile); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + noc_async_write_tile(itileC, s, l1_read_addr); + noc_async_write_barrier(); + cb_pop_front(cb_id_out0, onetile); + //DPRINT << 'W' << 'C' << itileC << ' ' << 'a' << dst_addr << ENDL(); + //DPRINT << itileC << ' ' << uint32_t(dst_noc_addr) << ENDL(); + itileC ++; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp new file mode 100644 index 00000000000..4d62bbda8b5 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +inline __attribute__((always_inline)) +void pop_from_cb_and_write(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes, + uint32_t dram_dst_noc_x, uint32_t dram_dst_noc_y, uint32_t& dram_buffer_dst_addr) { + for (uint32_t i = 0; i < num_tiles_per_cb; i += ublock_size_tiles) { + // DRAM NOC dst address + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + + cb_wait_front(cb_id, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(cb_id); + + noc_async_write(l1_read_addr, dram_buffer_dst_noc_addr, ublock_size_bytes); + noc_async_write_barrier(); + cb_pop_front(cb_id, ublock_size_tiles); + dram_buffer_dst_addr += ublock_size_bytes; + } +} + +void kernel_main() { + std::uint32_t dram_buffer_dst_addr = get_arg_val(0); + std::uint32_t dram_dst_noc_x = get_arg_val(1); + std::uint32_t dram_dst_noc_y = get_arg_val(2); + std::uint32_t num_tiles_per_cb = get_arg_val(3); + + constexpr uint32_t cb_id = get_compile_time_arg_val(0); + constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1); + uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; + + pop_from_cb_and_write(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes, + dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp new file mode 100644 index 00000000000..4a06fd0043d --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" +// #include "tools/profiler/kernel_profiler.hpp" + +void kernel_main() { + + std::uint32_t buffer_dst_addr = get_arg_val(0); + std::uint32_t dst_noc_x = get_arg_val(1); + std::uint32_t dst_noc_y = get_arg_val(2); + std::uint32_t num_tiles = get_arg_val(3); + std::uint32_t num_repetitions = get_arg_val(4); + + constexpr uint32_t cb_id = get_compile_time_arg_val(0); + constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); + + uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles; + + for (uint32_t j = 0; j < num_repetitions; j++) { + uint32_t dst_addr = buffer_dst_addr; + for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) { + std::uint64_t buffer_dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + + cb_wait_front(cb_id, block_size_tiles); + + if (j == 0) { + uint32_t l1_read_addr = get_read_ptr(cb_id); + noc_async_write(l1_read_addr, buffer_dst_noc_addr, block_size_bytes); + noc_async_write_barrier(); + + // some delay to test backpressure + // volatile uint32_t *l1_read_addr_ptr = reinterpret_cast(BRISC_BREAKPOINT); + // for (int delay = 0; delay < 10000; delay++) { + // *l1_read_addr_ptr = 1; + // } + } + + cb_pop_front(cb_id, block_size_tiles); + dst_addr += block_size_bytes; + } + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp new file mode 100644 index 00000000000..c2a73cf70cb --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + + + // out tensor args + uint32_t out_tensor_addr = get_arg_val(0); + uint32_t out_tensor_start_tile_id = get_arg_val(1); + uint32_t out_tensor_stride_w = get_arg_val(2); + uint32_t out_tensor_stride_h = get_arg_val(3); + uint32_t out_tensor_next_subblock_stride_w = get_arg_val(4); + uint32_t out_tensor_next_subblock_stride_h = get_arg_val(5); + + // out subblock args + uint32_t out_subblock_w = get_arg_val(6); + uint32_t out_subblock_h = get_arg_val(7); + uint32_t out_subblock_tile_count = get_arg_val(8); + uint32_t out_num_subblocks_w = get_arg_val(9); + uint32_t out_num_subblocks_h = get_arg_val(10); + + // const args for tile-based bank-swizzled layout + // could be added to the arg list in the future to test different + // bank-swizzling configurations + constexpr uint32_t num_used_dram_ch = 8; + constexpr uint32_t num_used_dram_ch_pow2_exponent = 3; + constexpr uint32_t tile_size_pow2_exponent = 11; + + constexpr uint32_t cb_id_out0 = 16; + + // single-tile + uint32_t single_tile_size_bytes = get_tile_size(cb_id_out0); + + const InterleavedPow2AddrGen s = { + .bank_base_address = out_tensor_addr, + + + .log_base_2_of_page_size = tile_size_pow2_exponent + }; + + + bool one_time_profile = true; + uint32_t out_tensor_sbh_start_tile_id = out_tensor_start_tile_id; + for(uint32_t sbh = 0; sbh < out_num_subblocks_h; sbh++) { + uint32_t out_tensor_sbw_start_tile_id = out_tensor_sbh_start_tile_id; + for(uint32_t sbw = 0; sbw < out_num_subblocks_w; sbw++) { + uint32_t out_tensor_sb_row_start_tile_id = out_tensor_sbw_start_tile_id; + + cb_wait_front(cb_id_out0, out_subblock_tile_count); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + + for(uint32_t h = 0; h < out_subblock_h; h++) { + uint32_t out_tensor_tile_id = out_tensor_sb_row_start_tile_id; + for(uint32_t w = 0; w < out_subblock_w; w++) { + uint64_t out_tensor_tile_noc_addr = get_noc_addr(out_tensor_tile_id, s); + + // kernel_profiler::mark_time(9); + noc_async_write(l1_read_addr, out_tensor_tile_noc_addr, single_tile_size_bytes); + l1_read_addr+=single_tile_size_bytes; + + out_tensor_tile_id += out_tensor_stride_w; + } + out_tensor_sb_row_start_tile_id += out_tensor_stride_h; + } + + noc_async_write_barrier(); + cb_pop_front(cb_id_out0, out_subblock_tile_count); + out_tensor_sbw_start_tile_id += out_tensor_next_subblock_stride_w; + } + out_tensor_sbh_start_tile_id += out_tensor_next_subblock_stride_h; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp new file mode 100644 index 00000000000..3d6404e8c71 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_noc_x = get_arg_val(1); + uint32_t dst_noc_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + constexpr uint32_t cb_id_out0 = 16; + + // single-tile ublocks + uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); + uint32_t ublock_size_tiles = 1; + + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + + cb_wait_front(cb_id_out0, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id_out0, ublock_size_tiles); + dst_addr += ublock_size_bytes; + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp new file mode 100644 index 00000000000..c20798b1e6c --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + + +void kernel_main() { + uint32_t dst_addr = get_arg_val(0); + uint32_t num_tiles = get_arg_val(3); // Index 3 to match with regular writer_unary + + constexpr uint32_t cb_id_out0 = 16; + constexpr uint32_t onetile = 1; + uint32_t tile_bytes = get_tile_size(cb_id_out0); + + #ifdef KERNEL_COMPILE_TIME_ARG_0 + constexpr bool write_to_dram = get_compile_time_arg_val(0); + #else + constexpr bool write_to_dram = true; + #endif + + const InterleavedPow2AddrGen s = { dst_addr, 11 }; + + for (uint32_t i = 0; i +#include "dataflow_api.h" + +void kernel_main() { + + // Constexpr + constexpr uint32_t num_dram_channels = 8; + constexpr uint32_t log_base_2_of_num_dram_channels = 3; + constexpr uint32_t cb_id_out0 = 16; + + uint32_t dst_addr = get_arg_val(0); + uint32_t num_sticks = get_arg_val(1); + uint32_t stick_size = get_arg_val(2); + + // TODO(agrebenisan): This isn't good... here we are assuming + // that the stick size dictates tiles c, but stick size + // doesn't necessarily need to be divisible by tiles c... + // this is only the case really for tilize + const uint32_t num_tiles_c = stick_size / 64; // Assuming 2 bytes per datum, there are 64 bytes per tile row + uint32_t stick_id = 0; + + const InterleavedAddrGen s = { + .bank_base_address = dst_addr, + + + .page_size = stick_size + }; + + for (uint32_t i = 0; i < num_sticks / 32; i++) { + // We reserve back an entire tile row and issue a bunch of reads + cb_wait_front(cb_id_out0, num_tiles_c); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + for (uint32_t j = 0; j < 32; j++) { + uint64_t dst_noc_addr = get_noc_addr( + stick_id, s); + + uint32_t bank_id = stick_id & (num_dram_channels - 1); + noc_async_write(l1_read_addr, dst_noc_addr, stick_size); + l1_read_addr += stick_size; + stick_id++; + } + noc_async_write_barrier(); + cb_pop_front(cb_id_out0, num_tiles_c); + } + +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp new file mode 100644 index 00000000000..ffa1bff29b2 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_noc_x = get_arg_val(1); + uint32_t dst_noc_y = get_arg_val(2); + uint32_t inner_r = get_arg_val(3); + uint32_t inner_c = get_arg_val(4); + uint32_t num_sub_blocks_m = get_arg_val(5); + uint32_t num_sub_blocks_n = get_arg_val(6); + uint32_t stride_r = get_arg_val(7); + uint32_t stride_subblock_r = get_arg_val(8); + uint32_t stride_subblock_c = get_arg_val(9); + + constexpr uint32_t cb_id_out0 = 16; + + // single-tile ublocks + uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); + uint32_t ublock_size_tiles = 1; + + uint32_t dram_address_block_row_beginning = dst_addr; + for(uint32_t sb_m = 0; sb_m < num_sub_blocks_m; sb_m++) { + uint32_t dram_address_block_beginning = dram_address_block_row_beginning; + for(uint32_t sb_n = 0; sb_n < num_sub_blocks_n; sb_n++) { + uint32_t dram_address_r = dram_address_block_beginning; + for(uint32_t r = 0; r < inner_r; r++) { + uint32_t dram_address_c = dram_address_r; + for(uint32_t c = 0; c < inner_c; c++) { + uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dram_address_c); + + cb_wait_front(cb_id_out0, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + + noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id_out0, ublock_size_tiles); + dram_address_c += ublock_size_bytes; + } + dram_address_r += stride_r; // goto next row within sub-block + } + dram_address_block_beginning += stride_subblock_c; // move to next sub-block on c dim + } + dram_address_block_row_beginning += stride_subblock_r; // move to next sub-block on r dim + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp new file mode 100644 index 00000000000..56ba6456d00 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "debug_print.h" + +/** + * add two ints + * args are in L1 + * result is in L1 +*/ + +void kernel_main() { + + volatile tt_l1_ptr std::uint32_t* arg_a = (volatile tt_l1_ptr uint32_t*)(L1_ARG_BASE); + volatile tt_l1_ptr std::uint32_t* arg_b = (volatile tt_l1_ptr uint32_t*)(L1_ARG_BASE + 4); + volatile tt_l1_ptr std::uint32_t* result = (volatile tt_l1_ptr uint32_t*)(L1_RESULT_BASE); + + //Sample print statement + // DPRINT << 123; + result[0] = arg_a[0] + arg_b[0]; + +} diff --git a/tests/tt_metal/tt_metal/test_kernels/ping_legal_l1s.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/ping_legal_l1s.cpp similarity index 100% rename from tests/tt_metal/tt_metal/test_kernels/ping_legal_l1s.cpp rename to tests/tt_metal/tt_metal/test_kernels/misc/ping_legal_l1s.cpp diff --git a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp index 829ba1a4505..b6229646f9b 100644 --- a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp @@ -68,7 +68,7 @@ int main(int argc, char **argv) { auto l1_to_l1_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/l1_to_l1.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp index 0e46656eb07..590b3d75274 100644 --- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp @@ -251,7 +251,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool std::vector writer_rt_args; string writer_kernel; if (output_rm) { - writer_kernel = "tt_metal/kernels/dataflow/writer_unary.cpp"; + writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp"; writer_rt_args = { dst_dram_buffer.address(), (std::uint32_t)dram_dst_noc_xy.x, @@ -259,7 +259,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool uint(M * N) }; } else { - writer_kernel = "tt_metal/kernels/dataflow/writer_unswizzle.cpp"; + writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; writer_rt_args = { dst_dram_buffer.address(), (std::uint32_t)dram_dst_noc_xy.x, @@ -276,7 +276,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); @@ -326,7 +326,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool uint(output_rm) }; - string compute_kernel = "tt_metal/kernels/compute/matmul_large_block.cpp"; + string compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp"; auto mm_kernel = tt_metal::CreateComputeKernel( program, diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp index 9f69cc6ed08..3fd64d6ba6e 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp @@ -141,13 +141,13 @@ std::tuple create_pro auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_tile_layout.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_matmul_tile_layout.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -181,7 +181,7 @@ std::tuple create_pro auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block_zm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp", all_cores, tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp index 645bca0b1a2..ab7f83f8b85 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp @@ -154,19 +154,19 @@ std::tuple create_pr auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unswizzle.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -244,7 +244,7 @@ std::tuple create_pr auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block_zm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp", all_cores, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp index 72d7d91be91..ff9737f2ccf 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp @@ -165,7 +165,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { .set_page_size(intermediate_cb_index, single_tile_size); auto cb_output = tt_metal::CreateCircularBuffer(program, cores, cb_output_config); - string reader_kernel = "tt_metal/kernels/dataflow/reader_matmul_with_bias_blocked.cpp"; + string reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp"; auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, @@ -175,7 +175,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -191,7 +191,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { }; string compute_kernel_name; - compute_kernel_name = "tt_metal/kernels/compute/matmul_with_bias.cpp"; + compute_kernel_name = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp"; auto mm_kernel = tt_metal::CreateComputeKernel( program, diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp index e54ffc8a0c4..4c756b69a1d 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp @@ -244,13 +244,13 @@ int main(int argc, char **argv) { auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unswizzle.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -285,7 +285,7 @@ int main(int argc, char **argv) { auto matmul_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block_zm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp index b908c8b9c69..53cd1bda173 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp @@ -246,13 +246,13 @@ int main(int argc, char **argv) { auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unswizzle.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -287,7 +287,7 @@ int main(int argc, char **argv) { auto matmul_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block_zm.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp index 706a0b64737..116de6ce101 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp @@ -71,13 +71,13 @@ int main(int argc, char **argv) { auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -92,7 +92,7 @@ int main(int argc, char **argv) { }; auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp index 75809c84dd2..80a267f6225 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp @@ -71,13 +71,13 @@ int main(int argc, char **argv) { auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -93,7 +93,7 @@ int main(int argc, char **argv) { auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp index 8e6dea68504..40470d5e11f 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp @@ -71,13 +71,13 @@ int main(int argc, char **argv) { auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -93,7 +93,7 @@ int main(int argc, char **argv) { auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp index 20f7a1e9233..a1cb087f4aa 100644 --- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp +++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp @@ -47,19 +47,19 @@ std::tuple create_pro auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", all_cores, tt_metal::ComputeConfig{.compile_args = eltwise_unary_args} ); diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp index 3ba77e2cc9c..f4387e244cc 100644 --- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp +++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp @@ -56,13 +56,13 @@ std::tuple setup_prog auto binary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -72,7 +72,7 @@ std::tuple setup_prog binary_defines["ELTWISE_OP"] = "add_tiles"; auto eltwise_binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines} ); @@ -109,13 +109,13 @@ std::tuple setup_prog auto mm_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_small_block.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -131,7 +131,7 @@ std::tuple setup_prog auto mm_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_reduce_h.cpp b/tests/tt_metal/tt_metal/test_reduce_h.cpp index 187a2c151fb..61114b59fe5 100644 --- a/tests/tt_metal/tt_metal/test_reduce_h.cpp +++ b/tests/tt_metal/tt_metal/test_reduce_h.cpp @@ -111,15 +111,15 @@ int main(int argc, char **argv) { reader_defines["REDUCE_SCALER"] = "1"; auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp" - : "tt_metal/kernels/dataflow/reader_unary_transpose_wh.cpp", // TODO(AP): not ported for reduce with scaler + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp", // TODO(AP): not ported for reduce with scaler core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_args, .defines = reader_defines}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" // no need to transpose the output since output Ht=1 - : "tt_metal/kernels/dataflow/writer_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" // no need to transpose the output since output Ht=1 + : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -135,7 +135,7 @@ int main(int argc, char **argv) { }; auto reduce_h_compute_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/reduce_h.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = reduce_defines} ); diff --git a/tests/tt_metal/tt_metal/test_reduce_hw.cpp b/tests/tt_metal/tt_metal/test_reduce_hw.cpp index 6d8a7222b4c..fc66a3b7052 100644 --- a/tests/tt_metal/tt_metal/test_reduce_hw.cpp +++ b/tests/tt_metal/tt_metal/test_reduce_hw.cpp @@ -106,15 +106,15 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/reader_unary_8bank_reduce.cpp" - : "tt_metal/kernels/dataflow/reader_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" - : "tt_metal/kernels/dataflow/writer_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -131,7 +131,7 @@ int main(int argc, char **argv) { }; auto reduce_hw_compute_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/reduce_hw.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = reduce_defines} ); diff --git a/tests/tt_metal/tt_metal/test_reduce_w.cpp b/tests/tt_metal/tt_metal/test_reduce_w.cpp index cda9dc3bf93..5dbb3d3f953 100644 --- a/tests/tt_metal/tt_metal/test_reduce_w.cpp +++ b/tests/tt_metal/tt_metal/test_reduce_w.cpp @@ -104,15 +104,15 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/reader_unary_8bank_reduce.cpp" - : "tt_metal/kernels/dataflow/reader_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" - : "tt_metal/kernels/dataflow/writer_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -128,7 +128,7 @@ int main(int argc, char **argv) { }; auto reduce_w_compute_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/reduce_w.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = reduce_defines} ); diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp index 292f1c9eb83..29fc74eab09 100644 --- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp +++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp @@ -90,16 +90,16 @@ int main(int argc, char **argv) { auto reader_kernel = tt_metal::CreateDataMovementKernel( program, multibank ? - "tt_metal/kernels/dataflow/transpose_hc_8bank.cpp" : - "tt_metal/kernels/dataflow/transpose_hc.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp" : + "tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, multibank ? - "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" : - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" : + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -109,7 +109,7 @@ int main(int argc, char **argv) { auto blank_binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_transpose_wh.cpp index c4a5d91c75e..4bfc3402f93 100644 --- a/tests/tt_metal/tt_metal/test_transpose_wh.cpp +++ b/tests/tt_metal/tt_metal/test_transpose_wh.cpp @@ -92,15 +92,15 @@ int main(int argc, char **argv) { auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - //"tt_metal/kernels/dataflow/reader_unary_transpose_wh.cpp", - "tt_metal/kernels/dataflow/reader_unary_transpose_wh_8bank.cpp", + //"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - //"tt_metal/kernels/dataflow/writer_unary.cpp", - "tt_metal/kernels/dataflow/writer_unary_8bank.cpp", + //"tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -110,7 +110,7 @@ int main(int argc, char **argv) { auto reduce_w_compute_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/transpose_wh.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_unpack_tilize.cpp b/tests/tt_metal/tt_metal/test_unpack_tilize.cpp index 86b811f3361..23b93c13058 100644 --- a/tests/tt_metal/tt_metal/test_unpack_tilize.cpp +++ b/tests/tt_metal/tt_metal/test_unpack_tilize.cpp @@ -114,13 +114,13 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -131,7 +131,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/tilize.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_unpack_untilize.cpp b/tests/tt_metal/tt_metal/test_unpack_untilize.cpp index dbc37237183..998307d1d92 100644 --- a/tests/tt_metal/tt_metal/test_unpack_untilize.cpp +++ b/tests/tt_metal/tt_metal/test_unpack_untilize.cpp @@ -128,13 +128,13 @@ int main(int argc, char **argv) { auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -145,7 +145,7 @@ int main(int argc, char **argv) { auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/untilize.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args} ); diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp index 86dcfed15e8..7fe39006d22 100644 --- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp @@ -148,15 +148,15 @@ int main(int argc, char **argv) { auto binary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/reader_dual_8bank.cpp" - : "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" - : "tt_metal/kernels/dataflow/writer_unary.cpp", + multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -168,7 +168,7 @@ int main(int argc, char **argv) { auto eltwise_binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/untilA_elwbin_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = {{"ELTWISE_OP", op_id_to_op_define[eltwise_op]}}} ); diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp index 986e32b03c2..6eb2e8c3a3a 100644 --- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp +++ b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp @@ -50,13 +50,13 @@ tt_metal::Program generate_eltwise_unary_program(Device *device) { auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_8bank.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); @@ -67,7 +67,7 @@ tt_metal::Program generate_eltwise_unary_program(Device *device) { auto eltwise_binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp index b2d3990fc5b..33c9ac4656c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp @@ -298,7 +298,7 @@ TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) { } tt_metal::Program program = tt_metal::Program(); - string kernel_name = "tests/tt_metal/tt_metal/test_kernels/ping_legal_l1s.cpp"; + string kernel_name = "tests/tt_metal/tt_metal/test_kernels/misc/ping_legal_l1s.cpp"; CoreCoord logical_target_core = CoreCoord({.x = 0, .y = 0}); uint32_t intermediate_l1_addr = L1_UNRESERVED_BASE; uint32_t size_bytes = host_input.size() * sizeof(uint32_t); diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp index a98382ba06a..95a0bfb3699 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp @@ -36,14 +36,14 @@ void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program auto unary_reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary_push_4.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core_range, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", core_range, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -54,7 +54,7 @@ void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program auto eltwise_unary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy_3m.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core_range, tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp index 4f4ca69bb88..3987f63408e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp @@ -31,7 +31,7 @@ Program initialize_program_data_movement(Device *device, const CoreRangeSet &cor auto add_two_ints_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/riscv_draft/add_two_ints.cpp", + "tests/tt_metal/tt_metal/test_kernels/riscv_draft/add_two_ints.cpp", core_range_set, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -51,7 +51,7 @@ Program initialize_program_compute(Device *device, const CoreRangeSet &core_rang auto compute_kernel_id = tt_metal::CreateComputeKernel( program, - "tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp", core_range_set, tt_metal::ComputeConfig{.math_fidelity = MathFidelity::HiFi4, .fp32_dest_acc_en = fp32_dest_acc_en, .math_approx_mode = math_approx_mode, .compile_args = compute_args}); diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp index f59910052a5..bfb9953fceb 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp @@ -49,18 +49,18 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked std::vector reader_runtime_args = {}; std::vector writer_runtime_args = {}; if (banked_reader) { - reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp"; + reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp"; input_page_size_bytes = cfg.page_size_bytes; } else { - reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp"; + reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp"; input_page_size_bytes = cfg.size_bytes; } if (banked_writer) { - writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp"; + writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp"; output_page_size_bytes = cfg.page_size_bytes; } else { - writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp"; + writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp"; output_page_size_bytes = cfg.size_bytes; } @@ -168,7 +168,7 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) { auto reader_kernel = CreateDataMovementKernel( program, - "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp", cfg.logical_core, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, @@ -177,7 +177,7 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) { auto writer_kernel = CreateDataMovementKernel( program, - "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp", cfg.logical_core, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_0, @@ -189,7 +189,7 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) { }; auto datacopy_kernel = CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", cfg.logical_core, ComputeConfig{.compile_args = compute_kernel_args}); diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp index 615af0937b7..c709abb32d2 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp @@ -49,18 +49,18 @@ bool l1_reader_cb_writer_l1(Device* device, const BankedL1Config& cfg, const boo std::vector reader_runtime_args = {}; std::vector writer_runtime_args = {}; if (banked_reader) { - reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp"; + reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp"; input_page_size_bytes = cfg.page_size_bytes; } else { - reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp"; + reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp"; input_page_size_bytes = cfg.size_bytes; } if (banked_writer) { - writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp"; + writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp"; output_page_size_bytes = cfg.page_size_bytes; } else { - writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp"; + writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp"; output_page_size_bytes = cfg.size_bytes; } @@ -168,7 +168,7 @@ bool l1_reader_datacopy_l1_writer(Device* device, const BankedL1Config& cfg) { auto reader_kernel = CreateDataMovementKernel( program, - "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp", cfg.logical_core, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, @@ -177,7 +177,7 @@ bool l1_reader_datacopy_l1_writer(Device* device, const BankedL1Config& cfg) { auto writer_kernel = CreateDataMovementKernel( program, - "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp", cfg.logical_core, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_0, @@ -189,7 +189,7 @@ bool l1_reader_datacopy_l1_writer(Device* device, const BankedL1Config& cfg) { }; auto datacopy_kernel = CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", cfg.logical_core, ComputeConfig{.compile_args = compute_kernel_args}); diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp index 0750a69053b..6b2864599c7 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp @@ -64,12 +64,12 @@ namespace tt::test::buffer::detail { auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::NOC_0, .compile_args = {cb_index}}); auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::NOC_1, .compile_args = {cb_index}}); diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp index 3d9839702a6..49b86d3b56d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp @@ -22,14 +22,14 @@ struct CBConfig { inline void initialize_program(Program& program, const CoreRangeSet& cr_set) { auto dummy_reader_kernel = CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/blank.cpp", cr_set, + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); auto dummy_writer_kernel = CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/blank.cpp", cr_set, + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto dummy_compute_kernel = CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cr_set); + auto dummy_compute_kernel = CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cr_set); } } // end namespace basic_tests::circular_buffer diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp index 3f981af3334..2a33d54a7f6 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp @@ -386,7 +386,7 @@ TEST_F(DeviceFixture, TestDataCopyWithUpdatedCircularBufferConfig) { auto reader_kernel = CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", core, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, @@ -395,7 +395,7 @@ TEST_F(DeviceFixture, TestDataCopyWithUpdatedCircularBufferConfig) { auto writer_kernel = CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", core, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_0, diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp index a8d5f2e3855..9447c20f9e2 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp @@ -79,14 +79,14 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp", test_config.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", test_config.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -98,7 +98,7 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& {"ELTWISE_OP", binary_op_name_to_op_kernel.at(test_config.binary_op)}}; auto binary_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp", test_config.core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = defines}); diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp index 8aba23bcdc6..f982647a7b9 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp @@ -210,14 +210,14 @@ bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& std::vector writer_rt_args; string writer_kernel_name; if (cfg.outputs_rm) { - writer_kernel_name = "tt_metal/kernels/dataflow/writer_unary.cpp"; + writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp"; writer_rt_args = { (std::uint32_t)output_dram_byte_address, (std::uint32_t)output_dram_noc_xy.x, (std::uint32_t)output_dram_noc_xy.y, uint(cfg.M * cfg.N)}; } else { - writer_kernel_name = "tt_metal/kernels/dataflow/writer_unswizzle.cpp"; + writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; writer_rt_args = { (std::uint32_t)output_dram_byte_address, (std::uint32_t)output_dram_noc_xy.x, @@ -243,7 +243,7 @@ bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", cfg.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); @@ -296,7 +296,7 @@ bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& auto matmul_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/matmul_large_block.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp", cfg.core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); @@ -412,7 +412,7 @@ bool single_tile_matmul(tt_metal::Device* device) { auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/reader_binary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, @@ -421,7 +421,7 @@ bool single_tile_matmul(tt_metal::Device* device) { auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, @@ -430,7 +430,7 @@ bool single_tile_matmul(tt_metal::Device* device) { auto simple_matmul_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/single_tile_compute.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp", core, tt_metal::ComputeConfig{.compile_args = {in0_cb_index, in1_cb_index, out_cb_index}}); @@ -537,7 +537,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, @@ -546,7 +546,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, @@ -555,7 +555,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 auto simple_matmul_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/multi_tile_compute.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp", core, tt_metal::ComputeConfig{ .compile_args = {in0_cb_index, in1_cb_index, out_cb_index, M * K, K * N, M * N, M, N, K}}); @@ -682,7 +682,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, @@ -691,7 +691,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, @@ -700,7 +700,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N auto simple_matmul_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/unit_tests/matmul/multi_block_compute.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp", core, tt_metal::ComputeConfig{ .compile_args = { diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp index 859691dd580..aab0bb5c280 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp @@ -165,14 +165,14 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", test_config.cores, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", test_config.cores, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -190,7 +190,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c auto sfpu_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_sfpu.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp", test_config.cores, tt_metal::ComputeConfig{ .math_approx_mode = test_config.approx_mode, diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp index 36d15dc87e5..7aaa893772e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp @@ -48,7 +48,7 @@ bool reader_only( auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp", reader_core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -111,7 +111,7 @@ bool writer_only( auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp", writer_core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -181,7 +181,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", test_config.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, @@ -190,7 +190,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", test_config.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, @@ -276,7 +276,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", test_config.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, @@ -285,7 +285,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", test_config.core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, @@ -297,7 +297,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter }; auto datacopy_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_copy.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", test_config.core, tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp index 01378d826ef..64a3e21ecb7 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp @@ -36,14 +36,14 @@ namespace local_test_functions { void initialize_dummy_kernels(Program& program, const CoreRangeSet& cr_set) { auto dummy_reader_kernel = CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/blank.cpp", cr_set, + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); auto dummy_writer_kernel = CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/blank.cpp", cr_set, + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto dummy_compute_kernel = CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cr_set); + auto dummy_compute_kernel = CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cr_set); } bool cb_config_successful(Device* device, const DummyProgramMultiCBConfig & program_config){ @@ -203,7 +203,7 @@ bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, CommandQueue& c auto dummy_kernel1 = CreateDataMovementKernel( program, "tests/tt_metal/tt_metal/gtest_unit_tests/command_queue/test_kernels/runtime_args_kernel1.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - auto dummy_compute_kernel = CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cr_set); + auto dummy_compute_kernel = CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cr_set); vector dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8}; vector dummy_kernel1_args = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; @@ -297,7 +297,7 @@ TEST_F(CommandQueueFixture, TestArbiterDoesNotHang) { // Add an NCRISC blank manually, but in compile program, the BRISC blank will be // added separately auto dummy_reader_kernel = CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); EnqueueProgram(*::detail::GLOBAL_CQ, program, false); Finish(*::detail::GLOBAL_CQ); @@ -427,7 +427,7 @@ TEST_F(CommandQueueFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode // Add an NCRISC blank manually, but in compile program, the BRISC blank will be // added separately auto dummy_reader_kernel = CreateDataMovementKernel( - program, "tt_metal/kernels/dataflow/blank.cpp", cr_set, + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false); @@ -443,7 +443,7 @@ TEST_F(CommandQueueFixture, ComputeRuntimeArgs) { auto compute_kernel_id = CreateComputeKernel( program, - "tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp", cr_set, tt::tt_metal::ComputeConfig{}); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp index a3d052431f7..44884966a4c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp @@ -166,7 +166,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c auto reader_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp", test_config.cores, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); @@ -174,7 +174,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c // Enqueue apis only supported on gs so far auto writer_kernel = tt_metal::CreateDataMovementKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp", test_config.cores, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -192,7 +192,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c auto sfpu_kernel = tt_metal::CreateComputeKernel( program, - "tt_metal/kernels/compute/eltwise_sfpu.cpp", + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp", test_config.cores, tt_metal::ComputeConfig{ .math_approx_mode = test_config.approx_mode, diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp index bd0458f7012..e78c56e36af 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp @@ -102,9 +102,9 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf for (int core_id = 0; core_id < num_cores; core_id++) { string receiver_kernel_name; if (core_id == 0) { - receiver_kernel_name = "tt_metal/kernels/dataflow/reader_first_stage.cpp"; + receiver_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp"; } else { - receiver_kernel_name = "tt_metal/kernels/dataflow/receiver_intermediate_stage.cpp"; + receiver_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/receiver_intermediate_stage.cpp"; } std::vector receiver_kernel_compile_time_args = {cb_index, block_size_tiles}; @@ -119,9 +119,9 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf string sender_kernel_name; if (core_id == num_cores - 1) { - sender_kernel_name = "tt_metal/kernels/dataflow/writer_last_stage.cpp"; + sender_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp"; } else { - sender_kernel_name = "tt_metal/kernels/dataflow/sender_intermediate_stage.cpp"; + sender_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp"; } std::vector sender_kernel_compile_time_args = {cb_index, block_size_tiles}; sender_kernels.push_back(tt_metal::CreateDataMovementKernel( @@ -134,7 +134,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf .compile_args = sender_kernel_compile_time_args})); // Add blank compute kernel - tt_metal::CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cores[core_id]); + tt_metal::CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cores[core_id]); } // TODO(agrebenisan): Once semaphores are properly allocated at 16B-aligned addresses, then