From 6dd5eba613eb5a4e412bc122a972db5449054b41 Mon Sep 17 00:00:00 2001 From: Aleksandar Zecevic Date: Thu, 14 Nov 2024 10:29:46 +0100 Subject: [PATCH] Matmul1DProgramConfig workaround removal (#1248) Workaround introduced in https://github.com/tenstorrent/tt-mlir/pull/894 is not needed anymore. The issue was fixed in metal https://github.com/tenstorrent/tt-metal/pull/13819. Closes https://github.com/tenstorrent/tt-mlir/issues/891 FYI @odjuricicTT --- .../include/tt/runtime/detail/workarounds.h | 12 +-- runtime/lib/common/workarounds.cpp | 4 +- runtime/lib/ttnn/operations/matmul/matmul.cpp | 79 +------------------ runtime/tools/python/test/test_run.py | 17 ---- runtime/tools/python/ttrt/common/run.py | 8 -- 5 files changed, 6 insertions(+), 114 deletions(-) diff --git a/runtime/include/tt/runtime/detail/workarounds.h b/runtime/include/tt/runtime/detail/workarounds.h index 69d2b7f7f..38d8c08cf 100644 --- a/runtime/include/tt/runtime/detail/workarounds.h +++ b/runtime/include/tt/runtime/detail/workarounds.h @@ -17,12 +17,12 @@ struct Env { #endif get(bool ignoreTileShape = true, bool emptyOpForceRowMajor = true, bool fullOpForceRowMajor = true, bool maxpool2dPreshard = true, - bool setMatmul1DProgramConfig = true, bool swapBinaryOperands = true) + bool swapBinaryOperands = true) #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1 ; #else { - return Env(true, true, true, true, true, true); + return Env(true, true, true, true, true); } #endif // TODO(bug #272), determine correct layout by tile shape in the future @@ -42,9 +42,6 @@ struct Env { // instead of adding a method in runtime bool maxpool2dPreshard; - // TODO(bug #891): ttnn::matmul doesn't chose correct program config. - bool setMatmul1DProgramConfig; - // TODO(bug #1124): We're currently swapping the operands for binary ops // in runtime if the lhs operand is smaller (and requires broadcast onto the // rhs operand). We should add this check in the compiler. @@ -53,12 +50,11 @@ struct Env { private: constexpr Env(bool ignoreTileShape, bool emptyOpForceRowMajor, bool fullOpForceRowMajor, bool maxpool2dPreshard, - bool setMatmul1DProgramConfig, bool swapBinaryOperands) + bool swapBinaryOperands) : ignoreTileShape(ignoreTileShape), emptyOpForceRowMajor(emptyOpForceRowMajor), fullOpForceRowMajor(fullOpForceRowMajor), maxpool2dPreshard(maxpool2dPreshard), - setMatmul1DProgramConfig(setMatmul1DProgramConfig), swapBinaryOperands(swapBinaryOperands) {} }; @@ -72,8 +68,6 @@ inline std::ostream &operator<<(std::ostream &os, const Env &env) { << "fullOpForceRowMajor: " << env.fullOpForceRowMajor << ",\n"; os << "\t" << "maxpool2dPreshard: " << env.maxpool2dPreshard << ",\n"; - os << "\t" - << "setMatmul1DProgramConfig: " << env.setMatmul1DProgramConfig << "\n"; os << "\t" << "swapBinaryOperands: " << env.swapBinaryOperands << "\n"; os << "}"; diff --git a/runtime/lib/common/workarounds.cpp b/runtime/lib/common/workarounds.cpp index 40ce96612..cd2795d02 100644 --- a/runtime/lib/common/workarounds.cpp +++ b/runtime/lib/common/workarounds.cpp @@ -8,10 +8,10 @@ namespace tt::runtime::workaround { #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1 const Env &Env::get(bool ignoreTileShape, bool emptyOpForceRowMajor, bool fullOpForceRowMajor, bool maxpool2dPreshard, - bool setMatmul1DProgramConfig, bool swapBinaryOperands) { + bool swapBinaryOperands) { static const Env config(ignoreTileShape, emptyOpForceRowMajor, fullOpForceRowMajor, maxpool2dPreshard, - setMatmul1DProgramConfig, swapBinaryOperands); + swapBinaryOperands); return config; } #endif diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp index 029a134fa..abe71f970 100644 --- a/runtime/lib/ttnn/operations/matmul/matmul.cpp +++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp @@ -5,81 +5,11 @@ #include "matmul.h" #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" -#include "tt/runtime/detail/workarounds.h" #include "tt/runtime/ttnn/operations/utils.h" #include // ANCHOR: adding_an_op_matmul_runtime_operations namespace tt::runtime::ttnn::operations::matmul { - -// This is a workaround for the lack of program config selection in ttnn.matmul. -// The logic here is temporary and totaly incompleate. -static ::ttnn::operations::matmul::MatmulMultiCoreReuseMultiCast1DProgramConfig -createProgramConfig(const ::tt::target::ttnn::MatmulOp *op, - ProgramContext &context, - ::tt::tt_metal::MemoryConfig outputMemoryConfig) { - - uint32_t numCores = outputMemoryConfig.shard_spec->grid.num_cores(); - bool fuseBatch = true; // required for sharded inputs - - ProgramTensorPool &tensorPool = context.getTensorPool(); - const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id()); - const ::ttnn::Tensor &rhs = tensorPool.at(op->in1()->global_id()); - - // note: use ttnn::Shape::value returns a legacy tt::tt_metal::Shape object - // which does take padding into account. - uint32_t volume = 1; - for (size_t i = 0; i < lhs.shape().rank(); i++) { - volume *= lhs.shape().value[i]; - } - - uint32_t M = - fuseBatch ? volume / lhs.shape().value[-1] : lhs.shape().value[-2]; - // uint32_t K = lhs.shape().value[-1]; - uint32_t N = rhs.shape().value[-1]; - bool mcastIn0 = N >= M; - - uint32_t perCoreM, perCoreN; - - if (mcastIn0) { - perCoreM = M / tt::constants::TILE_HEIGHT; - perCoreN = tt::div_up(tt::div_up(N, numCores), tt::constants::TILE_WIDTH); - } else { - perCoreM = tt::div_up(tt::div_up(M, numCores), tt::constants::TILE_HEIGHT); - perCoreN = N / tt::constants::TILE_WIDTH; - } - - // uint32_t in0_block_w = (K / tt::constants::TILE_WIDTH) % 2 == 0 ? 2 : 1; - uint32_t in0BlockW = 1; - - // These should work in most cases, but there is a logic how we can optimize - // this later. - uint32_t outSubblockH = 1, outSubblockW = 1; - - LOG_ASSERT(outputMemoryConfig.shard_spec->grid.ranges().size() == 1); - CoreCoord computeWithStorageGridSize = - outputMemoryConfig.shard_spec->grid.ranges().begin()->grid_size(); - if (lhs.is_sharded()) { - CoreCoord lhs_grid_size = - lhs.shard_spec()->grid.ranges().begin()->grid_size(); - if (computeWithStorageGridSize < lhs_grid_size) { - computeWithStorageGridSize = lhs_grid_size; - } - } - - return ::ttnn::operations::matmul:: - MatmulMultiCoreReuseMultiCast1DProgramConfig{ - .compute_with_storage_grid_size = computeWithStorageGridSize, - .in0_block_w = in0BlockW, - .out_subblock_h = outSubblockH, - .out_subblock_w = outSubblockW, - .per_core_M = perCoreM, - .per_core_N = perCoreN, - .fuse_batch = true, - .fused_activation = std::nullopt, - .mcast_in0 = mcastIn0}; -}; - void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) { ProgramTensorPool &tensorPool = context.getTensorPool(); const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id()); @@ -94,13 +24,6 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) { ::ttnn::operations::matmul::MatmulMultiCoreReuseMultiCast1DProgramConfig> programConfig = std::nullopt; - // TODO(bug #891): ttnn::matmul doesn't chose correct program config. - if (workaround::Env::get().setMatmul1DProgramConfig && - outputMemoryConfig.memory_layout == - ::tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED) { - programConfig = createProgramConfig(op, context, outputMemoryConfig); - } - const std::optional memoryConfig = std::make_optional(outputMemoryConfig); @@ -109,7 +32,7 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) { ::ttnn::Tensor out = ::ttnn::matmul( lhs, rhs, /*transposeA*/ false, /*transposeB*/ false, memoryConfig, dtype, - programConfig, /*activation*/ std::nullopt, + /*programConfig*/ std::nullopt, /*activation*/ std::nullopt, /*computeKernelConfig*/ std::nullopt, /*coreGrid*/ std::nullopt); tensorPool.insert_or_assign(op->out()->global_id(), out); diff --git a/runtime/tools/python/test/test_run.py b/runtime/tools/python/test/test_run.py index df88ca802..69d5683aa 100644 --- a/runtime/tools/python/test/test_run.py +++ b/runtime/tools/python/test/test_run.py @@ -377,20 +377,3 @@ def test_disable_maxpool2d_preshard_run(): def test_disable_maxpool2d_preshard_cmd_run(): command = f"ttrt run {BINARY_FILE_PATH} --disable-maxpool2d-preshard --log-file ttrt-results/{inspect.currentframe().f_code.co_name}.log --result-file ttrt-results/{inspect.currentframe().f_code.co_name}.json" sub_process_command(command) - - -def test_disable_matmul_1d_program_config_run(): - API.initialize_apis() - custom_args = {} - custom_args[ - "--result-file" - ] = f"ttrt-results/{inspect.currentframe().f_code.co_name}.json" - custom_args["binary"] = BINARY_FILE_PATH - custom_args["--disable-matmul-1d-program-config"] = True - run_instance = API.Run(args=custom_args) - run_instance() - - -def test_disable_matmul_1d_program_config_cmd_run(): - command = f"ttrt run {BINARY_FILE_PATH} --disable-matmul-1d-program-config --log-file ttrt-results/{inspect.currentframe().f_code.co_name}.log --result-file ttrt-results/{inspect.currentframe().f_code.co_name}.json" - sub_process_command(command) diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py index 0fe97f093..24796717a 100644 --- a/runtime/tools/python/ttrt/common/run.py +++ b/runtime/tools/python/ttrt/common/run.py @@ -151,13 +151,6 @@ def initialize_api(): choices=[True, False], help="disable maxpool2d preshard workaround", ) - Run.register_arg( - name="--disable-matmul-1d-program-config", - type=bool, - default=False, - choices=[True, False], - help="disable matmul 1d program config workaround", - ) Run.register_arg( name="--disable-swap-binary-operands", type=bool, @@ -370,7 +363,6 @@ def _execute(binaries): not self["--disable-empty-op-row-major"], not self["--disable-full-op-row-major"], not self["--disable-maxpool2d-preshard"], - not self["--disable-matmul-1d-program-config"], not self["--disable-swap-binary-operands"], ) self.logging.debug(f"setting tt runtime workaround env={workaround_env}")