diff --git a/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py b/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py index 29c6218b64f..b31fd51b723 100644 --- a/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py +++ b/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py @@ -388,13 +388,13 @@ def test_falcon7b_attnention_sliced( ttnn.experimental.tensor.ShardOrientation.ROW_MAJOR, ) - mm_slice = ttnn.add( + mm_slice = ttnn.experimental.operations.primary.add( mm_slice, attn_mask_slice, fused_activations=None, - memory_config=height_sharded_memory_config, + output_mem_config=height_sharded_memory_config, output_dtype=ttnn.experimental.tensor.DataType.BFLOAT16, - output_tensor=mm_slice, + in_place=True, ) attn_mask_slice.deallocate() diff --git a/models/demos/t3000/falcon40b/tt/falcon_decoder.py b/models/demos/t3000/falcon40b/tt/falcon_decoder.py index 9de02f5cd34..0d19dd25e13 100644 --- a/models/demos/t3000/falcon40b/tt/falcon_decoder.py +++ b/models/demos/t3000/falcon40b/tt/falcon_decoder.py @@ -304,11 +304,11 @@ def fwd_prefill( # Note that this is only correct in inference when dropout is disabled for i in range(len(residual)): output.append( - ttnn.add( + ttnn.experimental.operations.primary.add( residual[i], attention_output[i], - memory_config=self.model_config["PARALLEL_ATTN_ADD_OUTPUT_MEMCFG"], - output_tensor=residual[i], + output_mem_config=self.model_config["PARALLEL_ATTN_ADD_OUTPUT_MEMCFG"], + in_place=True, ) ) attention_output[i].deallocate(True) @@ -320,11 +320,11 @@ def fwd_prefill( # dropout_add # For inference, this is just add for i in range(len(output)): - output[i] = ttnn.add( + output[i] = ttnn.experimental.operations.primary.add( output[i], mlp_output[i], - memory_config=self.model_config["DROPOUT_ADD_OUTPUT_MEMCFG"], - output_tensor=output[i], + output_mem_config=self.model_config["DROPOUT_ADD_OUTPUT_MEMCFG"], + in_place=True, ) mlp_output[i].deallocate(True) @@ -421,11 +421,11 @@ def fwd_decode( # Note that this is only correct in inference when dropout is disabled for i in range(len(residual)): output.append( - ttnn.add( + ttnn.experimental.operations.primary.add( residual[i], attention_output[i], - memory_config=self.model_config["PARALLEL_ATTN_ADD_OUTPUT_MEMCFG"], - output_tensor=residual[i], + output_mem_config=self.model_config["PARALLEL_ATTN_ADD_OUTPUT_MEMCFG"], + in_place=True, ) ) attention_output[i].deallocate(True) @@ -437,11 +437,11 @@ def fwd_decode( # dropout_add # For inference, this is just add for i in range(len(output)): - output[i] = ttnn.add( + output[i] = ttnn.experimental.operations.primary.add( output[i], mlp_output[i], - memory_config=self.model_config["DROPOUT_ADD_OUTPUT_MEMCFG"], - output_tensor=output[i], + output_mem_config=self.model_config["DROPOUT_ADD_OUTPUT_MEMCFG"], + in_place=True, ) mlp_output[i].deallocate(True) diff --git a/tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_interleaved.cpp b/tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h.cpp similarity index 100% rename from tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_interleaved.cpp rename to tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h.cpp diff --git a/tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_sharded.cpp b/tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_sharded.cpp deleted file mode 100644 index 1b848fea1bb..00000000000 --- a/tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_sharded.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "compute_kernel_api/bcast.h" - - -namespace NAMESPACE { -void MAIN { - constexpr uint32_t onetile = 1; - uint32_t NC = get_arg_val(0); - uint32_t Ht = get_arg_val(1); - uint32_t Wt = get_arg_val(2); - uint32_t h_blk = get_arg_val(3); - init_bcast(tt::CB::c_in0, tt::CB::c_in1, tt::CB::c_out0); - - // TODO: deal with batched in1 - for (uint32_t nc = 0; nc < NC; nc++) { - cb_wait_front(tt::CB::c_in0, Wt*Ht); - cb_reserve_back(tt::CB::c_out0, Wt*Ht); - for (uint32_t wt = 0; wt < Wt; wt++) { - cb_wait_front(tt::CB::c_in1, onetile); - for (uint32_t ht = 0; ht < Ht; ht+=h_blk) { - acquire_dst(tt::DstMode::Half); - for (uint32_t htr = 0; htr(tt::CB::c_in0, tt::CB::c_in1, current_index, 0, htr); - pack_tile(htr, tt::CB::c_out0, current_index); - } - release_dst(tt::DstMode::Half); - } - cb_pop_front(tt::CB::c_in1, onetile); - } - cb_pop_front(tt::CB::c_in0, Wt*Ht); - cb_push_back(tt::CB::c_out0, Wt*Ht); - } -} -} // NAMESPACE diff --git a/tt_eager/tt_dnn/op_library/bcast/kernels/dataflow/reader_bcast_h_sharded.cpp b/tt_eager/tt_dnn/op_library/bcast/kernels/dataflow/reader_bcast_h_sharded.cpp index db032920cd5..c4540397688 100644 --- a/tt_eager/tt_dnn/op_library/bcast/kernels/dataflow/reader_bcast_h_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/bcast/kernels/dataflow/reader_bcast_h_sharded.cpp @@ -6,13 +6,12 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src1_addr = get_arg_val(0); - uint32_t Ht = get_arg_val(1); - uint32_t Wt = get_arg_val(2); - uint32_t offset = get_arg_val(3); - uint32_t NC = get_arg_val(4); - uint32_t batch_offset = get_arg_val(5); //if weight has multiple batches - uint32_t w_blk = get_arg_val(6); + uint32_t src1_addr = get_arg_val(0); + uint32_t Ht = get_arg_val(1); + uint32_t Wt = get_arg_val(2); + uint32_t offset = get_arg_val(3); + uint32_t NC = get_arg_val(4); + uint32_t batch_offset= get_arg_val(5); //if weight has multiple batches //constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; @@ -36,17 +35,26 @@ void kernel_main() { uint32_t l1_write_addr_in0; uint32_t l1_write_addr_in1; - // TODO: do we really need in1 NC != 1 support?! if not supported here need to validate in1 correctly! - + uint32_t i = 0; cb_push_back(cb_id_in0, Ht * Wt); - for (uint32_t wt = 0; wt < Wt; wt += w_blk) { - cb_reserve_back(cb_id_in1, w_blk); - l1_write_addr_in1 = get_write_ptr(cb_id_in1); - for (uint32_t r = 0; r bcast_defines = bcast_op_utils::get_defines(BcastOpDim::H, bcast_math); auto bcast_kernel_id = tt_metal::CreateKernel( program, - "tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_interleaved.cpp", + "tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h.cpp", all_device_cores, tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_defines} ); diff --git a/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_sharded_h.cpp b/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_sharded_h.cpp index f89317ee2fa..06885ce922b 100644 --- a/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_sharded_h.cpp +++ b/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_sharded_h.cpp @@ -82,10 +82,7 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b .set_globally_allocated_address(*output.buffer()); auto out_cb = tt_metal::CreateCircularBuffer(program, all_cores, output_cb_config); - uint32_t h_blk = std::min(Ht, 8u); - uint32_t w_blk = std::min(Wt, 8u); - - uint32_t num_input_tiles = w_blk; + uint32_t num_input_tiles = (b.get_legacy_shape()[-1] * output.element_size() + TILE_HW - 1)/ TILE_HW; uint32_t src1_cb_index = CB::c_in1; tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * aligned_input_tile_nbytes, {{src1_cb_index, act_df}}) .set_page_size(src1_cb_index, aligned_input_tile_nbytes); @@ -110,13 +107,13 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b //const char* compute_name = bcast_op_utils::get_compute_name(BcastOpDim::H)); auto bcast_kernel_id = tt_metal::CreateKernel( program, - "tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h_sharded.cpp", + "tt_eager/tt_dnn/op_library/bcast/kernels/compute/bcast_h.cpp", all_cores, tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_defines} ); uint32_t ncores_y = ncores / ncores_x; - log_debug("ncores {}, ncores_x {}, Wt {}, Ht {}, h_blk {}, w_blk {}, src0_cb_index {}, src1_cb_index {}, output_cb_index {}, src1_is_dram {}, dst_is_dram {}", ncores, ncores_x, Wt, Ht, h_blk, w_blk, src0_cb_index, src1_cb_index, output_cb_index, src1_is_dram, dst_is_dram); + log_debug("ncores {}, ncores_x {}, Wt {}, Ht {}, src0_cb_index {}, src1_cb_index {}, output_cb_index {}, src1_is_dram {}, dst_is_dram {}", ncores, ncores_x, Wt, Ht, src0_cb_index, src1_cb_index, output_cb_index, src1_is_dram, dst_is_dram); for (uint32_t i = 0; i < ncores; i++){ CoreCoord core; uint32_t offset = 0; @@ -152,8 +149,7 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b Wt, // 2 offset, // 3 Ht_per_core, // 4 - tile_offset, // 5 - w_blk, // 6 + tile_offset, //5 } ); @@ -165,7 +161,6 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b NC, // B Ht, // Hbatch for block shardeshardedt Wt, // Wt - h_blk, // h block size } ); } @@ -226,9 +221,6 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b } uint32_t tile_offset = Wt * ncores; - uint32_t h_blk = std::min(Ht, 8u); - uint32_t w_blk = std::min(Wt, 8u); - tt_metal::SetRuntimeArgs( program, binary_reader_kernel_id, @@ -240,7 +232,6 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b offset, // 3 Ht_per_core, // 4 tile_offset, //5 - w_blk, // 6 } ); @@ -252,7 +243,6 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b NC, // B Ht, // Ht Wt, // Wt - h_blk, // h block size } ); }