diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml index 0726f3a252ef..5a7832823ec6 100644 --- a/.github/workflows/ttnn-post-commit.yaml +++ b/.github/workflows/ttnn-post-commit.yaml @@ -21,6 +21,7 @@ jobs: test-group: [ {name: ttnn group 1, cmd: pytest $TT_METAL_HOME/tests/ttnn/unit_tests -v --splits 2 --group 1}, {name: ttnn group 2, cmd: pytest $TT_METAL_HOME/tests/ttnn/unit_tests -v --splits 2 --group 2}, + {name: ttnn cpp tests, cmd: ./build/test/ttnn/unit_tests}, ] name: ${{ matrix.test-group.name }} ${{ matrix.runner-info.arch }} ${{ matrix.runner-info.name }} diff --git a/tests/ttnn/unit_tests/test_async_runtime.cpp b/tests/ttnn/unit_tests/test_async_runtime.cpp new file mode 100644 index 000000000000..92398dc6a2c9 --- /dev/null +++ b/tests/ttnn/unit_tests/test_async_runtime.cpp @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tensor/tensor.hpp" +#include "ttnn_multi_command_queue_fixture.hpp" +#include "tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp" +#include "tt_dnn/op_library/moreh_sum/moreh_sum_op.hpp" +#include "common/bfloat16.hpp" +#include "ttnn/cpp/ttnn/async_runtime.hpp" +#include "tt_numpy/functions.hpp" +#include + +using namespace tt; +using namespace tt_metal; +using MultiCommandQueueSingleDeviceFixture = ttnn::MultiCommandQueueSingleDeviceFixture; +using namespace constants; + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { + Device* device = this->device_; + MemoryConfig mem_cfg = MemoryConfig{ + .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, + .buffer_type = BufferType::DRAM, + .shard_spec = std::nullopt}; + + uint32_t input_buf_size_datums = 1024 * 1024; + uint32_t output_buf_size_datums = 1024 * 32; + uint32_t datum_size_bytes = 2; + uint32_t io_cq = 1; // Data reads and writes done through CQ0 + uint32_t workload_dispatch_cq = 0; // Workload dispatched through CQ1 + + ttnn::Shape input_shape = ttnn::Shape(Shape({1, 1, 1024, 1024})); + auto host_data = std::shared_ptr(new bfloat16[input_buf_size_datums]); + auto readback_data = std::shared_ptr(new bfloat16[output_buf_size_datums]); + + + for (int i = 0; i < input_buf_size_datums; i++) { + host_data[i] = bfloat16(static_cast(1)); + } + // Create golden data using tt_eager APIs + Tensor np_tensor = tt::numpy::full(input_shape.value(), static_cast(1), DataType::BFLOAT16).to(Layout::TILE).to(device); + std::vector reduce_dims = {3}; + Tensor np_out = tt::operations::primary::moreh_sum(np_tensor, reduce_dims); + Tensor np_out_host = np_out.cpu(); + const bfloat16* golden_output = std::get>(std::get(np_out_host.get_storage()).buffer).begin(); + // Enable Asynchronous Execution and test ttnn runtime APIs + device->set_worker_mode(WorkExecutorMode::ASYNCHRONOUS); + // Events for host - device synchronization + auto write_event = std::make_shared(); + auto workload_event = std::make_shared(); + // Running sum-reduce with preallocated output + auto op = tt::operations::primary::MorehSum{.dim = 3}; + // Preallocate Input and Output Tensors on Device + auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto output_buffer = ttnn::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; + auto output_storage = tt::tt_metal::DeviceStorage{output_buffer}; + Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE); + Tensor output_tensor = Tensor(output_storage, np_out.get_shape(), DataType::BFLOAT16, Layout::TILE); + // Populate input_tensor with data + ttnn::write_buffer(io_cq, input_tensor, {host_data}); + // Record the completion of the write event + ttnn::record_event(device->command_queue(io_cq), write_event); + // Host stalls until write is completed, before sending workload + ttnn::event_synchronize(write_event); + // Dispatch workload. Preallocated output_tensor is populated by op/ + ttnn::run_operation(workload_dispatch_cq, op, {input_tensor}, {}, {output_tensor}).at(0); + // Record completion of workload + ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event); + ttnn::event_synchronize(workload_event); + // Read output back, once workload is complete + ttnn::read_buffer(io_cq, output_tensor, {readback_data}); + // Ensure that reference count book keeping is done correctly + // Tensors only have one reference in the main thread. Ensure this is true. + EXPECT_EQ(input_tensor.tensor_attributes->main_thread_ref_count, 1); + EXPECT_EQ(output_tensor.tensor_attributes->main_thread_ref_count, 1); + // Buffers are currently jointly owned by the original buffer object, the storage object and the tensor (3). + EXPECT_EQ(input_buffer.use_count(), 3); + EXPECT_EQ(output_buffer.use_count(), 3); + // Deallocate tensors (tensor gives up buffer). Done asynchronously, so sync on queue after. + input_tensor.deallocate(); + output_tensor.deallocate(); + ttnn::queue_synchronize(device->command_queue(io_cq)); + // Buffer only has 2 owners in main thread. + EXPECT_EQ(input_buffer.use_count(), 2); + EXPECT_EQ(output_buffer.use_count(), 2); + for (int i = 0; i < output_buf_size_datums; i++) { + EXPECT_EQ(readback_data[i], golden_output[i]); + } +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { + Device* device = this->device_; + device->set_worker_mode(WorkExecutorMode::ASYNCHRONOUS); + MemoryConfig mem_cfg = MemoryConfig{ + .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, + .buffer_type = BufferType::DRAM, + .shard_spec = std::nullopt}; + + uint32_t buf_size_datums = 1024 * 1024; + uint32_t datum_size_bytes = 2; + std::vector inputs = {4, 9, 16, 25, 36, 64}; + uint32_t io_cq = 1; + uint32_t workload_dispatch_cq = 0; + ttnn::Shape shape = ttnn::Shape(Shape({1, 1, 1024, 1024})); + + auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]); + auto readback_data = std::shared_ptr(new bfloat16[buf_size_datums]); + for (int loop = 0; loop < 10; loop++) { + log_info(LogTest, "Running outer loop {}", loop); + for (auto input_val : inputs) { + for (int i = 0; i < buf_size_datums; i++) { + host_data[i] = bfloat16(static_cast(input_val)); + } + + auto write_event = std::make_shared(); + auto workload_event = std::make_shared(); + auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; + Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); + ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1 + ttnn::record_event(device->command_queue(io_cq), write_event); // Record write on cq 1 + // Wait until cq 1 write is complete + ttnn::event_synchronize(write_event); + auto op0 = tt::tt_metal::EltwiseUnary{std::vector{tt::tt_metal::UnaryWithParam{tt::tt_metal::UnaryOpType::SQRT}}}; + auto op1 = tt::tt_metal::EltwiseUnary{std::vector{tt::tt_metal::UnaryWithParam{tt::tt_metal::UnaryOpType::NEG}}}; + // Run operation on cq 0 + Tensor output_tensor = ttnn::run_operation(workload_dispatch_cq, op0, {input_tensor}).at(0); + auto dummy_buffer_0 = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + output_tensor = ttnn::run_operation(workload_dispatch_cq, op1, {output_tensor}).at(0); + // Allocate this buffer to stress test async allocation across op execution and explicit allocation + auto dummy_buffer_1 = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + // Record cq 0 prog execution + ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event); + // Wait until cq 0 prog execution is done + ttnn::event_synchronize(workload_event); + // Read using cq 1 + ttnn::read_buffer(io_cq, output_tensor, {readback_data}); + for (int i = 0; i < buf_size_datums; i++) { + EXPECT_EQ(static_cast(floor(bfloat16(readback_data[i]).to_float())), static_cast(-1 * sqrt(input_val))); + } + } + } +} diff --git a/tests/ttnn/unit_tests/ttnn_multi_command_queue_fixture.hpp b/tests/ttnn/unit_tests/ttnn_multi_command_queue_fixture.hpp new file mode 100644 index 000000000000..d9fbc41fb715 --- /dev/null +++ b/tests/ttnn/unit_tests/ttnn_multi_command_queue_fixture.hpp @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gtest/gtest.h" +#include "tt_metal/host_api.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/impl/dispatch/command_queue.hpp" +#include "tt_metal/llrt/rtoptions.hpp" + +namespace ttnn { + +class MultiCommandQueueSingleDeviceFixture : public ::testing::Test { + protected: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); + num_devices_ = tt::tt_metal::GetNumAvailableDevices(); + if (slow_dispatch) { + GTEST_SKIP() << "Skipping Multi CQ test suite, since it can only be run in Fast Dispatch Mode."; + } + + if (arch_ == tt::ARCH::WORMHOLE_B0 and num_devices_ != 1) { + device_ = tt::tt_metal::CreateDevice(0); // Create device here so teardown can gracefully run + GTEST_SKIP() << "Skipping for Multi-Chip Wormhole, since not enough dispatch cores."; + } + device_ = tt::tt_metal::CreateDevice(0, 2); + } + + void TearDown() override { + tt::tt_metal::CloseDevice(device_); + } + + tt::tt_metal::Device* device_; + tt::ARCH arch_; + size_t num_devices_; +}; +} diff --git a/tt_eager/tensor/tensor.cpp b/tt_eager/tensor/tensor.cpp index b61d8f117c02..89fb187f5b64 100644 --- a/tt_eager/tensor/tensor.cpp +++ b/tt_eager/tensor/tensor.cpp @@ -853,9 +853,7 @@ void* get_raw_host_data_ptr(const Tensor& tensor) { } void memcpy(CommandQueue& queue, void* dst, const Tensor& src, const std::optional transfer_size) { - if (not transfer_size.has_value()) { - TT_ASSERT("transfer_size is not supported for memcpy right now!"); - } + TT_ASSERT(not transfer_size.has_value(), "transfer_size is not supported for memcpy right now!"); if (not is_device_tensor(src)) { TT_THROW("memcpy: src tensor must be on device"); } @@ -872,9 +870,7 @@ void memcpy(void* dst, const Tensor& src, const std::optional trans } void memcpy(CommandQueue& queue, Tensor& dst, const void* src, const std::optional transfer_size) { - if (not transfer_size.has_value()) { - TT_ASSERT("transfer_size is not supported for memcpy right now!"); - } + TT_ASSERT(not transfer_size.has_value(), "transfer_size is not supported for memcpy right now!"); if (not is_device_tensor(dst)) { TT_THROW("memcpy: memcpy to non-device tensor is not supported!"); } diff --git a/tt_eager/tensor/tensor_impl.cpp b/tt_eager/tensor/tensor_impl.cpp index fde448db7cdd..f4f87e033854 100644 --- a/tt_eager/tensor/tensor_impl.cpp +++ b/tt_eager/tensor/tensor_impl.cpp @@ -101,22 +101,7 @@ std::array get_sharded_page_shape(Layout layout, DataType dtype, s return page_shape; } -namespace detail { - -DeviceBuffer allocate_interleaved_buffer_on_device(uint32_t buffer_size_bytes, Device *device, const Shape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config) { - uint32_t page_size = get_page_size(data_type, layout, buffer_size_bytes, shape); - return std::make_shared(device, buffer_size_bytes, page_size, memory_config.buffer_type); -} - -DeviceBuffer allocate_contiguous_buffer_on_device(uint32_t buffer_size_bytes, Device *device, const MemoryConfig& memory_config) { - return std::make_shared(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type); -} - - -DeviceBuffer allocate_sharded_buffer_on_device(uint32_t buffer_size_bytes, Device *device, - const Shape& shape, DataType data_type, Layout layout, - std::optional shard_params, - const MemoryConfig& memory_config) { +void validate_sharded_buffer_allocation(const Shape& shape, Layout layout, std::optional shard_params, const MemoryConfig& memory_config) { TT_ASSERT(shard_params.has_value(), "Shard params are required for sharded buffer and they were not initialized"); auto shard_spec = memory_config.shard_spec.value(); @@ -158,7 +143,25 @@ DeviceBuffer allocate_sharded_buffer_on_device(uint32_t buffer_size_bytes, Devic // Require alignment for now // TT_ASSERT(shard_shape[1] * tensor_impl::element_size_bytes_wrapper(data_type) % ADDRESS_ALIGNMENT == 0); } +} +namespace detail { + +DeviceBuffer allocate_interleaved_buffer_on_device(uint32_t buffer_size_bytes, Device *device, const Shape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config) { + uint32_t page_size = get_page_size(data_type, layout, buffer_size_bytes, shape); + return std::make_shared(device, buffer_size_bytes, page_size, memory_config.buffer_type); +} + +DeviceBuffer allocate_contiguous_buffer_on_device(uint32_t buffer_size_bytes, Device *device, const MemoryConfig& memory_config) { + return std::make_shared(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type); +} + + +DeviceBuffer allocate_sharded_buffer_on_device(uint32_t buffer_size_bytes, Device *device, + const Shape& shape, DataType data_type, Layout layout, + std::optional shard_params, + const MemoryConfig& memory_config) { + validate_sharded_buffer_allocation(shape, layout, shard_params, memory_config); auto page_shape = shard_params.value().page_shape; uint32_t size_of_element = element_size_bytes_wrapper(data_type); uint32_t page_size = page_shape[0] * page_shape[1] * size_of_element; diff --git a/tt_eager/tensor/tensor_impl.hpp b/tt_eager/tensor/tensor_impl.hpp index 89c9979fa1c2..25c3a235b7c5 100644 --- a/tt_eager/tensor/tensor_impl.hpp +++ b/tt_eager/tensor/tensor_impl.hpp @@ -210,7 +210,7 @@ inline std::vector convert_layout_tile_to_row_major(const Shape& shape, const // Validators // ====================================================================================== void validate_on_device_dtype_and_layout(Device* device, const Shape& shape, DataType dtype, Layout layout); - +void validate_sharded_buffer_allocation(const Shape& shape, Layout layout, std::optional shard_params, const MemoryConfig& memory_config); // ----------------------------------------------------------------------------------------------------------------------------------------------- // =============================================================================================================================================== // High Level APIs @@ -220,6 +220,9 @@ void validate_on_device_dtype_and_layout(Device* device, const Shape& shape, Dat // ====================================================================================== // Data reader, writer, and initializers // ====================================================================================== + +uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes, const Shape& shape); + DeviceBuffer allocate_buffer_on_device( uint32_t buffer_size_bytes, Device* device, diff --git a/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp b/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp index 5065f12446ea..a235918dd28c 100644 --- a/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp +++ b/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp @@ -104,7 +104,7 @@ std::vector all_gather_impl(const std::vector& input_tensors, co // Package output in vector, to populate it with launch_op std::vector output_for_curr_device = {output_tensors[i]}; operation::launch_op( - [is_ring, dim, num_links, i, num_inputs, output_mem_config, topology] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [is_ring, dim, num_links, i, num_inputs, output_mem_config, topology] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { bool is_last_chip_in_clockwise_direction = is_ring ? false : i == (num_inputs - 1); bool is_last_chip_in_counter_clockwise_direction = is_ring ? false : i == 0; diff --git a/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp b/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp index 476bf91d0ed0..0fb876f555eb 100644 --- a/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp +++ b/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp @@ -77,7 +77,7 @@ inline Tensor bcast( std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_with_autoformat( - [bcast_op, bcast_dim, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [bcast_op, bcast_dim, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { using tt::constants::TILE_HEIGHT; using tt::constants::TILE_WIDTH; auto& input_tensor_a = input_tensors.at(0); diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp index 98a3f6f6e405..0a93fdde2272 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp @@ -571,7 +571,7 @@ Tensor falcon_lm_head_matmul(const Tensor &input_tensor_a, const Tensor &input_t if (seq_len > 512) { // TODO: Check support for seq_len == 128, 256, 512, ..., 2048 operation::launch_with_autoformat( - [seq_len, mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [seq_len, mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor_a = input_tensors.at(0); auto& input_tensor_b = input_tensors.at(1); auto& bias = optional_input_tensors.at(0); @@ -585,7 +585,7 @@ Tensor falcon_lm_head_matmul(const Tensor &input_tensor_a, const Tensor &input_t } else { operation::launch_op( - [mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor_a = input_tensors.at(0); auto& input_tensor_b = input_tensors.at(1); auto& bias = optional_input_tensors.at(0); @@ -1121,7 +1121,7 @@ MatmulParallelizationStrategy Matmul::get_parallelization_strategy(const std::ve Tensor matmul_1d(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, std::optional program_config, const MemoryConfig& mem_config, std::optional output_dtype, std::optional compute_kernel_config, bool untilize_out) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}, {bias}))}; operation::launch_op( - [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); if (!program_config.has_value()) { diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp index 7452f80154d3..2bee9dd433af 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp @@ -283,9 +283,9 @@ inline bool get_broadcast_batch(const Tensor &input_tensor_a, const Tensor &inpu [](const auto& program_config) -> bool { using ProgramConfigType = std::decay_t; if constexpr (std::is_same_v) { - return true; + return true; } - return false; + return false; }, matmul_program_config ); @@ -392,7 +392,7 @@ inline Tensor matmul( if (!needs_autoformat) { operation::launch_op( - [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out, user_core_coord, input_b_is_batched] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out, user_core_coord, input_b_is_batched] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); auto arch = input_tensor_a.device()->arch(); @@ -410,7 +410,7 @@ inline Tensor matmul( {input_tensor_a, input_tensor_b}, output_tensors, optional_input_tensors); } else { operation::launch_with_autoformat( - [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out, user_core_coord, input_b_is_batched] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out, user_core_coord, input_b_is_batched] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); auto arch = input_tensor_a.storage_type() == StorageType::DEVICE ? input_tensor_a.device()->arch() : AutoFormat::GetDefaultDevice()->arch(); diff --git a/tt_eager/tt_dnn/op_library/bmm/single_core/bmm_op_single_core_tilize_untilize.cpp b/tt_eager/tt_dnn/op_library/bmm/single_core/bmm_op_single_core_tilize_untilize.cpp index b20bdb04b018..7cb20ac5da8f 100644 --- a/tt_eager/tt_dnn/op_library/bmm/single_core/bmm_op_single_core_tilize_untilize.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/single_core/bmm_op_single_core_tilize_untilize.cpp @@ -31,7 +31,7 @@ Tensor bmm_tilize_untilize(const Tensor& a, const Tensor& b, const Tensor& bias, [out_dt, a_height_nblocks, a_width_nblocks, b_width_nblocks, a_block_height_ntiles, a_block_width_ntiles, b_block_width_ntiles, out_subblock_height_ntiles, out_subblock_width_ntiles, tilize_in0, untilize_out, has_bias, compute_kernel_config] - (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable { + (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable { const auto& a = input_tensors.at(0); const auto& b = input_tensors.at(1); // bias not provided, give it a dummy device handle diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp index 09e5fff6530d..68fc0bd7a09c 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp @@ -239,7 +239,7 @@ Tensor multigammaln(const Tensor& a, const MemoryConfig& output_mem_config) { Tensor _mish(const Tensor& x, const MemoryConfig& output_mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({x}))}; operation::launch_op( - [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& x = input_tensors.at(0); Tensor sp_x = softplus(x, 1.0f, 20.0f, output_mem_config); Tensor tanh_x = tanh(sp_x, output_mem_config); @@ -1553,7 +1553,7 @@ Tensor pow(const Tensor& input_a, int exponent, const MemoryConfig& output_mem_c Tensor _argmax(const Tensor& input_a, int64_t _dim, bool all, const MemoryConfig& output_mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_a}))}; operation::launch_with_autoformat( - [_dim, all, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [_dim, all, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_a = input_tensors.at(0); auto& input_shape = input_a.get_legacy_shape(); TT_FATAL(input_shape.rank() == 4, "supported for rank-4 tensors at this time"); diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp index 2e562d087efd..a2389740295b 100644 --- a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp +++ b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp @@ -89,7 +89,7 @@ Tensor copy(const Tensor& src_tensor, const Tensor& dst_tensor) { Tensor clone(const Tensor& input, const MemoryConfig& output_mem_config, std::optional output_dtype) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input}))}; operation::launch_op( - [output_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input = input_tensors.at(0); return operation::run(Copy{output_mem_config, output_dtype.value_or(input.get_dtype())}, {input}); }, {input}, output_tensors); diff --git a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp index a0e2285adc7b..be8f731d7384 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp @@ -114,7 +114,7 @@ struct make_eltwise_binary { std::optional output_dtype = std::nullopt) const { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}))}; operation::launch_with_autoformat( - [fused_activations, output_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [fused_activations, output_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { Tensor in_a = input_tensors.at(0); Tensor in_b = input_tensors.at(1); Shape shape_a = in_a.get_legacy_shape(); @@ -188,7 +188,7 @@ inline Tensor add( std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}))}; operation::launch_op( - [fused_activations, output_mem_config, output_dtype, in_place] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [fused_activations, output_mem_config, output_dtype, in_place] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor_a = input_tensors.at(0); auto& input_tensor_b = input_tensors.at(1); diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp index 59fbd768ab40..131d1c4a1793 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp @@ -196,14 +196,14 @@ inline Tensor run_eltwise_unary( std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; if(output_mem_config.is_sharded()){ operation::launch_op( - [ops_chain, output_mem_config, fp32_dest_acc_en] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [ops_chain, output_mem_config, fp32_dest_acc_en] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run_without_autoformat( EltwiseUnary{ops_chain, output_mem_config, fp32_dest_acc_en}, input_tensors); }, {input_tensor}, output_tensors); } else { operation::launch_with_autoformat( - [ops_chain, output_mem_config,fp32_dest_acc_en] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [ops_chain, output_mem_config,fp32_dest_acc_en] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { Tensor input_tensor = input_tensors.at(0); Shape pad_shape = AutoFormat::pad_to_tile_shape(input_tensor.get_legacy_shape()); FormatParams input_format_params = {.pad_shape = pad_shape, .pad_value = 0.0, .target_layout = Layout::TILE}; @@ -518,7 +518,7 @@ inline Tensor relu( const Tensor& input_tensor, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); bool fp32_dest_acc_en = input_tensor.get_dtype() == DataType::UINT32 or input_tensor.get_dtype() == DataType::INT32; // MT: Currently only uint32/int32 is moved to DST directly, fp32 is converted to fp16b return operation::run( diff --git a/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp b/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp index 0ed550d41053..f18f663c0d42 100644 --- a/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp +++ b/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp @@ -43,7 +43,7 @@ inline Tensor embeddings( std::optional output_dtype = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor, weights}))}; operation::launch_op( - [tilized, embeddings_type, pad_token, mem_config, output_dtype] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [tilized, embeddings_type, pad_token, mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); auto& weights = input_tensors.at(1); return operation::run_without_autoformat( diff --git a/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.hpp b/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.hpp index b2986bfd1321..b766a0c84e7c 100644 --- a/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.hpp +++ b/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.hpp @@ -97,7 +97,7 @@ struct make_layernorm { const Tensor &a, float eps, std::optional gamma = std::nullopt, std::optional beta = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional compute_kernel_config = std::nullopt) const { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({a}))}; operation::launch_with_autoformat( - [eps, mem_config, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [eps, mem_config, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& a = input_tensors.at(0); const auto& gamma = optional_input_tensors.at(0); const auto& beta = optional_input_tensors.at(1); @@ -202,7 +202,7 @@ struct make_layernorm { std::optional compute_kernel_config = std::nullopt) const { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({a}))}; operation::launch_op( - [eps, mem_config, program_config, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [eps, mem_config, program_config, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& a = input_tensors.at(0); const auto& gamma = optional_input_tensors.at(0); const auto& beta = optional_input_tensors.at(1); @@ -228,7 +228,7 @@ struct make_add_layernorm { const Tensor &a, const Tensor& b, float eps, std::optional gamma = std::nullopt, std::optional beta = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const LayerNormProgramConfig& program_config = LayerNormDefaultProgramConfig{}, std::optional compute_kernel_config = std::nullopt) const { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({a, b}))}; operation::launch_op( - [eps, mem_config, program_config, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [eps, mem_config, program_config, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& a = input_tensors.at(0); const auto& b = input_tensors.at(1); const auto& gamma = optional_input_tensors.at(0); diff --git a/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.hpp b/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.hpp index 66c19cecf056..5dc115a8b526 100644 --- a/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.hpp +++ b/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.hpp @@ -134,7 +134,8 @@ inline std::vector nlp_create_qkv_heads_falcon7b(const Tensor& input_ten operation::launch_op( [mem_config]( std::vector input_tensors, - const std::vector>& optional_input_tensors) mutable -> std::vector { + const std::vector>& optional_input_tensors, + const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run(NlpCreateHeadsFalcon7B{mem_config}, input_tensors); }, {input_tensor_a}, @@ -165,7 +166,7 @@ inline std::vector nlp_create_qkv_heads( Tensor(operation::get_workers_for_op_output({input_tensor}, {input_tensor_kv})), Tensor(operation::get_workers_for_op_output({input_tensor}, {input_tensor_kv}))}; operation::launch_op( - [num_heads, num_kv_heads, transpose_k_heads, mem_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [num_heads, num_kv_heads, transpose_k_heads, mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); auto& input_tensor_kv = optional_input_tensors.at(0); const uint32_t num_kv_heads_val = num_kv_heads.value_or(num_heads); @@ -187,7 +188,7 @@ inline std::vector nlp_create_qkv_heads( inline Tensor nlp_concat_heads(const Tensor &input_tensor_a, const MemoryConfig& mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [mem_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run(NlpConcatHeads{mem_config}, input_tensors); }, {input_tensor_a}, output_tensors); return output_tensors.at(0); @@ -195,7 +196,7 @@ inline Tensor nlp_concat_heads(const Tensor &input_tensor_a, const MemoryConfig& inline Tensor nlp_concat_heads_decode(const Tensor &input_tensor_a, const uint32_t num_heads) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [num_heads] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [num_heads] (std::vector input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run(NlpConcatHeadsDecode{num_heads}, input_tensors); }, {input_tensor_a}, output_tensors); return output_tensors.at(0); diff --git a/tt_eager/tt_dnn/op_library/pad/pad_op.cpp b/tt_eager/tt_dnn/op_library/pad/pad_op.cpp index d652c62a999d..cfd9b3cb07e5 100644 --- a/tt_eager/tt_dnn/op_library/pad/pad_op.cpp +++ b/tt_eager/tt_dnn/op_library/pad/pad_op.cpp @@ -594,7 +594,7 @@ tt::stl::reflection::Attributes Pad::attributes() const { Tensor pad(const Tensor &input_tensor, const Shape &output_tensor_shape, const Shape &input_tensor_start, float pad_value, const MemoryConfig& output_mem_config, bool use_multicore) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [output_tensor_shape, input_tensor_start, pad_value, output_mem_config, use_multicore] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_tensor_shape, input_tensor_start, pad_value, output_mem_config, use_multicore] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); if (input_tensor.get_legacy_shape() == output_tensor_shape) { return {AutoFormat::move_tensor_to_mem_config(input_tensor, output_mem_config)}; diff --git a/tt_eager/tt_dnn/op_library/permute/permute_op.cpp b/tt_eager/tt_dnn/op_library/permute/permute_op.cpp index 1409d0f99789..a34193a33964 100644 --- a/tt_eager/tt_dnn/op_library/permute/permute_op.cpp +++ b/tt_eager/tt_dnn/op_library/permute/permute_op.cpp @@ -107,7 +107,7 @@ Tensor permute_(const Tensor &a, std::vector dims, const MemoryConfig& Tensor permute(const Tensor &a, std::vector dims, const MemoryConfig& output_mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({a}))}; operation::launch_with_autoformat( - [dims, output_mem_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [dims, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& a = input_tensors.at(0); std::vector normalized_dims(dims.size()); std::transform(dims.begin(), dims.end(), normalized_dims.begin(), [a](std::int64_t idx) {return a.get_legacy_shape().get_normalized_index(idx);}); diff --git a/tt_eager/tt_dnn/op_library/pool/max_pool.cpp b/tt_eager/tt_dnn/op_library/pool/max_pool.cpp index 10f836b3eb02..68d6ee025f12 100644 --- a/tt_eager/tt_dnn/op_library/pool/max_pool.cpp +++ b/tt_eager/tt_dnn/op_library/pool/max_pool.cpp @@ -217,7 +217,7 @@ Tensor max_pool2d_v2(const Tensor &input, {input, reader_indices}).at(0); } -operation::OpPerformanceModel MaxPool::create_op_performance_model(const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector &output_tensors) const { +operation::OpPerformanceModel MaxPool::create_op_performance_model(const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors, const std::vector &output_tensors) const { const auto& input = input_tensors.at(0); const auto& input_shape = input.get_shape(); uint32_t batch_size = this->in_n_; diff --git a/tt_eager/tt_dnn/op_library/pool/max_pool.hpp b/tt_eager/tt_dnn/op_library/pool/max_pool.hpp index 5f427412f1f7..47cc12c8a1a4 100644 --- a/tt_eager/tt_dnn/op_library/pool/max_pool.hpp +++ b/tt_eager/tt_dnn/op_library/pool/max_pool.hpp @@ -30,7 +30,7 @@ struct MaxPool { std::vector compute_output_shapes(const std::vector &input_tensors) const; std::vector create_output_tensors(const std::vector &input_tensors) const; operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const; - operation::OpPerformanceModel create_op_performance_model(const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector &output_tensors) const; + operation::OpPerformanceModel create_op_performance_model(const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors, const std::vector &output_tensors) const; static constexpr auto attribute_names = std::make_tuple( "in_n", diff --git a/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp b/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp index cdf16aa88a45..56fa3d869219 100644 --- a/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp +++ b/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp @@ -168,7 +168,7 @@ Tensor reduce(const Tensor &input_tensor, ReduceOpMath reduce_math, ReduceOpDim std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; if (is_multicore_hw) { operation::launch_op( - [reduce_math, reduce_dim, pad_value, scaler, output_dtype, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [reduce_math, reduce_dim, pad_value, scaler, output_dtype, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); Device * device; @@ -189,7 +189,7 @@ Tensor reduce(const Tensor &input_tensor, ReduceOpMath reduce_math, ReduceOpDim }, {input_tensor}, output_tensors); } else { operation::launch_with_autoformat( - [reduce_math, reduce_dim, pad_value, scaler, output_dtype, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [reduce_math, reduce_dim, pad_value, scaler, output_dtype, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); return operation::run_with_autoformat(Reduce{reduce_math, reduce_dim, scaler, output_mem_config, output_dtype.value_or(input_tensor.get_dtype())}, {input_tensor}, {}, pad_value); }, {input_tensor}, output_tensors); diff --git a/tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp b/tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp index d8111d40fc5f..304557d1dca9 100644 --- a/tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp +++ b/tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp @@ -74,7 +74,7 @@ operation::ProgramWithCallbacks Repeat::create_program( Tensor repeat(const Tensor &input_tensor, const Shape &shape, const MemoryConfig &output_mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [shape, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) -> std::vector { + [shape, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) -> std::vector { auto& input_tensor = input_tensors.at(0); uint32_t input_rank = input_tensor.get_legacy_shape().rank(); TT_FATAL(shape.rank() == input_rank, "Number of repeat dims must be equal to number of tensor dims"); diff --git a/tt_eager/tt_dnn/op_library/reshape/reshape_op.cpp b/tt_eager/tt_dnn/op_library/reshape/reshape_op.cpp index 14cef5cd3deb..998f16de8e7a 100644 --- a/tt_eager/tt_dnn/op_library/reshape/reshape_op.cpp +++ b/tt_eager/tt_dnn/op_library/reshape/reshape_op.cpp @@ -344,7 +344,7 @@ Tensor reshape (const Tensor &input_tensor_a, int N, int C, int H, int W, const } std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [N, C, H, W, output_mem_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [N, C, H, W, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run_without_autoformat(Reshape{N, C, H, W, output_mem_config}, input_tensors); }, {input_tensor_a}, output_tensors); return output_tensors.at(0); diff --git a/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp b/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp index 674c7db8638c..8843a81aca7d 100644 --- a/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp +++ b/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp @@ -52,7 +52,7 @@ inline Tensor rotary_embedding( std::optional compute_kernel_config = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor, cos, sin}))}; operation::launch_with_autoformat( - [token_idx, output_mem_config, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [token_idx, output_mem_config, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); auto& cos = input_tensors.at(1); auto& sin = input_tensors.at(2); diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp index 30fee3da8fd7..2275b0c18775 100644 --- a/tt_eager/tt_dnn/op_library/run_operation.cpp +++ b/tt_eager/tt_dnn/op_library/run_operation.cpp @@ -370,20 +370,15 @@ OutputTensors run( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors) { - // Async Mode: Asserts to ensure that tensors are populated before running op - // TODO : Joseph/Eyon, this was deleted in a prior PR, is that correct? - // for (const Tensor& tensor : input_tensors) { - // TT_ASSERT(tensor.metadata_populated(), "Input tensors must be populated before running op."); - // } - // for (auto& tensor : optional_input_tensors) { - // if (tensor.has_value()) { - // TT_ASSERT(tensor.value().metadata_populated(), "Input tensors must be populated before running op."); - // } - // } +#ifdef DEBUG + operation.validate(input_tensors, optional_input_tensors, optional_output_tensors); +#endif if (detail::any_tensor_on_multi_device(input_tensors)) { return detail::decorate_device_operation(detail::run_multi_device_operation)( std::make_optional(std::ref(queue)), operation, input_tensors, optional_input_tensors, optional_output_tensors); } + auto device = detail::get_device(input_tensors, optional_input_tensors); + detail::validate_op_launch(device); return detail::decorate_device_operation(detail::run_device_operation)( queue, operation, input_tensors, optional_input_tensors, optional_output_tensors); } @@ -408,17 +403,10 @@ OutputTensors run( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors) { - // Async Mode: Asserts to ensure that tensors are populated before running op - // for (const Tensor& tensor : input_tensors) { - // TT_ASSERT(tensor.metadata_populated(), "Input tensors must be populated before running op."); - // } - // for (auto& tensor : optional_input_tensors) { - // if (tensor.has_value()) { - // TT_ASSERT(tensor.value().metadata_populated(), "Input tensors must be populated before running op."); - // } - // } +#ifdef DEBUG operation.validate(input_tensors, optional_input_tensors, optional_output_tensors); +#endif if (detail::any_tensor_on_multi_device(input_tensors)) { return detail::decorate_device_operation(detail::run_multi_device_operation)( std::nullopt, operation, input_tensors, optional_input_tensors, optional_output_tensors); @@ -640,24 +628,26 @@ Tensors run_with_autoformat( } void launch_with_autoformat( - std::function(const std::vector&, const std::vector>&)>&& op_func, - const std::vector input_tensors, - std::vector& output_tensors, - const std::vector> optional_input_tensors + std::function&& op_func, + const Tensors input_tensors, + Tensors& output_tensors, + const OptionalConstTensors optional_input_tensors, + const OptionalTensors optional_output_tensors ) { // Mark each output tensor as having dynamic storage (can be on host or device, depending // on autoformat behaviour). Multi device tensors do not support dynamic storage. for (auto& output_tensor : output_tensors) { output_tensor.tensor_attributes->dynamic_storage = (output_tensor.workers.size() <= 1); } - launch_op(std::move(op_func), input_tensors, output_tensors, optional_input_tensors); + launch_op(std::move(op_func), input_tensors, output_tensors, optional_input_tensors, optional_output_tensors); } void launch_op( - std::function(const std::vector&, const std::vector>&)>&& op_func, - const std::vector input_tensors, - std::vector& output_tensors, - const std::vector> optional_input_tensors + std::function&& op_func, + const Tensors input_tensors, + Tensors& output_tensors, + const OptionalConstTensors optional_input_tensors, + const OptionalTensors optional_output_tensors ) { // Send host side op compile and run to the worker queue // Assert to ensure that worker threads are specified. @@ -672,6 +662,7 @@ void launch_op( std::vector input_tensor_ref_count = {}; std::vector optional_input_tensor_ref_count = {}; std::vector output_tensor_ref_count = {}; + std::vector optional_output_tensor_ref_count = {}; std::vector async_safe_input_tensors = {}; std::vector> async_safe_optional_input_tensors = {}; @@ -696,6 +687,14 @@ void launch_op( for (int i = 0; i < output_tensors.size(); i++) { output_tensor_ref_count.push_back(output_tensors[i].tensor_attributes->record_main_thread_ref_count()); } + for (int i = 0; i < optional_output_tensors.size(); i++) { + if (optional_output_tensors[i].has_value()) { + optional_output_tensor_ref_count.push_back(optional_output_tensors[i].value().tensor_attributes->record_main_thread_ref_count()); + } + else { + optional_output_tensor_ref_count.push_back(0); + } + } // Check if this op dispatch step relies on tensors from other workers. // If so, mark them in use by current worker. Tensors shared across workers // are only supported when each tensor is tied to a single device/worker @@ -720,10 +719,13 @@ void launch_op( { ZoneScopedN("PushOpToWorkers"); for (auto target_device : workers) { - target_device->push_work([target_device, workers, op_func, async_safe_optional_input_tensors, inputs = async_safe_input_tensors, outputs = output_tensors, shared_input_idx = cross_worker_input_tensor_idx, shared_optional_input_idx = cross_worker_optional_input_tensor_idx] () mutable { + target_device->push_work([target_device, workers, op_func, optional_output_tensors, async_safe_optional_input_tensors, inputs = async_safe_input_tensors, outputs = output_tensors, shared_input_idx = cross_worker_input_tensor_idx, shared_optional_input_idx = cross_worker_optional_input_tensor_idx] () mutable { + std::vector input_shards = {}; std::vector> optional_input_shards = {}; - + std::vector> optional_output_shards = {}; + // Initialize all optional_outputs to std::nullopt + optional_output_shards.resize(optional_output_tensors.size()); for (const auto& input : inputs) { input_shards.push_back(get_shard_for_device(input, target_device)); } @@ -735,7 +737,12 @@ void launch_op( optional_input_shards.push_back(std::nullopt); } } - auto local_tensors = op_func(input_shards, optional_input_shards); + for (std::size_t optional_output_idx = 0; optional_output_idx < optional_output_tensors.size(); optional_output_idx++) { + if (optional_output_tensors[optional_output_idx].has_value()) { + optional_output_shards[optional_output_idx] = get_shard_for_device(optional_output_tensors[optional_output_idx].value(), target_device); + } + } + auto local_tensors = op_func(input_shards, optional_input_shards, optional_output_shards); // Release shared ownership of tensors belonging to other workers. // If the workers for this tensor are stalled to deallocate for (auto& shared_input : shared_input_idx) { @@ -784,6 +791,11 @@ void launch_op( for (int i = 0; i < output_tensors.size(); i++) { output_tensors[i].tensor_attributes->update_main_thread_ref_count(workers.at(0), output_tensor_ref_count[i]); } + for (int i = 0; i < optional_output_tensors.size(); i++) { + if (optional_output_tensors[i].has_value()) { + optional_output_tensors[i].value().tensor_attributes->update_main_thread_ref_count(workers.at(0), optional_output_tensor_ref_count[i]); + } + } } void validate_workers_and_storage(const std::vector& inputs, const std::vector>& optional_inputs, const std::vector& workers) { diff --git a/tt_eager/tt_dnn/op_library/run_operation.hpp b/tt_eager/tt_dnn/op_library/run_operation.hpp index 45fcda4c2450..9cdff1a16d2f 100644 --- a/tt_eager/tt_dnn/op_library/run_operation.hpp +++ b/tt_eager/tt_dnn/op_library/run_operation.hpp @@ -306,7 +306,8 @@ inline auto run( ConcreteOperation&& concrete_op, const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors={}, - const OptionalTensors& optional_output_tensors={} + const OptionalTensors& optional_output_tensors={}, + std::optional> queue = std::nullopt ) -> ProgramOutputTensors { using OutputTensors = ProgramOutputTensors; if constexpr (detail::is_host_operation()) { @@ -315,6 +316,9 @@ inline auto run( return run(operation, input_tensors); } else if constexpr (detail::is_device_operation()) { const auto operation = DeviceOperation(concrete_op); + if (queue.has_value()) { + return run(queue.value(), operation, input_tensors, optional_input_tensors, optional_output_tensors); + } return run(operation, input_tensors, optional_input_tensors, optional_output_tensors); } else { static_assert(tt::stl::concepts::always_false_v, "Unsupported Operation"); @@ -384,17 +388,19 @@ inline auto run_with_autoformat( } void launch_op( - std::function(const Tensors&, const OptionalConstTensors&)>&& op_func, - const std::vector input_tensors, - std::vector& output_tensors, - const std::vector> optional_input_tensors = {} + std::function&& op_func, + const Tensors input_tensors, + Tensors& output_tensors, + const OptionalConstTensors optional_input_tensors = {}, + const OptionalTensors optional_output_tensors = {} ); void launch_with_autoformat( - std::function(const std::vector&, const std::vector>&)>&& op_func, - const std::vector input_tensors, - std::vector& output_tensors, - const std::vector> optional_input_tensors = {} + std::function&& op_func, + const Tensors input_tensors, + Tensors& output_tensors, + const OptionalConstTensors optional_input_tensors = {}, + const OptionalTensors optional_output_tensors = {} ); std::vector get_workers_for_op_output(const std::vector&& inputs, const std::vector>&& optional_inputs = {}); diff --git a/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp b/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp index accefdeb6d08..cedb2ac34fc1 100644 --- a/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp +++ b/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp @@ -59,7 +59,7 @@ inline Tensor interleaved_to_sharded( const std::optional output_dtype = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [grid, shard_shape, shard_scheme, shard_orientation, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) -> std::vector { + [grid, shard_shape, shard_scheme, shard_orientation, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) -> std::vector { const auto& input_tensor = input_tensors.at(0); bool row_wise = shard_orientation == ShardOrientation::ROW_MAJOR; CoreCoord grid_size; @@ -154,7 +154,7 @@ inline Tensor interleaved_to_sharded( std::optional output_dtype = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [sharded_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) -> std::vector { + [sharded_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) -> std::vector { const auto& input_tensor = input_tensors.at(0); TT_FATAL(sharded_mem_config.is_sharded()); auto bbox = sharded_mem_config.shard_spec.value().grid.bounding_box(); @@ -178,7 +178,7 @@ inline Tensor sharded_to_interleaved( std::optional output_dtype = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [output_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) -> std::vector { + [output_mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) -> std::vector { const auto& input_tensor = input_tensors.at(0); TT_FATAL(input_tensor.shard_spec().has_value()); auto shard_spec = input_tensor.shard_spec().value(); @@ -198,7 +198,7 @@ inline Tensor sharded_to_interleaved( inline Tensor reshard(const Tensor &input_tensor, const MemoryConfig &output_mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) -> std::vector { + [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) -> std::vector { const auto& input_tensor = input_tensors.at(0); TT_FATAL(input_tensor.shard_spec().has_value()); TT_FATAL(output_mem_config.is_sharded()); diff --git a/tt_eager/tt_dnn/op_library/softmax/softmax_op.cpp b/tt_eager/tt_dnn/op_library/softmax/softmax_op.cpp index 8f8a4e168c52..dfcc8395e9c1 100644 --- a/tt_eager/tt_dnn/op_library/softmax/softmax_op.cpp +++ b/tt_eager/tt_dnn/op_library/softmax/softmax_op.cpp @@ -179,7 +179,7 @@ namespace transformers { Tensor scale_mask_softmax_in_place(Tensor& input_tensor, std::optional scale, std::optional mask, const SoftmaxProgramConfig& program_config, const bool is_causal_mask, std::optional compute_kernel_config) { std::vector dummy_output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [scale, mask, program_config, is_causal_mask, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [scale, mask, program_config, is_causal_mask, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); auto& mask = optional_input_tensors.at(0); auto kernel_config_val = init_device_compute_kernel_config(input_tensor.device()->arch(), compute_kernel_config, MathFidelity::HiFi4, true, false, false); @@ -207,7 +207,7 @@ namespace transformers { Tensor scale_mask_softmax(const Tensor& input_tensor, std::optional scale, std::optional mask, const MemoryConfig& output_mem_config, const bool is_causal_mask, std::optional compute_kernel_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_with_autoformat( - [scale, mask, output_mem_config, is_causal_mask, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [scale, mask, output_mem_config, is_causal_mask, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); auto& mask = optional_input_tensors.at(0); Shape input_pad_shape = AutoFormat::pad_to_tile_shape(input_tensor.get_legacy_shape()); diff --git a/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp b/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp index be0d9e146a16..cb4488eb909e 100644 --- a/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp +++ b/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp @@ -95,7 +95,7 @@ Tensor tilize(const Tensor &input_tensor_a, const MemoryConfig& output_mem_confi } std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [output_mem_config, output_dtype, use_multicore] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_mem_config, output_dtype, use_multicore] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor_a = input_tensors.at(0); return operation::run_without_autoformat(Tilize{output_mem_config, output_dtype.value_or(input_tensor_a.get_dtype()), use_multicore}, {input_tensor_a}); }, {input_tensor_a}, output_tensors); diff --git a/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.hpp b/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.hpp index 451660151968..239e17f9dd9a 100644 --- a/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.hpp +++ b/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.hpp @@ -44,7 +44,7 @@ struct SplitFusedQKVAndSplitHeads { inline std::tuple split_query_key_value_and_split_heads(const Tensor &input_tensor, const CoreCoord& compute_with_storage_grid_size, const MemoryConfig& mem_config, const uint32_t num_heads = 16) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor})), Tensor(operation::get_workers_for_op_output({input_tensor})), Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [compute_with_storage_grid_size, mem_config, num_heads] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [compute_with_storage_grid_size, mem_config, num_heads] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run(SplitFusedQKVAndSplitHeads{compute_with_storage_grid_size, mem_config, num_heads}, input_tensors); }, {input_tensor}, output_tensors); return {output_tensors.at(0), output_tensors.at(1), output_tensors.at(2)}; @@ -64,7 +64,7 @@ struct ConcatenateHeads { inline Tensor concatenate_heads(const Tensor &input_tensor, const CoreCoord& compute_with_storage_grid_size, const MemoryConfig& mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [compute_with_storage_grid_size, mem_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [compute_with_storage_grid_size, mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run(ConcatenateHeads{compute_with_storage_grid_size, mem_config}, input_tensors); }, {input_tensor}, output_tensors); return output_tensors.at(0); @@ -89,7 +89,7 @@ struct AttnMatmul { inline Tensor attn_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const CoreCoord& compute_with_storage_grid_size, const MemoryConfig& mem_config, std::optional output_dtype=std::nullopt, std::optional compute_kernel_config = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}))}; operation::launch_op( - [compute_with_storage_grid_size, mem_config, output_dtype, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [compute_with_storage_grid_size, mem_config, output_dtype, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); auto arch = input_tensor_a.storage_type() == StorageType::DEVICE ? input_tensor_a.device()->arch() : AutoFormat::GetDefaultDevice()->arch(); @@ -131,7 +131,7 @@ struct GroupAttnMatmul { inline Tensor group_attn_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const CoreCoord& compute_with_storage_grid_size, const MemoryConfig& mem_config, std::optional output_dtype=std::nullopt, std::optional compute_kernel_config = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}))}; operation::launch_op( - [compute_with_storage_grid_size, mem_config, output_dtype, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [compute_with_storage_grid_size, mem_config, output_dtype, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); bool row_major = false; @@ -185,7 +185,7 @@ struct SSMEltwiseMul { inline Tensor ssm_eltwise_mul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const MemoryConfig& mem_config, std::optional output_dtype=std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}))}; operation::launch_op( - [mem_config, output_dtype] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); return operation::run(SSMEltwiseMul{mem_config, output_dtype.value_or(input_tensor_a.get_dtype())}, input_tensors); @@ -207,7 +207,7 @@ struct SSM1DSumReduce { inline Tensor ssm_1d_sum_reduce(const Tensor &input_tensor_a, const MemoryConfig& mem_config, std::optional output_dtype=std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [mem_config, output_dtype] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); return operation::run(SSM1DSumReduce{mem_config, output_dtype.value_or(input_tensor_a.get_dtype())}, input_tensors); }, {input_tensor_a}, output_tensors); diff --git a/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp b/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp index c4960e7dda1d..908532d4ccb4 100644 --- a/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp +++ b/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp @@ -197,7 +197,7 @@ inline Tensor transpose_(const Tensor &a, TransposeOpDim transpose_dim, const Me Tensor transpose(const Tensor &a, std::int64_t dim1, std::int64_t dim2, const MemoryConfig& output_mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({a}))}; operation::launch_with_autoformat( - [dim1, dim2, output_mem_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [dim1, dim2, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& a = input_tensors.at(0); uint32_t normalized_dim1 = a.get_legacy_shape().get_normalized_index(dim1); uint32_t normalized_dim2 = a.get_legacy_shape().get_normalized_index(dim2); diff --git a/tt_eager/tt_dnn/op_library/unpad/unpad_op.cpp b/tt_eager/tt_dnn/op_library/unpad/unpad_op.cpp index 665c756b25f2..5089c3e89a7d 100644 --- a/tt_eager/tt_dnn/op_library/unpad/unpad_op.cpp +++ b/tt_eager/tt_dnn/op_library/unpad/unpad_op.cpp @@ -211,7 +211,7 @@ Tensor unpad(const Tensor &input_tensor_a, const Shape &output_tensor_start, con // TODO: We need to run asserts before this std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [output_tensor_start, output_tensor_end, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_tensor_start, output_tensor_end, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor_a = input_tensors.at(0); auto input_tensor_shape = input_tensor_a.get_legacy_shape(); const Shape output_tensor_shape = { diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp b/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp index d5210720d20e..6a247053b9c4 100644 --- a/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp +++ b/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp @@ -106,7 +106,7 @@ Tensor untilize(const Tensor &input_tensor_a, const MemoryConfig& output_mem_con // No-op (Will do a tensor copy) std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; operation::launch_op( - [output_mem_config, use_multicore, use_pack_untilize] (const std::vector& input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [output_mem_config, use_multicore, use_pack_untilize] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor_a = input_tensors.at(0); if (input_tensor_a.get_layout() == Layout::ROW_MAJOR) { log_warning("Perf warning: Trying to untilize non-tilized data."); @@ -121,7 +121,7 @@ Tensor untilize(const Tensor &input_tensor_a, const MemoryConfig& output_mem_con void UntilizeWithUnpadding::validate(const std::vector &input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); - TT_FATAL(input_tensor_a.storage_type() == StorageType::DEVICE, "Operandsneed to be on device!"); + TT_FATAL(input_tensor_a.storage_type() == StorageType::DEVICE, "Operands need to be on device!"); TT_FATAL(input_tensor_a.buffer() != nullptr , "Operands need to be allocated in buffers on device!"); TT_FATAL(input_tensor_a.get_layout() == Layout::TILE, "Can only untilize tile major data"); diff --git a/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp b/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp index fcd99d9620c6..79c4f26b77ce 100644 --- a/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp +++ b/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp @@ -57,7 +57,7 @@ struct UpdateCache { inline Tensor fill_cache(const Tensor& cache_tensor, const Tensor& input_tensor, const uint32_t batch_idx) { std::vector dummy_output_tensors = {Tensor(operation::get_workers_for_op_output({cache_tensor, input_tensor}))}; operation::launch_op( - [batch_idx] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [batch_idx] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { return operation::run(UpdateCache{batch_idx, 0, 0, UpdateCacheOpType::FILL}, input_tensors); }, {cache_tensor, input_tensor}, dummy_output_tensors); return cache_tensor; @@ -66,7 +66,7 @@ inline Tensor fill_cache(const Tensor& cache_tensor, const Tensor& input_tensor, inline Tensor update_cache(const Tensor& cache_tensor, const Tensor& input_tensor, const uint32_t update_idx, const uint32_t batch_offset, std::optional compute_kernel_config = std::nullopt) { std::vector dummy_output_tensors = {Tensor(operation::get_workers_for_op_output({cache_tensor, input_tensor}))}; operation::launch_op( - [update_idx, batch_offset, compute_kernel_config] (std::vector input_tensors, const std::vector>& optional_input_tensors) mutable -> std::vector { + [update_idx, batch_offset, compute_kernel_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& cache_tensor = input_tensors.at(0); auto& input_tensor = input_tensors.at(1); auto kernel_config_val = init_device_compute_kernel_config(input_tensor.device()->arch(), compute_kernel_config); diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 111d818f593d..45c2a6312ad0 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -108,13 +108,14 @@ inline std::tuple>, std::vector shard_parameters - ) + std::optional< ShardSpecBuffer> shard_parameters, + bool allocate) : device_(device), size_(size), page_size_(page_size), buffer_type_(buffer_type), buffer_layout_(buffer_layout), shard_parameters_(shard_parameters) { TT_FATAL(this->device_ != nullptr and this->device_->allocator_ != nullptr); validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters); - - this->allocate(); + if (allocate) { + this->allocate(); + } } diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index e4da4d2c64b4..3ed7829c2cc1 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -165,7 +165,8 @@ class Buffer { Buffer(Device *device, uint64_t size, uint64_t page_size, const BufferType buffer_type, const TensorMemoryLayout buffer_layout=TensorMemoryLayout::INTERLEAVED, - std::optional shard_parameter = std::nullopt); + std::optional shard_parameter = std::nullopt, + bool allocate = true); Buffer(const Buffer &other); Buffer& operator=(const Buffer &other); @@ -178,6 +179,7 @@ class Buffer { uint32_t size() const { return static_cast(size_); } + void set_size(uint64_t size) { size_ = size; } // Returns address of buffer in the first bank uint32_t address() const { return static_cast(address_); } @@ -235,9 +237,9 @@ class Buffer { } private: - void allocate(); + virtual void allocate(); - void deallocate(); + virtual void deallocate(); friend void DeallocateBuffer(Buffer &buffer); uint64_t translate_page_address(uint64_t offset, uint32_t bank_id) const; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 585162fa1395..b7589e10967b 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -2108,7 +2108,6 @@ void EnqueueRecordEventImpl(CommandQueue& cq, std::shared_ptr event) { } void EnqueueWaitForEvent(CommandQueue& cq, std::shared_ptr event) { - detail::DispatchStateCheck(true); cq.run_command(CommandInterface{ .type = EnqueueCommandType::ENQUEUE_WAIT_FOR_EVENT, diff --git a/ttnn/cpp/ttnn/async_runtime.cpp b/ttnn/cpp/ttnn/async_runtime.cpp new file mode 100644 index 000000000000..04656515c21b --- /dev/null +++ b/ttnn/cpp/ttnn/async_runtime.cpp @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "async_runtime.hpp" +#include "tt_eager/tensor/tensor_impl.hpp" +#include "tt_eager/tensor/tensor_impl_wrapper.hpp" + +namespace ttnn { + using DeviceBuffer = std::shared_ptr; + using queue_id = uint8_t; + + DeviceBuffer allocate_interleaved_buffer_on_device(uint32_t buffer_size_bytes, Device *device, const Shape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config) { + uint32_t page_size = tt::tt_metal::tensor_impl::get_page_size(data_type, layout, buffer_size_bytes, shape.value()); + return std::make_shared(device, buffer_size_bytes, page_size, memory_config.buffer_type); + } + + DeviceBuffer allocate_contiguous_buffer_on_device(uint32_t buffer_size_bytes, Device *device, const MemoryConfig& memory_config) { + return std::make_shared(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type); + } + + DeviceBuffer allocate_sharded_buffer_on_device(uint32_t buffer_size_bytes, Device *device, + const Shape& shape, DataType data_type, Layout layout, + std::optional shard_params, + const MemoryConfig& memory_config) { + tt::tt_metal::tensor_impl::validate_sharded_buffer_allocation(shape.value(), layout, shard_params, memory_config); + auto page_shape = shard_params.value().page_shape; + uint32_t size_of_element = tt::tt_metal::tensor_impl::element_size_bytes_wrapper(data_type); + uint32_t page_size = page_shape[0] * page_shape[1] * size_of_element; + if(layout == Layout::TILE){ + page_size = tt::tt_metal::tensor_impl::get_page_size(data_type, layout, buffer_size_bytes, shape.value()); + } + + return std::make_shared(device, buffer_size_bytes, page_size, + memory_config.buffer_type, + memory_config.memory_layout, + shard_params); + } + + DeviceBuffer allocate_buffer_on_device(uint32_t buffer_size_bytes, types::Device* device, const Shape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config, std::optional shard_spec) { + if (memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { + return allocate_interleaved_buffer_on_device(buffer_size_bytes, device, shape, data_type, layout, memory_config); + } + else if(memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::SINGLE_BANK){ + return allocate_contiguous_buffer_on_device(buffer_size_bytes, device, memory_config); + } + else { + return allocate_sharded_buffer_on_device(buffer_size_bytes, device, shape, data_type, layout, shard_spec, memory_config); + } + } + + void write_buffer(queue_id cq_id, Tensor& dst, std::vector> src, const std::optional transfer_size) { + uint32_t dst_ref_count = dst.tensor_attributes->record_main_thread_ref_count(); + for (const auto worker : dst.get_workers()) { + auto src_for_device = src.at(worker->id()); + worker->push_work( + [worker, src_for_device, dst, cq_id, transfer_size] () { + auto shard = tt::tt_metal::get_shard_for_device(dst, worker); + tt::tt_metal::memcpy(worker->command_queue(cq_id), shard, src_for_device.get(), transfer_size); + }); + } + dst.tensor_attributes->update_main_thread_ref_count(dst.workers.at(0), dst_ref_count); + } + + void read_buffer(queue_id cq_id, Tensor& src, std::vector> dst, const std::optional transfer_size, size_t src_offset) { + TT_ASSERT(src_offset == 0, "src_offset is not supported"); + uint32_t src_ref_count = src.tensor_attributes->record_main_thread_ref_count(); + for (const auto worker : src.get_workers()) { + auto dst_for_device = dst.at(worker->id()); + worker->push_work( + [worker, dst_for_device, src, cq_id, transfer_size, src_offset] () { + const auto& shard = tt::tt_metal::get_shard_for_device(src, worker); + tt::tt_metal::memcpy(worker->command_queue(cq_id), dst_for_device.get(), shard, transfer_size); + }); + } + for (auto worker : src.get_workers()) { + worker->synchronize(); + } + src.tensor_attributes->update_main_thread_ref_count(src.workers.at(0), src_ref_count); + } + + void queue_synchronize(CommandQueue& cq) { + // Ensure that all work pushed to async engine has been passed + // off to device CQ + cq.device()->synchronize(); + // Wait for device CQ to finish + Finish(cq); + } + + void event_synchronize(std::shared_ptr event) { + EventSynchronize(event); + } + + bool event_query(std::shared_ptr event) { + return EventQuery(event); + } + + void wait_for_event(CommandQueue& cq, std::shared_ptr event) { + auto cq_id = cq.id(); + auto cq_worker = cq.device(); + cq_worker->push_work( + [cq_worker, cq_id, event] () { + EnqueueWaitForEvent(cq_worker->command_queue(cq_id), event); + }); + } + + void record_event(CommandQueue& cq, std::shared_ptr event) { + auto cq_id = cq.id(); + auto cq_worker = cq.device(); + cq_worker->push_work( + [cq_worker, cq_id, event] () { + EnqueueRecordEvent(cq_worker->command_queue(cq_id), event); + }); + } + +} // namespace::ttnn diff --git a/ttnn/cpp/ttnn/async_runtime.hpp b/ttnn/cpp/ttnn/async_runtime.hpp new file mode 100644 index 000000000000..7800ab20e80b --- /dev/null +++ b/ttnn/cpp/ttnn/async_runtime.hpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "tt_eager/tensor/types.hpp" +#include "tt_eager/tt_dnn/op_library/run_operation.hpp" +#include "types.hpp" + +namespace ttnn { + using DeviceBuffer = std::shared_ptr; + using queue_id = uint8_t; + + DeviceBuffer allocate_buffer_on_device(uint32_t buffer_size_bytes, types::Device* device, const Shape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config, std::optional shard_spec = std::nullopt); + + void write_buffer(queue_id cq_id, Tensor& dst, std::vector> src, const std::optional transfer_size = std::nullopt); + + void read_buffer(queue_id cq_id, Tensor& src, std::vector> dst, const std::optional transfer_size = std::nullopt, size_t src_offset = 0); + + void queue_synchronize(CommandQueue& cq); + + void event_synchronize(std::shared_ptr event); + + bool event_query(std::shared_ptr event); + + void wait_for_event(CommandQueue& cq, std::shared_ptr event); + + void record_event(CommandQueue& cq, std::shared_ptr event); + + // Generic Device Op dispatch function. Templated on Op structs. + template + std::vector run_operation( + queue_id cq_id, + OpConfig devop, + const tt::tt_metal::operation::Tensors& input_tensors, + const tt::tt_metal::operation::OptionalConstTensors& optional_input_tensors = {}, + const tt::tt_metal::operation::OptionalTensors& optional_output_tensors = {}) { + static_assert(tt::tt_metal::operation::detail::is_device_operation(), "ttnn::run_operation can only dispatch Device Operations!"); + // Create output tensor vector by examining the number of output shapes created by the device operation + std::vector outputs(tt::tt_metal::operation::DeviceOperation(devop).compute_output_shapes(input_tensors).size()); + // Populate the workers of the output tensors, based on the input tensors. This is needed for the async engine. + for (int i = 0; i < outputs.size(); i++) { + outputs[i] = Tensor(tt::tt_metal::operation::get_workers_for_op_output(std::move(input_tensors), std::move(optional_input_tensors))); + } + // Send the operation to the async engine, which will populate the output tensors. + for (auto worker : outputs.at(0).workers) { + tt::tt_metal::operation::launch_op( + [devop, worker, cq_id] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { + return operation::run(std::move(devop), input_tensors, optional_input_tensors, optional_output_tensors, worker->command_queue(cq_id)); + }, input_tensors, outputs, optional_input_tensors, optional_output_tensors); + } + return outputs; + } +} diff --git a/ttnn/cpp/ttnn/op_library/binary/binary_op.hpp b/ttnn/cpp/ttnn/op_library/binary/binary_op.hpp index dd9ecaa48250..dafa14ab8949 100644 --- a/ttnn/cpp/ttnn/op_library/binary/binary_op.hpp +++ b/ttnn/cpp/ttnn/op_library/binary/binary_op.hpp @@ -102,7 +102,8 @@ struct Binary { operation::launch_op( [activations, memory_config, dtype]( const std::vector &input_tensors, - const std::vector> &optional_input_tensors) mutable -> std::vector { + const std::vector> &optional_input_tensors, + const std::vector>& optional_output_tensors) mutable -> std::vector { auto &&[input_tensor_a, input_tensor_b] = [](const auto &input_tensor_a_arg, const auto &input_tensor_b_arg) { // Swap tensors if input_tensor_a needs to be broadcasted to input_tensor_b diff --git a/ttnn/cpp/ttnn/operations/transformer.hpp b/ttnn/cpp/ttnn/operations/transformer.hpp index 52c349762a1a..ff9e78983a9e 100644 --- a/ttnn/cpp/ttnn/operations/transformer.hpp +++ b/ttnn/cpp/ttnn/operations/transformer.hpp @@ -227,7 +227,8 @@ struct ConcatenateHeads : public tt::tt_metal::NlpConcatHeads { operation::launch_op( [memory_config]( std::vector input_tensors, - const std::vector>& optional_input_tensors) mutable -> std::vector { + const std::vector>& optional_input_tensors, + const std::vector>& optional_output_tensors) mutable -> std::vector { auto& input_tensor = input_tensors.at(0); return operation::run( ConcatenateHeads{memory_config.value_or(input_tensor.memory_config())}, {input_tensor}); diff --git a/ttnn/cpp/ttnn/operations/unary.hpp b/ttnn/cpp/ttnn/operations/unary.hpp index deec97b16d6f..50fd8b4a98d9 100644 --- a/ttnn/cpp/ttnn/operations/unary.hpp +++ b/ttnn/cpp/ttnn/operations/unary.hpp @@ -44,7 +44,8 @@ inline Tensor execute( operation::launch_op( [op_chain, memory_config]( const std::vector& input_tensors, - const std::vector>& optional_input_tensors) mutable -> std::vector { + const std::vector>& optional_input_tensors, + const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); bool fp32_dest_acc_en = input_tensor.get_dtype() == DataType::UINT32 or diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp index b5c579db3b39..2710ac0048e7 100644 --- a/ttnn/cpp/ttnn/types.hpp +++ b/ttnn/cpp/ttnn/types.hpp @@ -4,6 +4,7 @@ #pragma once +#include "tt_metal/detail/tt_metal.hpp" #include "tt_eager/tensor/tensor.hpp" #include "tt_eager/tensor/types.hpp" @@ -48,6 +49,40 @@ struct CoreGrid { CoreGrid(std::size_t x, std::size_t y) : x(x), y(y) {} }; +// This buffer class is compatible with multithreaded runtime (which lives in tt_eager) +// It is derived from the tt_metal::Buffer class, but defines its own asynchronous allocation functions +class Buffer : public tt::tt_metal::Buffer { + public: + Buffer(Device *device, uint64_t size, uint64_t page_size, const BufferType buffer_type, + const TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, + std::optional< ShardSpecBuffer> shard_parameters = std::nullopt + ) : tt::tt_metal::Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, false) { + this->allocate(); + } + ~Buffer() { + this->deallocate(); + } + private: + void allocate() { + TT_ASSERT(this->device()); + this->device()->push_work([this] () mutable { + bool bottom_up = this->buffer_type() == BufferType::DRAM; + tt::tt_metal::detail::AllocateBuffer(this, bottom_up); + + }); + } + void deallocate() { + if (this->device() == nullptr or not this->device()->initialized_ or this->size() == 0) { + return; + } + this->set_size(0); + TT_ASSERT(this->device()->allocator_ != nullptr, "Expected allocator to be initialized!"); + this->device()->push_work([this] () mutable { + tt::tt_metal::detail::DeallocateBuffer(this); + }); + } +}; + static std::ostream &operator<<(std::ostream &os, const CoreGrid &core_grid) { os << "ttnn.CoreGrid(x=" <