diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h index 672e458b7..c02765224 100644 --- a/runtime/include/tt/runtime/detail/ttnn.h +++ b/runtime/include/tt/runtime/detail/ttnn.h @@ -55,7 +55,11 @@ #include "ttnn/operations/embedding/embedding.hpp" #include "ttnn/operations/matmul/matmul.hpp" #include "ttnn/operations/normalization/softmax/softmax.hpp" +#include "ttnn/operations/pool/maxpool/max_pool2d.hpp" #include "ttnn/operations/reduction/generic/generic_reductions.hpp" +#include "ttnn/tensor/tensor.hpp" +#include "ttnn/tensor/types.hpp" + #pragma clang diagnostic pop #include "tt/runtime/types.h" diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt index ad2542456..98aa7fc98 100644 --- a/runtime/lib/ttnn/CMakeLists.txt +++ b/runtime/lib/ttnn/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(operations) add_library(TTRuntimeTTNN STATIC runtime.cpp @@ -5,11 +6,10 @@ add_library(TTRuntimeTTNN ) # We have to set the C++ standard to 20 because tt-metal requires it set_property(TARGET TTRuntimeTTNN PROPERTY CXX_STANDARD 20) -target_compile_options(TTRuntimeTTNN PRIVATE -mavx -mavx2) target_include_directories(TTRuntimeTTNN PUBLIC ${PROJECT_SOURCE_DIR}/runtime/include ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common ) target_include_directories(TTRuntimeTTNN PUBLIC "$") -target_link_libraries(TTRuntimeTTNN PUBLIC TTNN_LIBRARY) -add_dependencies(TTRuntimeTTNN TTNN_LIBRARY tt-metal FBS_GENERATION) +target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps) +add_dependencies(TTRuntimeTTNN TTRuntimeTTNNOps) diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h new file mode 100644 index 000000000..2fdc41485 --- /dev/null +++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_TYPES_H +#define TTNN_RUNTIME_TYPES_H + +#include "tt/runtime/detail/ttnn.h" + +namespace tt::runtime::ttnn { + +using DeviceMap = std::unordered_map; +using TensorMap = std::unordered_map; +struct ProgramTensorPool { + ProgramTensorPool(const TensorMap &liveTensors) : liveTensors(liveTensors) {} + + auto try_emplace(std::uint32_t global_id, const ::ttnn::Tensor &tensor) { + auto it = liveTensors.find(global_id); + if (it != liveTensors.end()) { + return std::make_pair(it, false); + } + assert(!intermedTensors.contains(global_id)); + intermedTensors.try_emplace(global_id, tensor); + return liveTensors.try_emplace(global_id, &intermedTensors.at(global_id)); + } + + auto insert_or_assign(std::uint32_t global_id, const ::ttnn::Tensor &tensor) { + intermedTensors.insert_or_assign(global_id, tensor); + return liveTensors.insert_or_assign(global_id, + &intermedTensors.at(global_id)); + } + + ::ttnn::Tensor &at(std::uint32_t global_id) { + assert(liveTensors.contains(global_id)); + return *liveTensors.at(global_id); + } + + size_t erase(std::uint32_t global_id) { + assert(liveTensors.contains(global_id) && + intermedTensors.contains(global_id)); + intermedTensors.erase(global_id); + return liveTensors.erase(global_id); + } + + bool contains(std::uint32_t global_id) const { + return liveTensors.contains(global_id); + } + +private: + // A superset of intermedTensors, containing pointers to all tensors created + // by the program and the input/output tensors passed in by the user + TensorMap liveTensors; + + // A subset of liveTensors, containing values of any intermediate tensors + // created by the program + std::unordered_map intermedTensors; +}; + +struct ProgramContext { + ProgramTensorPool tensorPool; + DeviceMap allDevices; + DeviceMap devicePool; + + ProgramContext(const TensorMap &liveTensors, const DeviceMap &allDevices) + : tensorPool(ProgramTensorPool(liveTensors)), allDevices(allDevices) {} +}; +} // namespace tt::runtime::ttnn + +#endif diff --git a/runtime/lib/ttnn/utils.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h similarity index 100% rename from runtime/lib/ttnn/utils.h rename to runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt new file mode 100644 index 000000000..70a6b962c --- /dev/null +++ b/runtime/lib/ttnn/operations/CMakeLists.txt @@ -0,0 +1,38 @@ +set(TTNN_OPS_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/include/tt/runtime/ttnn/operations/utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/conv/conv2d.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/creation/empty.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/creation/full.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/concat.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/transpose.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/deletion/dealloc.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/eltwise/binary.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/eltwise/unary.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/embedding/embedding.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_layout.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_memory_config.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/matmul.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/softmax.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/pool/maxpool2d.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduction/reduction.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/context/get_device.cpp +) + +add_library(TTRuntimeTTNNOps + STATIC + ${TTNN_OPS_SRCS} +) + +set_property(TARGET TTRuntimeTTNNOps PROPERTY CXX_STANDARD 20) +target_compile_options(TTRuntimeTTNNOps PUBLIC -mavx -mavx2) +target_include_directories(TTRuntimeTTNNOps PUBLIC + ${PROJECT_SOURCE_DIR}/runtime/include + ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/include + ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include + ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common +) +target_include_directories(TTRuntimeTTNNOps PUBLIC "$") +target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY) +add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION) diff --git a/runtime/lib/ttnn/operations/context/get_device.cpp b/runtime/lib/ttnn/operations/context/get_device.cpp new file mode 100644 index 000000000..36aa40be3 --- /dev/null +++ b/runtime/lib/ttnn/operations/context/get_device.cpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "get_device.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::context { +void run(const ::tt::target::ttnn::GetDeviceOp *op, ProgramContext &context) { + DeviceMap &devicePool = context.devicePool; + DeviceMap &allDevices = context.allDevices; + const flatbuffers::Vector *chipIds = op->chip_ids(); + assert(chipIds->size() == 1 && "Expected 1 chip id"); + for (const uint32_t chipId : *chipIds) { + assert(allDevices.contains(chipId) && "Device not found"); + auto [iter, inserted] = + devicePool.try_emplace(chipId, allDevices.at(chipId)); + assert(inserted && "Duplicate device"); + } +} +} // namespace tt::runtime::ttnn::operations::context diff --git a/runtime/lib/ttnn/operations/context/get_device.h b/runtime/lib/ttnn/operations/context/get_device.h new file mode 100644 index 000000000..1d9b11201 --- /dev/null +++ b/runtime/lib/ttnn/operations/context/get_device.h @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_GET_DEVICE_H +#define TTNN_RUNTIME_GET_DEVICE_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::context { +void run(const ::tt::target::ttnn::GetDeviceOp *op, ProgramContext &context); + +} // namespace tt::runtime::ttnn::operations::context + +#endif diff --git a/runtime/lib/ttnn/operations/conv/conv2d.cpp b/runtime/lib/ttnn/operations/conv/conv2d.cpp new file mode 100644 index 000000000..11bcb1697 --- /dev/null +++ b/runtime/lib/ttnn/operations/conv/conv2d.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "conv2d.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::conv { +void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap &devicePool = context.devicePool; + const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id()); + const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id()); + std::optional<::ttnn::Tensor> bias = + op->bias() ? std::make_optional(tensorPool.at(op->bias()->global_id())) + : std::nullopt; + auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig(); + config.dtype = input.dtype(); + config.weights_dtype = weight.dtype(); + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + ::ttnn::Tensor out = + std::get<0>(::ttnn::operations::conv::conv2d::conv2d<::ttnn::Device>( + input, weight, &device, op->in_channels(), op->out_channels(), + op->batch_size(), op->input_height(), op->input_width(), + {op->kernel_height(), op->kernel_width()}, + {op->stride_height(), op->stride_width()}, + {op->padding_height(), op->padding_width()}, + {op->dilation_height(), op->dilation_width()}, op->groups(), bias, + config)); + + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::conv diff --git a/runtime/lib/ttnn/operations/conv/conv2d.h b/runtime/lib/ttnn/operations/conv/conv2d.h new file mode 100644 index 000000000..3ce35f7af --- /dev/null +++ b/runtime/lib/ttnn/operations/conv/conv2d.h @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_CONV2D_H +#define TTNN_RUNTIME_CONV2D_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::conv { +void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context); + +} // namespace tt::runtime::ttnn::operations::conv + +#endif diff --git a/runtime/lib/ttnn/operations/creation/empty.cpp b/runtime/lib/ttnn/operations/creation/empty.cpp new file mode 100644 index 000000000..68bf6dc3f --- /dev/null +++ b/runtime/lib/ttnn/operations/creation/empty.cpp @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "empty.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn::operations::creation { +void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap &devicePool = context.devicePool; + ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(op->out()); + // TODO(bug #582): ttnn::empty doesn't work properly with tile layout, + // using ROW_MAJOR until we fix it + auto desiredLayout = ::ttnn::Layout::ROW_MAJOR; + auto shape = ::ttnn::Shape( + ::tt::tt_metal::Shape(::tt::runtime::ttnn::utils::toShapeFromFBShape( + *op->out()->desc()->shape()))); + + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + ::ttnn::Tensor out = + ::ttnn::empty(shape, targetDataTypeTTNN, desiredLayout, device); + // use try emplace here so the program output tensor doesn't get overwritten + tensorPool.try_emplace(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::creation diff --git a/runtime/lib/ttnn/operations/creation/empty.h b/runtime/lib/ttnn/operations/creation/empty.h new file mode 100644 index 000000000..ecc42a71e --- /dev/null +++ b/runtime/lib/ttnn/operations/creation/empty.h @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_EMPTY_H +#define TTNN_RUNTIME_EMPTY_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::creation { + +void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context); + +} // namespace tt::runtime::ttnn::operations::creation + +#endif diff --git a/runtime/lib/ttnn/operations/creation/full.cpp b/runtime/lib/ttnn/operations/creation/full.cpp new file mode 100644 index 000000000..22b22f0cc --- /dev/null +++ b/runtime/lib/ttnn/operations/creation/full.cpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "full.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn::operations::creation { +void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap devicePool = context.devicePool; + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + ::ttnn::DataType outputDataType = utils::getDataType(op->out()); + auto shape = ::ttnn::Shape( + ::tt::tt_metal::Shape(::tt::runtime::ttnn::utils::toShapeFromFBShape( + *op->out()->desc()->shape()))); + float fillValue = op->fill_value(); + // TODO(bug #272), determine correct layout by tile shape in the future + ::ttnn::Layout outputLayout = ::ttnn::Layout::ROW_MAJOR; + std::optional> outputDevice = + std::make_optional(std::ref(device)); + std::optional<::tt::tt_metal::MemoryConfig> outputMemoryConfig = + std::make_optional(utils::createMemoryConfig(op->out())); + + ::ttnn::Tensor out = + ::ttnn::full(shape, fillValue, outputDataType, outputLayout, outputDevice, + outputMemoryConfig); + + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::creation diff --git a/runtime/lib/ttnn/operations/creation/full.h b/runtime/lib/ttnn/operations/creation/full.h new file mode 100644 index 000000000..2b2e003f6 --- /dev/null +++ b/runtime/lib/ttnn/operations/creation/full.h @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_FULL_H +#define TTNN_RUNTIME_FULL_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::creation { + +void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context); + +} // namespace tt::runtime::ttnn::operations::creation + +#endif diff --git a/runtime/lib/ttnn/operations/data_movement/concat.cpp b/runtime/lib/ttnn/operations/data_movement/concat.cpp new file mode 100644 index 000000000..e904adc2a --- /dev/null +++ b/runtime/lib/ttnn/operations/data_movement/concat.cpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "concat.h" +#include "tt/runtime/detail/ttnn.h" + +namespace tt::runtime::ttnn::operations::data_movement { +void run(const ::tt::target::ttnn::ConcatOp *op, ProgramContext &context) { + std::vector<::ttnn::Tensor> inputs; + for (const auto &input : *op->inputs()) { + inputs.push_back(context.tensorPool.at(input->global_id())); + } + int32_t dim = op->dim(); + ::ttnn::Tensor out = ::ttnn::concat(inputs, dim); + context.tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::data_movement diff --git a/runtime/lib/ttnn/operations/data_movement/concat.h b/runtime/lib/ttnn/operations/data_movement/concat.h new file mode 100644 index 000000000..396be6fb3 --- /dev/null +++ b/runtime/lib/ttnn/operations/data_movement/concat.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_CONCAT_H +#define TTNN_RUNTIME_CONCAT_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::data_movement { +void run(const ::tt::target::ttnn::ConcatOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::data_movement + +#endif diff --git a/runtime/lib/ttnn/operations/data_movement/reshape.cpp b/runtime/lib/ttnn/operations/data_movement/reshape.cpp new file mode 100644 index 000000000..d91788687 --- /dev/null +++ b/runtime/lib/ttnn/operations/data_movement/reshape.cpp @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "reshape.h" +#include "tt/runtime/detail/ttnn.h" + +namespace tt::runtime::ttnn::operations::data_movement { + +template +static std::array +vectorToArray(const std::vector &vec) { + if (vec.size() != Rank) { + throw std::invalid_argument("Vector size does not match array size"); + } + std::array arr; + std::copy(vec.begin(), vec.end(), arr.begin()); + return arr; +} + +template +static ::ttnn::Tensor invoke_reshape(const ::ttnn::Tensor &tensor, + const std::vector &shape) { + return ::ttnn::reshape(tensor, vectorToArray(shape)); +} + +void run(const ::tt::target::ttnn::ReshapeOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); + const auto *fbShape = op->shape(); + std::vector shape(fbShape->begin(), fbShape->end()); + constexpr int32_t Rank1 = 1; + constexpr int32_t Rank2 = 2; + constexpr int32_t Rank3 = 3; + constexpr int32_t Rank4 = 4; + constexpr int32_t Rank5 = 5; + + ::ttnn::Tensor out; + switch (fbShape->size()) { + case Rank1: + out = invoke_reshape(in, shape); + break; + case Rank2: + out = invoke_reshape(in, shape); + break; + case Rank3: + out = invoke_reshape(in, shape); + break; + case Rank4: + out = invoke_reshape(in, shape); + break; + case Rank5: + out = invoke_reshape(in, shape); + break; + default: + throw std::invalid_argument("Unsupported rank for reshape"); + } + + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::data_movement diff --git a/runtime/lib/ttnn/operations/data_movement/reshape.h b/runtime/lib/ttnn/operations/data_movement/reshape.h new file mode 100644 index 000000000..cfbd5ca9e --- /dev/null +++ b/runtime/lib/ttnn/operations/data_movement/reshape.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_RESHAPE_H +#define TTNN_RUNTIME_RESHAPE_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::data_movement { +void run(const ::tt::target::ttnn::ReshapeOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::data_movement + +#endif diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.cpp b/runtime/lib/ttnn/operations/data_movement/transpose.cpp new file mode 100644 index 000000000..fdfe5411b --- /dev/null +++ b/runtime/lib/ttnn/operations/data_movement/transpose.cpp @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "transpose.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::data_movement { +void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); + int32_t dim0 = op->dim0(); + int32_t dim1 = op->dim1(); + auto inputRank = in.get_shape().rank(); + // for the current version of permute, we need to work in 4D, so we add + // leading dimensions of size 1 + std::vector dimensionOrder(4); + std::iota(dimensionOrder.begin(), dimensionOrder.end(), 0); + if (dim0 < 0) { + dim0 += 4; + } else { + dim0 = dim0 + 4 - inputRank; + } + if (dim1 < 0) { + dim1 += 4; + } else { + dim1 = dim1 + 4 - inputRank; + } + std::swap(dimensionOrder[dim0], dimensionOrder[dim1]); + // Ideally this would use ttnn::transpose, but since ttnn::transpose doesn't + // work at the moment, we use this temporary solution. + ::ttnn::Tensor unsqueezedInput = ::ttnn::unsqueeze_to_4D(in); + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + ::ttnn::Tensor out = + ::ttnn::permute(unsqueezedInput, dimensionOrder, outputMemoryConfig); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::data_movement diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.h b/runtime/lib/ttnn/operations/data_movement/transpose.h new file mode 100644 index 000000000..27f562094 --- /dev/null +++ b/runtime/lib/ttnn/operations/data_movement/transpose.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_TRANSPOSE_H +#define TTNN_RUNTIME_TRANSPOSE_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::data_movement { +void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::data_movement + +#endif diff --git a/runtime/lib/ttnn/operations/deletion/dealloc.cpp b/runtime/lib/ttnn/operations/deletion/dealloc.cpp new file mode 100644 index 000000000..0fc992583 --- /dev/null +++ b/runtime/lib/ttnn/operations/deletion/dealloc.cpp @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +#include "dealloc.h" +#include "tt/runtime/detail/ttnn.h" + +namespace tt::runtime::ttnn::operations::deletion { +void run(const ::tt::target::ttnn::DeallocOp *op, ProgramContext &context) { + bool force = true; + ProgramTensorPool &tensorPool = context.tensorPool; + ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id()); + tensor.deallocate(force); + tensorPool.erase(op->in()->global_id()); +} +} // namespace tt::runtime::ttnn::operations::deletion diff --git a/runtime/lib/ttnn/operations/deletion/dealloc.h b/runtime/lib/ttnn/operations/deletion/dealloc.h new file mode 100644 index 000000000..f0b0ea786 --- /dev/null +++ b/runtime/lib/ttnn/operations/deletion/dealloc.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_DEALLOC_H +#define TTNN_RUNTIME_DEALLOC_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::deletion { +void run(const ::tt::target::ttnn::DeallocOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::deletion + +#endif diff --git a/runtime/lib/ttnn/operations/eltwise/binary.cpp b/runtime/lib/ttnn/operations/eltwise/binary.cpp new file mode 100644 index 000000000..b6143d358 --- /dev/null +++ b/runtime/lib/ttnn/operations/eltwise/binary.cpp @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +#include "binary.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::binary { + +static void +getEltwiseBinaryOPInputTensors(const ::tt::target::ttnn::EltwiseOp *op, + ProgramTensorPool &tensorPool, + ::ttnn::Tensor **lhs, ::ttnn::Tensor **rhs) { + assert(op->ins()->size() == 2 && "Expected 2 inputs"); + *lhs = &(tensorPool.at(op->ins()->Get(0)->global_id())); + *rhs = &(tensorPool.at(op->ins()->Get(1)->global_id())); +} + +static void runEltwiseBinaryOP( + const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool, + std::function<::ttnn::Tensor( + const ::ttnn::Tensor &, const ::ttnn::Tensor &, + const std::optional &, + const std::optional<::tt::tt_metal::MemoryConfig> &, + std::optional<::ttnn::Tensor>, + std::optional<::ttnn::operations::unary::FusedActivations>, + std::optional<::ttnn::operations::unary::UnaryWithParam>)> + ttnnOp) { + + ::ttnn::Tensor *lhs = nullptr; + ::ttnn::Tensor *rhs = nullptr; + getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs); + + ::ttnn::DataType outputDataType = utils::getDataType(op->out()); + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + + ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig, + std::nullopt, std::nullopt, std::nullopt); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} + +static void runEltwiseBinaryCompositeOP( + const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool, + std::function< + ::ttnn::Tensor(const ::ttnn::Tensor &, const ::ttnn::Tensor &, + const std::optional<::tt::tt_metal::MemoryConfig> &)> + ttnnOp) { + + ::ttnn::Tensor *lhs = nullptr; + ::ttnn::Tensor *rhs = nullptr; + getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs); + + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + + ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} + +void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) { + assert(isBinaryOp(op) && "Expected binary operation"); + ProgramTensorPool &tensorPool = context.tensorPool; + switch (op->type()) { + /* Eltwise Binary */ + case ::tt::target::ttnn::EltwiseOpType::Add: { + runEltwiseBinaryOP(op, tensorPool, ::ttnn::add); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Multiply: { + runEltwiseBinaryOP(op, tensorPool, ::ttnn::multiply); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Subtract: { + runEltwiseBinaryOP(op, tensorPool, ::ttnn::subtract); + break; + } + case ::tt::target::ttnn::EltwiseOpType::GreaterEqual: { + runEltwiseBinaryOP(op, tensorPool, ::ttnn::ge); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Div: { + runEltwiseBinaryOP(op, tensorPool, ::ttnn::divide); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Maximum: { + runEltwiseBinaryCompositeOP(op, tensorPool, ::ttnn::maximum); + break; + } + default: + throw std::invalid_argument("Unsupported Eltwise Binary operation"); + } +} + +} // namespace tt::runtime::ttnn::operations::binary diff --git a/runtime/lib/ttnn/operations/eltwise/binary.h b/runtime/lib/ttnn/operations/eltwise/binary.h new file mode 100644 index 000000000..56af49033 --- /dev/null +++ b/runtime/lib/ttnn/operations/eltwise/binary.h @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_ELTWISE_BINARY_H +#define TTNN_RUNTIME_ELTWISE_BINARY_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::binary { + +inline bool isBinaryOp(const ::tt::target::ttnn::EltwiseOp *op) { + return op->ins()->size() == 2; +} + +void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context); + +} // namespace tt::runtime::ttnn::operations::binary + +#endif diff --git a/runtime/lib/ttnn/operations/eltwise/unary.cpp b/runtime/lib/ttnn/operations/eltwise/unary.cpp new file mode 100644 index 000000000..7646f542c --- /dev/null +++ b/runtime/lib/ttnn/operations/eltwise/unary.cpp @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +#include "unary.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::unary { + +static void +getEltwiseUnaryOPInputTensor(const ::tt::target::ttnn::EltwiseOp *op, + ProgramTensorPool &tensorPool, + ::ttnn::Tensor **in) { + assert(op->ins()->size() == 1 && "Expected 1 input"); + *in = &(tensorPool.at(op->ins()->Get(0)->global_id())); +} + +static void runEltwiseUnaryOP( + const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool, + std::function< + ::ttnn::Tensor(const ::ttnn::Tensor &, + const std::optional<::tt::tt_metal::MemoryConfig> &, + const std::optional<::ttnn::Tensor> &)> + ttnnOp) { + + ::ttnn::Tensor *in = nullptr; + getEltwiseUnaryOPInputTensor(op, tensorPool, &in); + + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + + ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} + +static void runEltwiseUnaryWithFastAndApproximateModeOP( + const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool, + std::function< + ::ttnn::Tensor(const ::ttnn::Tensor &, const bool, + const std::optional<::tt::tt_metal::MemoryConfig> &, + const std::optional<::ttnn::Tensor> &)> + ttnnOp) { + + ::ttnn::Tensor *in = nullptr; + getEltwiseUnaryOPInputTensor(op, tensorPool, &in); + + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + + ::ttnn::Tensor out = + ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} + +void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) { + assert(isUnaryOp(op) && "Expected binary operation"); + ProgramTensorPool &tensorPool = context.tensorPool; + switch (op->type()) { + case ::tt::target::ttnn::EltwiseOpType::Abs: { + runEltwiseUnaryOP(op, tensorPool, ::ttnn::abs); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Relu: { + runEltwiseUnaryOP(op, tensorPool, ::ttnn::relu); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Sqrt: { + runEltwiseUnaryOP(op, tensorPool, ::ttnn::sqrt); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Sigmoid: { + runEltwiseUnaryOP(op, tensorPool, ::ttnn::sigmoid); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Reciprocal: { + runEltwiseUnaryOP(op, tensorPool, ::ttnn::reciprocal); + break; + } + case ::tt::target::ttnn::EltwiseOpType::Exp: { + runEltwiseUnaryWithFastAndApproximateModeOP(op, tensorPool, ::ttnn::exp); + break; + } + default: + throw std::invalid_argument("Unsupported unary operation"); + } +} + +} // namespace tt::runtime::ttnn::operations::unary diff --git a/runtime/lib/ttnn/operations/eltwise/unary.h b/runtime/lib/ttnn/operations/eltwise/unary.h new file mode 100644 index 000000000..691e3ef67 --- /dev/null +++ b/runtime/lib/ttnn/operations/eltwise/unary.h @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_ELTWISE_UNARY_H +#define TTNN_RUNTIME_ELTWISE_UNARY_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::unary { + +inline bool isUnaryOp(const ::tt::target::ttnn::EltwiseOp *op) { + return op->ins()->size() == 1; +} + +void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context); + +} // namespace tt::runtime::ttnn::operations::unary + +#endif diff --git a/runtime/lib/ttnn/operations/embedding/embedding.cpp b/runtime/lib/ttnn/operations/embedding/embedding.cpp new file mode 100644 index 000000000..433428eda --- /dev/null +++ b/runtime/lib/ttnn/operations/embedding/embedding.cpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "embedding.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::embedding { +void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) { + + ProgramTensorPool &tensorPool = context.tensorPool; + const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id()); + const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id()); + // default params for embedding op + std::optional padToken = std::nullopt; + ::tt::tt_metal::Layout layout = ::ttnn::ROW_MAJOR_LAYOUT; + auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC; + ::ttnn::DataType outputDataType = utils::getDataType(op->output()); + ::ttnn::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->output()); + ::ttnn::Tensor out = + ::ttnn::embedding(input, weight, padToken, layout, embeddingsType, + outputDataType, outputMemoryConfig); + tensorPool.insert_or_assign(op->output()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::embedding diff --git a/runtime/lib/ttnn/operations/embedding/embedding.h b/runtime/lib/ttnn/operations/embedding/embedding.h new file mode 100644 index 000000000..7c287bb50 --- /dev/null +++ b/runtime/lib/ttnn/operations/embedding/embedding.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_EMBEDDING_H +#define TTNN_RUNTIME_EMBEDDING_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::embedding { +void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::embedding + +#endif diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp new file mode 100644 index 000000000..ce8173244 --- /dev/null +++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +#include "utils.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn::operations::utils { + +bool isOnHost(const ::ttnn::Tensor &tensor) { + // Currently only supports borrowed or owned host storage + return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or + tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED; +} + +bool isOnDevice(const ::ttnn::Tensor &tensor) { + // Currently only supports single device storage + return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE; +} + +::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) { + return ::tt::runtime::ttnn::utils::toTTNNDataType( + tensorRef->desc()->layout()->memory_desc()->data_type()); +} +::ttnn::Device &getDevice(const ::tt::target::DeviceRef *deviceRef, + DeviceMap &devicePool) { + uint32_t deviceId = deviceRef->global_id(); + assert(devicePool.contains(deviceId) && "Device not found in device pool"); + return *devicePool.at(deviceId); +} + +CoreRangeSet toCoreRangeSet( + const ::flatbuffers::Vector *coreRangeSet) { + std::set coreRanges; + for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) { + CoreCoord start(coreRange->loc().x(), coreRange->loc().y()); + // End is inclusive + CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1, + coreRange->loc().y() + coreRange->size().y() - 1); + + coreRanges.emplace(start, end); + } + return CoreRangeSet(coreRanges); +} + +::tt::tt_metal::MemoryConfig +createMemoryConfig(const ::tt::target::TensorRef *tensorRef) { + const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout(); + const ::tt::target::TensorMemoryLayout targetMemoryLayout = + layout->memory_desc()->memory_layout(); + const ::tt::target::MemorySpace targetMemorySpace = + layout->memory_desc()->memory_space(); + const ::flatbuffers::Vector + *targetCoreRangeSet = layout->core_range_set(); + const ::flatbuffers::Vector *targetShardShape = + layout->memory_desc()->shape(); + + // TODO (jnie): Hardcoding to interleaved and block sharded for now + // Add support for other types once compiler supports it + assert(targetMemoryLayout == ::tt::target::TensorMemoryLayout::Interleaved || + targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded); + assert(targetMemoryLayout != target::TensorMemoryLayout::BlockSharded || + targetMemorySpace == target::MemorySpace::DeviceL1 && + "Only L1 memory space supports sharded memory layout"); + assert(targetCoreRangeSet->size() == 1 && + "Currently only single core range/grid is supported"); + assert(targetShardShape->size() == 2 && + "Only 2D shard shape is supported in TTNN backend"); + + CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet); + std::array ttnnShardShape; + std::copy(targetShardShape->begin(), targetShardShape->end(), + ttnnShardShape.begin()); + + if (targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded) { + assert(ttnnShardShape[0] % ::tt::constants::TILE_HEIGHT == 0 && + ttnnShardShape[1] % ::tt::constants::TILE_WIDTH == 0 && + "Shard shape must divide tile shape (32, 32) evenly"); + } + + ::tt::tt_metal::ShardSpec shardSpec( + ttnnCoreRangeSet, ttnnShardShape, + ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false); + + ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout = + ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout(targetMemoryLayout); + + ::tt::tt_metal::BufferType ttnnBufferType = + ::tt::runtime::ttnn::utils::toTTNNBufferType(targetMemorySpace); + + return {ttnnMemLayout, ttnnBufferType, shardSpec}; +} + +} // namespace tt::runtime::ttnn::operations::utils diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h new file mode 100644 index 000000000..a6b218536 --- /dev/null +++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_OPERATIONS_UTILS_H +#define TTNN_RUNTIME_OPERATIONS_UTILS_H + +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" +#include "types_generated.h" +#include + +namespace tt::runtime::ttnn::operations::utils { + +bool isOnHost(const ::ttnn::Tensor &tensor); + +bool isOnDevice(const ::ttnn::Tensor &tensor); + +::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef); + +::ttnn::Device &getDevice(const ::tt::target::DeviceRef *deviceRef, + DeviceMap &devicePool); + +CoreRangeSet toCoreRangeSet( + const ::flatbuffers::Vector *coreRangeSet); + +::tt::tt_metal::MemoryConfig +createMemoryConfig(const ::tt::target::TensorRef *tensorRef); + +} // namespace tt::runtime::ttnn::operations::utils +#endif diff --git a/runtime/lib/ttnn/operations/layout/to_device.cpp b/runtime/lib/ttnn/operations/layout/to_device.cpp new file mode 100644 index 000000000..f1ff3d44f --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_device.cpp @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "to_device.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn::operations::layout { +void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap &devicePool = context.devicePool; + const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); + assert((utils::isOnHost(inputTensor) or utils::isOnDevice(inputTensor)) && + "Unsupported storage type"); + + ::ttnn::TensorMemoryLayout tensorMemoryLayout = + ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout( + op->memcfg()->tensor_memory_layout()); + + ::ttnn::BufferType bufferType; + switch (op->memcfg()->buffer_type()) { + case ::tt::target::BufferType::DRAM: + bufferType = ::ttnn::BufferType::DRAM; + break; + case ::tt::target::BufferType::L1: + bufferType = ::ttnn::BufferType::L1; + break; + case ::tt::target::BufferType::SystemMemory: + bufferType = ::ttnn::BufferType::SYSTEM_MEMORY; + break; + case ::tt::target::BufferType::L1Small: + bufferType = ::ttnn::BufferType::L1_SMALL; + break; + case ::tt::target::BufferType::Trace: + bufferType = ::ttnn::BufferType::TRACE; + break; + } + + // TODO(bug #620): + // Until ShardSpec support is added in TTNN, read it from the output tensor. + // If ShardSpec is not supplied, an error will be thrown in ttnn lib. + // + const ::tt::target::LayoutDesc *layout = op->out()->desc()->layout(); + const ::flatbuffers::Vector + *targetCoreRangeSet = layout->core_range_set(); + const ::flatbuffers::Vector *targetShardShape = + layout->memory_desc()->shape(); + CoreRangeSet ttnnCoreRangeSet = utils::toCoreRangeSet(targetCoreRangeSet); + std::array ttnnShardShape; + std::copy(targetShardShape->begin(), targetShardShape->end(), + ttnnShardShape.begin()); + ::tt::tt_metal::ShardSpec shardSpec( + ttnnCoreRangeSet, ttnnShardShape, + ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false); + + ::ttnn::MemoryConfig memoryConfig = {tensorMemoryLayout, bufferType, + shardSpec}; + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + ::ttnn::Tensor out = ::ttnn::to_device(inputTensor, &device, memoryConfig); + + tensorPool.try_emplace(op->out()->global_id(), out); +} + +} // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/layout/to_device.h b/runtime/lib/ttnn/operations/layout/to_device.h new file mode 100644 index 000000000..237211ed4 --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_device.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_TO_DEVICE_H +#define TTNN_RUNTIME_TO_DEVICE_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::layout { +void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::layout + +#endif diff --git a/runtime/lib/ttnn/operations/layout/to_layout.cpp b/runtime/lib/ttnn/operations/layout/to_layout.cpp new file mode 100644 index 000000000..adfeb8b43 --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_layout.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "to_layout.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::layout { +void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap &devicePool = context.devicePool; + const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); + assert((utils::isOnHost(inputTensor) or utils::isOnDevice(inputTensor)) && + "Unsupported storage type"); + + ::ttnn::Layout layout; + switch (op->layout()) { + case ::tt::target::TensorLayout::RowMajor: + layout = ::ttnn::Layout::ROW_MAJOR; + break; + case ::tt::target::TensorLayout::Tile: + layout = ::ttnn::Layout::TILE; + break; + case ::tt::target::TensorLayout::Invalid: + layout = ::ttnn::Layout::INVALID; + break; + } + + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + ::ttnn::Tensor out = ::ttnn::to_layout(inputTensor, layout, std::nullopt, + std::nullopt, &device); + + tensorPool.try_emplace(op->out()->global_id(), out); +} + +} // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/layout/to_layout.h b/runtime/lib/ttnn/operations/layout/to_layout.h new file mode 100644 index 000000000..f397b47cc --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_layout.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_TO_LAYOUT_H +#define TTNN_RUNTIME_TO_LAYOUT_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::layout { +void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::layout + +#endif diff --git a/runtime/lib/ttnn/operations/layout/to_memory_config.cpp b/runtime/lib/ttnn/operations/layout/to_memory_config.cpp new file mode 100644 index 000000000..a24c16f08 --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_memory_config.cpp @@ -0,0 +1,217 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "to_memory_config.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn::operations::layout { + +static ::ttnn::Tensor tilize(::ttnn::Tensor const &input) { + // NOLINTNEXTLINE + return ::ttnn::to_layout(input, ::ttnn::TILE_LAYOUT, std::nullopt, + std::nullopt, + static_cast<::ttnn::Device *>(nullptr)); +} + +static ::ttnn::Tensor untilize(::ttnn::Tensor const &input) { + return ::ttnn::to_layout(input, ::ttnn::ROW_MAJOR_LAYOUT, std::nullopt, + std::nullopt, + static_cast<::ttnn::Device *>(nullptr)); +} + +static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input, + const ::ttnn::DataType &targetDataType) { + if (utils::isOnHost(input)) { + return ::ttnn::to_dtype(input, targetDataType); + } + + if (utils::isOnDevice(input)) { + if (input.get_layout() != ::ttnn::TILE_LAYOUT) { + // typecast op requires tilized tensor + ::ttnn::Tensor converted = + ::ttnn::typecast(tilize(input), targetDataType); + // untilize and return + return untilize(converted); + } + return ::ttnn::typecast(input, targetDataType); + } + + throw std::runtime_error("Unsupported storage type"); +} + +/* TODO(bug #272), ideal flow is to determine tilize/untilize with + * tile_shape */ +static ::ttnn::Tensor +updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor, + const ::ttnn::DataType targetDataType, + const bool shouldTilize, const bool shouldUntilize) { + + ::ttnn::Tensor outputTensor = inputTensor; + const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType; + // const int targetTileX = targetTileShape->x(); + // const int targetTileY = targetTileShape->y(); + // const bool shouldTilize = + // targetTileX == 32 and targetTileY == 32 and + // inputTensor.get_layout() == ::ttnn::ROW_MAJOR_LAYOUT; + // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and + // inputTensor.get_layout() == + // ::ttnn::TILE_LAYOUT; + assert(not(shouldTilize and shouldUntilize) && + "Cannot tilize and untilize tensor at the same time"); + if (shouldTilize) { + outputTensor = tilize(outputTensor); + } else if (shouldUntilize) { + outputTensor = untilize(outputTensor); + } + if (shouldConvertDataType) { + outputTensor = convertDataType(outputTensor, targetDataType); + } + return outputTensor; +} + +static void +handleToHostMemoryConfigOp(const ::ttnn::Tensor &inputTensor, + const ::tt::target::TensorRef *outputTensorRef, + ProgramTensorPool &tensorPool) { + ::ttnn::Tensor result; + ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(outputTensorRef); + bool shouldTilize, shouldUntilize; + if (utils::isOnHost(inputTensor)) { + shouldTilize = false; + shouldUntilize = true; + result = updateLayoutAndDataType(inputTensor, targetDataTypeTTNN, + shouldTilize, shouldUntilize); + } else if (utils::isOnDevice(inputTensor)) { + shouldTilize = false; + shouldUntilize = true; + result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN, + shouldTilize, shouldUntilize); + } + // copy the output to the output tensor if it exists + if (tensorPool.contains(outputTensorRef->global_id())) { + ::ttnn::Tensor &outputTensor = tensorPool.at(outputTensorRef->global_id()); + void *src = ::tt::tt_metal::get_raw_host_data_ptr(result); + void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor); + std::uint32_t size = result.volume() * result.element_size(); + std::memcpy(dst, src, size); + } else { + tensorPool.insert_or_assign(outputTensorRef->global_id(), result); + } +} + +static void +handleToDramMemoryConfigOp(::ttnn::Device &device, + const ::ttnn::Tensor &inputTensor, + const ::tt::target::TensorRef *outputTensorRef, + ProgramTensorPool &tensorPool) { + ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(outputTensorRef); + ::tt::tt_metal::MemoryConfig targetMemoryConfig = + utils::createMemoryConfig(outputTensorRef); + bool shouldTilize, shouldUntilize; + if (utils::isOnHost(inputTensor)) { + ::ttnn::Tensor result = inputTensor; + shouldTilize = true; + shouldUntilize = false; + // device tilize requires BFLOAT16, if not then tilize on host + if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) { + result = tilize(result); + shouldTilize = false; + } + result = ::ttnn::to_device(result, &device, targetMemoryConfig); + result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize, + shouldUntilize); + tensorPool.insert_or_assign(outputTensorRef->global_id(), result); + } else if (utils::isOnDevice(inputTensor)) { + shouldTilize = false; + shouldUntilize = false; + ::ttnn::Tensor result = updateLayoutAndDataType( + inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize); + result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt); + tensorPool.insert_or_assign(outputTensorRef->global_id(), result); + } +} + +static void +handleToL1MemoryConfigOp(::ttnn::Device &device, + const ::ttnn::Tensor &inputTensor, + const ::tt::target::TensorRef *outputTensorRef, + ProgramTensorPool &tensorPool) { + ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(outputTensorRef); + ::tt::tt_metal::MemoryConfig targetMemoryConfig = + utils::createMemoryConfig(outputTensorRef); + bool shouldTilize, shouldUntilize; + if (utils::isOnHost(inputTensor)) { + ::ttnn::Tensor result = inputTensor; + // device tilize requires BFLOAT16, if not then tilize on host + if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) { + result = tilize(result); + result = ::ttnn::to_device(result, &device, targetMemoryConfig); + shouldTilize = false; + shouldUntilize = false; + result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize, + shouldUntilize); + } else { + shouldTilize = true; + shouldUntilize = false; + // device tilize op requires height sharded or interleaved tensors + // thus tilize first with default mem config, then convert memory config + result = ::ttnn::to_device(result, &device, std::nullopt); + result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize, + shouldUntilize); + result = + ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt); + } + tensorPool.insert_or_assign(outputTensorRef->global_id(), result); + } else if (utils::isOnDevice(inputTensor)) { + shouldTilize = false; + shouldUntilize = false; + ::ttnn::Tensor result = updateLayoutAndDataType( + inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize); + result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt); + tensorPool.insert_or_assign(outputTensorRef->global_id(), result); + } +} + +// TODO(bug #272): right now hardcoding tilize/untilize, should determine with +// tile shape blocked by issue #272 +void run(const ::tt::target::ttnn::ToMemoryConfigOp *op, + ProgramContext &context) { + + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap &devicePool = context.devicePool; + const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in0()->global_id()); + assert(utils::isOnHost(inputTensor) or + utils::isOnDevice(inputTensor) && "Unsupported storage type"); + + const ::tt::target::Dim2d *targetTileShape = + op->out()->desc()->layout()->memory_desc()->tile_shape(); + assert(::tt::runtime::ttnn::utils::isValidTileShape(targetTileShape) && + "Invalid tile shape"); + + const ::tt::target::MemorySpace targetMemorySpace = + op->out()->desc()->layout()->memory_desc()->memory_space(); + + switch (targetMemorySpace) { + // This case should only be used when gathering outputs at the end of the + // program + case ::tt::target::MemorySpace::System: + case ::tt::target::MemorySpace::SystemMMIO: { + handleToHostMemoryConfigOp(inputTensor, op->out(), tensorPool); + break; + } + case ::tt::target::MemorySpace::DeviceDRAM: { + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + handleToDramMemoryConfigOp(device, inputTensor, op->out(), tensorPool); + break; + } + case ::tt::target::MemorySpace::DeviceL1: { + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + handleToL1MemoryConfigOp(device, inputTensor, op->out(), tensorPool); + break; + } + } +} +} // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/layout/to_memory_config.h b/runtime/lib/ttnn/operations/layout/to_memory_config.h new file mode 100644 index 000000000..269897981 --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_memory_config.h @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_TO_MEMORY_CONFIG_H +#define TTNN_RUNTIME_TO_MEMORY_CONFIG_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::layout { +void run(const ::tt::target::ttnn::ToMemoryConfigOp *op, + ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::layout + +#endif diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp new file mode 100644 index 000000000..458cfe444 --- /dev/null +++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "matmul.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::matmul { +// ANCHOR: adding_an_op_matmul_runtime +void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id()); + const ::ttnn::Tensor &rhs = tensorPool.at(op->in1()->global_id()); + ::ttnn::DataType outputDataType = utils::getDataType(op->out()); + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + ::ttnn::Tensor out = ::ttnn::operations::matmul::matmul( + lhs, rhs, /*bias=*/std::nullopt, + ::ttnn::operations::matmul::Matmul{/*program_config=*/std::nullopt, + /*bcast_batch=*/std::nullopt, + outputMemoryConfig, outputDataType}); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +// ANCHOR_END: adding_an_op_matmul_runtime + +} // namespace tt::runtime::ttnn::operations::matmul diff --git a/runtime/lib/ttnn/operations/matmul/matmul.h b/runtime/lib/ttnn/operations/matmul/matmul.h new file mode 100644 index 000000000..2db14d7b9 --- /dev/null +++ b/runtime/lib/ttnn/operations/matmul/matmul.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_MATMUL_H +#define TTNN_RUNTIME_MATMUL_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::matmul { +void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::matmul + +#endif diff --git a/runtime/lib/ttnn/operations/normalization/softmax.cpp b/runtime/lib/ttnn/operations/normalization/softmax.cpp new file mode 100644 index 000000000..eba2323d4 --- /dev/null +++ b/runtime/lib/ttnn/operations/normalization/softmax.cpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "softmax.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::normalization { +void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); + int32_t dimension = op->dimension(); + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig); + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::normalization diff --git a/runtime/lib/ttnn/operations/normalization/softmax.h b/runtime/lib/ttnn/operations/normalization/softmax.h new file mode 100644 index 000000000..caf2cbc54 --- /dev/null +++ b/runtime/lib/ttnn/operations/normalization/softmax.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_SOFTMAX_H +#define TTNN_RUNTIME_SOFTMAX_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::normalization { +void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::normalization + +#endif diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp new file mode 100644 index 000000000..742fad1bd --- /dev/null +++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "maxpool2d.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::pool { +void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + DeviceMap &devicePool = context.devicePool; + const ::ttnn::Tensor &input = tensorPool.at(op->in()->global_id()); + const ::ttnn::operations::pool::MaxPool2DOp operation = + ::ttnn::operations::pool::MaxPool2DOp(); + + ::ttnn::Device &device = utils::getDevice(op->device(), devicePool); + ::ttnn::Tensor out = operation.invoke( + 0, input, op->batch_size(), op->input_height(), op->input_width(), + op->channels(), {op->kernel_height(), op->kernel_width()}, + {op->stride_height(), op->stride_width()}, + {op->padding_height(), op->padding_width()}, + {op->dilation_height(), op->dilation_width()}, &device); + + tensorPool.insert_or_assign(op->out()->global_id(), out); + return; +} +} // namespace tt::runtime::ttnn::operations::pool diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.h b/runtime/lib/ttnn/operations/pool/maxpool2d.h new file mode 100644 index 000000000..f8746f817 --- /dev/null +++ b/runtime/lib/ttnn/operations/pool/maxpool2d.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_MAXPOOL2D_H +#define TTNN_RUNTIME_MAXPOOL2D_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::pool { +void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::pool + +#endif diff --git a/runtime/lib/ttnn/operations/reduction/reduction.cpp b/runtime/lib/ttnn/operations/reduction/reduction.cpp new file mode 100644 index 000000000..88925e3d2 --- /dev/null +++ b/runtime/lib/ttnn/operations/reduction/reduction.cpp @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "reduction.h" +#include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/operations/utils.h" + +namespace tt::runtime::ttnn::operations::reduction { +static void runReductionOp( + ::tt::target::ttnn::ReductionOp const *op, ProgramTensorPool &tensorPool, + std::function<::ttnn::Tensor( + const ::ttnn::Tensor &, + const std::optional>> &, const bool, + const std::optional<::tt::tt_metal::MemoryConfig> &, + const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)> + ttnnOp) { + ::tt::tt_metal::MemoryConfig outputMemoryConfig = + utils::createMemoryConfig(op->out()); + const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); + + const auto *fbDimArg = op->dim_arg(); + std::optional> dimArg = + fbDimArg ? std::make_optional( + std::vector(fbDimArg->begin(), fbDimArg->end())) + : std::nullopt; + + ::ttnn::Tensor out = ttnnOp( + in, dimArg, op->keep_dim(), outputMemoryConfig /* memory_config_arg */, + std::nullopt /* compute_kernel_config */, 1.0f /* scalar */); + + tensorPool.insert_or_assign(op->out()->global_id(), out); +} + +void run(const ::tt::target::ttnn::ReductionOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.tensorPool; + switch (op->type()) { + case ::tt::target::ttnn::ReductionOpType::Sum: { + runReductionOp(op, tensorPool, ::ttnn::sum); + break; + } + case ::tt::target::ttnn::ReductionOpType::Mean: { + runReductionOp(op, tensorPool, ::ttnn::mean); + break; + } + case ::tt::target::ttnn::ReductionOpType::Max: { + runReductionOp(op, tensorPool, ::ttnn::max); + break; + } + } +} +} // namespace tt::runtime::ttnn::operations::reduction diff --git a/runtime/lib/ttnn/operations/reduction/reduction.h b/runtime/lib/ttnn/operations/reduction/reduction.h new file mode 100644 index 000000000..9e9d668b8 --- /dev/null +++ b/runtime/lib/ttnn/operations/reduction/reduction.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTNN_RUNTIME_REDUCTION_H +#define TTNN_RUNTIME_REDUCTION_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::reduction { +void run(const ::tt::target::ttnn::ReductionOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::reduction + +#endif diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp index 26a967a25..ef070ffe3 100644 --- a/runtime/lib/ttnn/program.cpp +++ b/runtime/lib/ttnn/program.cpp @@ -1,994 +1,102 @@ // SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include -#include -#include -#include -#include - -#include "tt/runtime/detail/ttnn.h" -#include "tt/runtime/runtime.h" +#include "operations/context/get_device.h" +#include "operations/conv/conv2d.h" +#include "operations/creation/empty.h" +#include "operations/creation/full.h" +#include "operations/data_movement/concat.h" +#include "operations/data_movement/reshape.h" +#include "operations/data_movement/transpose.h" +#include "operations/deletion/dealloc.h" +#include "operations/eltwise/binary.h" +#include "operations/eltwise/unary.h" +#include "operations/embedding/embedding.h" +#include "operations/layout/to_device.h" +#include "operations/layout/to_layout.h" +#include "operations/layout/to_memory_config.h" +#include "operations/matmul/matmul.h" +#include "operations/normalization/softmax.h" +#include "operations/pool/maxpool2d.h" +#include "operations/reduction/reduction.h" +#include "tt/runtime/ttnn/types.h" #include "ttmlir/Target/TTNN/program_generated.h" -#include "ttnn/device.hpp" -#include "ttnn/operations/conv/conv2d/conv2d.hpp" -#include "ttnn/operations/core/core.hpp" -#include "ttnn/operations/pool/maxpool/max_pool2d.hpp" -#include "ttnn/tensor/tensor.hpp" -#include "ttnn/tensor/types.hpp" -#include "ttnn/types.hpp" -#include "types_generated.h" -#include "utils.h" - -#include "ttmlir/Target/TTNN/Target.h" -#include "ttmlir/Version.h" namespace tt::runtime::ttnn { -class ProgramTensorPool { -public: - ProgramTensorPool( - const std::unordered_map &liveTensors) - : liveTensors(liveTensors) {} +struct ProgramExecutor { + ProgramContext context; + ProgramExecutor(const TensorMap &liveTensors, const DeviceMap &allDevices) + : context(ProgramContext(liveTensors, allDevices)) {} - auto try_emplace(std::uint32_t global_id, const ::ttnn::Tensor &tensor) { - auto it = liveTensors.find(global_id); - if (it != liveTensors.end()) { - return std::make_pair(it, false); + void execute(const ::tt::target::ttnn::Program *program) { + for (const ::tt::target::ttnn::Operation *op : *program->operations()) { + runOperation(op); } - assert(!intermedTensors.contains(global_id)); - intermedTensors.try_emplace(global_id, tensor); - return liveTensors.try_emplace(global_id, &intermedTensors.at(global_id)); - } - - auto insert_or_assign(std::uint32_t global_id, const ::ttnn::Tensor &tensor) { - intermedTensors.insert_or_assign(global_id, tensor); - return liveTensors.insert_or_assign(global_id, - &intermedTensors.at(global_id)); - } - - ::ttnn::Tensor &at(std::uint32_t global_id) { - assert(liveTensors.contains(global_id)); - return *liveTensors.at(global_id); - } - - size_t erase(std::uint32_t global_id) { - assert(liveTensors.contains(global_id) && - intermedTensors.contains(global_id)); - intermedTensors.erase(global_id); - return liveTensors.erase(global_id); - } - - bool contains(std::uint32_t global_id) const { - return liveTensors.contains(global_id); } private: - // A superset of intermedTensors, containing all tensors created by the - // program and the input/output tensors passed in by the user - std::unordered_map liveTensors; - - // A subset of liveTensors, containing any intermediate tensors created by the - // program - std::unordered_map intermedTensors; + void runOperation(const ::tt::target::ttnn::Operation *op); }; -static bool isOnHost(const ::ttnn::Tensor &tensor) { - // Currently only supports borrowed or owned host storage - return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or - tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED; -} - -static bool isOnDevice(const ::ttnn::Tensor &tensor) { - // Currently only supports single device storage - return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE; -} - -static ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) { - return utils::toTTNNDataType( - tensorRef->desc()->layout()->memory_desc()->data_type()); -} - -static ::ttnn::Device & -getDevice(const ::tt::target::DeviceRef *deviceRef, - std::unordered_map &devicePool) { - uint32_t deviceId = deviceRef->global_id(); - assert(devicePool.contains(deviceId) && "Device not found in device pool"); - return *devicePool.at(deviceId); -} - -static CoreRangeSet toCoreRangeSet( - const ::flatbuffers::Vector *coreRangeSet) { - std::set coreRanges; - for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) { - CoreCoord start(coreRange->loc().x(), coreRange->loc().y()); - // End is inclusive - CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1, - coreRange->loc().y() + coreRange->size().y() - 1); - - coreRanges.emplace(start, end); - } - return CoreRangeSet(coreRanges); -} - -static ::tt::tt_metal::MemoryConfig -createMemoryConfig(const ::tt::target::TensorRef *tensorRef) { - const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout(); - const ::tt::target::TensorMemoryLayout targetMemoryLayout = - layout->memory_desc()->memory_layout(); - const ::tt::target::MemorySpace targetMemorySpace = - layout->memory_desc()->memory_space(); - const ::flatbuffers::Vector - *targetCoreRangeSet = layout->core_range_set(); - const ::flatbuffers::Vector *targetShardShape = - layout->memory_desc()->shape(); - - // TODO (jnie): Hardcoding to interleaved and block sharded for now - // Add support for other types once compiler supports it - assert(targetMemoryLayout == ::tt::target::TensorMemoryLayout::Interleaved || - targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded); - assert(targetMemoryLayout != target::TensorMemoryLayout::BlockSharded || - targetMemorySpace == target::MemorySpace::DeviceL1 && - "Only L1 memory space supports sharded memory layout"); - assert(targetCoreRangeSet->size() == 1 && - "Currently only single core range/grid is supported"); - assert(targetShardShape->size() == 2 && - "Only 2D shard shape is supported in TTNN backend"); - - CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet); - std::array ttnnShardShape; - std::copy(targetShardShape->begin(), targetShardShape->end(), - ttnnShardShape.begin()); - - if (targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded) { - assert(ttnnShardShape[0] % ::tt::constants::TILE_HEIGHT == 0 && - ttnnShardShape[1] % ::tt::constants::TILE_WIDTH == 0 && - "Shard shape must divide tile shape (32, 32) evenly"); - } - - ::tt::tt_metal::ShardSpec shardSpec( - ttnnCoreRangeSet, ttnnShardShape, - ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false); - - ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout = - utils::toTTNNTensorMemoryLayout(targetMemoryLayout); - - ::tt::tt_metal::BufferType ttnnBufferType = - utils::toTTNNBufferType(targetMemorySpace); - - return {ttnnMemLayout, ttnnBufferType, shardSpec}; -} - -static ::ttnn::Tensor tilize(::ttnn::Tensor const &input) { - // NOLINTNEXTLINE - return ::ttnn::to_layout(input, ::ttnn::TILE_LAYOUT, std::nullopt, - std::nullopt, - static_cast<::ttnn::Device *>(nullptr)); -} - -static ::ttnn::Tensor untilize(::ttnn::Tensor const &input) { - return ::ttnn::to_layout(input, ::ttnn::ROW_MAJOR_LAYOUT, std::nullopt, - std::nullopt, - static_cast<::ttnn::Device *>(nullptr)); -} - -static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input, - const ::ttnn::DataType &targetDataType) { - if (isOnHost(input)) { - return ::ttnn::to_dtype(input, targetDataType); - } - - if (isOnDevice(input)) { - if (input.get_layout() != ::ttnn::TILE_LAYOUT) { - // typecast op requires tilized tensor - ::ttnn::Tensor converted = - ::ttnn::typecast(tilize(input), targetDataType); - // untilize and return - return untilize(converted); - } - return ::ttnn::typecast(input, targetDataType); - } - - throw std::runtime_error("Unsupported storage type"); -} - -/* TODO(bug #272), ideal flow is to determine tilize/untilize with - * tile_shape */ -static ::ttnn::Tensor -updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor, - const ::ttnn::DataType targetDataType, - const bool shouldTilize, const bool shouldUntilize) { - - ::ttnn::Tensor outputTensor = inputTensor; - const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType; - // const int targetTileX = targetTileShape->x(); - // const int targetTileY = targetTileShape->y(); - // const bool shouldTilize = - // targetTileX == 32 and targetTileY == 32 and - // inputTensor.get_layout() == ::ttnn::ROW_MAJOR_LAYOUT; - // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and - // inputTensor.get_layout() == - // ::ttnn::TILE_LAYOUT; - assert(not(shouldTilize and shouldUntilize) && - "Cannot tilize and untilize tensor at the same time"); - if (shouldTilize) { - outputTensor = tilize(outputTensor); - } else if (shouldUntilize) { - outputTensor = untilize(outputTensor); - } - if (shouldConvertDataType) { - outputTensor = convertDataType(outputTensor, targetDataType); - } - return outputTensor; -} - -static void -handleToHostMemoryConfigOp(const ::ttnn::Tensor &inputTensor, - const ::tt::target::TensorRef *outputTensorRef, - ProgramTensorPool &tensorPool) { - ::ttnn::Tensor result; - ::ttnn::DataType targetDataTypeTTNN = getDataType(outputTensorRef); - bool shouldTilize, shouldUntilize; - if (isOnHost(inputTensor)) { - shouldTilize = false; - shouldUntilize = true; - result = updateLayoutAndDataType(inputTensor, targetDataTypeTTNN, - shouldTilize, shouldUntilize); - } else if (isOnDevice(inputTensor)) { - shouldTilize = false; - shouldUntilize = true; - result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN, - shouldTilize, shouldUntilize); - } - // copy the output to the output tensor if it exists - if (tensorPool.contains(outputTensorRef->global_id())) { - ::ttnn::Tensor &outputTensor = tensorPool.at(outputTensorRef->global_id()); - void *src = ::tt::tt_metal::get_raw_host_data_ptr(result); - void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor); - std::uint32_t size = result.volume() * result.element_size(); - std::memcpy(dst, src, size); - } else { - tensorPool.insert_or_assign(outputTensorRef->global_id(), result); - } -} - -static void -handleToDramMemoryConfigOp(::ttnn::Device &device, - const ::ttnn::Tensor &inputTensor, - const ::tt::target::TensorRef *outputTensorRef, - ProgramTensorPool &tensorPool) { - ::ttnn::DataType targetDataTypeTTNN = getDataType(outputTensorRef); - ::tt::tt_metal::MemoryConfig targetMemoryConfig = - createMemoryConfig(outputTensorRef); - bool shouldTilize, shouldUntilize; - if (isOnHost(inputTensor)) { - ::ttnn::Tensor result = inputTensor; - shouldTilize = true; - shouldUntilize = false; - // device tilize requires BFLOAT16, if not then tilize on host - if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) { - result = tilize(result); - shouldTilize = false; - } - result = ::ttnn::to_device(result, &device, targetMemoryConfig); - result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize, - shouldUntilize); - tensorPool.insert_or_assign(outputTensorRef->global_id(), result); - } else if (isOnDevice(inputTensor)) { - shouldTilize = false; - shouldUntilize = false; - ::ttnn::Tensor result = updateLayoutAndDataType( - inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize); - result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt); - tensorPool.insert_or_assign(outputTensorRef->global_id(), result); - } -} - -static void -handleToL1MemoryConfigOp(::ttnn::Device &device, - const ::ttnn::Tensor &inputTensor, - const ::tt::target::TensorRef *outputTensorRef, - ProgramTensorPool &tensorPool) { - ::ttnn::DataType targetDataTypeTTNN = getDataType(outputTensorRef); - ::tt::tt_metal::MemoryConfig targetMemoryConfig = - createMemoryConfig(outputTensorRef); - bool shouldTilize, shouldUntilize; - if (isOnHost(inputTensor)) { - ::ttnn::Tensor result = inputTensor; - // device tilize requires BFLOAT16, if not then tilize on host - if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) { - result = tilize(result); - result = ::ttnn::to_device(result, &device, targetMemoryConfig); - shouldTilize = false; - shouldUntilize = false; - result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize, - shouldUntilize); - } else { - shouldTilize = true; - shouldUntilize = false; - // device tilize op requires height sharded or interleaved tensors - // thus tilize first with default mem config, then convert memory config - result = ::ttnn::to_device(result, &device, std::nullopt); - result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize, - shouldUntilize); - result = - ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt); - } - tensorPool.insert_or_assign(outputTensorRef->global_id(), result); - } else if (isOnDevice(inputTensor)) { - shouldTilize = false; - shouldUntilize = false; - ::ttnn::Tensor result = updateLayoutAndDataType( - inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize); - result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt); - tensorPool.insert_or_assign(outputTensorRef->global_id(), result); - } -} - -// TODO(bug #272): right now hardcoding tilize/untilize, should determine with -// tile shape blocked by issue #272 -static void run(::tt::target::ttnn::ToMemoryConfigOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - - const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in0()->global_id()); - assert(isOnHost(inputTensor) or - isOnDevice(inputTensor) && "Unsupported storage type"); - - const ::tt::target::Dim2d *targetTileShape = - op->out()->desc()->layout()->memory_desc()->tile_shape(); - assert(utils::isValidTileShape(targetTileShape) && "Invalid tile shape"); - - const ::tt::target::MemorySpace targetMemorySpace = - op->out()->desc()->layout()->memory_desc()->memory_space(); - - switch (targetMemorySpace) { - // This case should only be used when gathering outputs at the end of the - // program - case ::tt::target::MemorySpace::System: - case ::tt::target::MemorySpace::SystemMMIO: { - handleToHostMemoryConfigOp(inputTensor, op->out(), tensorPool); - break; - } - case ::tt::target::MemorySpace::DeviceDRAM: { - ::ttnn::Device &device = getDevice(op->device(), devicePool); - handleToDramMemoryConfigOp(device, inputTensor, op->out(), tensorPool); - break; - } - case ::tt::target::MemorySpace::DeviceL1: { - ::ttnn::Device &device = getDevice(op->device(), devicePool); - handleToL1MemoryConfigOp(device, inputTensor, op->out(), tensorPool); - break; - } - } -} - -static void run(::tt::target::ttnn::ToLayoutOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - - const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); - assert((isOnHost(inputTensor) or isOnDevice(inputTensor)) && - "Unsupported storage type"); - - ::ttnn::Layout layout; - switch (op->layout()) { - case ::tt::target::TensorLayout::RowMajor: - layout = ::ttnn::Layout::ROW_MAJOR; - break; - case ::tt::target::TensorLayout::Tile: - layout = ::ttnn::Layout::TILE; - break; - case ::tt::target::TensorLayout::Invalid: - layout = ::ttnn::Layout::INVALID; - break; - } - - ::ttnn::Device &device = getDevice(op->device(), devicePool); - ::ttnn::Tensor out = ::ttnn::to_layout(inputTensor, layout, std::nullopt, - std::nullopt, &device); - - tensorPool.try_emplace(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::ToDeviceOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - - const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); - assert((isOnHost(inputTensor) or isOnDevice(inputTensor)) && - "Unsupported storage type"); - - op->memcfg()->tensor_memory_layout(); - op->memcfg()->buffer_type(); - - ::ttnn::TensorMemoryLayout tensorMemoryLayout; - switch (op->memcfg()->tensor_memory_layout()) { - case ::tt::target::TensorMemoryLayout::Interleaved: - tensorMemoryLayout = ::ttnn::TensorMemoryLayout::INTERLEAVED; - break; - case ::tt::target::TensorMemoryLayout::SingleBank: - tensorMemoryLayout = ::ttnn::TensorMemoryLayout::SINGLE_BANK; - break; - case ::tt::target::TensorMemoryLayout::HeightSharded: - tensorMemoryLayout = ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED; - break; - case ::tt::target::TensorMemoryLayout::WidthSharded: - tensorMemoryLayout = ::ttnn::TensorMemoryLayout::WIDTH_SHARDED; - break; - case ::tt::target::TensorMemoryLayout::BlockSharded: - tensorMemoryLayout = ::ttnn::TensorMemoryLayout::BLOCK_SHARDED; - break; - case ::tt::target::TensorMemoryLayout::None: - assert(false && - "Unsupported tensor memory layout TensorMemoryLayout::None"); - break; - } - - ::ttnn::BufferType bufferType; - switch (op->memcfg()->buffer_type()) { - case ::tt::target::BufferType::DRAM: - bufferType = ::ttnn::BufferType::DRAM; - break; - case ::tt::target::BufferType::L1: - bufferType = ::ttnn::BufferType::L1; - break; - case ::tt::target::BufferType::SystemMemory: - bufferType = ::ttnn::BufferType::SYSTEM_MEMORY; - break; - case ::tt::target::BufferType::L1Small: - bufferType = ::ttnn::BufferType::L1_SMALL; - break; - case ::tt::target::BufferType::Trace: - bufferType = ::ttnn::BufferType::TRACE; - break; - } - - // TODO(bug #620): - // Until ShardSpec support is added in TTNN, read it from the output tensor. - // If ShardSpec is not supplied, an error will be thrown in ttnn lib. - // - const ::tt::target::LayoutDesc *layout = op->out()->desc()->layout(); - const ::flatbuffers::Vector - *targetCoreRangeSet = layout->core_range_set(); - const ::flatbuffers::Vector *targetShardShape = - layout->memory_desc()->shape(); - CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet); - std::array ttnnShardShape; - std::copy(targetShardShape->begin(), targetShardShape->end(), - ttnnShardShape.begin()); - ::tt::tt_metal::ShardSpec shardSpec( - ttnnCoreRangeSet, ttnnShardShape, - ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false); - - ::ttnn::MemoryConfig memoryConfig = {tensorMemoryLayout, bufferType, - shardSpec}; - ::ttnn::Device &device = getDevice(op->device(), devicePool); - ::ttnn::Tensor out = ::ttnn::to_device(inputTensor, &device, memoryConfig); - - tensorPool.try_emplace(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::EmptyOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - ::ttnn::DataType targetDataTypeTTNN = getDataType(op->out()); - // TODO(bug #582): ttnn::empty doesn't work properly with tile layout, - // using ROW_MAJOR until we fix it - auto desiredLayout = ::ttnn::Layout::ROW_MAJOR; - auto shape = ::ttnn::Shape(::tt::tt_metal::Shape( - utils::toShapeFromFBShape(*op->out()->desc()->shape()))); - - // Create output memory config for the op - // - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - - ::ttnn::Device &device = getDevice(op->device(), devicePool); - ::ttnn::Tensor out = ::ttnn::empty(shape, targetDataTypeTTNN, desiredLayout, - device, outputMemoryConfig); - - // use try emplace here so the program output tensor doesn't get overwritten - tensorPool.try_emplace(op->out()->global_id(), out); -} - -static void -getEltwiseBinaryOPInputTensors(::tt::target::ttnn::EltwiseOp const *op, - ProgramTensorPool &tensorPool, - ::ttnn::Tensor **lhs, ::ttnn::Tensor **rhs) { - assert(op->ins()->size() == 2 && "Expected 2 inputs"); - *lhs = &(tensorPool.at(op->ins()->Get(0)->global_id())); - *rhs = &(tensorPool.at(op->ins()->Get(1)->global_id())); -} - -static void runEltwiseBinaryOP( - ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool, - std::function<::ttnn::Tensor( - const ::ttnn::Tensor &, const ::ttnn::Tensor &, - const std::optional &, - const std::optional<::tt::tt_metal::MemoryConfig> &, - std::optional<::ttnn::Tensor>, - std::optional<::ttnn::operations::unary::FusedActivations>, - std::optional<::ttnn::operations::unary::UnaryWithParam>)> - ttnnOp) { - - ::ttnn::Tensor *lhs = nullptr; - ::ttnn::Tensor *rhs = nullptr; - getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs); - - ::ttnn::DataType outputDataType = getDataType(op->out()); - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - - ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig, - std::nullopt, std::nullopt, std::nullopt); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void runEltwiseBinaryCompositeOP( - ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool, - std::function< - ::ttnn::Tensor(const ::ttnn::Tensor &, const ::ttnn::Tensor &, - const std::optional<::tt::tt_metal::MemoryConfig> &)> - ttnnOp) { - - ::ttnn::Tensor *lhs = nullptr; - ::ttnn::Tensor *rhs = nullptr; - getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs); - - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - - ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void -getEltwiseUnaryOPInputTensor(::tt::target::ttnn::EltwiseOp const *op, - ProgramTensorPool &tensorPool, - ::ttnn::Tensor **in) { - assert(op->ins()->size() == 1 && "Expected 1 input"); - *in = &(tensorPool.at(op->ins()->Get(0)->global_id())); -} - -static void runEltwiseUnaryOP( - ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool, - std::function< - ::ttnn::Tensor(const ::ttnn::Tensor &, - const std::optional<::tt::tt_metal::MemoryConfig> &, - const std::optional<::ttnn::Tensor> &)> - ttnnOp) { - - ::ttnn::Tensor *in = nullptr; - getEltwiseUnaryOPInputTensor(op, tensorPool, &in); - - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - - ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void runEltwiseUnaryWithFastAndApproximateModeOP( - ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool, - std::function< - ::ttnn::Tensor(const ::ttnn::Tensor &, const bool, - const std::optional<::tt::tt_metal::MemoryConfig> &, - const std::optional<::ttnn::Tensor> &)> - ttnnOp) { - - ::ttnn::Tensor *in = nullptr; - getEltwiseUnaryOPInputTensor(op, tensorPool, &in); - - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - - ::ttnn::Tensor out = - ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::EltwiseOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - switch (op->type()) { - /* Eltwise Binary */ - case ::tt::target::ttnn::EltwiseOpType::Add: { - runEltwiseBinaryOP(op, tensorPool, ::ttnn::add); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Multiply: { - runEltwiseBinaryOP(op, tensorPool, ::ttnn::multiply); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Subtract: { - runEltwiseBinaryOP(op, tensorPool, ::ttnn::subtract); - break; - } - case ::tt::target::ttnn::EltwiseOpType::GreaterEqual: { - runEltwiseBinaryOP(op, tensorPool, ::ttnn::ge); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Div: { - runEltwiseBinaryOP(op, tensorPool, ::ttnn::divide); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Maximum: { - runEltwiseBinaryCompositeOP(op, tensorPool, ::ttnn::maximum); - break; - } - /* Eltwise Unary */ - case ::tt::target::ttnn::EltwiseOpType::Abs: { - runEltwiseUnaryOP(op, tensorPool, ::ttnn::abs); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Neg: { - runEltwiseUnaryOP(op, tensorPool, ::ttnn::neg); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Relu: { - runEltwiseUnaryOP(op, tensorPool, ::ttnn::relu); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Sqrt: { - runEltwiseUnaryOP(op, tensorPool, ::ttnn::sqrt); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Sigmoid: { - runEltwiseUnaryOP(op, tensorPool, ::ttnn::sigmoid); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Reciprocal: { - runEltwiseUnaryOP(op, tensorPool, ::ttnn::reciprocal); - break; - } - case ::tt::target::ttnn::EltwiseOpType::Exp: { - runEltwiseUnaryWithFastAndApproximateModeOP(op, tensorPool, ::ttnn::exp); - break; - } - } -} - -static void runReductionOp( - ::tt::target::ttnn::ReductionOp const *op, ProgramTensorPool &tensorPool, - std::function<::ttnn::Tensor( - const ::ttnn::Tensor &, - const std::optional>> &, const bool, - const std::optional<::tt::tt_metal::MemoryConfig> &, - const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)> - ttnnOp) { - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); - - const auto *fbDimArg = op->dim_arg(); - std::optional> dimArg = - fbDimArg ? std::make_optional( - std::vector(fbDimArg->begin(), fbDimArg->end())) - : std::nullopt; - - ::ttnn::Tensor out = ttnnOp( - in, dimArg, op->keep_dim(), outputMemoryConfig /* memory_config_arg */, - std::nullopt /* compute_kernel_config */, 1.0f /* scalar */); - - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::ReductionOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - switch (op->type()) { - case ::tt::target::ttnn::ReductionOpType::Sum: { - runReductionOp(op, tensorPool, ::ttnn::sum); - break; - } - case ::tt::target::ttnn::ReductionOpType::Mean: { - runReductionOp(op, tensorPool, ::ttnn::mean); - break; - } - case ::tt::target::ttnn::ReductionOpType::Max: { - runReductionOp(op, tensorPool, ::ttnn::max); - break; - } - } -} - -template -static std::array -vectorToArray(const std::vector &vec) { - if (vec.size() != Rank) { - throw std::invalid_argument("Vector size does not match array size"); - } - std::array arr; - std::copy(vec.begin(), vec.end(), arr.begin()); - return arr; -} - -template -static ::ttnn::Tensor invoke_reshape(const ::ttnn::Tensor &tensor, - const std::vector &shape) { - return ::ttnn::reshape(tensor, vectorToArray(shape)); -} - -static void run(::tt::target::ttnn::ReshapeOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); - const auto *fbShape = op->shape(); - std::vector shape(fbShape->begin(), fbShape->end()); - constexpr int32_t Rank1 = 1; - constexpr int32_t Rank2 = 2; - constexpr int32_t Rank3 = 3; - constexpr int32_t Rank4 = 4; - constexpr int32_t Rank5 = 5; - - ::ttnn::Tensor out; - switch (fbShape->size()) { - case Rank1: - out = invoke_reshape(in, shape); - break; - case Rank2: - out = invoke_reshape(in, shape); - break; - case Rank3: - out = invoke_reshape(in, shape); - break; - case Rank4: - out = invoke_reshape(in, shape); - break; - case Rank5: - out = invoke_reshape(in, shape); - break; - default: - throw std::invalid_argument("Unsupported rank for reshape"); - } - - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::EmbeddingOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id()); - const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id()); - // default params for embedding op - std::optional padToken = std::nullopt; - ::tt::tt_metal::Layout layout = ::ttnn::ROW_MAJOR_LAYOUT; - auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC; - ::ttnn::DataType outputDataType = getDataType(op->output()); - ::ttnn::MemoryConfig outputMemoryConfig = createMemoryConfig(op->output()); - ::ttnn::Tensor out = - ::ttnn::embedding(input, weight, padToken, layout, embeddingsType, - outputDataType, outputMemoryConfig); - tensorPool.insert_or_assign(op->output()->global_id(), out); -} - -static void run(::tt::target::ttnn::SoftmaxOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); - int32_t dimension = op->dimension(); - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::TransposeOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); - int32_t dim0 = op->dim0(); - int32_t dim1 = op->dim1(); - auto inputRank = in.get_shape().rank(); - // for the current version of permute, we need to work in 4D, so we add - // leading dimensions of size 1 - std::vector dimensionOrder(4); - std::iota(dimensionOrder.begin(), dimensionOrder.end(), 0); - if (dim0 < 0) { - dim0 += 4; - } else { - dim0 = dim0 + 4 - inputRank; - } - if (dim1 < 0) { - dim1 += 4; - } else { - dim1 = dim1 + 4 - inputRank; - } - std::swap(dimensionOrder[dim0], dimensionOrder[dim1]); - // Ideally this would use ttnn::transpose, but since ttnn::transpose doesn't - // work at the moment, we use this temporary solution. - ::ttnn::Tensor unsqueezedInput = ::ttnn::unsqueeze_to_4D(in); - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - ::ttnn::Tensor out = - ::ttnn::permute(unsqueezedInput, dimensionOrder, outputMemoryConfig); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void run(::tt::target::ttnn::ConcatOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - std::vector<::ttnn::Tensor> inputs; - for (const auto &input : *op->inputs()) { - inputs.push_back(tensorPool.at(input->global_id())); - } - int32_t dim = op->dim(); - ::ttnn::Tensor out = ::ttnn::concat(inputs, dim); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -// ANCHOR: adding_an_op_matmul_runtime -static void run(::tt::target::ttnn::MatmulOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id()); - const ::ttnn::Tensor &rhs = tensorPool.at(op->in1()->global_id()); - ::ttnn::DataType outputDataType = getDataType(op->out()); - ::tt::tt_metal::MemoryConfig outputMemoryConfig = - createMemoryConfig(op->out()); - ::ttnn::Tensor out = ::ttnn::operations::matmul::matmul( - lhs, rhs, /*bias=*/std::nullopt, - ::ttnn::operations::matmul::Matmul{/*program_config=*/std::nullopt, - /*bcast_batch=*/std::nullopt, - outputMemoryConfig, outputDataType}); - tensorPool.insert_or_assign(op->out()->global_id(), out); -} -// ANCHOR_END: adding_an_op_matmul_runtime - -static void run(::tt::target::ttnn::Conv2dOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id()); - const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id()); - std::optional<::ttnn::Tensor> bias = - op->bias() ? std::make_optional(tensorPool.at(op->bias()->global_id())) - : std::nullopt; - auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig(); - config.dtype = input.dtype(); - config.weights_dtype = weight.dtype(); - ::ttnn::Device &device = getDevice(op->device(), devicePool); - ::ttnn::Tensor out = - std::get<0>(::ttnn::operations::conv::conv2d::conv2d<::ttnn::Device>( - input, weight, &device, op->in_channels(), op->out_channels(), - op->batch_size(), op->input_height(), op->input_width(), - {op->kernel_height(), op->kernel_width()}, - {op->stride_height(), op->stride_width()}, - {op->padding_height(), op->padding_width()}, - {op->dilation_height(), op->dilation_width()}, op->groups(), bias, - config)); - - tensorPool.insert_or_assign(op->out()->global_id(), out); - return; -} - -static void run(::tt::target::ttnn::MaxPool2dOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const ::ttnn::Tensor &input = tensorPool.at(op->in()->global_id()); - const ::ttnn::operations::pool::MaxPool2DOp operation = - ::ttnn::operations::pool::MaxPool2DOp(); - - ::ttnn::Device &device = getDevice(op->device(), devicePool); - ::ttnn::Tensor out = operation.invoke( - 0, input, op->batch_size(), op->input_height(), op->input_width(), - op->channels(), {op->kernel_height(), op->kernel_width()}, - {op->stride_height(), op->stride_width()}, - {op->padding_height(), op->padding_width()}, - {op->dilation_height(), op->dilation_width()}, &device); - - tensorPool.insert_or_assign(op->out()->global_id(), out); - return; -} - -static void run(::tt::target::ttnn::DeallocOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - bool force = true; - ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id()); - tensor.deallocate(force); - tensorPool.erase(op->in()->global_id()); -} - -static void -run(::tt::target::ttnn::GetDeviceOp const *op, - const std::unordered_map &allDevices, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - const flatbuffers::Vector *chipIds = op->chip_ids(); - assert(chipIds->size() == 1 && "Expected 1 chip id"); - for (const uint32_t chipId : *chipIds) { - assert(allDevices.contains(chipId) && "Device not found"); - auto [iter, inserted] = - devicePool.try_emplace(chipId, allDevices.at(chipId)); - assert(inserted && "Duplicate device"); - } -} - -static void run(::tt::target::ttnn::FullOp const *op, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - ::ttnn::Device &device = getDevice(op->device(), devicePool); - ::ttnn::DataType outputDataType = getDataType(op->out()); - auto shape = ::ttnn::Shape(::tt::tt_metal::Shape( - utils::toShapeFromFBShape(*op->out()->desc()->shape()))); - float fillValue = op->fill_value(); - // TODO(bug #272), determine correct layout by tile shape in the future - ::ttnn::Layout outputLayout = ::ttnn::Layout::ROW_MAJOR; - std::optional> outputDevice = - std::make_optional(std::ref(device)); - std::optional<::tt::tt_metal::MemoryConfig> outputMemoryConfig = - std::make_optional(createMemoryConfig(op->out())); - - ::ttnn::Tensor out = - ::ttnn::full(shape, fillValue, outputDataType, outputLayout, outputDevice, - outputMemoryConfig); - - tensorPool.insert_or_assign(op->out()->global_id(), out); -} - -static void -run(::tt::target::ttnn::Operation const *op, - const std::unordered_map &allDevices, - std::unordered_map &devicePool, - ProgramTensorPool &tensorPool) { - +void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) { switch (op->type_type()) { case ::tt::target::ttnn::OpType::GetDeviceOp: { - return run(op->type_as_GetDeviceOp(), allDevices, devicePool, tensorPool); - break; + return operations::context::run(op->type_as_GetDeviceOp(), context); } case ::tt::target::ttnn::OpType::ToMemoryConfigOp: { - return run(op->type_as_ToMemoryConfigOp(), devicePool, tensorPool); + return operations::layout::run(op->type_as_ToMemoryConfigOp(), context); } case ::tt::target::ttnn::OpType::ToLayoutOp: { - return run(op->type_as_ToLayoutOp(), devicePool, tensorPool); + return operations::layout::run(op->type_as_ToLayoutOp(), context); } case ::tt::target::ttnn::OpType::ToDeviceOp: { - return run(op->type_as_ToDeviceOp(), devicePool, tensorPool); + return operations::layout::run(op->type_as_ToDeviceOp(), context); } case ::tt::target::ttnn::OpType::EmptyOp: { - return run(op->type_as_EmptyOp(), devicePool, tensorPool); + return operations::creation::run(op->type_as_EmptyOp(), context); } case ::tt::target::ttnn::OpType::FullOp: { - return run(op->type_as_FullOp(), devicePool, tensorPool); - break; + return operations::creation::run(op->type_as_FullOp(), context); } case ::tt::target::ttnn::OpType::EltwiseOp: { - return run(op->type_as_EltwiseOp(), devicePool, tensorPool); + const ::tt::target::ttnn::EltwiseOp *eltwiseOp = op->type_as_EltwiseOp(); + if (operations::unary::isUnaryOp(eltwiseOp)) { + return operations::unary::run(eltwiseOp, context); + } + assert(operations::binary::isBinaryOp(eltwiseOp) && + "Eltwise op should be either unary or binary"); + return operations::binary::run(eltwiseOp, context); } case ::tt::target::ttnn::OpType::MatmulOp: { - return run(op->type_as_MatmulOp(), devicePool, tensorPool); + return operations::matmul::run(op->type_as_MatmulOp(), context); } case ::tt::target::ttnn::OpType::ReductionOp: { - return run(op->type_as_ReductionOp(), devicePool, tensorPool); + return operations::reduction::run(op->type_as_ReductionOp(), context); } case ::tt::target::ttnn::OpType::EmbeddingOp: { - return run(op->type_as_EmbeddingOp(), devicePool, tensorPool); + return operations::embedding::run(op->type_as_EmbeddingOp(), context); } case ::tt::target::ttnn::OpType::SoftmaxOp: { - return run(op->type_as_SoftmaxOp(), devicePool, tensorPool); + return operations::normalization::run(op->type_as_SoftmaxOp(), context); } case ::tt::target::ttnn::OpType::TransposeOp: { - return run(op->type_as_TransposeOp(), devicePool, tensorPool); - } - case ::tt::target::ttnn::OpType::Conv2dOp: { - return run(op->type_as_Conv2dOp(), devicePool, tensorPool); + return operations::data_movement::run(op->type_as_TransposeOp(), context); } case ::tt::target::ttnn::OpType::ConcatOp: { - return run(op->type_as_ConcatOp(), devicePool, tensorPool); + return operations::data_movement::run(op->type_as_ConcatOp(), context); } case ::tt::target::ttnn::OpType::ReshapeOp: { - return run(op->type_as_ReshapeOp(), devicePool, tensorPool); + return operations::data_movement::run(op->type_as_ReshapeOp(), context); + } + case ::tt::target::ttnn::OpType::Conv2dOp: { + return operations::conv::run(op->type_as_Conv2dOp(), context); } case ::tt::target::ttnn::OpType::DeallocOp: { - return run(op->type_as_DeallocOp(), devicePool, tensorPool); + return operations::deletion::run(op->type_as_DeallocOp(), context); } case ::tt::target::ttnn::OpType::MaxPool2dOp: { - return run(op->type_as_MaxPool2dOp(), devicePool, tensorPool); + return operations::pool::run(op->type_as_MaxPool2dOp(), context); } default: { throw std::runtime_error("Unsupported operation type"); @@ -997,9 +105,9 @@ run(::tt::target::ttnn::Operation const *op, } // Nop is single input, output tensor where input is returned as output. -bool handleNopProgram(::tt::target::ttnn::Program const *program, - std::vector<::ttnn::Tensor *> const &inputs, - std::vector<::ttnn::Tensor *> const &outputs) { +static bool handleNopProgram(::tt::target::ttnn::Program const *program, + std::vector<::ttnn::Tensor *> const &inputs, + std::vector<::ttnn::Tensor *> const &outputs) { bool isNop = program->inputs()->size() == 1 && program->outputs()->size() == 1 && @@ -1022,9 +130,8 @@ void runProgram(::ttnn::Device &device, if (handleNopProgram(program, inputs, outputs)) { return; } - std::unordered_map liveTensors; - std::unordered_map allDevices; - std::unordered_map devicePool; + TensorMap liveTensors; + DeviceMap allDevices; int inputIndex = 0; assert(program->inputs()->size() == inputs.size()); // Assuming single device for now until we support multichip @@ -1042,9 +149,8 @@ void runProgram(::ttnn::Device &device, liveTensors.try_emplace(output->global_id(), outputs[outputIndex++]); assert(inserted && "Duplicate output tensor"); } - ProgramTensorPool tensorPool(liveTensors); - for (::tt::target::ttnn::Operation const *op : *program->operations()) { - run(op, allDevices, devicePool, tensorPool); - } + ProgramExecutor executor(liveTensors, allDevices); + executor.execute(program); } + } // namespace tt::runtime::ttnn diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp index e1f786bc4..a535e5e2a 100644 --- a/runtime/lib/ttnn/runtime.cpp +++ b/runtime/lib/ttnn/runtime.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt/runtime/runtime.h" #include "tt/runtime/detail/ttnn.h" +#include "tt/runtime/ttnn/utils.h" #include "tt/runtime/utils.h" #include "ttmlir/Target/TTNN/Target.h" #include "ttmlir/Version.h" -#include "utils.h" namespace tt::runtime::ttnn { diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py index 98ed63833..1e191f3dc 100644 --- a/runtime/tools/python/setup.py +++ b/runtime/tools/python/setup.py @@ -58,7 +58,7 @@ linklibs = ["TTBinary", "TTRuntimeSysDesc"] if enable_ttnn: runlibs += ["_ttnn.so"] - linklibs += ["TTRuntimeTTNN", ":_ttnn.so"] + linklibs += ["TTRuntimeTTNN", "TTRuntimeTTNNOps", ":_ttnn.so"] if enable_ttmetal: runlibs += ["libtt_metal.so"] @@ -179,6 +179,7 @@ def package_files(directory): f"{src_dir}/build/runtime/lib", f"{src_dir}/build/runtime/lib/common", f"{src_dir}/build/runtime/lib/ttnn", + f"{src_dir}/build/runtime/lib/ttnn/operations", f"{src_dir}/build/runtime/lib/ttmetal", f"{toolchain}/lib", f"{src_dir}/build/runtime/tools/python/ttrt/runtime",