From 273a940cbcb91549a3e44dc05662d121a5a0b394 Mon Sep 17 00:00:00 2001 From: Marko Bezulj <156311081+mbezuljTT@users.noreply.github.com> Date: Sat, 30 Nov 2024 09:46:17 +0100 Subject: [PATCH] fix graph_processor buffer_alloc_id calculus (#15426) buffer_alloc_id was assigned per memory address, however this created wrong results when capturing composite op that might be allocating and deallocating entire tensors, so two buffers might have the same address. add buffer->unique_id() and used as a key, with id_to_counter map. split id_to_counter into tensor_id_to_counter and buffer_id_to_counter. --- tests/ttnn/unit_tests/gtests/CMakeLists.txt | 1 + .../unit_tests/gtests/test_graph_basic.cpp | 123 ++++++++++++++++++ tt_metal/impl/buffers/buffer.cpp | 6 + tt_metal/impl/buffers/buffer.hpp | 4 + tt_metal/include/tt_metal/buffer.hpp | 8 ++ ttnn/cpp/ttnn/graph/graph_processor.cpp | 32 ++--- ttnn/cpp/ttnn/graph/graph_processor.hpp | 3 +- 7 files changed, 160 insertions(+), 17 deletions(-) create mode 100644 tests/ttnn/unit_tests/gtests/test_graph_basic.cpp diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt index fcdd7a0b35f..52e1e69c098 100644 --- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt +++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt @@ -1,6 +1,7 @@ set(TTNN_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_add.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_add.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_basic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_async_runtime.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_multiprod_queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_cq_multi_dev.cpp diff --git a/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp b/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp new file mode 100644 index 00000000000..9cfd0dec7b2 --- /dev/null +++ b/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp @@ -0,0 +1,123 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gtest/gtest.h" +#include "ttnn_test_fixtures.hpp" +#include "ttnn/device.hpp" +#include "ttnn/tensor/tensor.hpp" +#include "ttnn/graph/graph_processor.hpp" +#include "ttnn/graph/graph_consts.hpp" + +#include + +namespace ttnn::graph::test { + +struct BufferTestParam { + ttnn::SimpleShape shape_a; + ttnn::SimpleShape shape_b; +}; + +class BufferTestFixture + : public TTNNFixtureWithDevice, + public testing::WithParamInterface> {}; + +TEST_P(BufferTestFixture, BufferTest) { + auto param_combination = GetParam(); + auto params = std::get<0>(param_combination); + auto run_mode = std::get<1>(param_combination); + + tt::tt_metal::Device* device = &(this->getDevice()); + { + ttnn::graph::GraphProcessor::begin_graph_capture(run_mode); + { + const auto input_a = ttnn::TensorSpec( + params.shape_a, + tt::tt_metal::TensorLayout( + tt::tt_metal::DataType::BFLOAT16, + tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), + ttnn::L1_MEMORY_CONFIG)); + const auto input_tensor_a = tt::tt_metal::create_device_tensor(input_a, device); + } + { + const auto input_a = ttnn::TensorSpec( + params.shape_a, + tt::tt_metal::TensorLayout( + tt::tt_metal::DataType::BFLOAT16, + tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), + ttnn::L1_MEMORY_CONFIG)); + const auto input_tensor_a = tt::tt_metal::create_device_tensor(input_a, device); + + const auto input_b = ttnn::TensorSpec( + params.shape_b, + tt::tt_metal::TensorLayout( + tt::tt_metal::DataType::BFLOAT16, + tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), + ttnn::L1_MEMORY_CONFIG)); + + const auto input_tensor_b = tt::tt_metal::create_device_tensor(input_b, device); + } + auto trace = ttnn::graph::GraphProcessor::end_graph_capture(); + + auto find_nodes_by_type = [](const auto& trace, const std::string& type) { + std::vector nodes; + for (const auto& node : trace) { + if (node.at(kNodeType) == type) { + nodes.push_back(node); + } + } + return nodes; + }; + + // Check if there are two buffer_allocate_nodes, and if each is connected to only one different buffer + auto buffer_allocate_nodes = find_nodes_by_type(trace, kNodeBufferAllocate); + EXPECT_EQ(buffer_allocate_nodes.size(), 3); + for (const auto& node : buffer_allocate_nodes) { + EXPECT_EQ(node.at(kConnections).size(), 1); + } + auto connection_a = buffer_allocate_nodes[0].at(kConnections)[0].get(); + auto connection_a2 = buffer_allocate_nodes[1].at(kConnections)[0].get(); + auto connection_c = buffer_allocate_nodes[2].at(kConnections)[0].get(); + EXPECT_NE(connection_a, connection_a2); + EXPECT_NE(connection_a, connection_c); + EXPECT_NE(connection_a2, connection_c); + + // Check if there are two buffer nodes and they have correct sizes + auto buffer_nodes = find_nodes_by_type(trace, kNodeBuffer); + EXPECT_EQ(buffer_nodes.size(), 3); + auto size_a = std::stoi(buffer_nodes[0].at(kParams).at(kSize).get()); + EXPECT_EQ(params.shape_a.volume() * 2, size_a); + auto size_a2 = std::stoi(buffer_nodes[1].at(kParams).at(kSize).get()); + EXPECT_EQ(params.shape_a.volume() * 2, size_a2); + auto size_b = std::stoi(buffer_nodes[2].at(kParams).at(kSize).get()); + EXPECT_EQ(params.shape_b.volume() * 2, size_b); + + // Print the trace for reference + std::cout << trace << std::endl; + } +} + +INSTANTIATE_TEST_SUITE_P( + BufferTest, + BufferTestFixture, + ::testing::Combine( + ::testing::Values(BufferTestParam{ + .shape_a = ttnn::SimpleShape(tt::tt_metal::Array4D{1, 1, 32, 32}), + .shape_b = ttnn::SimpleShape(tt::tt_metal::Array4D{32, 1, 32, 32})}), + ::testing::Values( + tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH, tt::tt_metal::IGraphProcessor::RunMode::NORMAL)), + [](const testing::TestParamInfo>& info) { + std::stringstream ss; + + static uint32_t uid = 0; + ss << uid++; + + const auto& run_mode = std::get<1>(info.param); + switch (run_mode) { + case tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH: ss << "_NO_DISPATCH"; break; + case tt::tt_metal::IGraphProcessor::RunMode::NORMAL: ss << "_NORMAL"; break; + default: break; + } + return ss.str(); + }); +} // namespace ttnn::graph::test diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 54756581fee..34975b0442b 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -13,6 +13,7 @@ #include "tt_metal/types.hpp" #include +#include #include #include #include "tt_metal/common/base.hpp" @@ -25,6 +26,8 @@ namespace tt { namespace tt_metal { +std::atomic Buffer::next_unique_id = 0; + std::ostream& operator<<(std::ostream& os, const ShardSpec& spec) { tt::stl::reflection::operator<<(os, spec); return os; @@ -247,6 +250,7 @@ Buffer::Buffer( if (size != 0) { validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters); } + unique_id_ = next_unique_id.fetch_add(1); } std::shared_ptr Buffer::create( @@ -524,6 +528,8 @@ v1::BufferHandle v1::CreateBuffer(InterleavedBufferConfig config) { return v1::B void v1::DeallocateBuffer(const BufferHandle& buffer) { v0::DeallocateBuffer(*buffer); } +std::size_t v1::GetId(const BufferHandle& buffer) { return buffer->unique_id(); } + void v1::WriteToBuffer(const BufferHandle& buffer, stl::Span host_buffer) { detail::WriteToBuffer(*buffer, stl::Span{reinterpret_cast(host_buffer.data()), host_buffer.size()}); } diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index f17a39104bc..4539da12fd8 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -248,6 +248,8 @@ class Buffer final { std::optional sub_device_id() const { return sub_device_id_; } std::optional sub_device_manager_id() const { return sub_device_manager_id_; } + size_t unique_id() const { return unique_id_; } + Buffer( Device *device, DeviceAddr size, @@ -300,6 +302,8 @@ class Buffer final { std::shared_ptr buffer_page_mapping_; std::weak_ptr weak_self; + size_t unique_id_ = 0; + static std::atomic next_unique_id; }; } // namespace v0 diff --git a/tt_metal/include/tt_metal/buffer.hpp b/tt_metal/include/tt_metal/buffer.hpp index a69f6a436eb..ea7d61f5c72 100644 --- a/tt_metal/include/tt_metal/buffer.hpp +++ b/tt_metal/include/tt_metal/buffer.hpp @@ -28,6 +28,14 @@ BufferHandle CreateBuffer(InterleavedBufferConfig config); */ void DeallocateBuffer(const BufferHandle& buffer); +/** + * @brief Retrieves the ID of the specified buffer. + * + * @param buffer The buffer to get the ID from. + * @return The unique ID of the buffer. + */ +std::size_t GetId(const BufferHandle& buffer); + /** * @brief Copies data from a host buffer into the specified device buffer. * diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp index f702edc1a26..637e936c9f0 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.cpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp @@ -95,9 +95,8 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) { } void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) { const std::lock_guard lock(mutex); - auto buf_id = add_buffer(buffer); + auto buffer_id = add_buffer(buffer); - auto alloc_id = reinterpret_cast(buffer); auto counter = graph.size(); std::unordered_map params = { @@ -113,7 +112,7 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) { .counter = counter, .node_type = kNodeBufferAllocate, .params = params, - .connections = {buf_id} + .connections = {buffer_id} }); graph[current_op_id.top()].connections.push_back(counter); } @@ -121,7 +120,7 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) { void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) { const std::lock_guard lock(mutex); - auto buffer_idx = add_buffer(buffer); + auto buffer_id = add_buffer(buffer); auto counter = graph.size(); std::unordered_map params = { {kSize, std::to_string(buffer->size())}, @@ -135,7 +134,7 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) { .counter = counter, .node_type = kNodeBufferDeallocate, .params = params, - .connections = {buffer_idx} + .connections = {buffer_id} }); graph[current_op_id.top()].connections.push_back(counter); } @@ -286,21 +285,21 @@ int GraphProcessor::add_tensor(const Tensor& t) { } else { tensor_id = t.tensor_id.value(); } - auto tensor_counter = id_to_counter.count(tensor_id) > 0 ? id_to_counter[tensor_id] : graph.size(); + auto tensor_counter = tensor_id_to_counter.count(tensor_id) > 0 ? tensor_id_to_counter[tensor_id] : graph.size(); auto shape = t.get_shape(); std::unordered_map params = { {kShape, fmt::format("{}", shape)}, {kTensorId, fmt::format("{}", tensor_id)}, }; - if (id_to_counter.count(tensor_id) == 0) { + if (tensor_id_to_counter.count(tensor_id) == 0) { graph.push_back(Vertex{.counter = tensor_counter, .node_type = kNodeTensor, .params = params, .connections = {}}); - id_to_counter[tensor_id] = tensor_counter; + tensor_id_to_counter[tensor_id] = tensor_counter; } if (buffer) { - auto buffer_idx = add_buffer(buffer); - graph[buffer_idx].connections.push_back(tensor_counter); + auto buffer_id = add_buffer(buffer); + graph[buffer_id].connections.push_back(tensor_counter); } else { tt::log_info("Tensor doesn't have buffer, but storage is {}", demangle(get_type_in_var(t.get_storage()).name())); } @@ -308,9 +307,9 @@ int GraphProcessor::add_tensor(const Tensor& t) { } int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) { - auto buffer_alloc_id = reinterpret_cast(buffer); - auto counter = id_to_counter.count(buffer_alloc_id) > 0 ? id_to_counter[buffer_alloc_id] : graph.size(); - if (id_to_counter.count(buffer_alloc_id) == 0) { + auto buffer_id = buffer->unique_id(); + auto counter = buffer_id_to_counter.count(buffer_id) > 0 ? buffer_id_to_counter[buffer_id] : graph.size(); + if (buffer_id_to_counter.count(buffer_id) == 0) { std::unordered_map params = { {kSize, std::to_string(buffer->size())}, {kType, buffer->is_dram() ? "DRAM" : "L1"}, @@ -324,10 +323,10 @@ int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) { .connections = {} }); graph[current_op_id.top()].connections.push_back(counter); - id_to_counter[buffer_alloc_id] = counter; + buffer_id_to_counter[buffer_id] = counter; return counter; } - return id_to_counter[buffer_alloc_id]; + return buffer_id_to_counter[buffer_id]; } @@ -428,7 +427,8 @@ void GraphProcessor::end_function_process_optional_tensor(const std::any& any_va void GraphProcessor::begin_capture(RunMode mode) { const std::lock_guard lock(mutex); graph.clear(); - id_to_counter.clear(); + buffer_id_to_counter.clear(); + tensor_id_to_counter.clear(); graph.push_back(Vertex{ .counter = 0, .node_type = kNodeCaptureStart, diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp index 7f89251622d..488d7cf1f06 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.hpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp @@ -73,7 +73,8 @@ namespace ttnn::graph { std::mutex mutex; RunMode run_mode = RunMode::NORMAL; std::stack current_op_id; - std::unordered_map id_to_counter; + std::unordered_map buffer_id_to_counter; + std::unordered_map tensor_id_to_counter; int last_finished_op_id = -1; std::vector graph; std::unordered_map begin_function_any_map;