fix graph_processor buffer_alloc_id calculus (#15426)

buffer_alloc_id was assigned per memory address, however this created wrong results when capturing composite op that might be allocating and deallocating entire tensors, so two buffers might have the same address. add buffer->unique_id() and used as a key, with id_to_counter map. split id_to_counter into tensor_id_to_counter and buffer_id_to_counter.
tenstorrent · Nov 30, 2024 · 273a940 · 273a940
1 parent 35cd995
commit 273a940
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 17 deletions.
diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(TTNN_UNIT_TESTS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/test_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_basic.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_async_runtime.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_multiprod_queue.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_cq_multi_dev.cpp

diff --git a/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp b/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp
@@ -0,0 +1,123 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "gtest/gtest.h"
+#include "ttnn_test_fixtures.hpp"
+#include "ttnn/device.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/graph/graph_processor.hpp"
+#include "ttnn/graph/graph_consts.hpp"
+
+#include <string>
+
+namespace ttnn::graph::test {
+
+struct BufferTestParam {
+    ttnn::SimpleShape shape_a;
+    ttnn::SimpleShape shape_b;
+};
+
+class BufferTestFixture
+    : public TTNNFixtureWithDevice,
+      public testing::WithParamInterface<std::tuple<BufferTestParam, tt::tt_metal::IGraphProcessor::RunMode>> {};
+
+TEST_P(BufferTestFixture, BufferTest) {
+    auto param_combination = GetParam();
+    auto params = std::get<0>(param_combination);
+    auto run_mode = std::get<1>(param_combination);
+
+    tt::tt_metal::Device* device = &(this->getDevice());
+    {
+        ttnn::graph::GraphProcessor::begin_graph_capture(run_mode);
+        {
+            const auto input_a = ttnn::TensorSpec(
+                params.shape_a,
+                tt::tt_metal::TensorLayout(
+                    tt::tt_metal::DataType::BFLOAT16,
+                    tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE),
+                    ttnn::L1_MEMORY_CONFIG));
+            const auto input_tensor_a = tt::tt_metal::create_device_tensor(input_a, device);
+        }
+        {
+            const auto input_a = ttnn::TensorSpec(
+                params.shape_a,
+                tt::tt_metal::TensorLayout(
+                    tt::tt_metal::DataType::BFLOAT16,
+                    tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE),
+                    ttnn::L1_MEMORY_CONFIG));
+            const auto input_tensor_a = tt::tt_metal::create_device_tensor(input_a, device);
+
+            const auto input_b = ttnn::TensorSpec(
+                params.shape_b,
+                tt::tt_metal::TensorLayout(
+                    tt::tt_metal::DataType::BFLOAT16,
+                    tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE),
+                    ttnn::L1_MEMORY_CONFIG));
+
+            const auto input_tensor_b = tt::tt_metal::create_device_tensor(input_b, device);
+        }
+        auto trace = ttnn::graph::GraphProcessor::end_graph_capture();
+
+        auto find_nodes_by_type = [](const auto& trace, const std::string& type) {
+            std::vector<nlohmann::json> nodes;
+            for (const auto& node : trace) {
+                if (node.at(kNodeType) == type) {
+                    nodes.push_back(node);
+                }
+            }
+            return nodes;
+        };
+
+        // Check if there are two buffer_allocate_nodes, and if each is connected to only one different buffer
+        auto buffer_allocate_nodes = find_nodes_by_type(trace, kNodeBufferAllocate);
+        EXPECT_EQ(buffer_allocate_nodes.size(), 3);
+        for (const auto& node : buffer_allocate_nodes) {
+            EXPECT_EQ(node.at(kConnections).size(), 1);
+        }
+        auto connection_a = buffer_allocate_nodes[0].at(kConnections)[0].get<int>();
+        auto connection_a2 = buffer_allocate_nodes[1].at(kConnections)[0].get<int>();
+        auto connection_c = buffer_allocate_nodes[2].at(kConnections)[0].get<int>();
+        EXPECT_NE(connection_a, connection_a2);
+        EXPECT_NE(connection_a, connection_c);
+        EXPECT_NE(connection_a2, connection_c);
+
+        // Check if there are two buffer nodes and they have correct sizes
+        auto buffer_nodes = find_nodes_by_type(trace, kNodeBuffer);
+        EXPECT_EQ(buffer_nodes.size(), 3);
+        auto size_a = std::stoi(buffer_nodes[0].at(kParams).at(kSize).get<std::string>());
+        EXPECT_EQ(params.shape_a.volume() * 2, size_a);
+        auto size_a2 = std::stoi(buffer_nodes[1].at(kParams).at(kSize).get<std::string>());
+        EXPECT_EQ(params.shape_a.volume() * 2, size_a2);
+        auto size_b = std::stoi(buffer_nodes[2].at(kParams).at(kSize).get<std::string>());
+        EXPECT_EQ(params.shape_b.volume() * 2, size_b);
+
+        // Print the trace for reference
+        std::cout << trace << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BufferTest,
+    BufferTestFixture,
+    ::testing::Combine(
+        ::testing::Values(BufferTestParam{
+            .shape_a = ttnn::SimpleShape(tt::tt_metal::Array4D{1, 1, 32, 32}),
+            .shape_b = ttnn::SimpleShape(tt::tt_metal::Array4D{32, 1, 32, 32})}),
+        ::testing::Values(
+            tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH, tt::tt_metal::IGraphProcessor::RunMode::NORMAL)),
+    [](const testing::TestParamInfo<std::tuple<BufferTestParam, tt::tt_metal::IGraphProcessor::RunMode>>& info) {
+        std::stringstream ss;
+
+        static uint32_t uid = 0;
+        ss << uid++;
+
+        const auto& run_mode = std::get<1>(info.param);
+        switch (run_mode) {
+            case tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH: ss << "_NO_DISPATCH"; break;
+            case tt::tt_metal::IGraphProcessor::RunMode::NORMAL: ss << "_NORMAL"; break;
+            default: break;
+        }
+        return ss.str();
+    });
+}  // namespace ttnn::graph::test
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/types.hpp"
 
 #include <algorithm>
+#include <atomic>
 #include <mutex>
 #include <utility>
 #include "tt_metal/common/base.hpp"
@@ -25,6 +26,8 @@ namespace tt {
 
 namespace tt_metal {
 
+std::atomic<size_t> Buffer::next_unique_id = 0;
+
 std::ostream& operator<<(std::ostream& os, const ShardSpec& spec) {
     tt::stl::reflection::operator<<(os, spec);
     return os;
@@ -247,6 +250,7 @@ Buffer::Buffer(
     if (size != 0) {
         validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters);
     }
+    unique_id_ = next_unique_id.fetch_add(1);
 }
 
 std::shared_ptr<Buffer> Buffer::create(
@@ -524,6 +528,8 @@ v1::BufferHandle v1::CreateBuffer(InterleavedBufferConfig config) { return v1::B
 
 void v1::DeallocateBuffer(const BufferHandle& buffer) { v0::DeallocateBuffer(*buffer); }
 
+std::size_t v1::GetId(const BufferHandle& buffer) { return buffer->unique_id(); }
+
 void v1::WriteToBuffer(const BufferHandle& buffer, stl::Span<const std::byte> host_buffer) {
     detail::WriteToBuffer(*buffer, stl::Span<const uint8_t>{reinterpret_cast<const std::uint8_t *>(host_buffer.data()), host_buffer.size()});
 }

diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp
@@ -248,6 +248,8 @@ class Buffer final {
     std::optional<SubDeviceId> sub_device_id() const { return sub_device_id_; }
     std::optional<SubDeviceManagerId> sub_device_manager_id() const { return sub_device_manager_id_; }
 
+    size_t unique_id() const { return unique_id_; }
+
     Buffer(
         Device *device,
         DeviceAddr size,
@@ -300,6 +302,8 @@ class Buffer final {
     std::shared_ptr<const BufferPageMapping> buffer_page_mapping_;
 
     std::weak_ptr<Buffer> weak_self;
+    size_t unique_id_ = 0;
+    static std::atomic<size_t> next_unique_id;
 };
 
 }  // namespace v0

diff --git a/tt_metal/include/tt_metal/buffer.hpp b/tt_metal/include/tt_metal/buffer.hpp
@@ -28,6 +28,14 @@ BufferHandle CreateBuffer(InterleavedBufferConfig config);
  */
 void DeallocateBuffer(const BufferHandle& buffer);
 
+/**
+ * @brief Retrieves the ID of the specified buffer.
+ *
+ * @param buffer The buffer to get the ID from.
+ * @return The unique ID of the buffer.
+ */
+std::size_t GetId(const BufferHandle& buffer);
+
 /**
  * @brief Copies data from a host buffer into the specified device buffer.
  *

diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -95,9 +95,8 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) {
 }
 void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
     const std::lock_guard<std::mutex> lock(mutex);
-    auto buf_id = add_buffer(buffer);
+    auto buffer_id = add_buffer(buffer);
 
-    auto alloc_id = reinterpret_cast<std::uintptr_t>(buffer);
     auto counter = graph.size();
 
     std::unordered_map<std::string, std::string> params = {
@@ -113,15 +112,15 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
             .counter = counter,
             .node_type = kNodeBufferAllocate,
             .params = params,
-            .connections = {buf_id}
+            .connections = {buffer_id}
         });
         graph[current_op_id.top()].connections.push_back(counter);
     }
 }
 
 void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
     const std::lock_guard<std::mutex> lock(mutex);
-    auto buffer_idx = add_buffer(buffer);
+    auto buffer_id = add_buffer(buffer);
     auto counter = graph.size();
     std::unordered_map<std::string, std::string> params = {
             {kSize, std::to_string(buffer->size())},
@@ -135,7 +134,7 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
             .counter = counter,
             .node_type = kNodeBufferDeallocate,
             .params = params,
-            .connections = {buffer_idx}
+            .connections = {buffer_id}
         });
         graph[current_op_id.top()].connections.push_back(counter);
     }
@@ -286,31 +285,31 @@ int GraphProcessor::add_tensor(const Tensor& t) {
     } else {
         tensor_id = t.tensor_id.value();
     }
-    auto tensor_counter = id_to_counter.count(tensor_id) > 0 ? id_to_counter[tensor_id] : graph.size();
+    auto tensor_counter = tensor_id_to_counter.count(tensor_id) > 0 ? tensor_id_to_counter[tensor_id] : graph.size();
     auto shape = t.get_shape();
     std::unordered_map<std::string, std::string> params = {
         {kShape, fmt::format("{}", shape)},
         {kTensorId, fmt::format("{}", tensor_id)},
     };
 
-    if (id_to_counter.count(tensor_id) == 0) {
+    if (tensor_id_to_counter.count(tensor_id) == 0) {
         graph.push_back(Vertex{.counter = tensor_counter, .node_type = kNodeTensor, .params = params, .connections = {}});
-        id_to_counter[tensor_id] = tensor_counter;
+        tensor_id_to_counter[tensor_id] = tensor_counter;
     }
 
     if (buffer) {
-        auto buffer_idx = add_buffer(buffer);
-        graph[buffer_idx].connections.push_back(tensor_counter);
+        auto buffer_id = add_buffer(buffer);
+        graph[buffer_id].connections.push_back(tensor_counter);
     } else {
         tt::log_info("Tensor doesn't have buffer, but storage is {}", demangle(get_type_in_var(t.get_storage()).name()));
     }
     return tensor_counter;
 }
 
 int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) {
-    auto buffer_alloc_id = reinterpret_cast<std::uintptr_t>(buffer);
-    auto counter = id_to_counter.count(buffer_alloc_id) > 0 ? id_to_counter[buffer_alloc_id] : graph.size();
-    if (id_to_counter.count(buffer_alloc_id) == 0) {
+    auto buffer_id = buffer->unique_id();
+    auto counter = buffer_id_to_counter.count(buffer_id) > 0 ? buffer_id_to_counter[buffer_id] : graph.size();
+    if (buffer_id_to_counter.count(buffer_id) == 0) {
         std::unordered_map<std::string, std::string> params = {
             {kSize, std::to_string(buffer->size())},
             {kType, buffer->is_dram() ? "DRAM" : "L1"},
@@ -324,10 +323,10 @@ int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) {
             .connections = {}
         });
         graph[current_op_id.top()].connections.push_back(counter);
-        id_to_counter[buffer_alloc_id] = counter;
+        buffer_id_to_counter[buffer_id] = counter;
         return counter;
     }
-    return id_to_counter[buffer_alloc_id];
+    return buffer_id_to_counter[buffer_id];
 }
 
 
@@ -428,7 +427,8 @@ void GraphProcessor::end_function_process_optional_tensor(const std::any& any_va
 void GraphProcessor::begin_capture(RunMode mode) {
     const std::lock_guard<std::mutex> lock(mutex);
     graph.clear();
-    id_to_counter.clear();
+    buffer_id_to_counter.clear();
+    tensor_id_to_counter.clear();
     graph.push_back(Vertex{
         .counter = 0,
         .node_type = kNodeCaptureStart,

diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp
@@ -73,7 +73,8 @@ namespace ttnn::graph {
         std::mutex mutex;
         RunMode run_mode = RunMode::NORMAL;
         std::stack<int> current_op_id;
-        std::unordered_map<std::int64_t, int> id_to_counter;
+        std::unordered_map<std::int64_t, int> buffer_id_to_counter;
+        std::unordered_map<std::int64_t, int> tensor_id_to_counter;
         int last_finished_op_id = -1;
         std::vector<Vertex> graph;
         std::unordered_map<std::type_index, ProcessFunc> begin_function_any_map;