diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h index 964caa6a5..b79bde0e1 100644 --- a/runtime/include/tt/runtime/detail/ttmetal.h +++ b/runtime/include/tt/runtime/detail/ttmetal.h @@ -161,6 +161,19 @@ createBufferFromTensorRef(::tt::tt_metal::Device *device, std::shared_ptr<::tt::tt_metal::Buffer> buffer = ::tt::tt_metal::CreateBuffer(shardedBufferConfig); assert(tensorRef->address()); + + // Issue #408: Temporary Hack, remove when fix available. + // Update tt-metal BUFFER_MAP with updated address and remove + // entry for original alloc'd address. + auto &buffer_map = tt::tt_metal::detail::BUFFER_MAP; + auto map_copy = buffer_map.value(); + auto old_key = std::make_tuple(device->id(), buffer->address()); + if (auto it = map_copy.find(old_key); it != map_copy.end()) { + auto new_key = std::make_tuple(device->id(), tensorRef->address()); + buffer_map.insert(new_key, it->second); + buffer_map.erase(old_key); + } + buffer->set_address(tensorRef->address()); return buffer; } diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp index 589989653..d93e012c7 100644 --- a/runtime/lib/ttmetal/command_queue.cpp +++ b/runtime/lib/ttmetal/command_queue.cpp @@ -242,8 +242,10 @@ void CQExecutor::execute( void CQExecutor::execute( ::tt::target::metal::CreateBufferCommand const *command) { - buffers[command->ref()->global_id()] = - createBufferFromTensorRef(device, command->ref()); + if (buffers.find(command->ref()->global_id()) == buffers.end()) { + buffers[command->ref()->global_id()] = + createBufferFromTensorRef(device, command->ref()); + } } void CQExecutor::execute( @@ -252,7 +254,7 @@ void CQExecutor::execute( assert(iter != buffers.end() && "Buffer not allocated"); assert(iter->second != nullptr && "Buffer already deallocated"); ::tt::tt_metal::DeallocateBuffer(*iter->second); - iter->second.reset(); + buffers.erase(iter); } void CQExecutor::execute( diff --git a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir index 494e3f19a..fdd65864d 100644 --- a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir +++ b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir @@ -1,5 +1,4 @@ // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s -// UNSUPPORTED: true #any_device = #tt.operand_constraint func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir index 1664ceba3..1cebfe451 100644 --- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir +++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir @@ -1,5 +1,4 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s -// UNSUPPORTED: true +// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s #l1_ = #tt.memory_space #untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>> @@ -7,11 +6,17 @@ #tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>> #untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>> func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %0 = tensor.empty() : tensor<64x128xf32, #tilized> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #untilized>, tensor<64x128xf32, #tilized>) -> tensor<64x128xf32, #tilized> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %2 = tensor.empty() : tensor<64x128xf32, #tilized2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %3 = "ttir.to_layout"(%1, %2) : (tensor<64x128xf32, #tilized>, tensor<64x128xf32, #tilized2x2>) -> tensor<64x128xf32, #tilized2x2> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %4 = tensor.empty() : tensor<64x128xf32, #untilized2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %5 = "ttir.to_layout"(%3, %4) : (tensor<64x128xf32, #tilized2x2>, tensor<64x128xf32, #untilized2x2>) -> tensor<64x128xf32, #untilized2x2> return %5 : tensor<64x128xf32, #untilized2x2> } @@ -22,13 +27,19 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64 #tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>> #untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>> func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %1 = "ttir.to_layout"(%arg0, %0) : (tensor<2x3x64x128xf32, #untilized4D>, tensor<2x3x64x128xf32, #tilized4D>) -> tensor<2x3x64x128xf32, #tilized4D> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %2 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D_2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %3 = "ttir.to_layout"(%1, %2) : (tensor<2x3x64x128xf32, #tilized4D>, tensor<2x3x64x128xf32, #tilized4D_2x2>) -> tensor<2x3x64x128xf32, #tilized4D_2x2> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %4 = tensor.empty() : tensor<2x3x64x128xf32, #untilized4D_2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %5 = "ttir.to_layout"(%3, %4) : (tensor<2x3x64x128xf32, #tilized4D_2x2>, tensor<2x3x64x128xf32, #untilized4D_2x2>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2> @@ -40,23 +51,33 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten #tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>> func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> { // move to tilized 1x1 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %0 = tensor.empty() : tensor<96x192xf32, #tilized_big> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %1 = "ttir.to_layout"(%arg0, %0) : (tensor<96x192xf32, #untilized_big>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big> // move to tilized 2x3 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %2 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %3 = "ttir.to_layout"(%1, %2) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #tilized_big_3x2>) -> tensor<96x192xf32, #tilized_big_3x2> // move to tilized 3x3 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %4 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x6> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %5 = "ttir.to_layout"(%3, %4) : (tensor<96x192xf32, #tilized_big_3x2>, tensor<96x192xf32, #tilized_big_3x6>) -> tensor<96x192xf32, #tilized_big_3x6> // move back to tilized 1x1 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %6 = tensor.empty() : tensor<96x192xf32, #tilized_big> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %7 = "ttir.to_layout"(%5, %6) : (tensor<96x192xf32, #tilized_big_3x6>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big> // untilize + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %8 = tensor.empty() : tensor<96x192xf32, #untilized_big> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %9 = "ttir.to_layout"(%7, %8) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> return %9 : tensor<96x192xf32, #untilized_big> diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir index 6b361a76d..f268e7b39 100644 --- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir +++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir @@ -1,5 +1,4 @@ // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s -// UNSUPPORTED: true #l1_ = #tt.memory_space #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>