Skip to content

Commit

Permalink
Merge branch 'main' into llama32-vision
Browse files Browse the repository at this point in the history
  • Loading branch information
cglagovichTT authored Oct 24, 2024
2 parents 2c5ff7f + af83135 commit 4e79091
Show file tree
Hide file tree
Showing 55 changed files with 607 additions and 792 deletions.
2 changes: 1 addition & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ ttnn/cpp/ttnn/operations/ccl/ @SeanNijjar @cfjchu
ttnn/cpp/ttnn/operations/pool/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
ttnn/cpp/ttnn/operations/conv/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic @bbradelTT
ttnn/cpp/ttnn/operations/sliding_window/ @mywoodstock @sankarmanoj-tt @pavlejosipovic
ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @yan-zaretskiy @jaykru-tt
ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957
ttnn/cpp/ttnn/operations/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
ttnn/cpp/ttnn/operations/experimental/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
ttnn/cpp/ttnn/operations/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
Expand Down
46 changes: 29 additions & 17 deletions tests/tt_eager/tensors/test_async_tensor_apis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ using namespace tt;
using namespace tt_metal;
using namespace constants;

namespace {
uint32_t get_device_buffer_address(const Tensor& tensor) {
TT_FATAL(std::holds_alternative<DeviceStorage>(tensor.get_storage()), "Tensor storage is not DeviceStorage");
auto buffer = std::get<DeviceStorage>(tensor.get_storage()).buffer;
uint32_t result = 0;
buffer->device()->push_work([&]() {
result = buffer->address();
}, true);
return result;
}
}

TEST_F(CommonFixture, TestTensorOwnershipSanity) {
// Sanity test tensor read, write and update paths with synchronous
// Ensure that tensor data is copied and owned as expected
Expand Down Expand Up @@ -129,17 +141,17 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
Tensor output_tensor_host = output_tensor_device_2.cpu();
// Test tensor deallocation in async mode: deallocate tensors after using them
if (i == 0) {
input_a_addr = std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address();
input_b_addr = std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address();
input_c_addr = std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address();
output_1_addr = std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address();
output_2_addr = std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address();
input_a_addr = get_device_buffer_address(input_tensor_a);
input_b_addr = get_device_buffer_address(input_tensor_b);
input_c_addr = get_device_buffer_address(input_tensor_c);
output_1_addr = get_device_buffer_address(output_tensor_device);
output_2_addr = get_device_buffer_address(output_tensor_device_2);
} else {
EXPECT_EQ(std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address(), input_a_addr);
EXPECT_EQ(std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address(), input_b_addr);
EXPECT_EQ(std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address(), input_c_addr);
EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address(), output_1_addr);
EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address(), output_2_addr);
EXPECT_EQ(get_device_buffer_address(input_tensor_a), input_a_addr);
EXPECT_EQ(get_device_buffer_address(input_tensor_b), input_b_addr);
EXPECT_EQ(get_device_buffer_address(input_tensor_c), input_c_addr);
EXPECT_EQ(get_device_buffer_address(output_tensor_device), output_1_addr);
EXPECT_EQ(get_device_buffer_address(output_tensor_device_2), output_2_addr);
}
input_tensor_a.deallocate();
input_tensor_b.deallocate();
Expand Down Expand Up @@ -171,7 +183,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
Tensor tensor2 =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
uint32_t tensor2_device_buf_addr = tensor2.device_buffer()->address();
uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2);
// Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2
// deallocated
tensor2 = tensor1;
Expand All @@ -181,19 +193,19 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
// prev addr for tensor2
Tensor tensor3 =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
EXPECT_EQ(tensor3.device_buffer()->address(), tensor2_device_buf_addr);
EXPECT_EQ(tensor1.device_buffer()->address(), tensor2.device_buffer()->address());
EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr);
EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2));
}
log_info(LogTest, "Testing Device tensor self-assignment through function");
for (int i = 0; i < 5; i++) {
Tensor device_tensor =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
uint32_t device_tensor_address = device_tensor.device_buffer()->address();
uint32_t device_tensor_address = get_device_buffer_address(device_tensor);
// This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor
// Ensure ref count and address remain unchanged
device_tensor = tensor_identity_copy_function(device_tensor);
EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1);
EXPECT_EQ(device_tensor.device_buffer()->address(), device_tensor_address);
EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address);
}

log_info(LogTest, "Testing Device tensor move assignment");
Expand All @@ -208,11 +220,11 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
log_info(LogTest, "Testing Device tensor self-assignment");
Tensor tensor_to_self_assign =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(0), DataType::BFLOAT16).to(device);
uint32_t tensor_to_self_assign_address = tensor_to_self_assign.device_buffer()->address();
uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign);
tensor_to_self_assign = tensor_to_self_assign;
EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1);
tensor_to_self_assign = std::move(tensor_to_self_assign);
EXPECT_EQ(tensor_to_self_assign.device_buffer()->address(), tensor_to_self_assign_address);
EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address);
auto barrier_tensor = tensor_to_self_assign.cpu();
device->enable_async(false);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ int main(int argc, char** argv) {

// limit size of the L1 buffer to do not exceed global L1 size
uint32_t l1_buffer_size = num_cores_r * num_cores_c * (num_tiles > 256 ? 256 : num_tiles) * page_size;
auto l1_buffer = tt_metal::Buffer(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
auto l1_buffer = tt_metal::Buffer::create(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);

////////////////////////////////////////////////////////////////////////////
// Application Setup
Expand Down Expand Up @@ -166,7 +166,7 @@ int main(int argc, char** argv) {
for (int j = 0; j < num_cores_c; j++) {
CoreCoord core = {(std::size_t)j, (std::size_t)i};
uint32_t core_index = i * num_cores_c + j;
uint32_t l1_buffer_addr = l1_buffer.address();
uint32_t l1_buffer_addr = l1_buffer->address();

const std::array noc_runtime_args = {core_index, l1_buffer_addr, num_tiles, num_cores_r * num_cores_c};
SetRuntimeArgs(program, noc_kernel, core, noc_runtime_args);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,14 @@ int main(int argc, char** argv) {
log_error(tt::LogTest, "Command line arguments found exception", e.what());
}

TT_ASSERT(transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
TT_ASSERT(page_size == 0 ? transfer_size == 0 : transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);

// Device setup
int device_id = 0;
tt_metal::Device* device = tt_metal::CreateDevice(device_id);

// Application setup
auto buffer = tt_metal::Buffer(
auto buffer = tt_metal::Buffer::create(
device, transfer_size, page_size, buffer_type == 0 ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1);

std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
Expand All @@ -104,7 +104,7 @@ int main(int argc, char** argv) {
// Execute application
if (!skip_write) {
auto t_begin = std::chrono::steady_clock::now();
EnqueueWriteBuffer(device->command_queue(), buffer, src_vec, false);
EnqueueWriteBuffer(device->command_queue(), *buffer, src_vec, false);
Finish(device->command_queue());
auto t_end = std::chrono::steady_clock::now();
auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
Expand All @@ -119,7 +119,7 @@ int main(int argc, char** argv) {

if (!skip_read) {
auto t_begin = std::chrono::steady_clock::now();
EnqueueReadBuffer(device->command_queue(), buffer, result_vec, true);
EnqueueReadBuffer(device->command_queue(), *buffer, result_vec, true);
auto t_end = std::chrono::steady_clock::now();
auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
d2h_bandwidth.push_back((transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ int main(int argc, char **argv) {
////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> input_vec = create_random_vector_of_bfloat16(
input_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
tt_metal::Buffer input_buffer(
auto input_buffer = Buffer::create(
device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);

////////////////////////////////////////////////////////////////////////////
Expand All @@ -212,7 +212,7 @@ int main(int argc, char **argv) {
num_tiles_per_core_group_1,
num_tiles_per_core_group_2,
kernel,
input_buffer.address(),
input_buffer->address(),
num_reqs_at_a_time,
single_tile_size,
tile_format);
Expand All @@ -221,7 +221,7 @@ int main(int argc, char **argv) {
// Copy Input To DRAM or L1
////////////////////////////////////////////////////////////////////////////
if (access_type == 0) {
tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
} else {
for (uint32_t i = 0, input_offset = 0; i < num_cores; ++i) {
CoreCoord core = {i / num_cores_y, i % num_cores_y};
Expand Down Expand Up @@ -276,7 +276,7 @@ int main(int argc, char **argv) {
////////////////////////////////////////////////////////////////////////////
pass = validation(
device,
input_buffer,
*input_buffer,
input_vec,
num_cores,
num_cores_y,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,18 +673,18 @@ int main(int argc, char **argv) {
input_size, 100, 1234);
}

tt_metal::Buffer input_buffer(
auto input_buffer = tt_metal::Buffer::create(
device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);

////////////////////////////////////////////////////////////////////////////
// Application Setup
////////////////////////////////////////////////////////////////////////////
auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer.address());
auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer->address());

////////////////////////////////////////////////////////////////////////////
// Copy Input To DRAM or L1
////////////////////////////////////////////////////////////////////////////
tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);

////////////////////////////////////////////////////////////////////////////
// Execution Application
Expand Down Expand Up @@ -713,7 +713,7 @@ int main(int argc, char **argv) {

pass = validation(
device,
input_buffer,
*input_buffer,
input_vec,
num_cores,
all_cores_list,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -869,18 +869,18 @@ int main(int argc, char **argv) {
input_size, 100, 1234);
}

tt_metal::Buffer input_buffer(
auto input_buffer = tt_metal::Buffer::create(
device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);

////////////////////////////////////////////////////////////////////////////
// Application Setup
////////////////////////////////////////////////////////////////////////////
auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address());
auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer->address());

////////////////////////////////////////////////////////////////////////////
// Copy Input To DRAM or L1
////////////////////////////////////////////////////////////////////////////
tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);

////////////////////////////////////////////////////////////////////////////
// Execution Application
Expand Down Expand Up @@ -909,7 +909,7 @@ int main(int argc, char **argv) {

pass = validation(
device,
input_buffer,
*input_buffer,
input_vec,
num_cores,
all_l1_writer_cores_ordered,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
// SPDX-FileCopyrightText: © 2023, 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

// NULL kernel is not 0, subtract off overhead
#if KERNEL_BYTES > 16
constexpr uint32_t empty_kernel_bytes = 16;
uint8_t data1[KERNEL_BYTES - empty_kernel_bytes] __attribute__ ((section ("l1_data_test_only"))) __attribute__((used));
// An empty kernel is 16 bytes, so only pad above that to fake a
// bigger kernel.
#define EMPTY_KERNEL_BYTES 16
#if KERNEL_BYTES > EMPTY_KERNEL_BYTES
[[gnu::section(".text"), gnu::used]]
static uint8_t lorem_ipsum[KERNEL_BYTES - EMPTY_KERNEL_BYTES];
#endif

#ifdef KERNEL_GLOBAL
volatile uint32_t global = 4;
[[gnu::section(".data"), gnu::used]]
static uint32_t global;
#endif

#ifdef COMPILE_FOR_TRISC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ int main(int argc, char **argv) {
activations_addr,
activations_addr / 1024,
Nt);
std::vector<tt_metal::Buffer> l1_buffers;
std::vector<std::shared_ptr<tt_metal::Buffer>> l1_buffers;

int l1_buffers_size = 1;
if (!(single_read || one_buffer_share)) {
Expand All @@ -199,8 +199,8 @@ int main(int argc, char **argv) {
l1_buffers.reserve(l1_buffers_size);
for (int r = 0; r < num_cores_r; ++r) {
for (int c = 0; c < num_cores_c; ++c) {
l1_buffers.emplace_back(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1);
tt_metal::detail::WriteToBuffer(l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
l1_buffers.push_back(tt_metal::Buffer::create(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1));
tt_metal::detail::WriteToBuffer(*l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);

if (single_read || one_buffer_share)
break;
Expand All @@ -213,7 +213,7 @@ int main(int argc, char **argv) {
for (int r = 0; r < num_cores_r; ++r) {
for (int c = 0; c < num_cores_c; ++c) {
std::vector<uint32_t> result_vec;
tt_metal::detail::ReadFromBuffer(l1_buffers[r * num_cores_c + c], result_vec);
tt_metal::detail::ReadFromBuffer(*l1_buffers[r * num_cores_c + c], result_vec);
auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);

if (print_tensor) {
Expand Down Expand Up @@ -260,7 +260,7 @@ int main(int argc, char **argv) {
CoreCoord core = {(size_t)c, (size_t)r};

int l1_buffers_idx = (single_read || one_buffer_share) ? (0) : (r * num_cores_c + c);
auto l1_buffer_addr = l1_buffers[l1_buffers_idx].address();
auto l1_buffer_addr = l1_buffers[l1_buffers_idx]->address();

uint32_t l1_buffer_offset = (one_buffer_share) ? ((r * num_cores_c + c) * Nt) : (0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {

uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(device);

std::vector<std::unique_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<Buffer>> buffers;
int alloc_size_idx = 0;
uint32_t total_buffer_size = 0;
while (total_size_bytes < alloc_limit) {
Expand All @@ -44,7 +44,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
if (total_buffer_size + buffer_size >= alloc_limit) {
break;
}
std::unique_ptr<tt::tt_metal::Buffer> buffer = std::make_unique<tt::tt_metal::Buffer>(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
auto buffer = tt::tt_metal::Buffer::create(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
buffers.emplace_back(std::move(buffer));
total_buffer_size += buffer_size;
EXPECT_EQ(buffers.back()->address(), device->l1_size_per_core() - total_buffer_size);
Expand Down
Loading

0 comments on commit 4e79091

Please sign in to comment.