Skip to content

Commit

Permalink
#13394: Galaxy 2cq support
Browse files Browse the repository at this point in the history
        Increase demux_d to 2 for galaxy tunnels.
        Add multicq galaxy tests. Fix queue flow control register indexing.
  • Loading branch information
ubcheema committed Oct 3, 2024
1 parent 595932a commit feb8d31
Show file tree
Hide file tree
Showing 10 changed files with 876 additions and 348 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ void init(int argc, char **argv) {
log_info(LogTest, " -hp: host huge page issue buffer size (default {})", DEFAULT_HUGEPAGE_ISSUE_BUFFER_SIZE);
log_info(LogTest, " -pq: prefetch queue entries (default {})", DEFAULT_PREFETCH_Q_ENTRIES);
log_info(LogTest, " -cs: cmddat q size (default {})", DEFAULT_CMDDAT_Q_SIZE);
log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", dispatch_constants::get(CoreType::WORKER).prefetch_d_buffer_size());
log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", dispatch_constants::get(CoreType::WORKER, 1).prefetch_d_buffer_size());
log_info(LogTest, " -ss: scratch cb size (default {})", DEFAULT_SCRATCH_DB_SIZE);
log_info(LogTest, " -pcies: size of data to transfer in pcie bw test type (default: {})", PCIE_TRANSFER_SIZE_DEFAULT);
log_info(LogTest, " -dpgs: dram page size in dram bw test type (default: {})", DRAM_PAGE_SIZE_DEFAULT);
Expand Down Expand Up @@ -154,7 +154,7 @@ void init(int argc, char **argv) {
pcie_transfer_size_g = test_args::get_command_option_uint32(input_args, "-pcies", PCIE_TRANSFER_SIZE_DEFAULT);
dram_page_size_g = test_args::get_command_option_uint32(input_args, "-dpgs", DRAM_PAGE_SIZE_DEFAULT);
dram_pages_to_read_g = test_args::get_command_option_uint32(input_args, "-dpgr", DRAM_PAGES_TO_READ_DEFAULT);
prefetch_d_buffer_size_g = test_args::get_command_option_uint32(input_args, "-pdcs", dispatch_constants::get(CoreType::WORKER).prefetch_d_buffer_size());
prefetch_d_buffer_size_g = test_args::get_command_option_uint32(input_args, "-pdcs", dispatch_constants::get(CoreType::WORKER, 1).prefetch_d_buffer_size());

test_type_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_TEST_TYPE);
all_workers_g.end_coord.x = test_args::get_command_option_uint32(input_args, "-wx", all_workers_g.end_coord.x);
Expand Down Expand Up @@ -1516,7 +1516,7 @@ void configure_for_single_chip(Device *device,
uint32_t packetized_path_test_results_size) {

const CoreType dispatch_core_type = CoreType::WORKER;
uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS;
uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS;
uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;

CoreCoord prefetch_core = {0, 0};
Expand Down Expand Up @@ -1887,7 +1887,7 @@ void configure_for_single_chip(Device *device,
std::vector<uint32_t> dispatch_compile_args = {
dispatch_buffer_base,
dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages(),
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(),
dispatch_cb_sem, // overridden below for h
split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem, // overridden below for dispatch_h
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
Expand Down Expand Up @@ -2144,7 +2144,7 @@ void configure_for_multi_chip(Device *device,
uint32_t packetized_path_test_results_size) {

const CoreType dispatch_core_type = CoreType::WORKER;
uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS;
uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS;
uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;
TT_ASSERT(num_compute_cores == (device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y));

Expand Down Expand Up @@ -2620,7 +2620,7 @@ void configure_for_multi_chip(Device *device,
std::vector<uint32_t> dispatch_compile_args = {
dispatch_buffer_base,
dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages(),
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(),
dispatch_cb_sem, // overridden below for h
split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem,
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRo
TEST_F(CommandQueueSingleCardFixture, Sending131072Pages) {
for (Device *device : devices_) {
TestBufferConfig config = {.num_pages = 131072, .page_size = 128, .buftype = BufferType::DRAM};

tt::log_info("Running On Device {}", device->id());
local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,106 @@ bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, v
namespace basic_tests {
namespace dram_tests {

TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToDramBank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}

}

TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllDramBanks) {
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
TestBufferConfig config = {
.num_pages = uint32_t(device->num_banks(BufferType::DRAM)),
.page_size = 2048,
.buftype = BufferType::DRAM};

CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}
}

TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
constexpr uint32_t num_round_robins = 2;
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
TestBufferConfig config = {
.num_pages = num_round_robins * (device->num_banks(BufferType::DRAM)),
.page_size = 2048,
.buftype = BufferType::DRAM};

CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}
}

TEST_F(MultiCommandQueueMultiDeviceFixture, Sending131072Pages) {
// Was a failing case where we used to accidentally program cb num pages to be total
// pages instead of cb num pages.
TestBufferConfig config = {
.num_pages = 131072,
.page_size = 128,
.buftype = BufferType::DRAM};
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}
}

TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram) {
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};

CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}
}

TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram2) {
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
// From stable diffusion read buffer
TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};

CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}
}

TEST_F(MultiCommandQueueMultiDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
uint32_t page_size = 2048;
uint32_t command_queue_size = device->sysmem_manager().get_cq_size();
uint32_t num_pages = command_queue_size / page_size;

TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};

CommandQueue& a = device->command_queue(0);
CommandQueue& b = device->command_queue(1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
}
}

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToDramBank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
CommandQueue& a = this->device_->command_queue(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "gtest/gtest.h"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/test_utils/env_vars.hpp"
#include "tt_metal/impl/dispatch/command_queue.hpp"
#include "tt_metal/llrt/rtoptions.hpp"
Expand All @@ -26,8 +27,10 @@ class MultiCommandQueueSingleDeviceFixture : public ::testing::Test {
arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());
DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) {
tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()");
dispatch_core_type = DispatchCoreType::ETH;
if (!tt::tt_metal::IsGalaxyCluster()) {
tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()");
dispatch_core_type = DispatchCoreType::ETH;
}
}
device_ = tt::tt_metal::CreateDevice(0, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
}
Expand All @@ -40,6 +43,48 @@ class MultiCommandQueueSingleDeviceFixture : public ::testing::Test {
tt::ARCH arch_;
};

class MultiCommandQueueMultiDeviceFixture : public ::testing::Test {
protected:
void SetUp() override {
auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (slow_dispatch) {
TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
GTEST_SKIP();
}
auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs();
if (num_cqs != 2) {
TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
GTEST_SKIP();
}
arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());


DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) {
if (!tt::tt_metal::IsGalaxyCluster()) {
tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()");
dispatch_core_type = DispatchCoreType::ETH;
}
}

const chip_id_t mmio_device_id = 0;
reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
for (const auto &[id, device] : reserved_devices_) {
devices_.push_back(device);
}

num_devices_ = reserved_devices_.size();
}

void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }

std::vector<tt::tt_metal::Device*> devices_;
std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
size_t num_devices_;
tt::ARCH arch_;
};


class SingleDeviceTraceFixture: public ::testing::Test {
protected:
Device* device_;
Expand Down
14 changes: 10 additions & 4 deletions tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ nebula_x1:

tg_compute_with_storage_grid_range: # Logical only start and end [x, y]
start: [0, 0]
end: [7, 3]
end: [7, 1]

storage_cores:
[]
Expand All @@ -53,7 +53,10 @@ nebula_x1:
tg_dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1],
[0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2],
[0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]]
[0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3],
[0, -4], [1, -4], [2, -4], [3, -4], [4, -4], [5, -4], [6, -4], [7, -4],
[0, -5], [1, -5], [2, -5], [3, -5], [4, -5], [5, -5], [6, -5], [7, -5],
[0, -6], [1, -6], [2, -6], [3, -6], [4, -6], [5, -6], [6, -6], [7, -6]]

dispatch_core_type:
"tensix"
Expand All @@ -65,7 +68,7 @@ nebula_x1:

tg_compute_with_storage_grid_range: # Logical only start and end [x, y]
start: [0, 0]
end: [7, 3]
end: [7, 1]

storage_cores:
[]
Expand All @@ -76,7 +79,10 @@ nebula_x1:
tg_dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1],
[0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2],
[0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]]
[0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3],
[0, -4], [1, -4], [2, -4], [3, -4], [4, -4], [5, -4], [6, -4], [7, -4],
[0, -5], [1, -5], [2, -5], [3, -5], [4, -5], [5, -5], [6, -5], [7, -5],
[0, -6], [1, -6], [2, -6], [3, -6], [4, -6], [5, -6], [6, -6], [7, -6]]

dispatch_core_type:
"tensix"
Expand Down
7 changes: 7 additions & 0 deletions tt_metal/host_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ class Buffer;
*/
size_t GetNumAvailableDevices();

/**
* Returns whether Tenstorrent devices are in a Galaxy cluster
*
* Return value: bool
*/
bool IsGalaxyCluster();

/**
* Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted
*
Expand Down
Loading

0 comments on commit feb8d31

Please sign in to comment.