Skip to content

Commit

Permalink
#0: Enable metal on galaxy.
Browse files Browse the repository at this point in the history
#8305: add Galaxy cluster apis
#8305: cleanup, add print
#8450: Establish tunnels originating from an mmio device. Determine the remote chips as well as their order on the tunnel.
#8452: add tests for tg pipeline
#0:    patch for tg workflows.
#8450: Add tables for tunnel dispatch workers with build settings.
       Populate build settings for tunnel kernels.
       Launch FD2 kernels based on information in tunnel device dispatch worker map.
       Enable 4 devices per hugepage/channel
#0:    disable hanging/failing tests for Galaxy
#0:    skip using channel 3, 7 which use huge page channel 3. This (4th) huepage is not a full 1GB in size. 256 MB is taken up by syseng tools 4th huge page.
#0:    re-enable Galaxy sharded tests, reduce one test runtime for Galaxy
#0:    fix cluster init for Galaxy
#8953: Fix hardcoding of queue sizes in tests.
#8450: Fix compute grid selection for N150. N150 can be standalone system or part of a TG system. On TG compute grid for N150 is different than standalone N150.
#0:    Reduce prefetch q entries to account for Galaxy CQ size.
#0:    galaxy mesh return any available device
#0:    Fix device mesh close for Galaxy
#8450: Update Galaxy device creation.
  • Loading branch information
ubcheema committed Jun 11, 2024
1 parent fa443d2 commit d35ea9d
Show file tree
Hide file tree
Showing 14 changed files with 1,462 additions and 710 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue
// Clear out command queue
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint32_t cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / device->num_hw_cqs();
uint32_t cq_size = device->sysmem_manager().get_cq_size();

std::vector<uint32_t> cq_zeros((cq_size - CQ_START) / sizeof(uint32_t), 0);

tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), CQ_START, mmio_device_id, channel);
tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), get_absolute_cq_offset(channel, 0, cq_size) + CQ_START, mmio_device_id, channel);

for (const bool cq_write : {true, false}) {
for (const bool cq_read : {true, false}) {
Expand Down Expand Up @@ -327,6 +327,7 @@ namespace dram_tests {
TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
for (Device *device : devices_) {
tt::log_info("Running On Device {}", device->id());
local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
}
}
Expand Down Expand Up @@ -428,11 +429,9 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {
// Requires enqueue write buffer
TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
for (Device *device : this->devices_) {
tt::log_info("Running On Device {}", device->id());
uint32_t page_size = 2048;
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
uint32_t command_issue_region_size = 805310400;
uint32_t command_issue_region_size = device->sysmem_manager().get_issue_queue_size(0);

uint32_t max_command_size = command_issue_region_size - CQ_START;
uint32_t buffer = 14240;
Expand All @@ -446,10 +445,9 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {

TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
for (Device *device : this->devices_) {
tt::log_info("Running On Device {}", device->id());
uint32_t page_size = 2048;
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
uint32_t command_queue_size = device->sysmem_manager().get_cq_size();
uint32_t num_pages = command_queue_size / page_size;

TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
Expand All @@ -465,10 +463,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
uint32_t small_page_size = 2048; // page size for second read

for (Device *device : devices_) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
uint32_t command_completion_region_size = 268431360;
tt::log_info("Running On Device {}", device->id());
uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0);

uint32_t first_buffer_size = tt::round_up(command_completion_region_size * 0.95, large_page_size);

Expand Down Expand Up @@ -504,10 +500,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) {
// Using default 75-25 issue and completion queue split
for (Device *device : devices_) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
uint32_t command_completion_region_size = 268431360;
tt::log_info("Running On Device {}", device->id());
uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0);

uint32_t num_pages_buff_1 = 9;
uint32_t page_size_buff_1 = 2048;
Expand Down Expand Up @@ -653,6 +647,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocki
.seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16};

for (Device *device : devices_) {
tt::log_info("Running on Device {}", device->id());
EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer<true>(
device, device->command_queue(), config));
}
Expand All @@ -672,34 +667,34 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblo

// TODO: Split this into separate tests
TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
std::map<std::string, std::vector<std::array<uint32_t, 2>>> test_params;

for (Device *device : devices_) {
for (const std::array<uint32_t, 2> cores :
{std::array<uint32_t, 2>{1, 1},
std::array<uint32_t, 2>{5, 1},
std::array<uint32_t, 2>{1, 5},
std::array<uint32_t, 2>{5, 3},
std::array<uint32_t, 2>{3, 5},
std::array<uint32_t, 2>{5, 5},
std::array<uint32_t, 2>{
static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}) {
for (const std::array<uint32_t, 2> num_pages : {
std::array<uint32_t, 2>{1, 1},
std::array<uint32_t, 2>{2, 1},
std::array<uint32_t, 2>{1, 2},
std::array<uint32_t, 2>{2, 2},
std::array<uint32_t, 2>{7, 11},
std::array<uint32_t, 2>{3, 65},
std::array<uint32_t, 2>{67, 4},
std::array<uint32_t, 2>{3, 137},
}) {
for (const std::array<uint32_t, 2> page_shape : {
std::array<uint32_t, 2>{32, 32},
std::array<uint32_t, 2>{1, 4},
std::array<uint32_t, 2>{1, 120},
std::array<uint32_t, 2>{1, 1024},
std::array<uint32_t, 2>{1, 2048},
}) {
if (tt::Cluster::instance().is_galaxy_cluster()) {
test_params = {
{"cores",
{{1, 1},
{static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
{"num_pages", {{3, 65}}},
{"page_shape", {{32, 32}}}};
} else {
test_params = {
{"cores",
{{1, 1},
{5, 1},
{1, 5},
{5, 3},
{3, 5},
{5, 5},
{static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
{"num_pages", {{1, 1}, {2, 1}, {1, 2}, {2, 2}, {7, 11}, {3, 65}, {67, 4}, {3, 137}}},
{"page_shape", {{32, 32}, {1, 4}, {1, 120}, {1, 1024}, {1, 2048}}}};
}
for (const std::array<uint32_t, 2> cores : test_params.at("cores")) {
for (const std::array<uint32_t, 2> num_pages : test_params.at("num_pages")) {
for (const std::array<uint32_t, 2> page_shape : test_params.at("page_shape")) {
for (const TensorMemoryLayout shard_strategy :
{TensorMemoryLayout::HEIGHT_SHARDED,
TensorMemoryLayout::WIDTH_SHARDED,
Expand All @@ -712,7 +707,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
config.num_iterations = num_iterations;
config.mem_config = shard_strategy;
config.page_shape = page_shape;
tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
device, device->command_queue(), config, BufferType::L1, false);
}
Expand Down Expand Up @@ -800,7 +795,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) {
config.num_iterations = num_iterations;
config.mem_config = shard_strategy;
config.page_shape = page_shape;
tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
device, device->command_queue(), config, BufferType::L1, true);
}
Expand Down
10 changes: 9 additions & 1 deletion tt_metal/common/core_descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,

auto compute_with_storage_start = desc_yaml["compute_with_storage_grid_range"]["start"];
auto compute_with_storage_end = desc_yaml["compute_with_storage_grid_range"]["end"];
if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") {
compute_with_storage_start = desc_yaml["tg_compute_with_storage_grid_range"]["start"];
compute_with_storage_end = desc_yaml["tg_compute_with_storage_grid_range"]["end"];
}
TT_ASSERT(compute_with_storage_start.IsSequence() and compute_with_storage_end.IsSequence());
TT_ASSERT(compute_with_storage_end[0].as<size_t>() >= compute_with_storage_start[0].as<size_t>());
TT_ASSERT(compute_with_storage_end[1].as<size_t>() >= compute_with_storage_start[1].as<size_t>());
Expand All @@ -136,7 +140,11 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,
}

std::vector<RelativeCoreCoord> dispatch_cores;
for (const auto& core_node : desc_yaml["dispatch_cores"]) {
auto dispatch_cores_string = "dispatch_cores";
if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") {
dispatch_cores_string = "tg_dispatch_cores";
}
for (const auto& core_node : desc_yaml[dispatch_cores_string]) {
RelativeCoreCoord coord = {};
if (core_node.IsSequence()) {
// Logical coord
Expand Down
9 changes: 9 additions & 0 deletions tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,21 @@ nebula_x1:
start: [0, 0]
end: [7, 7]

tg_compute_with_storage_grid_range: # Logical only start and end [x, y]
start: [0, 0]
end: [7, 3]

storage_cores:
[]

dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]]

tg_dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1],
[0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2],
[0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]]

dispatch_core_type:
"tensix"

Expand Down
Loading

0 comments on commit d35ea9d

Please sign in to comment.