Skip to content

Commit

Permalink
Revert "#16356: Program Dispatch Modifications for MeshWorkload"
Browse files Browse the repository at this point in the history
This reverts commit 3755428.
  • Loading branch information
rayraykay committed Jan 1, 2025
1 parent 4da41ff commit d9304dd
Show file tree
Hide file tree
Showing 20 changed files with 1,503 additions and 1,825 deletions.
1 change: 0 additions & 1 deletion tt_metal/impl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ set(IMPL_SRC
${CMAKE_CURRENT_SOURCE_DIR}/allocator/basic_allocator.cpp
${CMAKE_CURRENT_SOURCE_DIR}/allocator/l1_banking_allocator.cpp
${CMAKE_CURRENT_SOURCE_DIR}/program/program.cpp
${CMAKE_CURRENT_SOURCE_DIR}/program/program_dispatch_utils.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/debug_tools.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/command_queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp
Expand Down
3 changes: 1 addition & 2 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -938,7 +938,7 @@ void Device::init_command_queue_host() {
this->sysmem_manager_ = std::make_unique<SystemMemoryManager>(this->id_, this->num_hw_cqs());
hw_command_queues_.resize(num_hw_cqs());
for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) {
hw_command_queues_[cq_id] = std::make_unique<HWCommandQueue>(this, cq_id, dispatch_downstream_noc);
hw_command_queues_[cq_id] = std::make_unique<HWCommandQueue>(this, cq_id, NOC::NOC_0);
// Need to do this since CommandQueue constructor is private
sw_command_queues_.push_back(std::unique_ptr<CommandQueue>(new CommandQueue(this, cq_id)));
}
Expand Down Expand Up @@ -1750,7 +1750,6 @@ LaunchMessageRingBufferState& Device::get_worker_launch_message_buffer_state(Sub
return this->active_sub_device_manager_->get_worker_launch_message_buffer_state(sub_device_id);
}

// Main source to get NOC idx for dispatch core
NOC Device::dispatch_go_signal_noc() const {
return this->dispatch_s_enabled() ? NOC::NOC_1 : NOC::NOC_0;
}
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/device/device_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,12 @@ void DevicePool::add_devices_to_pool(const std::vector<chip_id_t>& device_ids) {
}
}

populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
for (const auto& device_id : devices_to_activate) {
if (not this->is_device_active(device_id)) {
this->activate_device(device_id);
}
}
populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
}

void DevicePool::register_worker_thread_for_device(v1::DeviceHandle device, std::thread::id worker_thread_id) {
Expand Down
1,255 changes: 1,193 additions & 62 deletions tt_metal/impl/dispatch/command_queue.cpp

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions tt_metal/impl/dispatch/command_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,16 @@ class EnqueueProgramCommand : public Command {
uint32_t unicast_cores_launch_message_wptr,
SubDeviceId sub_device_id);

void assemble_preamble_commands(
ProgramCommandSequence& program_command_sequence, const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs);
void assemble_stall_commands(ProgramCommandSequence& program_command_sequence, bool prefetch_stall);
void assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence);
void assemble_device_commands(
ProgramCommandSequence& program_command_sequence, const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs);
void update_device_commands(
ProgramCommandSequence& cached_program_command_sequence,
const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs);

void write_program_command_sequence(
const ProgramCommandSequence& program_command_sequence, bool stall_first, bool stall_before_program);

Expand Down Expand Up @@ -517,6 +527,7 @@ class HWCommandQueue {
void set_num_worker_sems_on_dispatch(uint32_t num_worker_sems);
void set_go_signal_noc_data_on_dispatch(const vector_memcpy_aligned<uint32_t>& go_signal_noc_data);
void reset_worker_state(bool reset_launch_msg_state);

private:
uint32_t id;
uint32_t size_B;
Expand Down
5 changes: 0 additions & 5 deletions tt_metal/impl/dispatch/dispatch_core_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

#include "common/core_descriptor.hpp"
#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/impl/kernels/data_types.hpp"
#include "tt_metal/llrt/get_platform_architecture.hpp"
#include "tt_metal/tt_stl/reflection.hpp"

Expand All @@ -33,10 +32,6 @@ enum DispatchWorkerType : uint32_t {
COUNT = 17
};

// NOC ID used by dispatch kernels to communicate with downstream cores. This parameter
// is required when setting up Command Queue objects on host.
static constexpr NOC dispatch_downstream_noc = NOC::NOC_0;

enum class DispatchCoreType : uint32_t { WORKER, ETH, COUNT };

enum class DispatchCoreAxis { ROW, COL, COUNT };
Expand Down
3 changes: 3 additions & 0 deletions tt_metal/impl/dispatch/kernel_config/dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void DispatchKernel::GenerateStaticConfigs() {
uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_);

logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_->id(), channel, cq_id_);
static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
static_config_.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
Expand Down Expand Up @@ -76,6 +77,7 @@ void DispatchKernel::GenerateStaticConfigs() {
uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_);

logical_core_ = dispatch_core_manager::instance().dispatcher_core(servicing_device_id_, channel, cq_id_);
static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
static_config_.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
Expand Down Expand Up @@ -122,6 +124,7 @@ void DispatchKernel::GenerateStaticConfigs() {
uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_);

logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_->id(), channel, cq_id_);
static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
static_config_.dispatch_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
Expand Down
16 changes: 0 additions & 16 deletions tt_metal/impl/dispatch/kernel_config/dispatch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,8 @@ class DispatchKernel : public FDKernel {
bool h_variant,
bool d_variant) :
FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {
TT_FATAL(
noc_selection.downstream_noc == dispatch_downstream_noc,
"Invalid downstream NOC specified for Dispatcher kernel");
TT_FATAL(
noc_selection.upstream_noc != noc_selection.downstream_noc,
"Dispatcher kernel cannot have identical upstream and downstream NOCs.");
static_config_.is_h_variant = h_variant;
static_config_.is_d_variant = d_variant;
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
if (h_variant && d_variant) {
this->logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id);
} else if (h_variant) {
channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id);
this->logical_core_ =
dispatch_core_manager::instance().dispatcher_core(servicing_device_id, channel, cq_id);
} else if (d_variant) {
this->logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id);
}
}
void CreateKernel() override;
void GenerateStaticConfigs() override;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void DispatchSKernel::GenerateStaticConfigs() {
dispatch_s_buffer_base = dispatch_buffer_base;
}
}

logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_->id(), channel, cq_id_);
static_config_.cb_base = dispatch_s_buffer_base;
static_config_.cb_log_page_size = dispatch_constants::DISPATCH_S_BUFFER_LOG_PAGE_SIZE;
static_config_.cb_size = my_dispatch_constants.dispatch_s_buffer_size();
Expand Down
5 changes: 1 addition & 4 deletions tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@ class DispatchSKernel : public FDKernel {
public:
DispatchSKernel(
int node_id, chip_id_t device_id, chip_id_t servicing_device_id, uint8_t cq_id, noc_selection_t noc_selection) :
FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
this->logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id_);
}
FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {}
void CreateKernel() override;
void GenerateStaticConfigs() override;
void GenerateDependentConfigs() override;
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() {
} else if (auto tk = dynamic_cast<EthTunnelerKernel*>(k)) {
tunneler_kernel = tk;
} else {
TT_FATAL(false, "Unexpected kernel type upstream of TUNNELER");
TT_FATAL(false, "Unexpected kernelt tyoe downstream of TUNNELER");
}
}
TT_ASSERT(tunneler_kernel && !tunneler_kernel->IsRemote());
Expand Down Expand Up @@ -175,7 +175,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() {
} else if (auto tk = dynamic_cast<EthTunnelerKernel*>(k)) {
ds_tunneler_kernel = tk;
} else {
TT_FATAL(false, "Unexpected kernel type downstream of TUNNELER");
TT_FATAL(false, "Unexpected kernelt tyoe downstream of TUNNELER");
}
}
TT_ASSERT(ds_tunneler_kernel && ds_tunneler_kernel == tunneler_kernel);
Expand Down
6 changes: 6 additions & 0 deletions tt_metal/impl/dispatch/kernel_config/prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start;
uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_);

logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_->id(), channel, cq_id_);

dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base();
static_config_.downstream_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
Expand Down Expand Up @@ -81,6 +83,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start;
uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_);

logical_core_ = dispatch_core_manager::instance().prefetcher_core(servicing_device_id_, channel, cq_id_);

static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
if (tt::Cluster::instance().is_galaxy_cluster()) { // TODO: whys is this hard-coded for galaxy?
static_config_.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(1);
Expand Down Expand Up @@ -119,6 +123,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
static_config_.dispatch_s_buffer_size = 0;
static_config_.dispatch_s_cb_log_page_size = 0;
} else if (static_config_.is_d_variant.value()) {
logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_->id(), channel, cq_id_);

dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base();
static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
Expand Down
13 changes: 0 additions & 13 deletions tt_metal/impl/dispatch/kernel_config/prefetch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,6 @@ class PrefetchKernel : public FDKernel {
FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {
static_config_.is_h_variant = h_variant;
static_config_.is_d_variant = d_variant;
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
TT_FATAL(
noc_selection.downstream_noc == dispatch_downstream_noc,
"Invalid downstream NOC specified for Prefetcher kernel");
if (h_variant && d_variant) {
this->logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id);
} else if (h_variant) {
channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id);
this->logical_core_ =
dispatch_core_manager::instance().prefetcher_core(servicing_device_id, channel, cq_id);
} else if (d_variant) {
this->logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_id, channel, cq_id);
}
}
void CreateKernel() override;
void GenerateStaticConfigs() override;
Expand Down
37 changes: 16 additions & 21 deletions tt_metal/impl/dispatch/topology.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,15 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_1cq = {

{3, 0, 1, 0, PREFETCH_H, {x, x, x, x}, {5, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{4, 0, 1, 0, DISPATCH_H, {6, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},

{5, 0, 1, 0, PACKET_ROUTER_MUX, {3, x, x, x}, {7, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{6, 0, 1, 0, DEMUX, {7, x, x, x}, {4, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{7, 0, 1, 0, US_TUNNELER_REMOTE, {11, 5, x, x}, {11, 6, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},

{8, 1, x, 0, PREFETCH_D, {13, x, x, x}, {9, 10, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{9, 1, x, 0, DISPATCH_D, {8, x, x, x}, {10, 12, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{10, 1, x, 0, DISPATCH_S, {8, x, x, x}, {9, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},

{11, 1, x, 0, US_TUNNELER_LOCAL, {7, 12, x, x}, {7, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{12, 1, x, 0, MUX_D, {9, x, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{7, 0, 1, 0, US_TUNNELER_REMOTE, {8, 5, x, x}, {8, 6, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{8, 1, x, 0, US_TUNNELER_LOCAL, {7, 9, x, x}, {7, 10, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{9, 1, x, 0, MUX_D, {12, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{10, 1, x, 0, PACKET_ROUTER_DEMUX, {8, x, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{11, 1, x, 0, PREFETCH_D, {10, x, x, x}, {12, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{12, 1, x, 0, DISPATCH_D, {11, x, x, x}, {13, 9, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{13, 1, x, 0, DISPATCH_S, {11, x, x, x}, {12, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
};

static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {
Expand All @@ -106,17 +103,15 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {

{8, 0, 1, 0, PACKET_ROUTER_MUX, {4, 5, x, x}, {10, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{9, 0, 1, 0, DEMUX, {10, x, x, x}, {6, 7, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{10, 0, 1, 0, US_TUNNELER_REMOTE, {15, 8, x, x}, {15, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},

{11, 1, x, 0, PREFETCH_D, {17, x, x, x}, {13, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{12, 1, x, 1, PREFETCH_D, {17, x, x, x}, {14, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{13, 1, x, 0, DISPATCH_D, {11, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{14, 1, x, 1, DISPATCH_D, {12, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},

{15, 1, x, 0, US_TUNNELER_LOCAL, {10, 16, x, x}, {10, 17, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{16, 1, x, 0, MUX_D, {13, 14, x, x}, {15, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{17, 1, x, 0, PACKET_ROUTER_DEMUX, {15, x, x, x}, {11, 12, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},

{10, 0, 1, 0, US_TUNNELER_REMOTE, {11, 8, x, x}, {11, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{11, 1, x, 0, US_TUNNELER_LOCAL, {10, 12, x, x}, {10, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{12, 1, x, 0, MUX_D, {16, 17, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {14, 15, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},

{14, 1, x, 0, PREFETCH_D, {13, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{15, 1, x, 1, PREFETCH_D, {13, x, x, x}, {17, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{16, 1, x, 0, DISPATCH_D, {14, x, x, x}, {12, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{17, 1, x, 1, DISPATCH_D, {15, x, x, x}, {12, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
};

static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_1cq = {
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/kernels/kernel_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

namespace tt::tt_metal {

using KernelHandle = std::uint32_t;
using KernelHandle = std::uint16_t;

struct DataMovementConfig {
DataMovementProcessor processor = DataMovementProcessor::RISCV_0; // For data transfer kernels: NCRISC & BRISC
Expand Down
Loading

0 comments on commit d9304dd

Please sign in to comment.