Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove old fd init code path #16321

Merged
merged 2 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,133 changes: 19 additions & 2,114 deletions tt_metal/impl/device/device.cpp

Large diffs are not rendered by default.

10 changes: 0 additions & 10 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,6 @@ class Device {
void initialize_synchronous_sw_cmd_queue();
void update_dispatch_cores_for_multi_cq_eth_dispatch();

void configure_kernel_variant(Program& program, const string& path, const std::vector<uint32_t>& compile_args, CoreCoord kernel_core, CoreCoord Kernel_virtual_core,
CoreType dispatch_core_type, CoreCoord upstream_virtual_core, CoreCoord downstream_virtual_core, CoreCoord downstream_slave_virtual_core, std::map<string, string> defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false);

// Puts device into reset
bool close();
friend bool CloseDevice(Device *device);
Expand Down Expand Up @@ -309,9 +306,7 @@ class Device {
void initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap);

void compile_command_queue_programs();
void compile_command_queue_programs_new();
void configure_command_queue_programs();
void configure_command_queue_programs_new();
void clear_l1_state();
void get_associated_dispatch_virtual_cores(
std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &my_dispatch_cores,
Expand All @@ -322,10 +317,6 @@ class Device {

void generate_device_bank_to_noc_tables();

void setup_tunnel_for_remote_devices();

void update_workers_build_settings(std::vector<std::vector<std::tuple<tt_cxy_pair, dispatch_worker_build_settings_t>>> &device_worker_variants);

SubDeviceManagerId get_next_sub_device_manager_id();
void reset_sub_devices_state(const std::unique_ptr<detail::SubDeviceManager>& sub_device_manager);

Expand All @@ -339,7 +330,6 @@ class Device {

chip_id_t id_;
uint32_t build_key_ = 0;
std::map<uint32_t, std::map<chip_id_t, std::vector<std::vector<std::tuple<tt_cxy_pair, dispatch_worker_build_settings_t>>>>> tunnel_device_dispatch_workers_;
std::vector<std::vector<chip_id_t>> tunnels_from_mmio_;

// Leaving here for compatibility with current reacharounds
Expand Down
4 changes: 1 addition & 3 deletions tt_metal/impl/device/device_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,9 +334,7 @@ void DevicePool::add_devices_to_pool(const std::vector<chip_id_t>& device_ids) {
}
}

if (llrt::RunTimeOptions::get_instance().get_use_new_fd_init()) {
populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
}
populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
for (const auto& device_id : devices_to_activate) {
if (not this->is_device_active(device_id)) {
this->activate_device(device_id);
Expand Down
29 changes: 0 additions & 29 deletions tt_metal/impl/dispatch/dispatch_core_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,35 +22,6 @@ namespace tt::tt_metal {
// One core dispatches commands to worker cores on the device `dispatcher`
// The `remote_x` cores are used for remote fast dispatch and receive / transmit fast dispatch packets from ethernet cores

struct dispatch_worker_build_settings_t{
std::string kernel_file;
std::string dispatch_s_kernel_file;
std::vector<uint32_t> compile_args;
std::vector<tt_cxy_pair> upstream_cores;
std::vector<tt_cxy_pair> downstream_cores;
tt_cxy_pair worker_virtual_core;
tt_cxy_pair eth_partner_virtual_core;
CoreType dispatch_core_type;
uint32_t command_queue_start_addr;
uint32_t issue_queue_start_addr;
uint32_t issue_queue_size;
uint32_t completion_queue_start_addr;
uint32_t completion_queue_size;
std::vector<uint32_t> semaphores;
uint32_t producer_semaphore_id;
uint32_t consumer_semaphore_id;
uint32_t consumer_slave_semaphore_id;
tt_cxy_pair dispatch_s_logical_core;
tt_cxy_pair dispatch_s_physical_core;
uint32_t cb_start_address;
uint32_t cb_size_bytes;
uint32_t cb_log_page_size;
uint32_t cb_pages;
uint32_t tunnel_stop;
uint32_t num_compute_cores;
uint32_t vc_count;
};

// std::optional is used to determine whether core has been assigned
// tt_cxy_pair is used over CoreCoord to denote location because remote device command queue interface cores are on the associated MMIO device
struct dispatch_core_placement_t {
Expand Down
16 changes: 2 additions & 14 deletions tt_metal/impl/dispatch/kernel_config/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,13 @@ void DemuxKernel::GenerateStaticConfigs() {
static_config_.test_results_buf_size_bytes = 0;
static_config_.timeout_cycles = 0;

// TODO: Do we need an upstream sem here?
for (int idx = 0; idx < downstream_kernels_.size(); idx++) {
FDKernel* k = downstream_kernels_[idx];
static_config_.remote_tx_queue_id[idx] = 0;
static_config_.remote_tx_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0;
static_config_.output_depacketize_cb_log_page_size[idx] = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
// Only connected dispatchers need a semaphore. TODO: can initialize anyways, but this matches previous
// implementation
if (dynamic_cast<DispatchKernel*>(k)) {
static_config_.output_depacketize_local_sem_id[idx] =
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
}
static_config_.output_depacketize_local_sem_id[idx] =
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
static_config_.output_depacketize_remove_header[idx] = 1;
}
}
Expand Down Expand Up @@ -89,9 +84,6 @@ void DemuxKernel::GenerateDependentConfigs() {
dependent_config_.remote_tx_queue_start_addr_words[idx] =
demux_kernel->GetStaticConfig().rx_queue_start_addr_words.value();
dependent_config_.remote_tx_queue_size_words[idx] = 0x1000; // TODO: hard-coded on previous implementation
// Match previous implementation where downstream demux has output_depacketize fields zeroed out. TODO: can
// remove this later
dependent_config_.output_depacketize_downstream_sem_id[idx] = 0;
uint64_t dest_endpoint_output_map;
if (device_->num_hw_cqs() == 1) {
uint32_t dest_map_array[4] = {0, 0, 1, 1}; // TODO: how to set these generically? Currently just
Expand All @@ -107,10 +99,6 @@ void DemuxKernel::GenerateDependentConfigs() {
TT_FATAL(false, "Unexpected kernel type downstream of DEMUX");
}
}
// TODO: this is just to match the previous implementation hard-code, remove later
if (!tt::Cluster::instance().is_galaxy_cluster()) {
dependent_config_.output_depacketize = 0x3;
}
}

void DemuxKernel::CreateKernel() {
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/dispatch/kernel_config/dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ void DispatchKernel::GenerateDependentConfigs() {
TT_ASSERT(downstream_kernels_.size() == 1);
auto prefetch_h_kernel = dynamic_cast<PrefetchKernel*>(downstream_kernels_[0]);
TT_ASSERT(prefetch_h_kernel);
dependent_config_.downstream_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
dependent_config_.downstream_logical_core = UNUSED_LOGICAL_CORE;
dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE;
dependent_config_.prefetch_h_noc_xy = tt::tt_metal::hal.noc_xy_encoding(
prefetch_h_kernel->GetVirtualCore().x, prefetch_h_kernel->GetVirtualCore().y);
dependent_config_.prefetch_h_local_downstream_sem_addr =
Expand Down
9 changes: 2 additions & 7 deletions tt_metal/impl/dispatch/kernel_config/eth_router.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,14 @@ void EthRouterKernel::GenerateStaticConfigs() {
dependent_config_.input_packetize_dst_endpoint = {0x0};

static_config_.fwd_vc_count = this->static_config_.vc_count;
uint32_t created_semaphores = 0;
for (int idx = 0; idx < downstream_kernels_.size(); idx++) {
static_config_.output_depacketize_local_sem[idx] =
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
// Forwward VCs are the ones that don't connect to a prefetch
if (auto pk = dynamic_cast<PrefetchKernel*>(downstream_kernels_[idx])) {
static_config_.fwd_vc_count = this->static_config_.fwd_vc_count.value() - 1;
static_config_.output_depacketize_local_sem[idx] = // TODO: to match for now, init one per vc after
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
created_semaphores++;
}
}
if (created_semaphores == 0) { // Just to match previous implementation
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
}

for (int idx = 0; idx < static_config_.vc_count.value(); idx++) {
static_config_.output_depacketize_log_page_size[idx] = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
Expand Down
2 changes: 0 additions & 2 deletions tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"

#define UNUSED_LOGICAL_CORE tt_cxy_pair(device_->id(), 0, 0)
// TODO: Just to make match with previous implementation, remove later
#define UNUSED_LOGICAL_CORE_ADJUSTED tt_cxy_pair(servicing_device_id_, 0, 0)
#define UNUSED_SEM_ID 0

typedef struct {
Expand Down
11 changes: 3 additions & 8 deletions tt_metal/impl/dispatch/kernel_config/mux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,8 @@ void MuxKernel::GenerateStaticConfigs() {
static_config_.output_depacketize_info = 0x0;

for (int idx = 0; idx < upstream_kernels_.size(); idx++) {
// Only connected dispatchers need a semaphore. TODO: can initialize anyways, but this matches previous
// implementation
if (dynamic_cast<DispatchKernel*>(upstream_kernels_[idx])) {
static_config_.input_packetize_local_sem[idx] =
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
}
static_config_.input_packetize_local_sem[idx] =
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
}
}

Expand Down Expand Up @@ -121,8 +117,7 @@ void MuxKernel::CreateKernel() {
compile_args[4 + idx] |= (static_config_.remote_rx_network_type[idx].value() & 0xFF) << 24;
}
if (dependent_config_.input_packetize[idx]) {
// Zero out if input packetize not set to match previous implementation. TODO: don't have to do this
if (dependent_config_.input_packetize[idx].value() != 0) {
if (dependent_config_.input_packetize[idx]) {
compile_args[19 + idx] |= (dependent_config_.input_packetize[idx].value() & 0xFF);
compile_args[19 + idx] |= (dependent_config_.input_packetize_log_page_size[idx].value() & 0xFF) << 8;
compile_args[19 + idx] |= (dependent_config_.input_packetize_upstream_sem[idx].value() & 0xFF) << 16;
Expand Down
13 changes: 4 additions & 9 deletions tt_metal/impl/dispatch/kernel_config/prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ void PrefetchKernel::GenerateStaticConfigs() {
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
static_config_.my_downstream_cb_sem_id = tt::tt_metal::CreateSemaphore(
*program_, logical_core_, static_config_.downstream_cb_pages.value(), GetCoreType());
tt::tt_metal::CreateSemaphore(
*program_, logical_core_, 0, GetCoreType()); // TODO: what is this third semaphore for?
static_config_.cmddat_q_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
static_config_.cmddat_q_blocks = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS;

Expand All @@ -128,6 +126,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base();
static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
static_config_.my_downstream_cb_sem_id = tt::tt_metal::CreateSemaphore(
*program_, logical_core_, my_dispatch_constants.dispatch_buffer_pages(), GetCoreType());

static_config_.pcie_base = 0;
static_config_.pcie_size = 0;
Expand All @@ -152,11 +152,6 @@ void PrefetchKernel::GenerateStaticConfigs() {
static_config_.cmddat_q_pages = my_dispatch_constants.prefetch_d_buffer_pages();
static_config_.my_upstream_cb_sem_id =
tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType());
static_config_.my_downstream_cb_sem_id = tt::tt_metal::CreateSemaphore(
*program_,
logical_core_,
my_dispatch_constants.dispatch_buffer_pages(),
GetCoreType()); // TODO: this is out of order to match previous implementation
static_config_.cmddat_q_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
static_config_.cmddat_q_blocks = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS;

Expand Down Expand Up @@ -231,15 +226,15 @@ void PrefetchKernel::GenerateDependentConfigs() {
} else if (static_config_.is_h_variant.value()) {
// Upstream, just host so no dispatch core
TT_ASSERT(upstream_kernels_.size() == 0);
dependent_config_.upstream_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
dependent_config_.upstream_logical_core = UNUSED_LOGICAL_CORE;
dependent_config_.upstream_cb_sem_id = 0; // Used in prefetch_d only

// Downstream, expect just one ROUTER
TT_ASSERT(downstream_kernels_.size() == 1);
auto router_kernel = dynamic_cast<EthRouterKernel*>(downstream_kernels_[0]);
TT_ASSERT(router_kernel);
dependent_config_.downstream_logical_core = router_kernel->GetLogicalCore();
dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE;
uint32_t router_idx = router_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream
dependent_config_.downstream_cb_base =
(router_kernel->GetStaticConfig().rx_queue_start_addr_words.value() << 4) +
Expand Down
6 changes: 0 additions & 6 deletions tt_metal/llrt/rtoptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,6 @@ RunTimeOptions::RunTimeOptions() {
enable_dispatch_data_collection = true;
}

if (getenv("TT_METAL_OLD_FD_INIT")) {
this->use_new_fd_init = false;
} else {
this->use_new_fd_init = true;
}

if (getenv("TT_METAL_GTEST_ETH_DISPATCH")) {
this->dispatch_core_config.set_dispatch_core_type(tt_metal::DispatchCoreType::ETH);
}
Expand Down
3 changes: 0 additions & 3 deletions tt_metal/llrt/rtoptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ class RunTimeOptions {
tt_metal::DispatchCoreConfig dispatch_core_config = tt_metal::DispatchCoreConfig{};

bool skip_deleting_built_cache = false;
bool use_new_fd_init = false;

RunTimeOptions();

Expand Down Expand Up @@ -305,8 +304,6 @@ class RunTimeOptions {
inline bool get_hw_cache_invalidation_enabled() const { return this->enable_hw_cache_invalidation; }

inline tt_metal::DispatchCoreConfig get_dispatch_core_config() { return dispatch_core_config; }
inline bool get_use_new_fd_init() { return use_new_fd_init; }
inline void set_use_new_fd_init(bool enable) { use_new_fd_init = enable; }

inline bool get_skip_deleting_built_cache() { return skip_deleting_built_cache; }

Expand Down
Loading