diff --git a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp index 227adf612e5..6072e34599f 100644 --- a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp @@ -85,7 +85,8 @@ std::vector get_device_ring(std::vector> adj(devices.size(), std::vector(devices.size(), 0)); for (uint32_t i = 0; i < devices.size(); ++i) { const auto& device = devices[i]; - for (const auto& connected_device_id : device->get_ethernet_connected_device_ids()) { + auto ethernet_connected_device_ids = tt::Cluster::instance().get_ethernet_connected_device_ids(device->id()); + for (const auto& connected_device_id : ethernet_connected_device_ids) { for (uint32_t j = 0; j < devices.size(); ++j) { if (devices[j]->id() == connected_device_id) { adj[i][j] = 1; diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index cb49710dc63..39ef6314e6f 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -63,6 +63,18 @@ bool Device::is_inactive_ethernet_core(CoreCoord logical_core) const { return inactive_ethernet_cores.find(logical_core) != inactive_ethernet_cores.end(); } +std::tuple Device::get_connected_ethernet_core(CoreCoord eth_core) const { + return tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(this->id_, eth_core)); +} + +std::vector Device::get_ethernet_sockets(chip_id_t connected_chip_id) const { + return tt::Cluster::instance().get_ethernet_sockets(this->id_, connected_chip_id); +} + +bool Device::is_mmio_capable() const { + return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_; +} + CoreRangeSet Device::worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const { return this->active_sub_device_manager_->sub_device(sub_device_id).cores(core_type); } @@ -3223,15 +3235,15 @@ CoreCoord Device::logical_grid_size() const { return tt::Cluster::instance().get_soc_desc(id_).worker_grid_size; } +CoreCoord Device::dram_grid_size() const { + return tt::Cluster::instance().get_soc_desc(id_).get_dram_grid_size(); +} + CoreCoord Device::compute_with_storage_grid_size() const { const auto &dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(id_); return tt::get_compute_grid_size(id_, num_hw_cqs_, dispatch_core_config); } -CoreCoord Device::dram_grid_size() const { - return tt::Cluster::instance().get_soc_desc(id_).get_dram_grid_size(); -} - CoreType Device::core_type_from_physical_core(const CoreCoord &physical_coord) const { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); if (soc_desc.physical_cores.find(physical_coord) == soc_desc.physical_cores.end()) @@ -3249,7 +3261,6 @@ CoreType Device::core_type_from_virtual_core(const CoreCoord &virtual_coord) con return this->core_type_from_physical_core(virtual_coord); } - CoreCoord Device::virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const { if (coord.x >= this->grid_size().x || coord.y >= this->grid_size().y) { // Coordinate already in virtual space: NOC0 and NOC1 are the same @@ -3304,6 +3315,7 @@ std::vector Device::ethernet_cores_from_logical_cores(const std::vect } return eth_cores; } + CoreCoord Device::virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const { return tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(this->id_, logical_coord, core_type); } @@ -3679,14 +3691,18 @@ void Device::enable_async(bool enable) { } bool Device::using_slow_dispatch() const { - return not (this->using_fast_dispatch_); + return !using_fast_dispatch(); +} + +bool Device::using_fast_dispatch() const { + return using_fast_dispatch_; } void Device::begin_trace(const uint8_t cq_id, const uint32_t tid) { ZoneScoped; TracyTTMetalBeginTrace(this->id(), tid); TT_FATAL(!this->hw_command_queues_[cq_id]->tid.has_value(), "CQ {} is already being used for tracing tid {}", (uint32_t)cq_id, tid); - this->MarkAllocationsSafe(); + this->mark_allocations_safe(); // Create an empty trace buffer here. This will get initialized in end_trace TT_FATAL(this->active_sub_device_manager_->get_trace(tid) == nullptr, "Trace already exists for tid {} on device {}'s active sub-device manager {}", tid, this->id_, this->active_sub_device_manager_id_); auto &trace_buffer = this->active_sub_device_manager_->create_trace(tid); @@ -3701,7 +3717,7 @@ void Device::end_trace(const uint8_t cq_id, const uint32_t tid) { TT_FATAL(trace_buffer != nullptr, "Trace instance {} must exist on device {}'s active sub-device manager {}", tid, this->id_, this->active_sub_device_manager_id_); this->hw_command_queues_[cq_id]->record_end(); Trace::initialize_buffer(this->command_queue(cq_id), trace_buffer); - this->MarkAllocationsUnsafe(); + this->mark_allocations_unsafe(); } void Device::replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) { @@ -3724,7 +3740,7 @@ void Device::release_trace(const uint32_t tid) { // Only enable allocations once all captured traces are released if (this->trace_buffers_size_ == 0) { - this->MarkAllocationsSafe(); + this->mark_allocations_safe(); } } @@ -3750,11 +3766,11 @@ std::size_t Device::num_program_cache_entries() { return program_cache_.num_entries(); } -void Device::MarkAllocationsUnsafe() { +void Device::mark_allocations_unsafe() { tt::tt_metal::allocator::mark_allocations_unsafe(*this->get_initialized_allocator()); } -void Device::MarkAllocationsSafe() { +void Device::mark_allocations_safe() { tt::tt_metal::allocator::mark_allocations_safe(*this->get_initialized_allocator()); } @@ -3964,6 +3980,35 @@ std::vector Device::get_optimal_dram_bank_to_logical_worker_assignmen return this->optimal_dram_bank_to_logical_worker_assignment_; } +HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord virtual_core) const { + if (!tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) { + return HalProgrammableCoreType::TENSIX; + } + + // Eth pcores have a different address, but only active ones. + CoreCoord logical_core = this->logical_core_from_ethernet_core(virtual_core); + if (this->is_active_ethernet_core(logical_core)) { + return HalProgrammableCoreType::ACTIVE_ETH; + } + + return HalProgrammableCoreType::IDLE_ETH; +} + +// TODO: Find a better home for this function +// Extracts all the pairs of noc multicast encodings given a set of core ranges +std::vector> Device::extract_dst_noc_multicast_info(const std::vector& ranges, const CoreType core_type) { + std::vector> dst_noc_multicast_info; + dst_noc_multicast_info.reserve(ranges.size()); + for (const CoreRange& core_range : ranges) { + CoreCoord virtual_start = this->virtual_core_from_logical_core(core_range.start_coord, core_type); + CoreCoord virtual_end = this->virtual_core_from_logical_core(core_range.end_coord, core_type); + + uint32_t num_receivers = core_range.size(); + dst_noc_multicast_info.push_back(std::make_pair(CoreRange(virtual_start, virtual_end), num_receivers)); + } + return dst_noc_multicast_info; +} + size_t v1::GetNumAvailableDevices() { return tt::Cluster::instance().number_of_user_devices(); } diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index d37cfa6f038..cbd7e6b96d7 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -61,16 +61,7 @@ inline namespace v0 { // A physical PCIexpress Tenstorrent device class Device { - private: - static_assert(detail::SubDeviceManager::MAX_NUM_SUB_DEVICES <= dispatch_constants::DISPATCH_MESSAGE_ENTRIES, "MAX_NUM_SUB_DEVICES must be less than or equal to dispatch_constants::DISPATCH_MESSAGE_ENTRIES"); - static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1; - - CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const; - CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const; - CoreType core_type_from_physical_core(const CoreCoord &physical_core) const; - CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const; - - public: +public: // friend void tt_gdb(Device* device, int chip_id, const vector cores, vector ops); Device () = delete; Device( @@ -103,67 +94,45 @@ class Device { bool is_initialized() const { return this->initialized_; } int num_dram_channels() const; - uint32_t l1_size_per_core() const; uint32_t dram_size_per_channel() const; - CoreCoord grid_size() const; - CoreCoord logical_grid_size() const; - - CoreCoord compute_with_storage_grid_size() const; - CoreCoord dram_grid_size() const; - CoreType core_type_from_virtual_core(const CoreCoord& virtual_coord) const; + // Given a Virtual coordinate in noc_index space, get the equivalent coordinate in Virtual NOC0 space CoreCoord virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const; // Given a coordinate in Virtual NOC0 Space, get the equivalent coordinate in Virtual noc_index space CoreCoord virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const; + std::vector worker_cores_from_logical_cores(const std::vector &logical_cores) const; std::vector ethernet_cores_from_logical_cores(const std::vector &logical_cores) const; std::vector get_optimal_dram_bank_to_logical_worker_assignment(); CoreCoord virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const; - CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const; // Ethernet API CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const; CoreCoord logical_core_from_ethernet_core(const CoreCoord ðernet_core) const; - - std::unordered_set get_ethernet_connected_device_ids() const { - return tt::Cluster::instance().get_ethernet_connected_device_ids(this->id_); - } - std::unordered_set get_active_ethernet_cores(bool skip_reserved_tunnel_cores=false) const; - - bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const; - std::unordered_set get_inactive_ethernet_cores() const; - + bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const; + std::tuple get_connected_ethernet_core(CoreCoord eth_core) const; + std::vector get_ethernet_sockets(chip_id_t connected_chip_id) const; bool is_inactive_ethernet_core(CoreCoord logical_core) const; + CoreCoord compute_with_storage_grid_size() const; + CoreRangeSet worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const; uint32_t num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const; - std::tuple get_connected_ethernet_core(CoreCoord eth_core) const { - return tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(this->id_, eth_core)); - } - - std::vector get_ethernet_sockets(chip_id_t connected_chip_id) const { - return tt::Cluster::instance().get_ethernet_sockets(this->id_, connected_chip_id); - } - - bool is_mmio_capable() const { - return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_; - } - - void setup_tunnel_for_remote_devices(); - - void update_workers_build_settings(std::vector>> &device_worker_variants); + const std::unique_ptr &get_initialized_allocator() const; + const std::unique_ptr &get_initialized_allocator(SubDeviceId sub_device_id) const; - uint32_t num_sub_devices() const; + DeviceAddr get_base_allocator_addr(const HalMemType &mem_type) const; + DeviceAddr get_base_allocator_addr(const HalMemType &mem_type, SubDeviceId sub_device_id) const; uint32_t num_banks(const BufferType &buffer_type) const; uint32_t num_banks(const BufferType &buffer_type, SubDeviceId sub_device_id) const; @@ -176,9 +145,6 @@ class Device { CoreCoord logical_core_from_dram_channel(uint32_t dram_channel) const; uint32_t dram_channel_from_logical_core(const CoreCoord& logical_core) const; - const std::unique_ptr &get_initialized_allocator() const; - const std::unique_ptr &get_initialized_allocator(SubDeviceId sub_device_id) const; - int32_t bank_offset(BufferType buffer_type, uint32_t bank_id) const; int32_t bank_offset(BufferType buffer_type, uint32_t bank_id, SubDeviceId sub_device_id) const; @@ -188,10 +154,8 @@ class Device { const std::vector &bank_ids_from_dram_channel(uint32_t dram_channel) const; const std::vector &bank_ids_from_dram_channel(uint32_t dram_channel, SubDeviceId sub_device_id) const; - const std::vector &bank_ids_from_logical_core( - BufferType buffer_type, const CoreCoord &logical_core) const; - const std::vector &bank_ids_from_logical_core( - BufferType buffer_type, const CoreCoord &logical_core, SubDeviceId sub_device_id) const; + const std::vector &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core) const; + const std::vector &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core, SubDeviceId sub_device_id) const; allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const; allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const; @@ -199,35 +163,31 @@ class Device { uint32_t get_allocator_alignment() const; uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const; + std::optional lowest_occupied_compute_l1_address() const; + std::optional lowest_occupied_compute_l1_address(tt::stl::Span sub_device_ids) const; + size_t get_l1_small_size() const; size_t get_l1_small_size(SubDeviceId sub_device_id) const; - void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const; - void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const; + const std::unordered_set &get_allocated_buffers() const; + const std::unordered_set &get_allocated_buffers(SubDeviceId sub_device_id) const; - // Set of logical storage only core coordinates - const std::set &storage_only_cores() const { return this->storage_only_cores_; } + void deallocate_buffers(); + void deallocate_buffers(SubDeviceId sub_device_id); - // Set of logical dispatch core coordinates + void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const; + void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const; // Set of logical ethernet core coordinates // core.x represents connectivity to one other chip, i.e. cores with all connect to same chip // core.y represents different channels along one const std::set ðernet_cores() const { return this->ethernet_cores_; } + const std::set &storage_only_cores() const { return this->storage_only_cores_; } uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const; uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const; - const std::unordered_set &get_allocated_buffers() const; - const std::unordered_set &get_allocated_buffers(SubDeviceId sub_device_id) const; - - void deallocate_buffers(); - void deallocate_buffers(SubDeviceId sub_device_id); - - std::optional lowest_occupied_compute_l1_address() const; - std::optional lowest_occupied_compute_l1_address(tt::stl::Span sub_device_ids) const; - // machine epsilon float sfpu_eps() const; @@ -237,13 +197,13 @@ class Device { // machine inf float sfpu_inf() const; - void generate_device_bank_to_noc_tables(); const JitBuildEnv& build_env() const { return this->build_env_; } const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const; const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const; const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const; const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const; const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const; + SystemMemoryManager& sysmem_manager() { return *sysmem_manager_; } HWCommandQueue& hw_command_queue(size_t cq_id = 0); CommandQueue& command_queue(size_t cq_id = 0); @@ -254,70 +214,57 @@ class Device { void replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking); void release_trace(const uint32_t tid); std::shared_ptr get_trace(uint32_t tid); + uint32_t get_trace_buffers_size() const { return trace_buffers_size_; } + void set_trace_buffers_size(uint32_t size) { trace_buffers_size_ = size; } bool using_slow_dispatch() const; + bool using_fast_dispatch() const; // Checks that the given arch is on the given pci_slot and that it's responding // Puts device into reset bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}, bool minimal = false); - void initialize_cluster(); - std::unique_ptr initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}); - void initialize_build(); - void initialize_device_kernel_defines(); void build_firmware(); - void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core); - void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg); void reset_cores(); void initialize_and_launch_firmware(); void init_command_queue_host(); void init_command_queue_device(); void initialize_synchronous_sw_cmd_queue(); + void update_dispatch_cores_for_multi_cq_eth_dispatch(); + void configure_kernel_variant(Program& program, const string& path, const std::vector& compile_args, CoreCoord kernel_core, CoreCoord Kernel_virtual_core, CoreType dispatch_core_type, CoreCoord upstream_virtual_core, CoreCoord downstream_virtual_core, CoreCoord downstream_slave_virtual_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false); - void compile_command_queue_programs(); - void compile_command_queue_programs_new(); - void configure_command_queue_programs(); - void configure_command_queue_programs_new(); - void clear_l1_state(); - void get_associated_dispatch_virtual_cores( - std::unordered_map> &my_dispatch_cores, - std::unordered_map> &other_dispatch_cores); - std::pair build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const; // Puts device into reset bool close(); friend bool CloseDevice(Device *device); - // APIs to access this device's work executor - bool can_use_passthrough_scheduling() const; - template - void push_work(F&& work, bool blocking = false) { - this->work_executor_.push_work(std::forward(work), blocking); - } - void synchronize(); - void set_worker_mode(const WorkExecutorMode& mode); void enable_async(bool enable); + void synchronize(); WorkExecutorMode get_worker_mode() { return work_executor_.get_worker_mode(); } void set_worker_queue_mode(const WorkerQueueMode& mode) { this->work_executor_.set_worker_queue_mode(mode); } WorkerQueueMode get_worker_queue_mode() { return this->work_executor_.get_worker_queue_mode(); } + bool is_worker_queue_empty() const { return work_executor_.worker_queue.empty(); } + bool can_use_passthrough_scheduling() const; - void update_dispatch_cores_for_multi_cq_eth_dispatch(); + template + void push_work(F&& work, bool blocking = false) { + this->work_executor_.push_work(std::forward(work), blocking); + } // Program cache interface. Syncrhonize with worker worker threads before querying or // modifying this structure, since worker threads use this for compiling ops void enable_program_cache(); void disable_and_clear_program_cache(); + program_cache::detail::ProgramCache& get_program_cache() { return program_cache_; } std::size_t num_program_cache_entries(); HalProgrammableCoreType get_programmable_core_type(CoreCoord virtual_core) const; + template T get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const; - // Returns address where allocator starts allocating buffer - DeviceAddr get_base_allocator_addr(const HalMemType &mem_type) const; - DeviceAddr get_base_allocator_addr(const HalMemType &mem_type, SubDeviceId sub_device_id) const; - template - std::vector> extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type); + std::vector> extract_dst_noc_multicast_info(const std::vector& ranges, const CoreType core_type); + bool dispatch_s_enabled() const; bool distributed_dispatcher() const; NOC dispatch_go_signal_noc() const; @@ -327,43 +274,68 @@ class Device { uint8_t num_noc_unicast_txns(SubDeviceId sub_device_id) const; uint8_t noc_data_start_index(SubDeviceId sub_device_id, bool mcast_data=true, bool unicast_data=true) const; - LaunchMessageRingBufferState& get_worker_launch_message_buffer_state(SubDeviceId sub_device_id); - SubDeviceManagerId get_active_sub_device_manager_id() const; SubDeviceManagerId get_default_sub_device_manager_id() const; SubDeviceManagerId create_sub_device_manager(tt::stl::Span sub_devices, DeviceAddr local_l1_size); + void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id); void load_sub_device_manager(SubDeviceManagerId sub_device_manager_id); void clear_loaded_sub_device_manager(); - void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id); + LaunchMessageRingBufferState& get_worker_launch_message_buffer_state(SubDeviceId sub_device_id); const std::vector &get_sub_device_ids() const; + uint32_t num_sub_devices() const; // TODO #15944: Temporary api until migration to actual fabric is complete std::tuple create_sub_device_manager_with_fabric(tt::stl::Span sub_devices, DeviceAddr local_l1_size); std::optional get_fabric_sub_device_id() const; - program_cache::detail::ProgramCache& get_program_cache() { return program_cache_; } - - std::set get_compute_cores() const { return compute_cores_; } - uint32_t get_completion_queue_reader_core() const { return completion_queue_reader_core_; } - bool is_worker_queue_empty() const { return work_executor_.worker_queue.empty(); } - - uint32_t get_trace_buffers_size() const { return trace_buffers_size_; } - void set_trace_buffers_size(uint32_t size) { trace_buffers_size_ = size; } - - bool using_fast_dispatch() const { return using_fast_dispatch_; } - + bool is_mmio_capable() const; std::vector> get_tunnels_from_mmio() const { return tunnels_from_mmio_; } static constexpr MemoryAllocator allocator_scheme_ = MemoryAllocator::L1_BANKING; private: + static_assert(detail::SubDeviceManager::MAX_NUM_SUB_DEVICES <= dispatch_constants::DISPATCH_MESSAGE_ENTRIES, "MAX_NUM_SUB_DEVICES must be less than or equal to dispatch_constants::DISPATCH_MESSAGE_ENTRIES"); + static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1; + + void initialize_cluster(); + std::unique_ptr initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}); + void initialize_build(); + void initialize_device_kernel_defines(); + void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core); + void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg); + void initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap); + + void compile_command_queue_programs(); + void compile_command_queue_programs_new(); + void configure_command_queue_programs(); + void configure_command_queue_programs_new(); + void clear_l1_state(); + void get_associated_dispatch_virtual_cores( + std::unordered_map> &my_dispatch_cores, + std::unordered_map> &other_dispatch_cores); + std::pair build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const; + + void set_worker_mode(const WorkExecutorMode& mode); + + void generate_device_bank_to_noc_tables(); + + void setup_tunnel_for_remote_devices(); + + void update_workers_build_settings(std::vector>> &device_worker_variants); + SubDeviceManagerId get_next_sub_device_manager_id(); void reset_sub_devices_state(const std::unique_ptr& sub_device_manager); - void MarkAllocationsUnsafe(); - void MarkAllocationsSafe(); + + void mark_allocations_unsafe(); + void mark_allocations_safe(); + + CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const; + CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const; + CoreType core_type_from_physical_core(const CoreCoord &physical_core) const; + CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const; chip_id_t id_; uint32_t build_key_ = 0; @@ -372,7 +344,7 @@ class Device { // Leaving here for compatibility with current reacharounds // TODO: Replace with get_initialized_allocator() - Allocator * allocator_ = nullptr; + Allocator* allocator_ = nullptr; bool initialized_ = false; std::vector> command_queue_programs_; @@ -421,43 +393,11 @@ class Device { } // namespace v0 -inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord virtual_core) const { - - HalProgrammableCoreType programmable_core_type = HalProgrammableCoreType::TENSIX; - if (tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) { - // Eth pcores have a different address, but only active ones. - CoreCoord logical_core = this->logical_core_from_ethernet_core(virtual_core); - if (this->is_active_ethernet_core(logical_core)) { - programmable_core_type = HalProgrammableCoreType::ACTIVE_ETH; - } else { - programmable_core_type = HalProgrammableCoreType::IDLE_ETH; - } - } - - return programmable_core_type; -} - template inline T Device::get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const { return hal.get_dev_addr(this->get_programmable_core_type(virtual_core), addr_type); } -// TODO: Find a better home for this function -template -std::vector> Device::extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type) { - // This API extracts all the pairs of noc multicast encodings given a set of core ranges - std::vector> dst_noc_multicast_info; - dst_noc_multicast_info.reserve(ranges.size()); - for (const CoreRange& core_range : ranges) { - CoreCoord virtual_start = this->virtual_core_from_logical_core(core_range.start_coord, core_type); - CoreCoord virtual_end = this->virtual_core_from_logical_core(core_range.end_coord, core_type); - - uint32_t num_receivers = core_range.size(); - dst_noc_multicast_info.push_back(std::make_pair(CoreRange(virtual_start, virtual_end), num_receivers)); - } - return dst_noc_multicast_info; -} - } // namespace tt_metal } // namespace tt diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 1eed5f60384..f6c4d500289 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -724,7 +724,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc } } else { std::vector> dst_noc_multicast_info = - device->extract_dst_noc_multicast_info>( + device->extract_dst_noc_multicast_info( kernel->logical_coreranges(), core_type); common_sub_cmds.emplace>( std::vector()); diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 59be426029b..fbbe914ac85 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -1019,7 +1019,7 @@ void detail::Program_::populate_dispatch_data(Device *device) { if (semaphore.core_type() == CoreType::WORKER) { uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); std::vector> dst_noc_multicast_info = - device->extract_dst_noc_multicast_info>( + device->extract_dst_noc_multicast_info( semaphore.core_range_set().ranges(), CoreType::WORKER); transfer_info transfer_info = { .dst_base_addr = semaphore.offset(), @@ -1111,7 +1111,7 @@ void detail::Program_::populate_dispatch_data(Device *device) { // TODO: add a bit in the hal that says if this core type is unicast/multicast if (core_type == CoreType::WORKER) { std::vector> dst_noc_multicast_info = - device->extract_dst_noc_multicast_info>( + device->extract_dst_noc_multicast_info( kernel_group.core_ranges.ranges(), core_type); std::vector kernel_ids; for (int dispatch_class = 0; dispatch_class < kernel_group.kernel_ids.size(); dispatch_class++) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp index 9f69e69b474..c5be2c09d7a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp @@ -36,11 +36,8 @@ inline uint32_t get_estimated_size_of_cbs(const Tensor& input_tensor_a) { } inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) { - tt::tt_metal::Device* device = input_tensor_a.device(); - const std::vector& bank_ids = - device->bank_ids_from_logical_core(BufferType::L1, *device->get_compute_cores().begin()); - std::optional lowest_address = - allocator::lowest_occupied_l1_address(*device->get_initialized_allocator(), bank_ids[0]); + auto device = input_tensor_a.device(); + auto lowest_address = device->lowest_occupied_compute_l1_address(); uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core(); max_l1_space = max_l1_space - device->get_base_allocator_addr(HalMemType::L1); return max_l1_space; diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp index bfb8350a644..2c0c62c0edc 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp @@ -171,11 +171,8 @@ inline uint32_t get_estimated_size_of_cbs( } inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) { - tt::tt_metal::Device* device = input_tensor_a.device(); - const std::vector& bank_ids = - device->bank_ids_from_logical_core(BufferType::L1, *device->get_compute_cores().begin()); - std::optional lowest_address = - allocator::lowest_occupied_l1_address(*device->get_initialized_allocator(), bank_ids[0]); + auto device = input_tensor_a.device(); + auto lowest_address = device->lowest_occupied_compute_l1_address(); uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core(); max_l1_space = max_l1_space - device->get_base_allocator_addr(HalMemType::L1); return max_l1_space;