Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
broskoTT committed Oct 17, 2024
1 parent 79bcb01 commit 5cdd297
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 72 deletions.
2 changes: 1 addition & 1 deletion tt_metal/llrt/tlb_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_
}
}

device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); });
device_driver.setup_core_to_tlb_map(mmio_device_id, get_static_tlb_index);
}

std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config(tt::ARCH arch) {
Expand Down
92 changes: 41 additions & 51 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,18 +207,16 @@ void Cluster::generate_cluster_descriptor() {
void Cluster::initialize_device_drivers() {
for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) {
this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices);
}

this->open_driver(mmio_device_id, controlled_devices);
this->open_driver();

tt_device_params default_params;
this->start_driver(mmio_device_id, default_params);
}
tt_device_params default_params;
this->start_driver(default_params);
}

void Cluster::assert_risc_reset() {
for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) {
this->get_driver(mmio_device_id).assert_risc_reset();
}
this->driver_->assert_risc_reset();
}

void Cluster::assign_mem_channels_to_devices(
Expand Down Expand Up @@ -247,35 +245,35 @@ void Cluster::get_metal_desc_from_tt_desc(
}
}

void Cluster::open_driver(
chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids, const bool &skip_driver_allocs) {
void Cluster::open_driver(const bool &skip_driver_allocs) {
const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_);

std::unique_ptr<tt_device> device_driver;
if (this->target_type_ == TargetDevice::Silicon) {
// This is the target/desired number of mem channels per arch/device.
// Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than
// available. Metal currently uses assigns 1 channel per device
uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size();
if (is_tg_cluster_) {
num_host_mem_ch_per_mmio_device = HOST_MEM_CHANNELS;
}
uint32_t num_host_mem_ch_per_mmio_device = HOST_MEM_CHANNELS;
std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = ll_api::get_dynamic_tlb_config(this->arch_);
// This will remove harvested rows from the soc descriptor
const bool perform_harvesting = true;
const bool clean_system_resources = true;
std::unordered_set<chip_id_t> all_chips = this->cluster_desc_->get_all_chips();
std::set<chip_id_t> all_chips_set(all_chips.begin(), all_chips.end());
device_driver = std::make_unique<tt_SiliconDevice>(
sdesc_path,
this->cluster_desc_path_,
controlled_device_ids,
all_chips_set,
num_host_mem_ch_per_mmio_device,
dynamic_tlb_config,
skip_driver_allocs,
clean_system_resources,
perform_harvesting);
if (this->arch_ == tt::ARCH::WORMHOLE_B0 and not this->is_galaxy_cluster()) {
// Give UMD Limited access to eth cores 8 and 9 for Non-Galaxy Wormhole Clusters
device_driver->configure_active_ethernet_cores_for_mmio_device(mmio_device_id, {});
for (const auto &[mmio_device_id, _]: this->cluster_desc_->get_chips_with_mmio()) {
device_driver->configure_active_ethernet_cores_for_mmio_device(mmio_device_id, {});
}
}
device_driver->set_driver_host_address_params(host_address_params);
device_driver->set_driver_eth_interface_params(eth_interface_params);
Expand All @@ -292,29 +290,28 @@ void Cluster::open_driver(

this->get_metal_desc_from_tt_desc(
device_driver->get_virtual_soc_descriptors(), device_driver->get_harvesting_masks_for_soc_descriptors());
this->mmio_device_id_to_driver_[mmio_device_id] = std::move(device_driver);
this->driver_ = std::move(device_driver);
}

void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const {
void Cluster::start_driver(tt_device_params &device_params) const {
device_params.init_device = true;

TT_FATAL(this->sdesc_per_chip_.size(), "Descriptor must be loaded. Try open_driver()");

if (this->target_type_ == TargetDevice::Silicon && device_params.init_device) {
ll_api::configure_static_tlbs(
this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), this->get_driver(mmio_device_id));
for (const auto [mmio_device_id, _]: this->cluster_desc_->get_chips_with_mmio()) {
ll_api::configure_static_tlbs(
this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), *this->driver_);
}
}

this->mmio_device_id_to_driver_.at(mmio_device_id)->start_device(device_params);
this->driver_->start_device(device_params);
}

Cluster::~Cluster() {
log_info(tt::LogDevice, "Closing user mode device drivers");
for (const auto &[mmio_device_id, device_driver] : this->mmio_device_id_to_driver_) {
device_driver->close_device();
}
this->driver_->close_device();

this->mmio_device_id_to_driver_.clear();
this->sdesc_per_chip_.clear();
this->devices_grouped_by_assoc_mmio_device_.clear();
this->device_to_mmio_device_.clear();
Expand All @@ -323,11 +320,6 @@ Cluster::~Cluster() {
this->tunnels_from_mmio_device.clear();
}

tt_device &Cluster::get_driver(chip_id_t device_id) const {
chip_id_t mmio_device_id = this->device_to_mmio_device_.at(device_id);
return *(this->mmio_device_id_to_driver_.at(mmio_device_id));
}

std::unordered_map<chip_id_t, eth_coord_t> Cluster::get_user_chip_ethernet_coordinates() const {
auto user_chip_ethernet_coordinates = this->cluster_desc_->get_chip_locations();
if (this->is_galaxy_cluster()) {
Expand All @@ -353,7 +345,7 @@ uint32_t Cluster::get_harvested_rows(chip_id_t chip) const {
if (this->target_type_ == TargetDevice::Simulator) {
return 0;
} else {
return this->get_driver(chip).harvested_rows_per_target.at(chip);
return this->driver_->harvested_rows_per_target.at(chip);
}
}

Expand All @@ -379,7 +371,7 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const {
// get_clocks returns MMIO device ID -> clock frequency
// There is one driver per MMIO device, so we use that to index returned map
chip_id_t mmio_device_id = this->device_to_mmio_device_.at(chip_id);
return this->get_driver(chip_id).get_clocks().at(mmio_device_id);
return this->driver_->get_clocks().at(mmio_device_id);
}
TT_THROW("Cannot get frequency for device {} that is not initialized!", chip_id);
return 0;
Expand All @@ -388,13 +380,13 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const {
void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const {
const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip);
tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord);
this->get_driver(virtual_chip_coord.chip).deassert_risc_reset_at_core(virtual_chip_coord);
this->driver_->deassert_risc_reset_at_core(virtual_chip_coord);
}

void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const {
const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip);
tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord);
this->get_driver(virtual_chip_coord.chip).assert_risc_reset_at_core(virtual_chip_coord);
this->driver_->assert_risc_reset_at_core(virtual_chip_coord);
}

inline uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) {
Expand Down Expand Up @@ -454,10 +446,9 @@ void Cluster::write_core(
tt::watcher_sanitize_host_noc_write(soc_desc, {core.x, core.y}, addr, sz_in_bytes);
}
tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core);
this->get_driver(chip_id).write_to_device(mem_ptr, sz_in_bytes, virtual_core, addr, "LARGE_WRITE_TLB");
if (this->get_driver(chip_id).get_target_remote_device_ids().find(virtual_core.chip) !=
this->get_driver(chip_id).get_target_remote_device_ids().end()) {
this->get_driver(chip_id).wait_for_non_mmio_flush();
this->driver_->write_to_device(mem_ptr, sz_in_bytes, virtual_core, addr, "LARGE_WRITE_TLB");
if (this->cluster_desc_->is_chip_remote(chip_id)) {
this->driver_->wait_for_non_mmio_flush(chip_id);
}
}

Expand All @@ -471,7 +462,7 @@ void Cluster::read_core(
}

tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core);
this->get_driver(chip_id).read_from_device(mem_ptr, virtual_core, addr, size_in_bytes, "LARGE_READ_TLB");
this->driver_->read_from_device(mem_ptr, virtual_core, addr, size_in_bytes, "LARGE_READ_TLB");
}

void Cluster::read_core(
Expand All @@ -489,10 +480,9 @@ void Cluster::write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64
tt::watcher_sanitize_host_noc_write(soc_desc, {target.x, target.y}, addr, size_in_bytes);
}
tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target);
this->get_driver(chip_id).write_to_device(mem_ptr, size_in_bytes, virtual_target, addr, "REG_TLB");
if (this->get_driver(chip_id).get_target_remote_device_ids().find(virtual_target.chip) !=
this->get_driver(chip_id).get_target_remote_device_ids().end()) {
this->get_driver(chip_id).wait_for_non_mmio_flush();
this->driver_->write_to_device(mem_ptr, size_in_bytes, virtual_target, addr, "REG_TLB");
if (this->cluster_desc_->is_chip_remote(chip_id)) {
this->driver_->wait_for_non_mmio_flush(chip_id);
}
}

Expand All @@ -505,19 +495,19 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr
tt::watcher_sanitize_host_noc_read(soc_desc, {target.x, target.y}, addr, size_in_bytes);
}
tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target);
this->get_driver(chip_id).read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB");
this->driver_->read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB");
}

void Cluster::write_sysmem(
const void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel & HOST_MEM_CHANNELS_MASK, src_device_id);
this->driver_->write_to_sysmem(vec, size_in_bytes, addr, channel & HOST_MEM_CHANNELS_MASK, src_device_id);
}

void Cluster::read_sysmem(
void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel & HOST_MEM_CHANNELS_MASK, size_in_bytes, src_device_id);
this->driver_->read_from_sysmem(vec, addr, channel & HOST_MEM_CHANNELS_MASK, size_in_bytes, src_device_id);
}

void Cluster::verify_sw_fw_versions(
Expand Down Expand Up @@ -547,7 +537,7 @@ void Cluster::dram_barrier(chip_id_t chip_id) const {
for (uint32_t channel = 0; channel < this->get_soc_desc(chip_id).get_num_dram_channels(); channel++) {
dram_channels.insert(channel);
}
this->get_driver(chip_id).dram_membar(chip_id, "LARGE_WRITE_TLB", dram_channels);
this->driver_->dram_membar(chip_id, "LARGE_WRITE_TLB", dram_channels);
}

// L1 barrier is used to implement host-to-device synchronization and should be used when all previous writes to L1 need
Expand All @@ -556,26 +546,26 @@ void Cluster::dram_barrier(chip_id_t chip_id) const {
// binaries, metadata, and data to compute on are committed before launching kernels
void Cluster::l1_barrier(chip_id_t chip_id) const {
// Sets and resets L1 barrier of all tensix cores and ethernet cores
this->get_driver(chip_id).l1_membar(chip_id, "LARGE_WRITE_TLB");
this->driver_->l1_membar(chip_id, "LARGE_WRITE_TLB");
}

uint32_t Cluster::get_num_host_channels(chip_id_t device_id) const {
bool mmio_capable = this->cluster_desc_->is_chip_mmio_capable(device_id);
return mmio_capable ? this->get_driver(device_id).get_num_host_channels(device_id) : 0;
return mmio_capable ? this->driver_->get_num_host_channels(device_id) : 0;
}

uint32_t Cluster::get_host_channel_size(chip_id_t device_id, uint32_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(device_id));
return this->get_driver(device_id).get_host_channel_size(device_id, channel & HOST_MEM_CHANNELS_MASK);
return this->driver_->get_host_channel_size(device_id, channel & HOST_MEM_CHANNELS_MASK);
}

void *Cluster::host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
return this->get_driver(src_device_id).host_dma_address(offset, src_device_id, channel & HOST_MEM_CHANNELS_MASK);
return this->driver_->host_dma_address(offset, src_device_id, channel & HOST_MEM_CHANNELS_MASK);
}

uint64_t Cluster::get_pcie_base_addr_from_device(chip_id_t chip_id) const {
return this->get_driver(chip_id).get_pcie_base_addr_from_device();
return this->driver_->get_pcie_base_addr_from_device();
}

std::unordered_map<chip_id_t, std::vector<CoreCoord>> Cluster::get_ethernet_cores_grouped_by_connected_chips(
Expand Down
28 changes: 9 additions & 19 deletions tt_metal/llrt/tt_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class Cluster {
const metal_SocDescriptor &get_soc_desc(chip_id_t chip) const;
uint32_t get_harvested_rows(chip_id_t chip) const;
uint32_t get_harvesting_mask(chip_id_t chip) const {
return this->get_driver(chip).get_harvesting_masks_for_soc_descriptors().at(chip);
return this->driver_->get_harvesting_masks_for_soc_descriptors().at(chip);
}

//! device driver and misc apis
Expand All @@ -97,43 +97,36 @@ class Cluster {
vector<uint32_t> &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const;

std::optional<std::tuple<uint32_t, uint32_t>> get_tlb_data(const tt_cxy_pair &target) const {
chip_id_t mmio_device_id = device_to_mmio_device_.at(target.chip);
tt_SiliconDevice *device =
dynamic_cast<tt_SiliconDevice *>(this->mmio_device_id_to_driver_.at(mmio_device_id).get());
tt_SiliconDevice *device = dynamic_cast<tt_SiliconDevice *>(driver_.get());
const metal_SocDescriptor &soc_desc = this->get_soc_desc(target.chip);
tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(target);
return device->get_tlb_data_from_target(virtual_chip_coord);
}

uint32_t get_m_dma_buf_size(chip_id_t chip_id) const {
chip_id_t mmio_device_id = device_to_mmio_device_.at(chip_id);
tt_SiliconDevice *device =
dynamic_cast<tt_SiliconDevice *>(this->mmio_device_id_to_driver_.at(mmio_device_id).get());
return 0;
}

std::function<void(uint32_t, uint32_t, const uint8_t *)> get_fast_pcie_static_tlb_write_callable(
int chip_id) const {
chip_id_t mmio_device_id = device_to_mmio_device_.at(chip_id);
tt_SiliconDevice *device =
dynamic_cast<tt_SiliconDevice *>(this->mmio_device_id_to_driver_.at(mmio_device_id).get());
tt_SiliconDevice *device = dynamic_cast<tt_SiliconDevice *>(driver_.get());
return device->get_fast_pcie_static_tlb_write_callable(mmio_device_id);
}

// Returns a writer object which holds a pointer to a static tlb
// Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals
tt::Writer get_static_tlb_writer(tt_cxy_pair target) const {
chip_id_t mmio_device_id = device_to_mmio_device_.at(target.chip);
tt_SiliconDevice* device = dynamic_cast<tt_SiliconDevice*>(this->mmio_device_id_to_driver_.at(mmio_device_id).get());
tt_SiliconDevice *device = dynamic_cast<tt_SiliconDevice *>(driver_.get());
const metal_SocDescriptor &soc_desc = this->get_soc_desc(target.chip);
tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target);
return device->get_static_tlb_writer(virtual_target);
}

std::uint32_t get_numa_node_for_device(uint32_t device_id) const {
uint32_t associated_mmio_device_id = this->get_associated_mmio_device(device_id);
tt_SiliconDevice* driver = dynamic_cast<tt_SiliconDevice*>(this->mmio_device_id_to_driver_.at(associated_mmio_device_id).get());
return driver->get_numa_node_for_pcie_device(associated_mmio_device_id);
uint32_t mmio_device_id = this->get_associated_mmio_device(device_id);
tt_SiliconDevice *device = dynamic_cast<tt_SiliconDevice *>(driver_.get());
return driver_->get_numa_node_for_pcie_device(mmio_device_id);
}

void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const;
Expand Down Expand Up @@ -239,12 +232,9 @@ class Cluster {
void assert_risc_reset();
void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids);
void open_driver(
chip_id_t mmio_device_id,
const std::set<chip_id_t> &controlled_device_ids,
const bool &skip_driver_allocs = false);
void start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const;
void start_driver(tt_device_params &device_params) const;

tt_device &get_driver(chip_id_t device_id) const;
void get_metal_desc_from_tt_desc(
const std::unordered_map<chip_id_t, tt_SocDescriptor> &input,
const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks);
Expand All @@ -265,7 +255,7 @@ class Cluster {

// There is one device driver per PCIe card. This map points id of the MMIO device points to the associated device
// driver
std::unordered_map<chip_id_t, std::unique_ptr<tt_device>> mmio_device_id_to_driver_;
std::unique_ptr<tt_device> driver_;

// Need to hold reference to cluster descriptor to detect total number of devices available in cluster
// UMD static APIs `detect_available_device_ids` and `detect_number_of_chips` only returns number of MMIO mapped
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/third_party/umd
Submodule umd updated 64 files
+26 −0 .github/docker_install_common.sh
+6 −0 .github/ubuntu-20.04.Dockerfile
+7 −0 .github/ubuntu-22.04.Dockerfile
+53 −0 .github/workflows/build-device.yml
+52 −0 .github/workflows/build-image.yml
+80 −0 .github/workflows/build-tests.yml
+45 −0 .github/workflows/on-pr-opt.yml
+14 −0 .github/workflows/on-pr.yml
+50 −0 .github/workflows/on-push.yml
+103 −0 .github/workflows/run-tests.yml
+121 −0 .github/workflows/test-runner.yaml
+1 −1 .gitignore
+0 −100 .gitlab-ci.yml
+0 −3 .gitmodules
+26 −25 CMakeLists.txt
+2,781 −0 Doxyfile
+0 −93 Makefile
+7 −41 README.md
+46 −0 cmake/compilers.cmake
+10 −0 cmake/dependencies.cmake
+2 −1 device/CMakeLists.txt
+5 −6 device/cpuset_lib.cpp
+0 −32 device/kmdif.h
+107 −0 device/mockup/tt_mockup_device.hpp
+0 −178 device/module.mk
+586 −0 device/pcie/pci_device.cpp
+198 −0 device/pcie/pci_device.hpp
+3 −2 device/simulation/tt_simulation_device.cpp
+2 −1 device/simulation/tt_simulation_device.h
+3 −0 device/tt_arch_types.h
+9 −3 device/tt_cluster_descriptor.cpp
+3 −2 device/tt_cluster_descriptor.h
+3 −1 device/tt_device.cpp
+228 −235 device/tt_device.h
+341 −1,300 device/tt_silicon_driver.cpp
+5 −3 device/tt_soc_descriptor.cpp
+4 −1 device/tt_soc_descriptor.h
+4 −2 device/xy_pair.h
+17 −0 docs/README.md
+14 −0 docs/build_docs.sh
+2 −0 docs/install_docs_requirements.sh
+27 −2 tests/CMakeLists.txt
+12 −0 tests/api/CMakeLists.txt
+160 −0 tests/api/test_chip.cpp
+266 −0 tests/api/test_cluster.cpp
+165 −0 tests/api/test_cluster_descriptor.cpp
+39 −39 tests/blackhole/test_silicon_driver_bh.cpp
+4 −4 tests/emulation/test_emulation_device.cpp
+10 −6 tests/galaxy/test_galaxy_common.cpp
+3 −4 tests/galaxy/test_galaxy_common.h
+19 −17 tests/galaxy/test_umd_concurrent_threads.cpp
+9 −6 tests/galaxy/test_umd_remote_api.cpp
+0 −1 tests/galaxy/test_umd_remote_api_stability.cpp
+29 −28 tests/grayskull/test_silicon_driver.cpp
+7 −6 tests/microbenchmark/test_rw_tensix.cpp
+0 −75 tests/module.mk
+6 −0 tests/pcie/CMakeLists.txt
+36 −0 tests/pcie/test_pcie_device.cpp
+5 −4 tests/simulation/test_simulation_device.cpp
+25 −0 tests/test_utils/device_test_utils.hpp
+1 −1 tests/test_utils/generate_cluster_desc.cpp
+12 −3 tests/test_utils/generate_cluster_desc.hpp
+137 −41 tests/wormhole/test_silicon_driver_wh.cpp
+0 −1 third_party/fmt

0 comments on commit 5cdd297

Please sign in to comment.