Skip to content

Commit

Permalink
#8530: Pull static/dynamic tlb config out of tt_cluster.cpp and into …
Browse files Browse the repository at this point in the history
…tlb_config.cpp
  • Loading branch information
abhullar-tt committed May 17, 2024
1 parent 40ea9ec commit b5d4b97
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 120 deletions.
1 change: 1 addition & 0 deletions tt_metal/llrt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
set(LLRT_SRC
${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp
${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tt_hexfile.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tt_memory.cpp)
Expand Down
175 changes: 175 additions & 0 deletions tt_metal/llrt/tlb_config.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "tlb_config.hpp"
#include "device_data.hpp"

#include "third_party/umd/device/blackhole_implementation.h"
#include "third_party/umd/device/grayskull_implementation.h"
#include "third_party/umd/device/wormhole_implementation.h"

namespace ll_api {

namespace grayskull {

static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0;
static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::grayskull::DYNAMIC_TLB_16M_SIZE;

int32_t get_static_tlb_index(CoreCoord target) {
// Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
// The default 1MB TLB is not used for DRAM cores
// auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
// if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
// return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
// }
int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x;
if (flat_index == 0) {
return -1;
}
return flat_index;
}

} // namespace grayskull

namespace wormhole {

static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0;
static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::wormhole::DYNAMIC_TLB_16M_SIZE;

int32_t get_static_tlb_index(CoreCoord target) {
bool is_eth_location =
std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) !=
std::cend(DEVICE_DATA.ETH_LOCATIONS);
bool is_tensix_location =
std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) !=
std::cend(DEVICE_DATA.T6_X_LOCATIONS) &&
std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) !=
std::cend(DEVICE_DATA.T6_Y_LOCATIONS);
// implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix
// repo (t6py-wormhole-bringup branch)

// Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
// The default 1MB TLB is not used for DRAM cores
// auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
// if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
// return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
// }

if (is_eth_location) {
if (target.y == 6) {
target.y = 1;
}

if (target.x >= 5) {
target.x -= 1;
}
target.x -= 1;

int flat_index = target.y * 8 + target.x;
int tlb_index = flat_index;
return tlb_index;

} else if (is_tensix_location) {
if (target.x >= 5) {
target.x -= 1;
}
target.x -= 1;

if (target.y >= 6) {
target.y -= 1;
}
target.y -= 1;

int flat_index = target.y * 8 + target.x;

// All 80 get single 1MB TLB.
int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index;

return tlb_index;
} else {
return -1;
}
}

} // namespace wormhole

namespace blackhole {

static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = tt::umd::blackhole::DYNAMIC_TLB_2M_SIZE;
static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 0;

int32_t get_static_tlb_index(CoreCoord target) {
return -1;
}

} // namespace blackhole

void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver) {
using get_static_tlb_index_ptr = std::int32_t (*)(tt_xy_pair);
get_static_tlb_index_ptr get_static_tlb_index;
uint32_t DYNAMIC_TLB_BASE_INDEX, DYNAMIC_TLB_COUNT, DYNAMIC_TLB_16M_SIZE, DYNAMIC_TLB_2M_SIZE;

switch (arch) {
case tt::ARCH::GRAYSKULL:
get_static_tlb_index = grayskull::get_static_tlb_index;
DYNAMIC_TLB_BASE_INDEX = grayskull::DYNAMIC_TLB_BASE_INDEX;
DYNAMIC_TLB_COUNT = grayskull::DYNAMIC_TLB_COUNT;
DYNAMIC_TLB_16M_SIZE = grayskull::DYNAMIC_TLB_16M_SIZE;
DYNAMIC_TLB_2M_SIZE = grayskull::DYNAMIC_TLB_2M_SIZE;
break;
case tt::ARCH::WORMHOLE:
case tt::ARCH::WORMHOLE_B0:
get_static_tlb_index = wormhole::get_static_tlb_index;
DYNAMIC_TLB_BASE_INDEX = wormhole::DYNAMIC_TLB_BASE_INDEX;
DYNAMIC_TLB_COUNT = wormhole::DYNAMIC_TLB_COUNT;
DYNAMIC_TLB_16M_SIZE = wormhole::DYNAMIC_TLB_16M_SIZE;
DYNAMIC_TLB_2M_SIZE = wormhole::DYNAMIC_TLB_2M_SIZE;
break;
case tt::ARCH::BLACKHOLE:
get_static_tlb_index = blackhole::get_static_tlb_index;
DYNAMIC_TLB_BASE_INDEX = blackhole::DYNAMIC_TLB_BASE_INDEX;
DYNAMIC_TLB_COUNT = blackhole::DYNAMIC_TLB_COUNT;
DYNAMIC_TLB_2M_SIZE = blackhole::DYNAMIC_TLB_2M_SIZE;
DYNAMIC_TLB_16M_SIZE = blackhole::DYNAMIC_TLB_16M_SIZE;
break;
default: TT_THROW("Configuring static TLBs is not supported for {}", tt::get_string(arch));
}

auto statically_mapped_cores = sdesc.workers;
statically_mapped_cores.insert(
statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end());
std::int32_t address = 0;

// Setup static TLBs for all worker cores
for (auto &core : statically_mapped_cores) {
auto tlb_index = get_static_tlb_index(core);
device_driver.configure_tlb(mmio_device_id, core, tlb_index, address);
}
// Setup static TLBs for MMIO mapped data space
uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START;
for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) {
device_driver.configure_tlb(
mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
// Align address space of 16MB TLB to 16MB boundary
peer_dram_offset += DYNAMIC_TLB_16M_SIZE;
}
device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); });
}

std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config() {
std::unordered_map<std::string, std::int32_t> dynamic_tlb_config;
dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
return dynamic_tlb_config;
}

} // namespace ll_api
19 changes: 19 additions & 0 deletions tt_metal/llrt/tlb_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "third_party/umd/device/device_api.h"
#include "tt_metal/common/tt_backend_api_types.hpp"
#include "tt_metal/common/metal_soc_descriptor.h"

#include <unordered_map>

namespace ll_api {

void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver);

std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config();

} // namespace ll_api
122 changes: 3 additions & 119 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,7 @@
#include "tools/profiler/profiler.hpp"
#include "tt_metal/impl/debug/sanitize_noc_host.hpp"
#include "tt_metal/llrt/rtoptions.hpp"

#ifdef ARCH_GRAYSKULL
static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;

#else
static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
#endif
#include "tt_metal/llrt/tlb_config.hpp"

namespace tt {

Expand Down Expand Up @@ -218,8 +208,7 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
// Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than available.
// Metal currently uses assigns 1 channel per device
uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size();
std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = ll_api::get_dynamic_tlb_config();
// This will remove harvested rows from the soc descriptor
const bool perform_harvesting = true;
const bool clean_system_resources = true;
Expand Down Expand Up @@ -248,119 +237,14 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
this->mmio_device_id_to_driver_[mmio_device_id] = std::move(device_driver);
}

#ifdef ARCH_WORMHOLE
std::int32_t get_static_tlb_index(CoreCoord target) {
bool is_eth_location =
std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) !=
std::cend(DEVICE_DATA.ETH_LOCATIONS);
bool is_tensix_location =
std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) !=
std::cend(DEVICE_DATA.T6_X_LOCATIONS) &&
std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) !=
std::cend(DEVICE_DATA.T6_Y_LOCATIONS);
// implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix
// repo (t6py-wormhole-bringup branch)

// Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
// The default 1MB TLB is not used for DRAM cores
// auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
// if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
// return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
// }

if (is_eth_location) {
if (target.y == 6) {
target.y = 1;
}

if (target.x >= 5) {
target.x -= 1;
}
target.x -= 1;

int flat_index = target.y * 8 + target.x;
int tlb_index = flat_index;
return tlb_index;

} else if (is_tensix_location) {
if (target.x >= 5) {
target.x -= 1;
}
target.x -= 1;

if (target.y >= 6) {
target.y -= 1;
}
target.y -= 1;

int flat_index = target.y * 8 + target.x;

// All 80 get single 1MB TLB.
int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index;

return tlb_index;
} else {
return -1;
}
}
#endif

#ifdef ARCH_GRAYSKULL
std::int32_t get_static_tlb_index(CoreCoord target) {
// Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
// The default 1MB TLB is not used for DRAM cores
// auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
// if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
// return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
// }
int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x;
if (flat_index == 0) {
return -1;
}
return flat_index;
}
#endif

// TODO: pull tlb config into sep file similar to BBE
#ifdef ARCH_BLACKHOLE
std::int32_t get_static_tlb_index(CoreCoord target) {
return -1;
}
#endif

void Cluster::configure_static_tlbs(chip_id_t mmio_device_id) const {
auto sdesc = get_soc_desc(mmio_device_id);
auto statically_mapped_cores = sdesc.workers;
statically_mapped_cores.insert(
statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end());
std::int32_t address = 0;

// Setup static TLBs for all worker cores
for (auto &core : statically_mapped_cores) {
auto tlb_index = get_static_tlb_index(core);
this->get_driver(mmio_device_id).configure_tlb(mmio_device_id, core, tlb_index, address);
}
// Setup static TLBs for MMIO mapped data space
uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START;
for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) {
this->get_driver(mmio_device_id).configure_tlb(
mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
// Align address space of 16MB TLB to 16MB boundary
#ifndef ARCH_BLACKHOLE // TODO (abhullar): clean this up
peer_dram_offset += DEVICE_DATA.DYNAMIC_TLB_16M_SIZE;
#endif
}
this->get_driver(mmio_device_id).setup_core_to_tlb_map([](CoreCoord core) { return get_static_tlb_index(core); });
}

void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const {
device_params.init_device = true;

TT_FATAL(this->sdesc_per_chip_.size(), "Descriptor must be loaded. Try open_driver()");

// static TLBs avoided for Blackhole bring up
if (this->target_type_ == TargetDevice::Silicon && device_params.init_device && this->arch_ != tt::ARCH::BLACKHOLE) {
configure_static_tlbs(mmio_device_id);
ll_api::configure_static_tlbs(this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), this->get_driver(mmio_device_id));
}

this->mmio_device_id_to_driver_.at(mmio_device_id)->start_device(device_params);
Expand Down
1 change: 0 additions & 1 deletion tt_metal/llrt/tt_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ class Cluster {
tt_device &get_driver(chip_id_t device_id) const;
void get_metal_desc_from_tt_desc(const std::unordered_map<chip_id_t, tt_SocDescriptor> &input, const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks);
tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const;
void configure_static_tlbs(chip_id_t mmio_device_id) const;

// Returns map of connected chip ids to active ethernet cores
std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped_by_connected_chips(
Expand Down

0 comments on commit b5d4b97

Please sign in to comment.