Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Blackhole board types #437

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions device/api/umd/device/blackhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <stdexcept>

#include "umd/device/architecture_implementation.h"
#include "umd/device/types/cluster_descriptor_types.h"
#include "umd/device/types/tlb.h"

namespace tt::umd {
Expand Down Expand Up @@ -99,8 +100,9 @@ static const std::vector<tt_xy_pair> ARC_CORES = {{8, 0}};
static const std::vector<tt_xy_pair> ARC_LOCATIONS = ARC_CORES;

static const tt_xy_pair PCIE_GRID_SIZE = {1, 1};
static const std::vector<tt_xy_pair> PCIE_CORES = {{{11, 0}}};
static const std::vector<tt_xy_pair> PCI_LOCATIONS = PCIE_CORES;
static const std::vector<tt_xy_pair> PCIE_CORES_TYPE2 = {{{2, 0}}};
static const std::vector<tt_xy_pair> PCI_LOCATIONS = PCIE_CORES_TYPE2;
static const std::vector<tt_xy_pair> PCIE_CORES_TYPE1 = {{{11, 0}}};

static const tt_xy_pair ETH_GRID_SIZE = {14, 1};
static const std::vector<tt_xy_pair> ETH_CORES = {
Expand Down Expand Up @@ -195,6 +197,13 @@ static const size_t pcie_translated_coordinate_start_y = 24;
static const size_t dram_translated_coordinate_start_x = 17;
static const size_t dram_translated_coordinate_start_y = 12;

/*
* Ge the PCIE core that can be used for communication with host
* based on the board type and whether the chip is remote or not.
* Information on remote chip is used only if the board type is P300.
*/
std::vector<tt_xy_pair> get_pcie_cores(const BoardType board_type, const bool is_chip_remote);
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved

} // namespace blackhole

class blackhole_implementation : public architecture_implementation {
Expand Down
10 changes: 9 additions & 1 deletion device/api/umd/device/coordinate_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,15 @@
#include "umd/device/tt_core_coordinates.h"
#include "umd/device/tt_xy_pair.h"
#include "umd/device/types/arch.h"
#include "umd/device/types/cluster_descriptor_types.h"

class CoordinateManager {
public:
/*
* Creates a Coordinate Manager object.
* Board type and is_chip_remote are used only for Blackhole, since PCIe cores are different
* for different boards and whether the chip is remote or not.
*/
static std::shared_ptr<CoordinateManager> create_coordinate_manager(
tt::ARCH arch,
const tt_xy_pair& tensix_grid_size,
Expand All @@ -36,7 +42,9 @@ class CoordinateManager {
tt::ARCH arch,
const size_t tensix_harvesting_mask = 0,
const size_t dram_harvesting_mask = 0,
const size_t eth_harvesting_mask = 0);
const size_t eth_harvesting_mask = 0,
const BoardType board_type = BoardType::UNKNOWN,
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved
const bool is_chip_remote = false);

static size_t get_num_harvested(const size_t harvesting_mask);

Expand Down
12 changes: 0 additions & 12 deletions device/api/umd/device/tt_cluster_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,6 @@ namespace YAML {
class Node;
}

enum BoardType : uint32_t {
E75,
E150,
E300,
N150,
N300,
P100,
P150A,
GALAXY,
UNKNOWN,
};

class tt_ClusterDescriptor {
private:
tt_ClusterDescriptor() = default;
Expand Down
4 changes: 3 additions & 1 deletion device/api/umd/device/tt_soc_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "umd/device/tt_core_coordinates.h"
#include "umd/device/tt_xy_pair.h"
#include "umd/device/types/arch.h"
#include "umd/device/types/cluster_descriptor_types.h"

namespace YAML {
class Node;
Expand Down Expand Up @@ -92,7 +93,8 @@ class tt_SocDescriptor {
// CoreCoord conversions.
tt::umd::CoreCoord translate_coord_to(const tt::umd::CoreCoord core_coord, const CoordSystem coord_system) const;

static std::string get_soc_descriptor_path(tt::ARCH arch);
static std::string get_soc_descriptor_path(
tt::ARCH arch, const BoardType board_type = BoardType::UNKNOWN, const bool is_chip_remote = false);

std::vector<tt::umd::CoreCoord> get_cores(const CoreType core_type) const;
std::vector<tt::umd::CoreCoord> get_harvested_cores(const CoreType core_type) const;
Expand Down
13 changes: 13 additions & 0 deletions device/api/umd/device/types/cluster_descriptor_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ struct eth_coord_t {
}
};

enum BoardType : uint32_t {
E75,
E150,
E300,
N150,
N300,
P100,
P150A,
P300,
GALAXY,
UNKNOWN,
};

namespace std {
template <>
struct hash<eth_coord_t> {
Expand Down
19 changes: 19 additions & 0 deletions device/blackhole/blackhole_implementation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "blackhole/eth_l1_address_map.h"
#include "blackhole/host_mem_address_map.h"
#include "blackhole/l1_address_map.h"
#include "logger.hpp"
#include "umd/device/cluster.h"

constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH
Expand Down Expand Up @@ -104,4 +105,22 @@ tt_driver_noc_params blackhole_implementation::get_noc_params() const {
return {NOC_ADDR_LOCAL_BITS, NOC_ADDR_NODE_ID_BITS};
}

namespace blackhole {
std::vector<tt_xy_pair> get_pcie_cores(const BoardType board_type, const bool is_chip_remote) {
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved
if (is_chip_remote) {
log_assert(board_type == BoardType::P300, "Remote chip is supported only for Blackhole P300 board.");
}

if (board_type == BoardType::UNKNOWN || board_type == BoardType::P100) {
return PCIE_CORES_TYPE1;
} else if (board_type == BoardType::P150A) {
return PCIE_CORES_TYPE2;
} else if (board_type == BoardType::P300) {
return is_chip_remote ? PCIE_CORES_TYPE1 : PCIE_CORES_TYPE2;
}
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved

return PCIE_CORES_TYPE2;
}
} // namespace blackhole

} // namespace tt::umd
12 changes: 7 additions & 5 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,9 @@ std::unique_ptr<Chip> Cluster::construct_chip_from_cluster(

std::unique_ptr<Chip> Cluster::construct_chip_from_cluster(chip_id_t chip_id, tt_ClusterDescriptor* cluster_desc) {
tt::ARCH arch = cluster_desc->get_arch(chip_id);
std::string soc_desc_path = tt_SocDescriptor::get_soc_descriptor_path(arch);
const BoardType chip_board_type = cluster_desc->get_board_type(chip_id);
std::string soc_desc_path =
tt_SocDescriptor::get_soc_descriptor_path(arch, chip_board_type, cluster_desc->is_chip_remote(chip_id));
// Note that initially soc_descriptors are not harvested, but will be harvested later if perform_harvesting is
// true.
// TODO: This should be changed, harvesting should be done in tt_socdescriptor's constructor and not as part of
Expand Down Expand Up @@ -600,7 +602,7 @@ Cluster::Cluster(
// rather than ClusterDescriptor.
tt::ARCH arch = tt::ARCH::GRAYSKULL;
chip_id_t mock_chip_id = 0;
tt_SocDescriptor soc_desc = tt_SocDescriptor(tt_SocDescriptor::get_soc_descriptor_path(arch));
tt_SocDescriptor soc_desc = tt_SocDescriptor(tt_SocDescriptor::get_soc_descriptor_path(arch, BoardType::UNKNOWN));
std::unique_ptr<Chip> chip = std::make_unique<MockChip>(soc_desc);

std::unordered_map<chip_id_t, std::unique_ptr<Chip>> chips;
Expand Down Expand Up @@ -2286,14 +2288,14 @@ void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) {
}

void Cluster::wait_for_non_mmio_flush(const chip_id_t chip_id) {
log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole");
std::string read_tlb = "LARGE_READ_TLB";

if (!this->cluster_desc->is_chip_remote(chip_id)) {
log_debug(LogSiliconDriver, "Chip {} is not a remote chip, skipping wait_for_non_mmio_flush", chip_id);
return;
}

std::string read_tlb = "LARGE_READ_TLB";
log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole");

chip_id_t mmio_connected_chip = cluster_desc->get_closest_mmio_capable_chip(chip_id);
wait_for_connected_non_mmio_flush(mmio_connected_chip);
}
Expand Down
10 changes: 7 additions & 3 deletions device/coordinate_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,9 @@ std::shared_ptr<CoordinateManager> CoordinateManager::create_coordinate_manager(
tt::ARCH arch,
const size_t tensix_harvesting_mask,
const size_t dram_harvesting_mask,
const size_t eth_harvesting_mask) {
const size_t eth_harvesting_mask,
const BoardType board_type,
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved
bool is_chip_remote) {
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved
switch (arch) {
case tt::ARCH::GRAYSKULL:
return create_coordinate_manager(
Expand Down Expand Up @@ -495,7 +497,8 @@ std::shared_ptr<CoordinateManager> CoordinateManager::create_coordinate_manager(
tt::umd::wormhole::ARC_CORES,
tt::umd::wormhole::PCIE_GRID_SIZE,
tt::umd::wormhole::PCIE_CORES);
case tt::ARCH::BLACKHOLE:
case tt::ARCH::BLACKHOLE: {
const std::vector<tt_xy_pair> pcie_cores = tt::umd::blackhole::get_pcie_cores(board_type, is_chip_remote);
return create_coordinate_manager(
arch,
tt::umd::blackhole::TENSIX_GRID_SIZE,
Expand All @@ -510,7 +513,8 @@ std::shared_ptr<CoordinateManager> CoordinateManager::create_coordinate_manager(
tt::umd::blackhole::ARC_GRID_SIZE,
tt::umd::blackhole::ARC_CORES,
tt::umd::blackhole::PCIE_GRID_SIZE,
tt::umd::blackhole::PCIE_CORES);
pcie_cores);
}
case tt::ARCH::Invalid:
throw std::runtime_error("Invalid architecture for creating coordinate manager");
default:
Expand Down
2 changes: 2 additions & 0 deletions device/tt_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,8 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
board_type = BoardType::P100;
} else if (chip_board_type.second == "p150A") {
board_type = BoardType::P150A;
} else if (chip_board_type.second == "p300") {
board_type = BoardType::P300;
} else if (chip_board_type.second == "GALAXY") {
board_type = BoardType::GALAXY;
} else {
Expand Down
25 changes: 21 additions & 4 deletions device/tt_soc_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,17 +273,34 @@ bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const {
return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end();
}

std::string tt_SocDescriptor::get_soc_descriptor_path(tt::ARCH arch) {
std::string tt_SocDescriptor::get_soc_descriptor_path(
tt::ARCH arch, const BoardType board_type, const bool is_chip_remote) {
switch (arch) {
case tt::ARCH::GRAYSKULL:
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/grayskull_10x12.yaml");
case tt::ARCH::WORMHOLE_B0:
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/wormhole_b0_8x10.yaml");
case tt::ARCH::BLACKHOLE:
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_no_eth.yaml");
case tt::ARCH::BLACKHOLE: {
if (board_type == BoardType::P100 || board_type == BoardType::UNKNOWN) {
pjanevskiTT marked this conversation as resolved.
Show resolved Hide resolved
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_no_eth.yaml");
} else if (board_type == BoardType::P150A) {
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_local.yaml");
} else if (board_type == BoardType::P300) {
if (is_chip_remote) {
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_remote.yaml");
} else {
// TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_local.yaml");
}
} else {
throw std::runtime_error("Invalid board type for Blackhole architecture.");
}
}
default:
throw std::runtime_error("Invalid architecture");
}
Expand Down
2 changes: 1 addition & 1 deletion tests/api/test_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ TEST(ApiClusterTest, DifferentConstructors) {
// 3. Constructor taking a custom soc descriptor in addition.
tt::ARCH device_arch = tt_ClusterDescriptor::detect_arch(logical_device_id);
// You can add a custom soc descriptor here.
std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch);
std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch, BoardType::UNKNOWN);
umd_cluster = std::make_unique<Cluster>(sdesc_path, target_devices);
umd_cluster = nullptr;

Expand Down
42 changes: 35 additions & 7 deletions tests/api/test_core_coord_translation_bh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,19 +496,47 @@ TEST(CoordinateManager, CoordinateManagerBlackholeDRAMPMoreThanOneDRAMBankHarves
}

// Test that virtual, physical and translated coordinates are the same for all logical PCIE coordinates.
TEST(CoordinateManager, CoordinateManagerBlackholePCIETranslation) {
TEST(CoordinateManager, CoordinateManagerBlackholePCIETranslationLocal) {
std::shared_ptr<CoordinateManager> coordinate_manager =
CoordinateManager::create_coordinate_manager(tt::ARCH::BLACKHOLE);
CoordinateManager::create_coordinate_manager(tt::ARCH::BLACKHOLE, 0, 0, 0, BoardType::P300, false);
const tt_xy_pair pcie_grid_size = tt::umd::blackhole::PCIE_GRID_SIZE;
const std::vector<tt_xy_pair> pcie_cores = tt::umd::blackhole::PCIE_CORES_TYPE2;

for (size_t x = 0; x < pcie_grid_size.x; x++) {
for (size_t y = 0; y < pcie_grid_size.y; y++) {
const CoreCoord arc_logical = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL);
const CoreCoord arc_virtual = coordinate_manager->translate_coord_to(arc_logical, CoordSystem::VIRTUAL);
const CoreCoord arc_physical = coordinate_manager->translate_coord_to(arc_logical, CoordSystem::PHYSICAL);
const CoreCoord pcie_logical = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL);
const CoreCoord pcie_virtual = coordinate_manager->translate_coord_to(pcie_logical, CoordSystem::VIRTUAL);
const CoreCoord pcie_physical = coordinate_manager->translate_coord_to(pcie_logical, CoordSystem::PHYSICAL);
const tt_xy_pair pcie_core = pcie_cores[y * pcie_grid_size.x + x];

EXPECT_EQ(arc_virtual.x, arc_physical.x);
EXPECT_EQ(arc_virtual.y, arc_physical.y);
EXPECT_EQ(pcie_virtual.x, pcie_physical.x);
EXPECT_EQ(pcie_virtual.y, pcie_physical.y);

EXPECT_EQ(pcie_core.x, pcie_physical.x);
EXPECT_EQ(pcie_core.y, pcie_physical.y);
}
}
}

// Test that virtual, physical and translated coordinates are the same for all logical PCIE coordinates.
TEST(CoordinateManager, CoordinateManagerBlackholePCIETranslationRemote) {
std::shared_ptr<CoordinateManager> coordinate_manager =
CoordinateManager::create_coordinate_manager(tt::ARCH::BLACKHOLE, 0, 0, 0, BoardType::P300, true);
const tt_xy_pair pcie_grid_size = tt::umd::blackhole::PCIE_GRID_SIZE;
const std::vector<tt_xy_pair> pcie_cores = tt::umd::blackhole::PCIE_CORES_TYPE1;

for (size_t x = 0; x < pcie_grid_size.x; x++) {
for (size_t y = 0; y < pcie_grid_size.y; y++) {
const CoreCoord pcie_logical = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL);
const CoreCoord pcie_virtual = coordinate_manager->translate_coord_to(pcie_logical, CoordSystem::VIRTUAL);
const CoreCoord pcie_physical = coordinate_manager->translate_coord_to(pcie_logical, CoordSystem::PHYSICAL);
const tt_xy_pair pcie_core = pcie_cores[y * pcie_grid_size.x + x];

EXPECT_EQ(pcie_virtual.x, pcie_physical.x);
EXPECT_EQ(pcie_virtual.y, pcie_physical.y);

EXPECT_EQ(pcie_core.x, pcie_physical.x);
EXPECT_EQ(pcie_core.y, pcie_physical.y);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion tests/api/test_mockup_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ std::string get_soc_descriptor_file(tt::ARCH arch) {
case tt::ARCH::WORMHOLE_B0:
return test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml");
case tt::ARCH::BLACKHOLE:
return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml");
return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_local.yaml");
case tt::ARCH::Invalid:
throw std::runtime_error("Invalid arch not supported");
default:
Expand Down
2 changes: 1 addition & 1 deletion tests/api/test_soc_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ TEST(SocDescriptor, SocDescriptorBlackholeETHHarvesting) {
}

tt_SocDescriptor soc_desc(
test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"), 0, 0, eth_harvesting_mask);
test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_local.yaml"), 0, 0, eth_harvesting_mask);

const std::vector<CoreCoord> eth_cores = soc_desc.get_cores(CoreType::ETH);

Expand Down
1 change: 0 additions & 1 deletion tests/blackhole/test_cluster_bh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,6 @@ TEST(SiliconDriverBH, RandomSysmemTestWithPcie) {
auto target_devices = get_target_devices();

Cluster cluster(
test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"),
target_devices,
num_channels,
false, // skip driver allocs - no (don't skip)
Expand Down
Loading
Loading