diff --git a/.github/workflows/on-pr-opt.yml b/.github/workflows/build-and-run-all-tests.yml similarity index 90% rename from .github/workflows/on-pr-opt.yml rename to .github/workflows/build-and-run-all-tests.yml index 18364734..ff75e6c2 100644 --- a/.github/workflows/on-pr-opt.yml +++ b/.github/workflows/build-and-run-all-tests.yml @@ -1,10 +1,12 @@ -# Optional PR checks -name: On PR - Optional +# Build and then run all tests, on all supported archs. +name: Build and run all tests on: workflow_dispatch: pull_request: branches: ["main"] + push: + branches: ["main"] jobs: build-tests: diff --git a/.github/workflows/build-clients.yml b/.github/workflows/build-clients.yml new file mode 100644 index 00000000..a2502367 --- /dev/null +++ b/.github/workflows/build-clients.yml @@ -0,0 +1,56 @@ +name: Build clients on newest UMD + +on: + workflow_dispatch: + inputs: + timeout: + required: true + description: 'The timeout for the job in minutes' + type: number + default: 30 + pull_request: + branches: ["main"] + push: + branches: ["main"] + +jobs: + build-tt-metal: + # Due to parsing bug, fromJSON is used to convert string to number. + # In pull_request or push events, the input context is not available, stating the default again here. + timeout-minutes: ${{ fromJSON(inputs.timeout || '30') }} + strategy: + fail-fast: false + matrix: + arch_name: [grayskull, wormhole_b0, blackhole] + + name: Build tt-metal for ${{ matrix.arch_name }} with newest UMD + runs-on: ubuntu-20.04 + container: + image: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:latest + options: --user root + + steps: + - name: Checkout client repo + uses: actions/checkout@v4 + with: + # Clone under tt-metal directory + path: tt-metal + repository: tenstorrent/tt-metal + submodules: recursive + lfs: 'true' + + - name: Checkout UMD + uses: actions/checkout@v4 + with: + # Clone directly into tt-metal directory for umd + path: tt-metal/tt_metal/third_party/umd + submodules: recursive + lfs: 'true' + + - name: Build tt-metal + run: | + cd tt-metal + export ARCH_NAME=${{ matrix.arch_name }} + export TT_METAL_HOME=$(pwd) + export PYTHONPATH=$(pwd) + ./build_metal.sh diff --git a/.github/workflows/build-device.yml b/.github/workflows/build-device.yml index 5a8c0648..335dd2c0 100644 --- a/.github/workflows/build-device.yml +++ b/.github/workflows/build-device.yml @@ -1,19 +1,19 @@ # Builds device. # Build is performed on all supported OS versions. -name: Build Target +name: Build Device on: - workflow_call: - inputs: - timeout: - required: true - type: number workflow_dispatch: inputs: timeout: required: true - description: 'The timeout for the build job in minutes' + description: 'The timeout for the job in minutes' type: number + default: 15 + pull_request: + branches: ["main"] + push: + branches: ["main"] env: BUILD_TARGET: device @@ -25,7 +25,9 @@ env: jobs: build: - timeout-minutes: ${{ inputs.timeout }} + # Due to parsing bug, fromJSON is used to convert string to number. + # In pull_request or push events, the input context is not available, stating the default again here. + timeout-minutes: ${{ fromJSON(inputs.timeout || '15') }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 3c21f65d..5affd5c2 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -3,11 +3,17 @@ name: Build and Publish Docker Image on: workflow_dispatch: - workflow_call: + inputs: + timeout: + required: true + description: 'The timeout for the job in minutes' + type: number + default: 15 jobs: build: - timeout-minutes: 15 + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 5edd35eb..3916e4bf 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -33,10 +33,12 @@ env: LIB_OUTPUT_DIR: ./build/lib DEPS_OUTPUT_DIR: ./build/_deps TEST_OUTPUT_DIR: ./build/test + CLUSTER_DESCRIPTORS_DIR: ./tests/api/cluster_descriptor_examples jobs: build: - timeout-minutes: ${{ inputs.timeout }} + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: @@ -74,7 +76,8 @@ jobs: run: | tar cvf artifact.tar ${{ env.TEST_OUTPUT_DIR }} \ ${{ env.LIB_OUTPUT_DIR }} \ - ${{ env.DEPS_OUTPUT_DIR }} + ${{ env.DEPS_OUTPUT_DIR }} \ + ${{ env.CLUSTER_DESCRIPTORS_DIR }} - name: Upload build artifacts archive uses: actions/upload-artifact@v4 diff --git a/.github/workflows/on-pr.yml b/.github/workflows/on-pr.yml deleted file mode 100644 index 158026dd..00000000 --- a/.github/workflows/on-pr.yml +++ /dev/null @@ -1,18 +0,0 @@ -# Mandatory PR checks -name: On PR - -on: - workflow_dispatch: - pull_request: - branches: ["main"] - -jobs: - build-all: - secrets: inherit - uses: ./.github/workflows/build-device.yml - with: - timeout: 15 - - pre-commit: - secrets: inherit - uses: ./.github/workflows/pre-commit.yml diff --git a/.github/workflows/on-push.yml b/.github/workflows/on-push.yml deleted file mode 100644 index 673be510..00000000 --- a/.github/workflows/on-push.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: On Push - -on: - workflow_dispatch: - push: - branches: ["main"] - -jobs: - build-all: - secrets: inherit - uses: ./.github/workflows/build-device.yml - with: - timeout: 15 - - pre-commit: - secrets: inherit - uses: ./.github/workflows/pre-commit.yml - - build-tests: - secrets: inherit - strategy: - fail-fast: false - matrix: - test-group: [ - # Enable once we have functional cards with specified architecture. - {arch: grayskull}, - {arch: wormhole_b0}, - # {arch: blackhole}, - ] - uses: ./.github/workflows/build-tests.yml - with: - arch: ${{ matrix.test-group.arch }} - timeout: 15 - - test-all: - secrets: inherit - needs: build-tests - strategy: - fail-fast: false - matrix: - test-group: [ - # Enable once we have functional cards. - {arch: grayskull, card: e75, timeout: 10}, - {arch: grayskull, card: e150, timeout: 10}, - {arch: grayskull, card: e300, timeout: 10}, - {arch: wormhole_b0, card: n150, timeout: 5}, - {arch: wormhole_b0, card: n300, timeout: 15}, - # {arch: blackhole}, - ] - uses: ./.github/workflows/run-tests.yml - with: - arch: ${{ matrix.test-group.arch }} - card: ${{ matrix.test-group.card }} - timeout: ${{ matrix.test-group.timeout }} diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index a4ecb678..c4b2b9a0 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -2,8 +2,11 @@ name: Run Pre-commit Hooks on: - workflow_call: workflow_dispatch: + pull_request: + branches: ["main"] + push: + branches: ["main"] jobs: pre-commit: diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 98b2a526..e9d1e6b2 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -45,7 +45,8 @@ env: jobs: test: - timeout-minutes: ${{ inputs.timeout }} + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/test-runner.yaml b/.github/workflows/test-runner.yaml index 74a4d6bf..c871c773 100644 --- a/.github/workflows/test-runner.yaml +++ b/.github/workflows/test-runner.yaml @@ -2,10 +2,17 @@ name: Check runner on: workflow_dispatch: + inputs: + timeout: + required: true + description: 'The timeout for the job in minutes' + type: number + default: 10 jobs: check-runners-host: - timeout-minutes: 10 + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: @@ -52,7 +59,8 @@ jobs: du -h --max-depth=1 | sort -rh check-runners-docker: - timeout-minutes: 10 + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: diff --git a/common/disjoint_set.hpp b/common/disjoint_set.hpp new file mode 100644 index 00000000..b2187173 --- /dev/null +++ b/common/disjoint_set.hpp @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include + +// A standard disjoint set data structure to track connected components. +template +class DisjointSet { +public: + void add_item(T item) { parent[item] = item; } + + int get_set(T item) { + while (parent[item] != item) { + item = parent[item]; + } + return item; + } + + void merge(T item1, T item2) { + T set1 = get_set(item1); + T set2 = get_set(item2); + parent[set1] = set2; + } + + bool are_same_set(T item1, T item2) { return get_set(item1) == get_set(item2); } + + int get_num_sets() { + std::unordered_set sets; + for (auto [item, _] : parent) { + sets.insert(get_set(item)); + } + return sets.size(); + } + +private: + std::unordered_map parent; +}; diff --git a/common/utils.hpp b/common/utils.hpp new file mode 100644 index 00000000..b8cba9f5 --- /dev/null +++ b/common/utils.hpp @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +namespace tt::umd::utils { + +std::string get_abs_path(std::string path) { + // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the + // compiler. + std::filesystem::path current_file_path = std::filesystem::path(__FILE__); + std::filesystem::path umd_root; + if (current_file_path.is_absolute()) { + umd_root = current_file_path.parent_path().parent_path(); + } else { + std::filesystem::path umd_root_relative = + std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path(), "../"); + umd_root = std::filesystem::canonical(umd_root_relative); + } + std::filesystem::path abs_path = umd_root / path; + return abs_path.string(); +} + +} // namespace tt::umd::utils diff --git a/device/.clang-format b/device/.clang-format deleted file mode 100644 index 9d159247..00000000 --- a/device/.clang-format +++ /dev/null @@ -1,2 +0,0 @@ -DisableFormat: true -SortIncludes: false diff --git a/device/api/umd/device/architecture_implementation.h b/device/api/umd/device/architecture_implementation.h index f157dc8e..60ce5368 100644 --- a/device/api/umd/device/architecture_implementation.h +++ b/device/api/umd/device/architecture_implementation.h @@ -12,8 +12,8 @@ #include #include "umd/device/tlb.h" -#include "umd/device/xy_pair.h" #include "umd/device/tt_arch_types.h" +#include "umd/device/xy_pair.h" struct tt_driver_host_address_params; struct tt_driver_eth_interface_params; @@ -22,7 +22,7 @@ struct tt_driver_noc_params; namespace tt::umd { class architecture_implementation { - public: +public: virtual ~architecture_implementation() = default; virtual tt::ARCH get_architecture() const = 0; @@ -65,7 +65,8 @@ class architecture_implementation { virtual std::tuple multicast_workaround(xy_pair start, xy_pair end) const = 0; virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0; virtual std::optional> describe_tlb(std::int32_t tlb_index) const = 0; - virtual std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const = 0; + virtual std::pair get_tlb_data( + std::uint32_t tlb_index, const tlb_data& data) const = 0; virtual tt_driver_host_address_params get_host_address_params() const = 0; virtual tt_driver_eth_interface_params get_eth_interface_params() const = 0; diff --git a/device/api/umd/device/blackhole_implementation.h b/device/api/umd/device/blackhole_implementation.h index 9fdf819c..3ff3493b 100644 --- a/device/api/umd/device/blackhole_implementation.h +++ b/device/api/umd/device/blackhole_implementation.h @@ -7,10 +7,10 @@ #pragma once #include +#include #include "umd/device/architecture_implementation.h" #include "umd/device/tlb.h" -#include namespace tt::umd { @@ -59,30 +59,8 @@ enum class arc_message_type { // DEVICE_DATA static constexpr std::array DRAM_LOCATIONS = { - {{0, 0}, - {0, 1}, - {0, 11}, - {0, 2}, - {0, 10}, - {0, 3}, - {0, 9}, - {0, 4}, - {0, 8}, - {0, 5}, - {0, 7}, - {0, 6}, - {9, 0}, - {9, 1}, - {9, 11}, - {9, 2}, - {9, 10}, - {9, 3}, - {9, 9}, - {9, 4}, - {9, 8}, - {9, 5}, - {9, 7}, - {9, 6}}}; + {{0, 0}, {0, 1}, {0, 11}, {0, 2}, {0, 10}, {0, 3}, {0, 9}, {0, 4}, {0, 8}, {0, 5}, {0, 7}, {0, 6}, + {9, 0}, {9, 1}, {9, 11}, {9, 2}, {9, 10}, {9, 3}, {9, 9}, {9, 4}, {9, 8}, {9, 5}, {9, 7}, {9, 6}}}; static constexpr std::array ARC_LOCATIONS = {{{8, 0}}}; static constexpr std::array PCI_LOCATIONS = {{{11, 0}}}; @@ -113,14 +91,14 @@ static constexpr uint32_t BROADCAST_TLB_INDEX = 0; // TODO: Copied from worm static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000; static constexpr uint32_t TLB_COUNT_2M = 202; -static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0 +static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0 static constexpr uint32_t TLB_BASE_INDEX_2M = 0; static constexpr uint32_t TLB_2M_SIZE = 2 * 1024 * 1024; static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 12; static constexpr uint32_t TLB_COUNT_4G = 8; -static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4 +static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4 static constexpr uint32_t TLB_BASE_INDEX_4G = TLB_COUNT_2M; static constexpr uint64_t TLB_4G_SIZE = 4ULL * 1024ULL * 1024ULL * 1024ULL; static constexpr uint64_t DYNAMIC_TLB_4G_SIZE = TLB_4G_SIZE; @@ -168,59 +146,108 @@ static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97; } // namespace blackhole class blackhole_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::BLACKHOLE; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(blackhole::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(blackhole::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(blackhole::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(blackhole::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(blackhole::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(blackhole::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(blackhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(blackhole::arc_message_type::TEST); } - uint32_t get_arc_csm_mailbox_offset() const override { throw std::runtime_error("Not supported for Blackhole arch"); return 0; } + + uint32_t get_arc_csm_mailbox_offset() const override { + throw std::runtime_error("Not supported for Blackhole arch"); + return 0; + } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return blackhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return blackhole::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return blackhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return blackhole::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return blackhole::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return blackhole::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return blackhole::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return blackhole::DYNAMIC_TLB_2M_SIZE; } - uint32_t get_dynamic_tlb_16m_base() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } - uint32_t get_dynamic_tlb_16m_size() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } - uint32_t get_dynamic_tlb_16m_cfg_addr() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } + + uint32_t get_dynamic_tlb_16m_base() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + + uint32_t get_dynamic_tlb_16m_size() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + uint32_t get_mem_large_read_tlb() const override { return blackhole::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return blackhole::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return blackhole::STATIC_TLB_CFG_ADDR; } - uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; } + + uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; } - uint32_t get_tlb_base_index_16m() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } + + uint32_t get_tlb_base_index_16m() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + uint32_t get_tensix_soft_reset_addr() const override { return blackhole::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return blackhole::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return blackhole::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return blackhole::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return blackhole::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return blackhole::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return blackhole::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -231,7 +258,6 @@ class blackhole_implementation : public architecture_implementation { tt_driver_host_address_params get_host_address_params() const override; tt_driver_eth_interface_params get_eth_interface_params() const override; tt_driver_noc_params get_noc_params() const override; - }; } // namespace tt::umd diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index b387ea15..8e97fc79 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -8,21 +8,20 @@ #include #include #include +#include #include #include #include -#include -#include "umd/device/tt_soc_descriptor.h" -#include "umd/device/tt_xy_pair.h" -#include "umd/device/tt_silicon_driver_common.hpp" -#include "umd/device/tt_cluster_descriptor_types.h" +#include "fmt/core.h" +#include "tt_silicon_driver_common.hpp" +#include "tt_soc_descriptor.h" +#include "tt_xy_pair.h" +#include "umd/device/pci_device.hpp" #include "umd/device/tlb.h" +#include "umd/device/tt_cluster_descriptor_types.h" #include "umd/device/tt_io.hpp" -#include "umd/device/pci_device.hpp" -#include "fmt/core.h" - using TLB_DATA = tt::umd::tlb_data; // TODO: Remove this - it's here for Metal backwards compatibility. @@ -30,29 +29,32 @@ using TLB_DATA = tt::umd::tlb_data; tt::ARCH detect_arch(int pci_device_num); tt::ARCH detect_arch(); -namespace boost::interprocess{ - class named_mutex; +namespace boost::interprocess { +class named_mutex; } class tt_ClusterDescriptor; -enum tt_DevicePowerState { - BUSY, - SHORT_IDLE, - LONG_IDLE -}; +enum tt_DevicePowerState { BUSY, SHORT_IDLE, LONG_IDLE }; enum tt_MemBarFlag { SET = 0xaa, RESET = 0xbb, }; -inline std::ostream &operator <<(std::ostream &os, const tt_DevicePowerState power_state) { +inline std::ostream& operator<<(std::ostream& os, const tt_DevicePowerState power_state) { switch (power_state) { - case tt_DevicePowerState::BUSY: os << "Busy"; break; - case tt_DevicePowerState::SHORT_IDLE: os << "SHORT_IDLE"; break; - case tt_DevicePowerState::LONG_IDLE: os << "LONG_IDLE"; break; - default: throw ("Unknown DevicePowerState"); + case tt_DevicePowerState::BUSY: + os << "Busy"; + break; + case tt_DevicePowerState::SHORT_IDLE: + os << "SHORT_IDLE"; + break; + case tt_DevicePowerState::LONG_IDLE: + os << "LONG_IDLE"; + break; + default: + throw("Unknown DevicePowerState"); } return os; } @@ -116,20 +118,22 @@ struct tt_version { std::uint16_t major = 0xffff; std::uint8_t minor = 0xff; std::uint8_t patch = 0xff; + tt_version() {} + tt_version(std::uint16_t major_, std::uint8_t minor_, std::uint8_t patch_) { major = major_; minor = minor_; patch = patch_; } + tt_version(std::uint32_t version) { major = (version >> 16) & 0xff; minor = (version >> 12) & 0xf; patch = version & 0xfff; } - std::string str() const { - return fmt::format("{}.{}.{}", major, minor, patch); - } + + std::string str() const { return fmt::format("{}.{}.{}", major, minor, patch); } }; struct tt_device_params { @@ -140,29 +144,32 @@ struct tt_device_params { bool init_device = true; bool early_open_device = false; int aiclk = 0; + // The command-line input for vcd_dump_cores can have the following format: // {"*-2", "1-*", "*-*", "1-2"} // '*' indicates we must dump all the cores in that dimension. // This function takes the vector above and unrolles the coords with '*' in one or both dimensions. std::vector unroll_vcd_dump_cores(tt_xy_pair grid_size) const { std::vector unrolled_dump_core; - for (auto &dump_core: vcd_dump_cores) { + for (auto& dump_core : vcd_dump_cores) { // If the input is a single *, then dump all cores. if (dump_core == "*") { for (size_t x = 0; x < grid_size.x; x++) { - for (size_t y = 0; y < grid_size.y; y++) { - std::string current_core_coord = fmt::format("{}-{}", x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { - unrolled_dump_core.push_back(current_core_coord); + for (size_t y = 0; y < grid_size.y; y++) { + std::string current_core_coord = fmt::format("{}-{}", x, y); + if (std::find( + std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { + unrolled_dump_core.push_back(current_core_coord); + } } } - } continue; } // Each core coordinate must contain three characters: "core.x-core.y". assert(dump_core.size() <= 5); size_t delimiter_pos = dump_core.find('-'); - assert (delimiter_pos != std::string::npos); // y-dim should exist in core coord. + assert(delimiter_pos != std::string::npos); // y-dim should exist in core coord. std::string core_dim_x = dump_core.substr(0, delimiter_pos); size_t core_dim_y_start = delimiter_pos + 1; @@ -172,7 +179,9 @@ struct tt_device_params { for (size_t x = 0; x < grid_size.x; x++) { for (size_t y = 0; y < grid_size.y; y++) { std::string current_core_coord = fmt::format("{}-{}", x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find( + std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } @@ -180,14 +189,16 @@ struct tt_device_params { } else if (core_dim_x == "*") { for (size_t x = 0; x < grid_size.x; x++) { std::string current_core_coord = fmt::format("{}-{}", x, core_dim_y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } } else if (core_dim_y == "*") { for (size_t y = 0; y < grid_size.y; y++) { std::string current_core_coord = fmt::format("{}-{}", core_dim_x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } @@ -199,10 +210,9 @@ struct tt_device_params { } std::vector expand_plusargs() const { - std::vector all_plusargs { + std::vector all_plusargs{ fmt::format("+enable_perf_scoreboard={}", enable_perf_scoreboard), - fmt::format("+register_monitor={}", register_monitor) - }; + fmt::format("+register_monitor={}", register_monitor)}; all_plusargs.insert(all_plusargs.end(), plusargs.begin(), plusargs.end()); @@ -216,18 +226,18 @@ struct tt_device_params { * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon. * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend. * Using tt_device itself will throw errors, since its APIs are undefined. - */ -class tt_device -{ - public: - tt_device(const std::string& sdesc_path); + */ +class tt_device { +public: + tt_device(); virtual ~tt_device(); + // Setup/Teardown Functions /** * Set L1 Address Map parameters used by UMD to communicate with the TT Device. * * @param l1_address_params_ All the L1 parameters required by UMD - */ + */ virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { throw std::runtime_error("---- tt_device::set_device_l1_address_params is not implemented\n"); } @@ -240,9 +250,9 @@ class tt_device * Set Host Address Map parameters used by UMD to communicate with the TT Device (used for remote transactions). * * @param host_address_params_ All the Host Address space parameters required by UMD. - */ - [[deprecated("Using unnecessary function.")]] - virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) { + */ + [[deprecated("Using unnecessary function.")]] virtual void set_driver_host_address_params( + const tt_driver_host_address_params& host_address_params_) { throw std::runtime_error("---- tt_device::set_driver_host_address_params is not implemented\n"); } @@ -250,9 +260,9 @@ class tt_device * Set ERISC Firmware parameters used by UMD to communicate with the TT Device (used for remote transactions). * * @param eth_interface_params_ All the Ethernet Firmware parameters required by UMD. - */ - [[deprecated("Using unnecessary function.")]] - virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) { + */ + [[deprecated("Using unnecessary function.")]] virtual void set_driver_eth_interface_params( + const tt_driver_eth_interface_params& eth_interface_params_) { throw std::runtime_error("---- tt_device::set_driver_eth_interface_params is not implemented\n"); } @@ -264,8 +274,13 @@ class tt_device * @param tlb_index TLB id that will be programmed. * @param address Start address TLB is mapped to. * @param ordering Ordering mode for the TLB. - */ - virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Relaxed) { + */ + virtual void configure_tlb( + chip_id_t logical_device_id, + tt_xy_pair core, + std::int32_t tlb_index, + std::int32_t address, + uint64_t ordering = TLB_DATA::Relaxed) { throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n"); } @@ -274,45 +289,51 @@ class tt_device * * @param fallback_tlb Dynamic TLB being targeted. * @param ordering Ordering mode for the TLB. - */ + */ virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted) { throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n"); } - + /** - * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core). + * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per + * core). * * @param logical_device_id MMIO chip being targeted. * @param mapping_function Function which maps core to TLB index. */ - virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function) { throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n"); } /** - * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to use a subset of cores from the active_eth_cores_per_chip set for all host->cluster - * non-MMIO transfers. If this function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). - * If default behaviour is not desired, this function must be called for all MMIO devices. + * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to + * use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this + * function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). If + * default behaviour is not desired, this function must be called for all MMIO devices. * * @param mmio_chip Device being targeted. * @param active_eth_cores_per_chip The active ethernet cores for this chip. */ - virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { - throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { + throw std::runtime_error( + "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); } /** - * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips. + * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize + * iATUs for PCIe devices and ethernet queues for remote chips. * * @param device_params Object specifying initialization configuration. */ - virtual void start_device(const tt_device_params &device_params) { + virtual void start_device(const tt_device_params& device_params) { throw std::runtime_error("---- tt_device::start_device is not implemented\n"); } /** * Broadcast deassert soft Tensix Reset to the entire device (to be done after start_device is called). - */ + */ virtual void deassert_risc_reset() { throw std::runtime_error("---- tt_device::deassert_risc_reset is not implemented\n"); } @@ -321,14 +342,15 @@ class tt_device * Send a soft deassert reset signal to a single tensix core. * * @param core Chip and core being targeted. - */ - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET) { + */ + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) { throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n"); } /** * Broadcast assert soft Tensix Reset to the entire device. - */ + */ virtual void assert_risc_reset() { throw std::runtime_error("---- tt_device::assert_risc_reset is not implemented\n"); } @@ -337,7 +359,7 @@ class tt_device * Send a soft assert reset signal to a single tensix core. * * @param core Chip and core being targeted. - */ + */ virtual void assert_risc_reset_at_core(tt_cxy_pair core) { throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n"); } @@ -345,17 +367,15 @@ class tt_device /** * To be called at the end of a run. * Set power state to idle, assert tensix reset at all cores. - */ - virtual void close_device() { - throw std::runtime_error("---- tt_device::close_device is not implemented\n"); - } + */ + virtual void close_device() { throw std::runtime_error("---- tt_device::close_device is not implemented\n"); } // Runtime functions /** * Non-MMIO (ethernet) barrier. - * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding with the next one. - * This will be applied to all chips in the cluster. - */ + * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding + * with the next one. This will be applied to all chips in the cluster. + */ virtual void wait_for_non_mmio_flush() { throw std::runtime_error("---- tt_device::wait_for_non_mmio_flush is not implemented\n"); } @@ -377,12 +397,20 @@ class tt_device * @param addr Address to write to. * @param tlb_to_use Specifies fallback/dynamic TLB to use. */ - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb) { + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb) { throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n"); } @@ -395,44 +423,54 @@ class tt_device * @param size Number of bytes to read. * @param fallback_tlb Specifies fallback/dynamic TLB to use. */ - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::read_from_device is not implemented\n"); } /** * Write uint32_t vector to specified address and channel on host (defined for Silicon). - * + * * @param vec Data to write. * @param addr Address to write to. * @param channel Host channel to target. * @param src_device_id Chip to target. */ - virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { throw std::runtime_error("---- tt_device::write_to_sysmem is not implemented\n"); } - virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { + + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { throw std::runtime_error("---- tt_device::read_from_sysmem is not implemented\n"); } - virtual void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { + + virtual void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { throw std::runtime_error("---- tt_device::l1_membar is not implemented\n"); } - virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) { + + virtual void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) { throw std::runtime_error("---- tt_device::dram_membar is not implemented\n"); } - virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { + + virtual void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { throw std::runtime_error("---- tt_device::dram_membar is not implemented\n"); } // Misc. Functions to Query/Set Device State /** - * Query post harvesting SOC descriptors from UMD in virtual coordinates. + * Query post harvesting SOC descriptors from UMD in virtual coordinates. * These descriptors should be used for looking up cores that are passed into UMD APIs. */ virtual std::unordered_map& get_virtual_soc_descriptors() { throw std::runtime_error("---- tt_device:get_virtual_soc_descriptors is not implemented\n"); } - + /** * Determine if UMD performed harvesting on SOC descriptors. */ @@ -440,18 +478,18 @@ class tt_device throw std::runtime_error("---- tt_device:using_harvested_soc_descriptors is not implemented\n"); return 0; } - + /** * Get harvesting masks for all chips/SOC Descriptors in the cluster. * Each mask represents a map of enabled (0) and disabled (1) rows on a specific chip (in NOC0 Coordinateds). - */ + */ virtual std::unordered_map get_harvesting_masks_for_soc_descriptors() { throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n"); } /** * Issue message to device, meant to be picked up by ARC firmware. - * + * * @param logical_device_id Chip to target. * @param msg_code Specifies type of ARC message. * @param wait_for_done Block until ARC responds. @@ -460,8 +498,16 @@ class tt_device * @param timeout Timeout on ARC. * @param return3 Return value from ARC. * @param return4 Return value from ARC. - */ - virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr) { + */ + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr) { throw std::runtime_error("---- tt_device::arc_msg is not implemented\n"); } @@ -471,28 +517,28 @@ class tt_device * @param device_id Chip to target. * @param r Row coordinate. * @param c Column coordinate. - */ - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { + */ + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { throw std::runtime_error("---- tt_device::translate_to_noc_table_coords is not implemented\n"); } /** * Get the total number of chips in the cluster based on the network descriptor. - */ + */ virtual int get_number_of_chips_in_cluster() { throw std::runtime_error("---- tt_device::get_number_of_chips_in_cluster is not implemented\n"); } /** * Get the logical ids for all chips in the cluster - */ + */ virtual std::unordered_set get_all_chips_in_cluster() { throw std::runtime_error("---- tt_device::get_all_chips_in_cluster is not implemented\n"); } /** * Get cluster descriptor object being used in UMD instance. - */ + */ virtual tt_ClusterDescriptor* get_cluster_description() { throw std::runtime_error("---- tt_device::get_cluster_description is not implemented\n"); } @@ -514,9 +560,9 @@ class tt_device /** * Get clock frequencies for all MMIO devices targeted by UMD. */ - virtual std::map get_clocks() { + virtual std::map get_clocks() { throw std::runtime_error("---- tt_device::get_clocks is not implemented\n"); - return std::map(); + return std::map(); } virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) { @@ -534,7 +580,7 @@ class tt_device * Query number of DRAM channels on a specific device. * * @param device_id Logical device id to query. - */ + */ virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_num_dram_channels is not implemented\n"); return 0; @@ -542,10 +588,10 @@ class tt_device /** * Get size for a specific DRAM channel on a device. - * + * * @param device_id Device to target. * @param channel DRAM channel to target. - */ + */ virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { throw std::runtime_error("---- tt_device::get_dram_channel_size is not implemented\n"); return 0; @@ -555,7 +601,7 @@ class tt_device * Query number of Host channels (hugepages) allocated for a specific device. * * @param device_id Logical device id to target. - */ + */ virtual std::uint32_t get_num_host_channels(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_num_host_channels is not implemented\n"); return 0; @@ -566,20 +612,21 @@ class tt_device * * @param device_id Logical device id to target. * @param channel Logical host channel to target. - */ + */ virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { throw std::runtime_error("---- tt_device::get_host_channel_size is not implemented\n"); return 0; } /** - * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific device. - * + * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific + * device. + * * @param offset Offset wrt the start of the channel's address space. - * @param src_device_id Device to target. + * @param src_device_id Device to target. * @param channel Host memory channel. */ - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { throw std::runtime_error("---- tt_device::host_dma_address is not implemented\n"); return nullptr; } @@ -588,28 +635,28 @@ class tt_device throw std::runtime_error("---- tt_device::get_pcie_base_addr_from_device is not implemented\n"); return 0; } + const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; bool performed_harvesting = false; std::unordered_map harvested_rows_per_target = {}; bool translation_tables_en = false; - protected: +protected: std::unordered_map soc_descriptor_per_chip = {}; }; namespace tt::umd { /** -* Silicon Driver Class, derived from the tt_device class + * Silicon Driver Class, derived from the tt_device class * Implements APIs to communicate with a physical Tenstorrent Device. -*/ -class Cluster: public tt_device -{ - public: + */ +class Cluster : public tt_device { +public: // Constructor /** - * Silicon Driver constructor. + * Cluster constructor. * * @param sdesc_path SOC descriptor specifying single chip. * @param ndesc_path Network Descriptor specifying the network topology of the system. @@ -619,43 +666,106 @@ class Cluster: public tt_device * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. * @param simulated_harvesting_masks - */ - Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, - const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); - - //Setup/Teardown Functions + */ + Cluster( + const std::string& sdesc_path, + const std::string& ndesc_path, + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); + + /** + * Cluster constructor. This constructor should be used to work towards removing all + * of the params from the constructor of tt_SiliconDevice (to become Cluster). + * + * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages). + * @param skip_driver_allocs + * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. + * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. + * @param simulated_harvesting_masks + */ + Cluster( + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); + + /** + * Cluster constructor. This constructor should be used to target specific devices in a cluster. + * + * @param target_devices Devices to target. + * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages). + * @param skip_driver_allocs + * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. + * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. + * @param simulated_harvesting_masks + */ + Cluster( + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); + + // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); - virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted); + virtual void configure_tlb( + chip_id_t logical_device_id, + tt_xy_pair core, + std::int32_t tlb_index, + std::int32_t address, + uint64_t ordering = TLB_DATA::Posted); virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); - virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function); - virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); - virtual void start_device(const tt_device_params &device_params); + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); + virtual void start_device(const tt_device_params& device_params); virtual void assert_risc_reset(); virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset_at_core(tt_cxy_pair core); virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); - virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); + + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); virtual void wait_for_non_mmio_flush(); virtual void wait_for_non_mmio_flush(const chip_id_t chip_id); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); // These functions are used by Debuda, so make them public - void bar_write32 (int logical_device_id, uint32_t addr, uint32_t data); - uint32_t bar_read32 (int logical_device_id, uint32_t addr); + void bar_write32(int logical_device_id, uint32_t addr, uint32_t data); + uint32_t bar_read32(int logical_device_id, uint32_t addr); /** * If the tlbs are initialized, returns a tuple with the TLB base address and its size @@ -673,16 +783,24 @@ class Cluster: public tt_device * - the mapping is unchanged during the lifetime of the returned object. * - the Cluster instance outlives the returned object. * - use of the returned object is congruent with the target's TLB setup. - * + * * @param target The target chip and core to write to. */ tt::Writer get_static_tlb_writer(tt_cxy_pair target); // Misc. Functions to Query/Set Device State - virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); virtual tt_ClusterDescriptor* get_cluster_description(); @@ -690,13 +808,16 @@ class Cluster: public tt_device static std::vector detect_available_device_ids(); virtual std::set get_target_mmio_device_ids(); virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; - static std::vector extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows); - static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); + static std::vector extract_rows_to_remove( + const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows); + static void remove_worker_row_from_descriptor( + tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows); - static std::unordered_map create_harvested_coord_translation(const tt::ARCH arch, bool identity_map); + static std::unordered_map create_harvested_coord_translation( + const tt::ARCH arch, bool identity_map); std::unordered_map get_harvested_coord_translation_map(chip_id_t logical_device_id); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); @@ -705,72 +826,154 @@ class Cluster: public tt_device virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; // TODO: This should be accessible through public API, probably to be moved to tt_device. - PCIDevice *get_pci_device(int device_id) const; + PCIDevice* get_pci_device(int device_id) const; // Destructor - virtual ~Cluster (); + virtual ~Cluster(); - private: +private: // Helper functions // Startup + teardown - void create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources); + void create_device( + const std::unordered_set& target_mmio_device_ids, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources); void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm); void cleanup_shared_host_state(); void initialize_pcie_devices(); - void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &cores); - void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets); - void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); - void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); + void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores); + void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets); + void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets); + void send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets); void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting); void populate_cores(); - void init_pcie_iatus(); // No more p2p support. + void init_pcie_iatus(); // No more p2p support. void check_pcie_device_initialized(int device_id); void set_pcie_power_state(tt_DevicePowerState state); - int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state); + int set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state); void set_power_state(tt_DevicePowerState state); uint32_t get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state); void enable_local_ethernet_queue(const chip_id_t& chip, int timeout); void enable_ethernet_queue(int timeout); void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout); void deassert_resets_and_set_power_state(); - int iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size); - uint32_t get_harvested_noc_rows (uint32_t harvesting_mask); - uint32_t get_harvested_rows (int logical_device_id); + int iatu_configure_peer_region( + int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size); + uint32_t get_harvested_noc_rows(uint32_t harvesting_mask); + uint32_t get_harvested_rows(int logical_device_id); int get_clock(int logical_device_id); // Communication Functions - void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); - void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); - void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb); - void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector broadcast_header = {}); - void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb); + void read_buffer( + void* mem_ptr, + std::uint32_t address, + std::uint16_t channel, + std::uint32_t size_in_bytes, + chip_id_t src_device_id); + void write_buffer( + const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); + void write_device_memory( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair target, + std::uint32_t address, + const std::string& fallback_tlb); + void write_to_non_mmio_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t address, + bool broadcast = false, + std::vector broadcast_header = {}); + void read_device_memory( + void* mem_ptr, + tt_cxy_pair target, + std::uint32_t address, + std::uint32_t size_in_bytes, + const std::string& fallback_tlb); void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes); - void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - void pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb); - void ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, const std::set& rows_to_exclude, - std::set& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords); - void set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb); - void insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb); + void read_mmio_device_register( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + void write_mmio_device_register( + const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + void pcie_broadcast_write( + chip_id_t chip, + const void* mem_ptr, + uint32_t size_in_bytes, + std::uint32_t addr, + const tt_xy_pair& start, + const tt_xy_pair& end, + const std::string& fallback_tlb); + void ethernet_broadcast_write( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + const std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb, + bool use_virtual_coords); + void set_membar_flag( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_value, + const uint32_t barrier_addr, + const std::string& fallback_tlb); + void insert_host_to_device_barrier( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_addr, + const std::string& fallback_tlb); void init_membars(); uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset); uint16_t get_sys_rack(uint32_t rack_x, uint32_t rack_y); bool is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr); - int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); - int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); - bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); + int pcie_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + int remote_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + bool address_in_tlb_space( + uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); std::shared_ptr get_mutex(const std::string& tlb_name, int pci_interface_id); - virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips - void generate_tensix_broadcast_grids_for_grayskull( std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude); - std::unordered_map>>& get_ethernet_broadcast_headers(const std::set& chips_to_exclude); + virtual uint32_t get_harvested_noc_rows_for_chip( + int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips + void generate_tensix_broadcast_grids_for_grayskull( + std::set>& broadcast_grids, + std::set& rows_to_exclude, + std::set& cols_to_exclude); + std::unordered_map>>& get_ethernet_broadcast_headers( + const std::set& chips_to_exclude); // Test functions void verify_eth_fw(); - void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions); - int test_setup_interface (); + void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions); + int test_setup_interface(); // This functions has to be called for local chip, and then it will wait for all connected remote chips to flush. void wait_for_connected_non_mmio_flush(chip_id_t chip_id); + void construct_cluster( + const std::string& sdesc_path, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks); + // State variables tt_device_dram_address_params dram_address_params; tt_device_l1_address_params l1_address_params; @@ -781,22 +984,24 @@ class Cluster: public tt_device std::set target_devices_in_cluster = {}; std::set target_remote_chips = {}; tt::ARCH arch_name; - std::unordered_map> m_pci_device_map; // Map of enabled pci devices - int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) + std::unordered_map> m_pci_device_map; // Map of enabled pci devices + int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) std::shared_ptr ndesc; // remote eth transfer setup static constexpr std::uint32_t NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 6; static constexpr std::uint32_t NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 4; static constexpr std::uint32_t NON_EPOCH_ETH_CORES_START_ID = 0; - static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); + static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1); - static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; - static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; - static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); + static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = + NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; + static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = + NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; + static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1); int active_core = NON_EPOCH_ETH_CORES_START_ID; - std::vector< std::vector > remote_transfer_ethernet_cores; + std::vector> remote_transfer_ethernet_cores; std::unordered_map flush_non_mmio_per_chip = {}; bool non_mmio_transfer_cores_customized = false; std::unordered_map active_eth_core_idx_per_chip = {}; @@ -821,7 +1026,7 @@ class Cluster: public tt_device bool use_ethernet_ordered_writes = true; bool use_ethernet_broadcast = true; bool use_virtual_coords_for_eth_broadcast = true; - tt_version eth_fw_version; // Ethernet FW the driver is interfacing with + tt_version eth_fw_version; // Ethernet FW the driver is interfacing with // Named Mutexes static constexpr char NON_MMIO_MUTEX_NAME[] = "NON_MMIO"; static constexpr char ARC_MSG_MUTEX_NAME[] = "ARC_MSG"; @@ -830,13 +1035,13 @@ class Cluster: public tt_device static constexpr std::uint32_t SW_VERSION = 0x06060000; }; -} +} // namespace tt::umd -constexpr inline bool operator==(const tt_version &a, const tt_version &b) { +constexpr inline bool operator==(const tt_version& a, const tt_version& b) { return a.major == b.major && a.minor == b.minor && a.patch == b.patch; } -constexpr inline bool operator>=(const tt_version &a, const tt_version &b) { +constexpr inline bool operator>=(const tt_version& a, const tt_version& b) { bool fw_major_greater = a.major > b.major; bool fw_minor_greater = (a.major == b.major) && (a.minor > b.minor); bool patch_greater_or_equal = (a.major == b.major) && (a.minor == b.minor) && (a.patch >= b.patch); diff --git a/device/api/umd/device/coordinate_manager.h b/device/api/umd/device/coordinate_manager.h index e24e370b..bf98be70 100644 --- a/device/api/umd/device/coordinate_manager.h +++ b/device/api/umd/device/coordinate_manager.h @@ -7,17 +7,17 @@ #pragma once #include -#include #include +#include -#include "umd/device/tt_xy_pair.h" #include "umd/device/tt_arch_types.h" +#include "umd/device/tt_xy_pair.h" class CoordinateManager { - public: - CoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {} + CoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {} virtual void perform_harvesting(std::size_t harvesting_mask); @@ -49,14 +49,17 @@ class CoordinateManager { protected: virtual void clear_harvesting_structures(); - + virtual std::set get_x_coordinates_to_harvest(std::size_t harvesting_mask); virtual std::set get_y_coordinates_to_harvest(std::size_t harvesting_mask); virtual void fill_logical_to_physical_mapping( - const std::set& x_to_harvest, const std::set& y_to_harvest, + const std::set& x_to_harvest, + const std::set& y_to_harvest, + const std::set& physical_x_unharvested, + const std::set& physical_y_unharvested); + virtual void fill_logical_to_virtual_mapping( const std::set& physical_x_unharvested, const std::set& physical_y_unharvested); - virtual void fill_logical_to_virtual_mapping(const std::set& physical_x_unharvested, const std::set& physical_y_unharvested); std::map physical_y_to_logical_y; std::map physical_x_to_logical_x; diff --git a/device/api/umd/device/driver_atomics.h b/device/api/umd/device/driver_atomics.h index ec213438..4ed3e7a6 100644 --- a/device/api/umd/device/driver_atomics.h +++ b/device/api/umd/device/driver_atomics.h @@ -12,54 +12,44 @@ namespace tt_driver_atomics { #if defined(__x86_64__) || defined(__i386__) // Store-Any barrier. -static inline __attribute__((always_inline)) void sfence() { - _mm_sfence(); -} +static inline __attribute__((always_inline)) void sfence() { _mm_sfence(); } + // Load-Any barrier. -static inline __attribute__((always_inline)) void lfence() { - _mm_lfence(); -} +static inline __attribute__((always_inline)) void lfence() { _mm_lfence(); } + // Any-Any barrier. -static inline __attribute__((always_inline)) void mfence() { - _mm_mfence(); -} +static inline __attribute__((always_inline)) void mfence() { _mm_mfence(); } #elif defined(__ARM_ARCH) static inline __attribute__((always_inline)) void sfence() { // Full memory barrier (full system). ARM does not have a Store-Any barrier. // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB SY" : : : "memory"); + asm volatile("DMB SY" : : : "memory"); } static inline __attribute__((always_inline)) void lfence() { // Load-Any barrier (full system) // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB LD" : : : "memory"); + asm volatile("DMB LD" : : : "memory"); } static inline __attribute__((always_inline)) void mfence() { // Full memory barrier (full system). // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB SY" : : : "memory"); + asm volatile("DMB SY" : : : "memory"); } #elif defined(__riscv) -static inline __attribute__((always_inline)) void sfence() { - asm volatile ("fence ow, ow" : : : "memory"); -} +static inline __attribute__((always_inline)) void sfence() { asm volatile("fence ow, ow" : : : "memory"); } -static inline __attribute__((always_inline)) void lfence() { - asm volatile ("fence ir, ir" : : : "memory"); -} +static inline __attribute__((always_inline)) void lfence() { asm volatile("fence ir, ir" : : : "memory"); } -static inline __attribute__((always_inline)) void mfence() { - asm volatile ("fence iorw, iorw" : : : "memory"); -} +static inline __attribute__((always_inline)) void mfence() { asm volatile("fence iorw, iorw" : : : "memory"); } #else #error "Unsupported architecture" #endif -} // namespace tt_driver_atomics +} // namespace tt_driver_atomics diff --git a/device/api/umd/device/grayskull_implementation.h b/device/api/umd/device/grayskull_implementation.h index 8f476ade..0a93e9b2 100644 --- a/device/api/umd/device/grayskull_implementation.h +++ b/device/api/umd/device/grayskull_implementation.h @@ -104,7 +104,8 @@ enum class arc_message_type { }; // DEVICE_DATA -static const std::array DRAM_LOCATIONS = {{{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}}; +static const std::array DRAM_LOCATIONS = { + {{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}}; static const std::array ARC_LOCATIONS = {{{0, 2}}}; static const std::array PCI_LOCATIONS = {{{0, 4}}}; static const std::array ETH_LOCATIONS = {}; @@ -134,7 +135,8 @@ static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000; static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 8; static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024; -static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); +static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = + STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M; static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024; @@ -171,59 +173,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; } // namespace grayskull class grayskull_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::GRAYSKULL; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(grayskull::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(grayskull::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(grayskull::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(grayskull::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(grayskull::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(grayskull::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(grayskull::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(grayskull::arc_message_type::TEST); } + uint32_t get_arc_csm_mailbox_offset() const override { return grayskull::ARC_CSM_MAILBOX_OFFSET; } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return grayskull::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return grayskull::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return grayskull::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return grayskull::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return grayskull::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return grayskull::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return grayskull::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return grayskull::DYNAMIC_TLB_2M_SIZE; } + uint32_t get_dynamic_tlb_16m_base() const override { return grayskull::DYNAMIC_TLB_16M_BASE; } + uint32_t get_dynamic_tlb_16m_size() const override { return grayskull::DYNAMIC_TLB_16M_SIZE; } + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return grayskull::DYNAMIC_TLB_16M_CFG_ADDR; } + uint32_t get_mem_large_read_tlb() const override { return grayskull::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return grayskull::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return grayskull::STATIC_TLB_CFG_ADDR; } + uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; } + uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; } + uint32_t get_tensix_soft_reset_addr() const override { return grayskull::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return grayskull::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return grayskull::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return grayskull::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return grayskull::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return grayskull::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return grayskull::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -234,7 +270,6 @@ class grayskull_implementation : public architecture_implementation { tt_driver_host_address_params get_host_address_params() const override; tt_driver_eth_interface_params get_eth_interface_params() const override; tt_driver_noc_params get_noc_params() const override; - }; } // namespace tt::umd diff --git a/device/api/umd/device/hugepage.h b/device/api/umd/device/hugepage.h index 18840ec5..1bf37dac 100644 --- a/device/api/umd/device/hugepage.h +++ b/device/api/umd/device/hugepage.h @@ -6,10 +6,10 @@ #pragma once -#include "umd/device/tt_cluster_descriptor_types.h" - -#include #include +#include + +#include "umd/device/tt_cluster_descriptor_types.h" namespace tt::umd { @@ -17,7 +17,8 @@ namespace tt::umd { uint32_t get_num_hugepages(); // Dynamically figure out how many host memory channels (based on hugepages installed) for each device, based on arch. -uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id); +uint32_t get_available_num_host_mem_channels( + const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id); // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G) std::string find_hugepage_dir(std::size_t pagesize); @@ -27,4 +28,4 @@ std::string find_hugepage_dir(std::size_t pagesize); // Today we assume there's only one pipeline running within the system. // One hugepage per device such that each device gets unique memory. int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel); -} +} // namespace tt::umd diff --git a/device/api/umd/device/pci_device.hpp b/device/api/umd/device/pci_device.hpp index c0c2c20d..914663d8 100644 --- a/device/api/umd/device/pci_device.hpp +++ b/device/api/umd/device/pci_device.hpp @@ -12,28 +12,30 @@ #include #include -#include "umd/device/tt_xy_pair.h" +#include "umd/device/tlb.h" #include "umd/device/tt_arch_types.h" #include "umd/device/tt_cluster_descriptor_types.h" -#include "umd/device/tlb.h" +#include "umd/device/tt_xy_pair.h" // TODO: this is used up in cluster.cpp but that logic ought to be // lowered into the PCIDevice class since it is specific to PCIe cards. // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; -// TODO: this is a bit of a hack... something to revisit when we formalize an +// TODO: this is a bit of a hack... something to revisit when we formalize an // abstraction for IO. // BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4 static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; constexpr unsigned int c_hang_read_value = 0xffffffffu; -namespace tt::umd { class architecture_implementation; } +namespace tt::umd { +class architecture_implementation; +} struct dynamic_tlb { - uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR. - uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. + uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR. + uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. }; struct hugepage_mapping { @@ -42,8 +44,7 @@ struct hugepage_mapping { uint64_t physical_address = 0; }; -struct PciDeviceInfo -{ +struct PciDeviceInfo { uint16_t vendor_id; uint16_t device_id; uint16_t pci_domain; @@ -57,14 +58,14 @@ struct PciDeviceInfo }; class PCIDevice { - const std::string device_path; // Path to character device: /dev/tenstorrent/N - const int pci_device_num; // N in /dev/tenstorrent/N - const int logical_id; // Unique identifier for each device in entire network topology - const int pci_device_file_desc; // Character device file descriptor - const PciDeviceInfo info; // PCI device info - const int numa_node; // -1 if non-NUMA - const int revision; // PCI revision value from sysfs - const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole + const std::string device_path; // Path to character device: /dev/tenstorrent/N + const int pci_device_num; // N in /dev/tenstorrent/N + const int logical_id; // Unique identifier for each device in entire network topology + const int pci_device_file_desc; // Character device file descriptor + const PciDeviceInfo info; // PCI device info + const int numa_node; // -1 if non-NUMA + const int revision; // PCI revision value from sysfs + const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole std::unique_ptr architecture_implementation; public: @@ -83,7 +84,7 @@ class PCIDevice { * * Opens the character device file descriptor, reads device information from * sysfs, and maps device memory region(s) into the process address space. - * + * * @param pci_device_number N in /dev/tenstorrent/N * @param logical_device_id unique identifier for this device in the network topology */ @@ -95,8 +96,8 @@ class PCIDevice { */ ~PCIDevice(); - PCIDevice(const PCIDevice&) = delete; // copy - void operator=(const PCIDevice&) = delete; // copy assignment + PCIDevice(const PCIDevice &) = delete; // copy + void operator=(const PCIDevice &) = delete; // copy assignment /** * @return PCI device info @@ -155,21 +156,39 @@ class PCIDevice { // NOC endpoints. Probably worth waiting for the KMD to start owning the // resource management aspect of these PCIe->NOC mappings (the "TLBs") // before doing too much work here... - void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr); - void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr); + void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr); + void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr); void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data); void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len); void read_regs(uint32_t byte_addr, uint32_t word_len, void *data); // TLB related functions. // TODO: These are architecture specific, and will be moved out of the class. - void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); - dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering); - dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); - dynamic_tlb set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); - - tt::umd::architecture_implementation* get_architecture_implementation() const; + void write_tlb_reg( + uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); + dynamic_tlb set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering); + dynamic_tlb set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair target, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + dynamic_tlb set_dynamic_tlb_broadcast( + unsigned int tlb_index, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + + tt::umd::architecture_implementation *get_architecture_implementation() const; void detect_hang_read(uint32_t data_read = c_hang_read_value); // TODO: this also probably has more sense to live in the future TTDevice class. @@ -197,8 +216,8 @@ class PCIDevice { // and simplify the code. void *system_reg_mapping = nullptr; size_t system_reg_mapping_size; - uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. - uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. + uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. + uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. uint32_t read_checking_offset; @@ -206,7 +225,7 @@ class PCIDevice { bool is_hardware_hung(); template - T* get_register_address(uint32_t register_offset); + T *get_register_address(uint32_t register_offset); // For debug purposes when various stages fails. void print_file_contents(std::string filename, std::string hint = ""); diff --git a/device/api/umd/device/tlb.h b/device/api/umd/device/tlb.h index 3e8fb826..30094202 100644 --- a/device/api/umd/device/tlb.h +++ b/device/api/umd/device/tlb.h @@ -8,8 +8,8 @@ #include #include -#include #include +#include namespace tt::umd { @@ -41,10 +41,10 @@ struct tlb_data { // Orderings static constexpr uint64_t Relaxed = 0; - static constexpr uint64_t Strict = 1; - static constexpr uint64_t Posted = 2; + static constexpr uint64_t Strict = 1; + static constexpr uint64_t Posted = 2; - bool check(const tlb_offsets & offset) const; + bool check(const tlb_offsets &offset) const; std::pair apply_offset(const tlb_offsets &offset) const; }; diff --git a/device/api/umd/device/tt_arch_types.h b/device/api/umd/device/tt_arch_types.h index 8a7c5dba..c165bf1b 100644 --- a/device/api/umd/device/tt_arch_types.h +++ b/device/api/umd/device/tt_arch_types.h @@ -17,4 +17,4 @@ enum class ARCH { BLACKHOLE = 3, Invalid = 0xFF, }; -} +} // namespace tt diff --git a/device/api/umd/device/tt_cluster_descriptor.h b/device/api/umd/device/tt_cluster_descriptor.h index 85d62c33..c39bdd93 100644 --- a/device/api/umd/device/tt_cluster_descriptor.h +++ b/device/api/umd/device/tt_cluster_descriptor.h @@ -4,23 +4,24 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once -#include "umd/device/tt_xy_pair.h" - #include -#include -#include -#include #include -#include +#include +#include #include +#include +#include +#include #include -#include + #include "umd/device/tt_cluster_descriptor_types.h" +#include "umd/device/tt_xy_pair.h" -namespace YAML { class Node; } +namespace YAML { +class Node; +} enum BoardType : uint32_t { N150 = 0, @@ -32,88 +33,93 @@ enum BoardType : uint32_t { }; class tt_ClusterDescriptor { - - private: - int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; - - protected: - - std::unordered_map > > ethernet_connections; - std::unordered_map chip_locations; - // reverse map: rack/shelf/y/x -> chip_id - std::map > > > coords_to_chip_ids; - std::unordered_map chips_with_mmio; - std::unordered_set all_chips; - std::unordered_map noc_translation_enabled = {}; - std::unordered_map harvesting_masks = {}; - std::unordered_set enabled_active_chips; - std::unordered_map closest_mmio_chip_cache = {}; - std::unordered_map chip_board_type = {}; - std::unordered_map> chips_grouped_by_closest_mmio; - - // one-to-many chip connections - struct Chip2ChipConnection { - eth_coord_t source_chip_coord; - std::unordered_set destination_chip_coords; - }; - - // shelf_id -> y dim -> list of chip2chip connections between different shelves - // assumption is that on every row of the shelf there is a chip that is connected to the other shelf - // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other shelf (in case of nebula->galaxy) - std::unordered_map > galaxy_shelves_exit_chip_coords_per_y_dim = {}; - // rack_id -> x dim -> list of chip2chip connections between different racks - // assumption is that on every row of the rack there is a chip that is connected to the other rack - std::unordered_map > galaxy_racks_exit_chip_coords_per_x_dim = {}; - - static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); - static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); - static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); - - void fill_chips_grouped_by_closest_mmio(); - - public: - tt_ClusterDescriptor() = default; - tt_ClusterDescriptor(const tt_ClusterDescriptor&) = default; - - /* - * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument ordering when calling the function - * An empty result implies that the two chips do not share any direct connection - */ - std::vector> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; - - bool is_chip_mmio_capable(const chip_id_t chip_id) const; - bool is_chip_remote(const chip_id_t chip_id) const; - chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip); - chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); - - // TODO: These following functions will be removed, and ClusterDescriptor will be created without any parameters. - // get_cluster_descriptor_file_path will create ethernet map in the background. - static std::string get_cluster_descriptor_file_path(); - static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); - - // TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation. - // The name of the function is kept to not gate the changes regarding create-ethernet-map. - // It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda. - static std::unique_ptr create_for_grayskull_cluster( - const std::set &logical_mmio_device_ids, - const std::vector &physical_mmio_device_ids); - - const std::unordered_map& get_harvesting_info() const; - const std::unordered_map& get_noc_translation_table_en() const; - const std::unordered_map& get_chip_locations() const; - const std::unordered_map > > get_ethernet_connections() const; - const std::unordered_map get_chips_with_mmio() const; - const std::unordered_set& get_all_chips() const; - const std::unordered_map>& get_chips_grouped_by_closest_mmio() const; - std::size_t get_number_of_chips() const; - - int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; - - BoardType get_board_type(chip_id_t chip_id) const; - - bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - std::tuple get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - - void enable_all_devices(); - +private: + int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; + +protected: + std::unordered_map>> + ethernet_connections; + std::unordered_map chip_locations; + // reverse map: rack/shelf/y/x -> chip_id + std::map>>> coords_to_chip_ids; + std::unordered_map chips_with_mmio; + std::unordered_set all_chips; + std::unordered_map noc_translation_enabled = {}; + std::unordered_map harvesting_masks = {}; + std::unordered_set enabled_active_chips; + std::unordered_map closest_mmio_chip_cache = {}; + std::unordered_map chip_board_type = {}; + std::unordered_map> chips_grouped_by_closest_mmio; + + // one-to-many chip connections + struct Chip2ChipConnection { + eth_coord_t source_chip_coord; + std::unordered_set destination_chip_coords; + }; + + // shelf_id -> y dim -> list of chip2chip connections between different shelves + // assumption is that on every row of the shelf there is a chip that is connected to the other shelf + // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other + // shelf (in case of nebula->galaxy) + std::unordered_map> galaxy_shelves_exit_chip_coords_per_y_dim = + {}; + // rack_id -> x dim -> list of chip2chip connections between different racks + // assumption is that on every row of the rack there is a chip that is connected to the other rack + std::unordered_map> galaxy_racks_exit_chip_coords_per_x_dim = {}; + + static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void fill_galaxy_connections(tt_ClusterDescriptor &desc); + static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void merge_cluster_ids(tt_ClusterDescriptor &desc); + static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); + + void fill_chips_grouped_by_closest_mmio(); + +public: + tt_ClusterDescriptor() = default; + tt_ClusterDescriptor(const tt_ClusterDescriptor &) = default; + + /* + * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument + * ordering when calling the function An empty result implies that the two chips do not share any direct connection + */ + std::vector> + get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; + + bool is_chip_mmio_capable(const chip_id_t chip_id) const; + bool is_chip_remote(const chip_id_t chip_id) const; + chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip); + chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); + + // TODO: These following functions will be removed, and ClusterDescriptor will be created without any parameters. + // get_cluster_descriptor_file_path will create ethernet map in the background. + static std::string get_cluster_descriptor_file_path(); + static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); + + // TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation. + // The name of the function is kept to not gate the changes regarding create-ethernet-map. + // It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda. + static std::unique_ptr create_for_grayskull_cluster( + const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids); + + const std::unordered_map &get_harvesting_info() const; + const std::unordered_map &get_noc_translation_table_en() const; + const std::unordered_map &get_chip_locations() const; + const std:: + unordered_map>> + get_ethernet_connections() const; + const std::unordered_map get_chips_with_mmio() const; + const std::unordered_set &get_all_chips() const; + const std::unordered_map> &get_chips_grouped_by_closest_mmio() const; + std::size_t get_number_of_chips() const; + + int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; + + BoardType get_board_type(chip_id_t chip_id) const; + + bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; + std::tuple get_chip_and_channel_of_remote_ethernet_core( + chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; + + void enable_all_devices(); }; diff --git a/device/api/umd/device/tt_cluster_descriptor_types.h b/device/api/umd/device/tt_cluster_descriptor_types.h index 142c9fef..81b652f5 100644 --- a/device/api/umd/device/tt_cluster_descriptor_types.h +++ b/device/api/umd/device/tt_cluster_descriptor_types.h @@ -4,25 +4,47 @@ * SPDX-License-Identifier: Apache-2.0 */ -#pragma once +#pragma once #include #include using chip_id_t = int; using ethernet_channel_t = int; -using eth_coord_t = std::tuple; // x, y, rack, shelf + +struct eth_coord_t { + int cluster_id; // This is the same for connected chips. + int x; + int y; + int rack; + int shelf; + + // in C++20 this should be defined as: + // constexpr bool operator==(const eth_coord_t &other) const noexcept = default; + constexpr bool operator==(const eth_coord_t &other) const noexcept { + return ( + cluster_id == other.cluster_id and x == other.x and y == other.y and rack == other.rack and + shelf == other.shelf); + } +}; + +// Small performant hash combiner taken from boost library. +// Not using boost::hash_combine due to dependency complications. +inline void boost_hash_combine(std::size_t &seed, const int value) { + seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} namespace std { template <> struct hash { - std::size_t operator()(eth_coord_t const &c) const { - std::size_t seed = 0; - seed = std::hash()(std::get<0>(c)) << 48 | - std::hash()(std::get<1>(c)) << 32 | - std::hash()(std::get<2>(c)) << 16 | - std::hash()(std::get<3>(c)); - return seed; - } + std::size_t operator()(eth_coord_t const &c) const { + std::size_t seed = 0; + boost_hash_combine(seed, c.cluster_id); + boost_hash_combine(seed, c.x); + boost_hash_combine(seed, c.y); + boost_hash_combine(seed, c.rack); + boost_hash_combine(seed, c.shelf); + return seed; + } }; -} +} // namespace std diff --git a/device/api/umd/device/tt_io.hpp b/device/api/umd/device/tt_io.hpp index 8d0203e3..174903cb 100644 --- a/device/api/umd/device/tt_io.hpp +++ b/device/api/umd/device/tt_io.hpp @@ -11,7 +11,7 @@ namespace tt { namespace umd { - class Cluster; +class Cluster; } /** @@ -22,20 +22,18 @@ namespace umd { * * It is the caller's responsibility to manage the lifetime of Writer objects. */ -class Writer -{ +class Writer { friend class tt::umd::Cluster; public: /** * @brief Write to a SoC core. - * + * * @param address must be aligned to the size of T - * @param value + * @param value */ template - void write(uint32_t address, T value) - { + void write(uint32_t address, T value) { auto dst = reinterpret_cast(base) + address; if (address >= tlb_size) { @@ -46,27 +44,23 @@ class Writer throw std::runtime_error("Unaligned write"); } - *reinterpret_cast(dst) = value; + *reinterpret_cast(dst) = value; } private: /** * @brief tt::umd::Cluster interface to construct a new Writer object. - * + * * @param base pointer to the base address of a mapped TLB. * @param tlb_size size of the mapped TLB. */ - Writer(void *base, size_t tlb_size) - : base(base) - , tlb_size(tlb_size) - { + Writer(void *base, size_t tlb_size) : base(base), tlb_size(tlb_size) { assert(base); assert(tlb_size > 0); } - void *base{ nullptr }; - size_t tlb_size{ 0 }; + void *base{nullptr}; + size_t tlb_size{0}; }; - -} // namespace tt +} // namespace tt diff --git a/device/api/umd/device/tt_silicon_driver_common.hpp b/device/api/umd/device/tt_silicon_driver_common.hpp index 9f275668..6dc6d7f4 100644 --- a/device/api/umd/device/tt_silicon_driver_common.hpp +++ b/device/api/umd/device/tt_silicon_driver_common.hpp @@ -9,53 +9,42 @@ #include #include -enum class TensixSoftResetOptions: std::uint32_t { +enum class TensixSoftResetOptions : std::uint32_t { NONE = 0, - BRISC = ((std::uint32_t) 1 << 11), - TRISC0 = ((std::uint32_t) 1 << 12), - TRISC1 = ((std::uint32_t) 1 << 13), - TRISC2 = ((std::uint32_t) 1 << 14), - NCRISC = ((std::uint32_t) 1 << 18), - STAGGERED_START = ((std::uint32_t) 1 << 31) + BRISC = ((std::uint32_t)1 << 11), + TRISC0 = ((std::uint32_t)1 << 12), + TRISC1 = ((std::uint32_t)1 << 13), + TRISC2 = ((std::uint32_t)1 << 14), + NCRISC = ((std::uint32_t)1 << 18), + STAGGERED_START = ((std::uint32_t)1 << 31) }; std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value); + constexpr TensixSoftResetOptions operator|(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return static_cast( - static_cast(lhs) | - static_cast(rhs) - ); + return static_cast(static_cast(lhs) | static_cast(rhs)); } constexpr TensixSoftResetOptions operator&(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return static_cast( - static_cast(lhs) & - static_cast(rhs) - ); + return static_cast(static_cast(lhs) & static_cast(rhs)); } constexpr bool operator!=(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return - static_cast(lhs) != - static_cast(rhs); + return static_cast(lhs) != static_cast(rhs); } -static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = TensixSoftResetOptions::TRISC0 | - TensixSoftResetOptions::TRISC1 | - TensixSoftResetOptions::TRISC2; +static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = + TensixSoftResetOptions::TRISC0 | TensixSoftResetOptions::TRISC1 | TensixSoftResetOptions::TRISC2; -static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = TensixSoftResetOptions::BRISC | - TensixSoftResetOptions::NCRISC | - TensixSoftResetOptions::STAGGERED_START | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = + TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | TensixSoftResetOptions::STAGGERED_START | + ALL_TRISC_SOFT_RESET; -static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = TensixSoftResetOptions::BRISC | - TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = + TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET; -static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET | - TensixSoftResetOptions::STAGGERED_START; +static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = + TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET | TensixSoftResetOptions::STAGGERED_START; -static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = + TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET; diff --git a/device/api/umd/device/tt_simulation_device.h b/device/api/umd/device/tt_simulation_device.h index 955dd288..9b4778aa 100644 --- a/device/api/umd/device/tt_simulation_device.h +++ b/device/api/umd/device/tt_simulation_device.h @@ -13,43 +13,49 @@ #include "umd/device/cluster.h" #include "umd/device/tt_simulation_host.hpp" -class tt_SimulationDevice: public tt_device { - public: - tt_SimulationDevice(const std::string &sdesc_path); +class tt_SimulationDevice : public tt_device { +public: + tt_SimulationDevice(const std::string& sdesc_path); ~tt_SimulationDevice(); tt_SimulationHost host; - //Setup/Teardown Functions + // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); - virtual void start_device(const tt_device_params &device_params); + virtual void start_device(const tt_device_params& device_params); virtual void assert_risc_reset(); virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset_at_core(tt_cxy_pair core); virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); virtual void wait_for_non_mmio_flush(); virtual void wait_for_non_mmio_flush(const chip_id_t chip); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); // Misc. Functions to Query/Set Device State // virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); static std::vector detect_available_device_ids(); virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); @@ -57,7 +63,7 @@ class tt_SimulationDevice: public tt_device { virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); - private: +private: // State variables tt_device_dram_address_params dram_address_params; tt_device_l1_address_params l1_address_params; diff --git a/device/api/umd/device/tt_simulation_host.hpp b/device/api/umd/device/tt_simulation_host.hpp index 636c51e8..2db54394 100644 --- a/device/api/umd/device/tt_simulation_host.hpp +++ b/device/api/umd/device/tt_simulation_host.hpp @@ -1,9 +1,9 @@ // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include #include #include +#include #include "umd/device/tt_xy_pair.h" @@ -20,6 +20,7 @@ class tt_SimulationHost { void start_host(); void send_to_device(uint8_t *buf, size_t buf_size); size_t recv_from_device(void **data_ptr); + private: std::unique_ptr host_socket; std::unique_ptr host_dialer; diff --git a/device/api/umd/device/tt_soc_descriptor.h b/device/api/umd/device/tt_soc_descriptor.h index 3a284d0a..e59d4416 100644 --- a/device/api/umd/device/tt_soc_descriptor.h +++ b/device/api/umd/device/tt_soc_descriptor.h @@ -7,29 +7,26 @@ #pragma once #include -#include +#include +#include #include +#include #include #include -#include -#include -#include - -#include "umd/device/tt_xy_pair.h" -#include "umd/device/tt_arch_types.h" - -#include "umd/device/coordinate_manager.h" - #include "fmt/core.h" +#include "tt_xy_pair.h" +#include "umd/device/coordinate_manager.h" +#include "umd/device/tt_arch_types.h" +#include "umd/device/tt_xy_pair.h" namespace YAML { - class Node; +class Node; } std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name); -static inline std::string get_arch_str(const tt::ARCH arch_name){ +static inline std::string get_arch_str(const tt::ARCH arch_name) { std::string arch_name_str; if (arch_name == tt::ARCH::GRAYSKULL) { @@ -45,16 +42,18 @@ static inline std::string get_arch_str(const tt::ARCH arch_name){ return arch_name_str; } -static inline tt::ARCH get_arch_name(const std::string &arch_str){ +static inline tt::ARCH get_arch_name(const std::string &arch_str) { tt::ARCH arch; if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) { arch = tt::ARCH::GRAYSKULL; - } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){ + } else if ( + (arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || + (arch_str == "WORMHOLE_B0")) { arch = tt::ARCH::WORMHOLE_B0; - } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")){ + } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")) { arch = tt::ARCH::BLACKHOLE; - }else { + } else { throw std::runtime_error( fmt::format("At LoadSocDescriptorFromYaml: \"{}\" is not recognized as tt::ARCH.", arch_str)); } @@ -69,13 +68,13 @@ tt_xy_pair format_node(std::string str); //! SocCore type enumerations /*! Superset for all chip generations */ enum class CoreType { - ARC, - DRAM, - ETH, - PCIE, - WORKER, - HARVESTED, - ROUTER_ONLY, + ARC, + DRAM, + ETH, + PCIE, + WORKER, + HARVESTED, + ROUTER_ONLY, }; @@ -84,10 +83,10 @@ enum class CoreType { Should only contain relevant configuration for SOC */ struct CoreDescriptor { - tt_xy_pair coord = tt_xy_pair(0, 0); - CoreType type; + tt_xy_pair coord = tt_xy_pair(0, 0); + CoreType type; - std::size_t l1_size = 0; + std::size_t l1_size = 0; }; //! tt_SocDescriptor contains information regarding the SOC configuration targetted. @@ -95,7 +94,6 @@ struct CoreDescriptor { Should only contain relevant configuration for SOC */ class tt_SocDescriptor { - public: tt::ARCH arch; tt_xy_pair grid_size; @@ -110,13 +108,15 @@ class tt_SocDescriptor { std::unordered_map worker_log_to_routing_y; std::unordered_map routing_x_to_worker_x; std::unordered_map routing_y_to_worker_y; - std::vector> dram_cores; // per channel list of dram cores + std::vector> dram_cores; // per channel list of dram cores std::unordered_map> dram_core_channel_map; // map dram core to chan/subchan - std::vector ethernet_cores; // ethernet cores (index == channel id) - std::unordered_map ethernet_core_channel_map; + std::vector ethernet_cores; // ethernet cores (index == channel id) + std::unordered_map ethernet_core_channel_map; std::vector trisc_sizes; // Most of software stack assumes same trisc size for whole chip.. std::string device_descriptor_file_path = std::string(""); + bool has(tt_xy_pair input) { return cores.find(input) != cores.end(); } + int overlay_version; int unpacker_version; int dst_size_alignment; @@ -129,15 +129,15 @@ class tt_SocDescriptor { int get_num_dram_channels() const; bool is_worker_core(const tt_xy_pair &core) const; tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const; - bool is_ethernet_core(const tt_xy_pair& core) const; + bool is_ethernet_core(const tt_xy_pair &core) const; // Default constructor. Creates uninitialized object with public access to all of its attributes. tt_SocDescriptor() = default; - // Constructor used to build object from device descriptor file. + // Constructor used to build object from device descriptor file. tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask = 0); // Copy constructor - tt_SocDescriptor(const tt_SocDescriptor& other) : + tt_SocDescriptor(const tt_SocDescriptor &other) : arch(other.arch), grid_size(other.grid_size), physical_grid_size(other.physical_grid_size), @@ -167,7 +167,7 @@ class tt_SocDescriptor { dram_bank_size(other.dram_bank_size) { coordinate_manager.reset(new CoordinateManager(*other.coordinate_manager)); } - + // Coordinate conversions. // Conversions from logical coordinates should be used just for worker cores. @@ -189,11 +189,14 @@ class tt_SocDescriptor { void perform_harvesting(std::size_t harvesting_mask); + static std::string get_soc_descriptor_path(tt::ARCH arch); + private: - std::unique_ptr coordinate_manager = nullptr; void create_coordinate_manager(std::size_t harvesting_mask); void load_core_descriptors_from_device_descriptor(YAML::Node &device_descriptor_yaml); void load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml); + + std::unique_ptr coordinate_manager = nullptr; }; // Allocates a new soc descriptor on the heap. Returns an owning pointer. diff --git a/device/api/umd/device/tt_xy_pair.h b/device/api/umd/device/tt_xy_pair.h index b86d568e..9375182f 100644 --- a/device/api/umd/device/tt_xy_pair.h +++ b/device/api/umd/device/tt_xy_pair.h @@ -15,44 +15,56 @@ using tt_cxy_pair = tt::umd::cxy_pair; struct tt_physical_coords : public tt_xy_pair { tt_physical_coords() : tt_xy_pair() {} + tt_physical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_physical_coords : public tt_cxy_pair { tt_chip_physical_coords() : tt_cxy_pair() {} + tt_chip_physical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_physical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_logical_coords : public tt_xy_pair { tt_logical_coords() : tt_xy_pair() {} + tt_logical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_logical_coords : public tt_cxy_pair { tt_chip_logical_coords() : tt_cxy_pair() {} + tt_chip_logical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_logical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_virtual_coords : public tt_xy_pair { tt_virtual_coords() : tt_xy_pair() {} + tt_virtual_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_virtual_coords : public tt_cxy_pair { tt_chip_virtual_coords() : tt_cxy_pair() {} + tt_chip_virtual_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_virtual_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_translated_coords : public tt_xy_pair { tt_translated_coords() : tt_xy_pair() {} + tt_translated_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_translated_coords : public tt_cxy_pair { tt_chip_translated_coords() : tt_cxy_pair() {} + tt_chip_translated_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_translated_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; diff --git a/device/api/umd/device/wormhole_implementation.h b/device/api/umd/device/wormhole_implementation.h index 7bef1e9e..3dfebb96 100644 --- a/device/api/umd/device/wormhole_implementation.h +++ b/device/api/umd/device/wormhole_implementation.h @@ -167,7 +167,8 @@ static constexpr uint32_t TLB_BASE_INDEX_16M = TLB_BASE_INDEX_2M + TLB_COUNT_2M; static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024; -static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); +static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = + STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M; static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024; @@ -205,59 +206,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; } // namespace wormhole class wormhole_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::WORMHOLE_B0; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(wormhole::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(wormhole::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(wormhole::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(wormhole::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(wormhole::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(wormhole::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(wormhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(wormhole::arc_message_type::TEST); } + uint32_t get_arc_csm_mailbox_offset() const override { return wormhole::ARC_CSM_MAILBOX_OFFSET; } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return wormhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return wormhole::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return wormhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return wormhole::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return wormhole::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return wormhole::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return wormhole::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return wormhole::DYNAMIC_TLB_2M_SIZE; } + uint32_t get_dynamic_tlb_16m_base() const override { return wormhole::DYNAMIC_TLB_16M_BASE; } + uint32_t get_dynamic_tlb_16m_size() const override { return wormhole::DYNAMIC_TLB_16M_SIZE; } + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return wormhole::DYNAMIC_TLB_16M_CFG_ADDR; } + uint32_t get_mem_large_read_tlb() const override { return wormhole::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return wormhole::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; } + uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; } + uint32_t get_tlb_base_index_16m() const override { return wormhole::TLB_BASE_INDEX_16M; } + uint32_t get_tensix_soft_reset_addr() const override { return wormhole::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return wormhole::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return wormhole::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return wormhole::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return wormhole::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return wormhole::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return wormhole::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -268,7 +303,6 @@ class wormhole_implementation : public architecture_implementation { tt_driver_host_address_params get_host_address_params() const override; tt_driver_eth_interface_params get_eth_interface_params() const override; tt_driver_noc_params get_noc_params() const override; - }; } // namespace tt::umd diff --git a/device/api/umd/device/xy_pair.h b/device/api/umd/device/xy_pair.h index ca717052..b989b31e 100644 --- a/device/api/umd/device/xy_pair.h +++ b/device/api/umd/device/xy_pair.h @@ -12,6 +12,7 @@ namespace tt::umd { struct xy_pair { constexpr xy_pair() : x{}, y{} {} + constexpr xy_pair(std::size_t x, std::size_t y) : x(x), y(y) {} std::size_t x; @@ -30,7 +31,9 @@ constexpr inline bool operator<(const xy_pair &left, const xy_pair &right) { struct cxy_pair : public xy_pair { cxy_pair() : xy_pair{}, chip{} {} + cxy_pair(std::size_t ichip, xy_pair pair) : xy_pair(pair.x, pair.y), chip(ichip) {} + cxy_pair(std::size_t ichip, std::size_t x, std::size_t y) : xy_pair(x, y), chip(ichip) {} std::size_t chip; diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp index 4b7b7faf..dc0c0b00 100644 --- a/device/architecture_implementation.cpp +++ b/device/architecture_implementation.cpp @@ -12,10 +12,14 @@ namespace tt::umd { std::unique_ptr architecture_implementation::create(tt::ARCH architecture) { switch (architecture) { - case tt::ARCH::BLACKHOLE: return std::make_unique(); - case tt::ARCH::GRAYSKULL: return std::make_unique(); - case tt::ARCH::WORMHOLE_B0: return std::make_unique(); - default: return nullptr; + case tt::ARCH::BLACKHOLE: + return std::make_unique(); + case tt::ARCH::GRAYSKULL: + return std::make_unique(); + case tt::ARCH::WORMHOLE_B0: + return std::make_unique(); + default: + return nullptr; } } diff --git a/device/blackhole/blackhole_coordinate_manager.h b/device/blackhole/blackhole_coordinate_manager.h index 7491f272..9a00b46d 100644 --- a/device/blackhole/blackhole_coordinate_manager.h +++ b/device/blackhole/blackhole_coordinate_manager.h @@ -9,15 +9,15 @@ #include "umd/device/coordinate_manager.h" class BlackholeCoordinateManager : public CoordinateManager { - public: - BlackholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + BlackholeCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override; tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override; -protected: +protected: std::set get_x_coordinates_to_harvest(std::size_t harvesting_mask) override; }; diff --git a/device/blackhole/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp index 0421e2d1..14de739e 100644 --- a/device/blackhole/blackhole_implementation.cpp +++ b/device/blackhole/blackhole_implementation.cpp @@ -4,13 +4,12 @@ #include "umd/device/blackhole_implementation.h" -#include "blackhole/host_mem_address_map.h" #include "blackhole/eth_interface.h" - +#include "blackhole/host_mem_address_map.h" #include "umd/device/cluster.h" -constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH -constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH namespace tt::umd { @@ -26,10 +25,9 @@ std::tuple blackhole_implementation::multicast_workaround(xy_p } tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_index) const { - // If TLB index is in range for 4GB tlbs (8 TLBs after 202 TLBs for 2MB) if (tlb_index >= blackhole::TLB_COUNT_2M && tlb_index < blackhole::TLB_COUNT_2M + blackhole::TLB_COUNT_4G) { - return tlb_configuration { + return tlb_configuration{ .size = blackhole::DYNAMIC_TLB_4G_SIZE, .base = blackhole::DYNAMIC_TLB_4G_BASE, .cfg_addr = blackhole::DYNAMIC_TLB_4G_CFG_ADDR, @@ -37,7 +35,7 @@ tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_i .offset = blackhole::TLB_4G_OFFSET, }; } - + return tlb_configuration{ .size = blackhole::DYNAMIC_TLB_2M_SIZE, .base = blackhole::DYNAMIC_TLB_2M_BASE, @@ -73,17 +71,17 @@ std::optional> blackhole_implementation std::pair blackhole_implementation::get_tlb_data( std::uint32_t tlb_index, const tlb_data& data) const { - if (tlb_index < blackhole::TLB_COUNT_2M) { return data.apply_offset(blackhole::TLB_2M_OFFSET); } else { throw std::runtime_error("Invalid TLB index for Blackhole arch"); } - } tt_driver_host_address_params blackhole_implementation::get_host_address_params() const { - return {::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } tt_driver_eth_interface_params blackhole_implementation::get_eth_interface_params() const { diff --git a/device/cluster.cpp b/device/cluster.cpp index be574bfe..7163223d 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -3,61 +3,59 @@ // SPDX-License-Identifier: Apache-2.0 #include "umd/device/cluster.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include -#include #include - +#include +#include +#include +#include +#include +#include +#include #include #include #include #include -#include #include #include +#include +#include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include +#include -#include "yaml-cpp/yaml.h" #include "logger.hpp" - -#include "umd/device/tt_cluster_descriptor.h" +#include "umd/device/architecture_implementation.h" #include "umd/device/driver_atomics.h" #include "umd/device/hugepage.h" -#include "umd/device/architecture_implementation.h" #include "umd/device/tlb.h" #include "umd/device/tt_arch_types.h" +#include "umd/device/tt_cluster_descriptor.h" +#include "yaml-cpp/yaml.h" using namespace boost::interprocess; using namespace tt; using namespace tt::umd; - static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF; // TLB size for DRAM on blackhole - 4GB const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024; -static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation) +// Remove 256MB from full 1GB for channel 3 (iATU limitation) +static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368; // TODO: Remove in favor of cluster descriptor method, when it becomes available. // Metal uses this function to determine the architecture of the first PCIe chip @@ -93,7 +91,7 @@ tt::ARCH detect_arch() { } template -void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { +void size_buffer_to_capacity(std::vector& data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; if (size_in_bytes > 0) { target_size = ((size_in_bytes - 1) / sizeof(T)) + 1; @@ -103,11 +101,9 @@ void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes // TODO: To be removed when tt_device is removed -tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) { -} +tt_device::tt_device() : soc_descriptor_per_chip({}) {} -tt_device::~tt_device() { -} +tt_device::~tt_device() {} const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { return soc_descriptor_per_chip.at(chip_id); @@ -117,12 +113,12 @@ const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { // -------------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------------- -#include "umd/device/tt_silicon_driver_common.hpp" -#include "umd/device/tt_xy_pair.h" -#include #include #include +#include +#include "umd/device/tt_silicon_driver_common.hpp" +#include "umd/device/tt_xy_pair.h" struct routing_cmd_t { uint64_t sys_addr; @@ -131,49 +127,53 @@ struct routing_cmd_t { uint16_t rack; uint16_t src_resp_buf_index; uint32_t local_buf_index; - uint8_t src_resp_q_id; - uint8_t host_mem_txn_id; + uint8_t src_resp_q_id; + uint8_t host_mem_txn_id; uint16_t padding; - uint32_t src_addr_tag; //upper 32-bits of request source address. + uint32_t src_addr_tag; // upper 32-bits of request source address. }; -struct remote_update_ptr_t{ - uint32_t ptr; - uint32_t pad[3]; +struct remote_update_ptr_t { + uint32_t ptr; + uint32_t pad[3]; }; namespace { - struct tt_4_byte_aligned_buffer { - // Stores a 4 byte aligned buffer - // If the input buffer is already 4 byte aligned, this is a nop - std::uint32_t* local_storage = nullptr; - std::uint32_t input_size = 0; - std::uint32_t block_size = 0; - - tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) { - input_size = size_in_bytes; - local_storage = (uint32_t*)mem_ptr; - uint32_t alignment_mask = sizeof(uint32_t) - 1; - uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask; +struct tt_4_byte_aligned_buffer { + // Stores a 4 byte aligned buffer + // If the input buffer is already 4 byte aligned, this is a nop + std::uint32_t* local_storage = nullptr; + std::uint32_t input_size = 0; + std::uint32_t block_size = 0; - if(size_in_bytes < aligned_size) { - local_storage = new uint32_t[aligned_size / sizeof(uint32_t)]; - } - block_size = aligned_size; + tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) { + input_size = size_in_bytes; + local_storage = (uint32_t*)mem_ptr; + uint32_t alignment_mask = sizeof(uint32_t) - 1; + uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask; + + if (size_in_bytes < aligned_size) { + local_storage = new uint32_t[aligned_size / sizeof(uint32_t)]; } + block_size = aligned_size; + } - ~tt_4_byte_aligned_buffer() { - if(block_size > input_size) { - delete [] local_storage; - } + ~tt_4_byte_aligned_buffer() { + if (block_size > input_size) { + delete[] local_storage; } - }; -} + } +}; +} // namespace namespace tt::umd { -bool Cluster::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { - return ((tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && address >= tlb_config_map.at(chip).at(tlb_index) && (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size)); +bool Cluster::address_in_tlb_space( + uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { + return ( + (tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && + address >= tlb_config_map.at(chip).at(tlb_index) && + (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size)); } std::unordered_map& Cluster::get_virtual_soc_descriptors() { @@ -181,10 +181,10 @@ std::unordered_map& Cluster::get_virtual_soc_descri } void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) { - // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here (during device init) - // since its unsafe to modify shared state during multithreaded runtime. - // cleanup_mutexes_in_shm is tied to clean_system_resources from the constructor. The main process is responsible for initializing the driver with this - // field set to cleanup after an aborted process. + // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here + // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm + // is tied to clean_system_resources from the constructor. The main process is responsible for initializing the + // driver with this field set to cleanup after an aborted process. // Store old mask and clear processes umask auto old_umask = umask(0); @@ -193,236 +193,292 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup std::string mutex_name = ""; // Initialize Dynamic TLB mutexes - for(auto &tlb : dynamic_tlb_config) { + for (auto& tlb : dynamic_tlb_config) { mutex_name = tlb.first + std::to_string(pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); } // Initialize ARC core mutex mutex_name = fmt::format("ARC_MSG{}", pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); if (arch_name == tt::ARCH::WORMHOLE_B0) { mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id); - // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for ethernet broadcast - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for + // ethernet broadcast + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); } // Initialize interprocess mutexes to make host -> device memory barriers atomic mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); - + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + // Restore old mask umask(old_umask); } -void Cluster::create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) { +void Cluster::create_device( + const std::unordered_set& target_mmio_device_ids, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources) { log_debug(LogSiliconDriver, "Cluster::Cluster"); // Don't buffer stdout. setbuf(stdout, NULL); - // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to use available devices. + // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to + // use available devices. auto logical_to_physical_device_id_map = ndesc->get_chips_with_mmio(); - log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now."); + log_assert( + target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now."); - for (const chip_id_t &logical_device_id : target_mmio_device_ids) { - log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id); + for (const chip_id_t& logical_device_id : target_mmio_device_ids) { + log_assert( + logical_to_physical_device_id_map.count(logical_device_id) != 0, + "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", + logical_device_id); int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id); if (!m_pci_device_map.count(logical_device_id)) { - log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id); - m_pci_device_map.insert({logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); + log_debug( + LogSiliconDriver, + "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", + pci_interface_id, + logical_device_id); + m_pci_device_map.insert( + {logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); } auto dev = m_pci_device_map.at(logical_device_id).get(); uint16_t pcie_device_id = dev->get_pci_device_id(); uint32_t pcie_revision = dev->get_pci_revision(); // TODO: get rid of this, it doesn't make any sense. - int num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); + int num_host_mem_channels = + get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); if (dev->get_arch() == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1) { // TODO: Implement support for multiple host channels on BLACKHOLE. - log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported."); + log_warning( + LogSiliconDriver, + "Forcing a single channel for Blackhole device. Multiple host channels not supported."); num_host_mem_channels = 1; } - log_debug(LogSiliconDriver, "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} device_id: 0x{:x} revision: {})", - num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->get_device_num(), pci_device->revision_id); + log_debug( + LogSiliconDriver, + "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} " + "device_id: 0x{:x} revision: {})", + num_host_mem_channels, + logical_device_id, + pci_interface_id, + pci_device->get_device_num(), + pci_device->revision_id); initialize_interprocess_mutexes(pci_interface_id, clean_system_resources); // MT: Initial BH - hugepages will fail init // For using silicon driver without workload to query mission mode params, no need for hugepage. - if (!skip_driver_allocs){ + if (!skip_driver_allocs) { // TODO: Implement support for multiple host channels on BLACKHOLE. - log_assert(!(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1), "More channels are not yet supported for Blackhole"); - bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels); // Same number of host channels per device for now + log_assert( + !(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1), + "More channels are not yet supported for Blackhole"); + // Same number of host channels per device for now + bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels); // Large writes to remote chips require hugepages to be initialized. - // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused if using remote only for small transactions) - if(target_remote_chips.size()) { - log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!"); + // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused + // if using remote only for small transactions) + if (target_remote_chips.size()) { + log_assert( + hugepages_initialized, + "Hugepages must be successfully initialized if workload contains remote chips!"); } if (not m_pci_device_map.at(logical_device_id)->get_hugepage_mapping(0).mapping) { log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id); } } - harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map + // translation layer for harvested coords. Default is identity map + harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); } - for(const chip_id_t& chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { // Initialize identity mapping for Non-MMIO chips as well - if(!ndesc -> is_chip_mmio_capable(chip)) { + if (!ndesc->is_chip_mmio_capable(chip)) { harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, true)}); flush_non_mmio_per_chip[chip] = false; } } } -bool Cluster::using_harvested_soc_descriptors() { - return perform_harvesting_on_sdesc && performed_harvesting; -} +bool Cluster::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; } std::unordered_map Cluster::get_harvested_coord_translation_map(chip_id_t logical_device_id) { return harvested_coord_translation.at(logical_device_id); } std::unordered_map Cluster::get_harvesting_masks_for_soc_descriptors() { - if(using_harvested_soc_descriptors()) { + if (using_harvested_soc_descriptors()) { return harvested_rows_per_target; } std::unordered_map default_harvesting_masks = {}; - for(const auto chip : target_devices_in_cluster) default_harvesting_masks.insert({chip, 0}); + for (const auto chip : target_devices_in_cluster) { + default_harvesting_masks.insert({chip, 0}); + } return default_harvesting_masks; } -Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device(sdesc_path) { +void Cluster::construct_cluster( + const std::string& sdesc_path, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) { std::unordered_set target_mmio_device_ids; - target_devices_in_cluster = target_devices; - arch_name = tt_SocDescriptor(sdesc_path).arch; - perform_harvesting_on_sdesc = perform_harvesting; - - auto available_device_ids = detect_available_device_ids(); - m_num_pci_devices = available_device_ids.size(); - - if (!skip_driver_allocs) { - log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); - log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); - } - - std::string cluster_descriptor_path = ndesc_path; - if (cluster_descriptor_path == "") { - cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - } - - ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path); - - for (auto &d: target_devices){ - if (ndesc->is_chip_mmio_capable(d)){ + for (auto& d : target_devices_in_cluster) { + log_assert( + ndesc->get_all_chips().find(d) != ndesc->get_all_chips().end(), + "Target device {} not present in current cluster!", + d); + if (ndesc->is_chip_mmio_capable(d)) { target_mmio_device_ids.insert(d); - } - else { + } else { target_remote_chips.insert(d); } } - // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and writes. + // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and + // writes. auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); + dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb(); dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb(); dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb(); - for(const auto& tlb : dynamic_tlb_config) { - dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH + // All dynamic TLBs use Relaxed Ordering by default + for (const auto& tlb : dynamic_tlb_config) { + dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); } create_device(target_mmio_device_ids, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources); // MT: Initial BH - Disable dependency to ethernet firmware - if(arch_name == tt::ARCH::BLACKHOLE) { + if (arch_name == tt::ARCH::BLACKHOLE) { use_ethernet_ordered_writes = false; use_ethernet_broadcast = false; use_virtual_coords_for_eth_broadcast = false; } - if(arch_name == tt::ARCH::WORMHOLE_B0) { - const auto& harvesting_masks = ndesc -> get_harvesting_info(); - const auto& noc_translation_enabled = ndesc -> get_noc_translation_table_en(); + if (arch_name == tt::ARCH::WORMHOLE_B0) { + const auto& harvesting_masks = ndesc->get_harvesting_info(); + const auto& noc_translation_enabled = ndesc->get_noc_translation_table_en(); translation_tables_en = false; - for(auto& masks : harvesting_masks) { - if(target_devices.find(masks.first) != target_devices.end()) { + for (auto& masks : harvesting_masks) { + if (target_devices_in_cluster.find(masks.first) != target_devices_in_cluster.end()) { harvested_rows_per_target[masks.first] = get_harvested_noc_rows(masks.second); noc_translation_enabled_for_chip[masks.first] = noc_translation_enabled.at(masks.first); num_rows_harvested.insert({masks.first, std::bitset<32>(masks.second).count()}); - if(harvested_rows_per_target[masks.first]) { + if (harvested_rows_per_target[masks.first]) { performed_harvesting = true; } } } - if(noc_translation_enabled_for_chip.size() > 0) { - auto const consistent_translation_table_state = [&] (std::pair const& i) { - return noc_translation_enabled_for_chip.begin() -> second == i.second; + if (noc_translation_enabled_for_chip.size() > 0) { + auto const consistent_translation_table_state = [&](std::pair const& i) { + return noc_translation_enabled_for_chip.begin()->second == i.second; }; - bool translation_tables_match_on_all_chips = std::all_of(noc_translation_enabled_for_chip.begin(), noc_translation_enabled_for_chip.end(), consistent_translation_table_state); - log_assert(translation_tables_match_on_all_chips, "Cluster uses NOC translation tables inconsistently across chips."); - translation_tables_en = noc_translation_enabled_for_chip.begin() -> second; + bool translation_tables_match_on_all_chips = std::all_of( + noc_translation_enabled_for_chip.begin(), + noc_translation_enabled_for_chip.end(), + consistent_translation_table_state); + log_assert( + translation_tables_match_on_all_chips, + "Cluster uses NOC translation tables inconsistently across chips."); + translation_tables_en = noc_translation_enabled_for_chip.begin()->second; } - if(translation_tables_en) { + if (translation_tables_en) { harvested_coord_translation.clear(); - for(const chip_id_t& chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, false)}); } } - log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled."); - } - else if(arch_name == tt::ARCH::BLACKHOLE) { + log_assert( + performed_harvesting ? translation_tables_en : true, + "Using a harvested WH cluster with NOC translation disabled."); + } else if (arch_name == tt::ARCH::BLACKHOLE) { // Default harvesting info for Blackhole, describing no harvesting - for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){ - harvested_rows_per_target[*chip_id] = 0; //get_harvested_noc_rows_for_chip(*chip_id); - num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. - if(harvested_rows_per_target[*chip_id]) { + for (auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++) { + harvested_rows_per_target[*chip_id] = 0; // get_harvested_noc_rows_for_chip(*chip_id); + num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want + // all rows to have a reset signal sent. + if (harvested_rows_per_target[*chip_id]) { performed_harvesting = true; } } - } - else if(arch_name == tt::ARCH::GRAYSKULL) { + } else if (arch_name == tt::ARCH::GRAYSKULL) { // Multichip harvesting is supported for GS. - for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){ - harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); - num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. - if(harvested_rows_per_target[*chip_id]) { + for (auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++) { + harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); + num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want + // all rows to have a reset signal sent. + if (harvested_rows_per_target[*chip_id]) { performed_harvesting = true; } } } - if(simulated_harvesting_masks.size()) { + if (simulated_harvesting_masks.size()) { performed_harvesting = true; - for (auto device_id = target_devices.begin(); device_id != target_devices.end(); device_id++) { - log_assert(simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), "Could not find harvesting mask for device_id {}", *device_id); - if(arch_name == tt::ARCH::GRAYSKULL) { - if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != harvested_rows_per_target[*device_id]) { - log_warning(LogSiliconDriver, - "Simulated harvesting config for device {} does not include the actual harvesting config. Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : {} Simulated Harvested Rows : {}", - *device_id, harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); + for (auto device_id = target_devices_in_cluster.begin(); device_id != target_devices_in_cluster.end(); + device_id++) { + log_assert( + simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), + "Could not find harvesting mask for device_id {}", + *device_id); + if (arch_name == tt::ARCH::GRAYSKULL) { + if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != + harvested_rows_per_target[*device_id]) { + log_warning( + LogSiliconDriver, + "Simulated harvesting config for device {} does not include the actual harvesting config. " + "Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : " + "{} Simulated Harvested Rows : {}", + *device_id, + harvested_rows_per_target[*device_id], + simulated_harvesting_masks.at(*device_id)); } simulated_harvesting_masks.at(*device_id) |= harvested_rows_per_target[*device_id]; - } - else if(arch_name == tt::ARCH::WORMHOLE_B0) { - log_assert(std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= std::bitset<32>(harvested_rows_per_target[*device_id]).count(), - "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. Actual Harvested Rows : {} Simulated Harvested Rows : {}", - harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); - num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count(); - log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled."); + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { + log_assert( + std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= + std::bitset<32>(harvested_rows_per_target[*device_id]).count(), + "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. " + "Actual Harvested Rows : {} Simulated Harvested Rows : {}", + harvested_rows_per_target[*device_id], + simulated_harvesting_masks.at(*device_id)); + num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count(); + log_assert( + performed_harvesting ? translation_tables_en : true, + "Using a harvested WH cluster with NOC translation disabled."); } harvested_rows_per_target[*device_id] = simulated_harvesting_masks.at(*device_id); } @@ -432,18 +488,18 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c populate_cores(); // MT: Initial BH - skip this for BH - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { remote_transfer_ethernet_cores.resize(target_mmio_device_ids.size()); - for (const auto &logical_mmio_chip_id : target_mmio_device_ids) { + for (const auto& logical_mmio_chip_id : target_mmio_device_ids) { const tt_SocDescriptor& soc_desc = get_soc_descriptor(logical_mmio_chip_id); // 4-5 is for send_epoch_commands, 0-3 are for everything else for (std::uint32_t i = 0; i < NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS; i++) { - if(remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) { + if (remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) { remote_transfer_ethernet_cores.resize(logical_mmio_chip_id + 1); } - remote_transfer_ethernet_cores.at(logical_mmio_chip_id).push_back( - tt_cxy_pair(logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y) - ); + remote_transfer_ethernet_cores.at(logical_mmio_chip_id) + .push_back(tt_cxy_pair( + logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y)); } } } @@ -456,20 +512,164 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c // Default initialize noc_params based on detected arch noc_params = architecture_implementation->get_noc_params(); +} + +Cluster::Cluster( + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device() { + // TODO: this should be fetched through ClusterDescriptor + auto available_device_ids = detect_available_device_ids(); + m_num_pci_devices = available_device_ids.size(); + + int physical_device_id = available_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device(physical_device_id, 0); + tt::ARCH device_arch = pci_device.get_arch(); + + std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch); + + arch_name = tt_SocDescriptor(sdesc_path).arch; + perform_harvesting_on_sdesc = perform_harvesting; + + if (!skip_driver_allocs) { + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); + log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); + } + + std::string ndesc_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + + std::set target_devices; + for (const chip_id_t& d : ndesc->get_all_chips()) { + target_devices.insert(d); + } + target_devices_in_cluster = target_devices; + + construct_cluster( + sdesc_path, + num_host_mem_ch_per_mmio_device, + skip_driver_allocs, + clean_system_resources, + perform_harvesting, + simulated_harvesting_masks); +} + +Cluster::Cluster( + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device() { + // TODO: this should be fetched through ClusterDescriptor + auto available_device_ids = detect_available_device_ids(); + m_num_pci_devices = available_device_ids.size(); + + int physical_device_id = available_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device(physical_device_id, 0); + tt::ARCH device_arch = pci_device.get_arch(); + + std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch); + arch_name = tt_SocDescriptor(sdesc_path).arch; + perform_harvesting_on_sdesc = perform_harvesting; + + if (!skip_driver_allocs) { + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); + log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); + } + + std::string ndesc_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + + target_devices_in_cluster = target_devices; + + construct_cluster( + sdesc_path, + num_host_mem_ch_per_mmio_device, + skip_driver_allocs, + clean_system_resources, + perform_harvesting, + simulated_harvesting_masks); +} + +Cluster::Cluster( + const std::string& sdesc_path, + const std::string& ndesc_path, + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device() { + // TODO: this should be fetched through ClusterDescriptor + auto available_device_ids = detect_available_device_ids(); + m_num_pci_devices = available_device_ids.size(); + + target_devices_in_cluster = target_devices; + arch_name = tt_SocDescriptor(sdesc_path).arch; + perform_harvesting_on_sdesc = perform_harvesting; + + if (!skip_driver_allocs) { + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); + log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); + } + + std::string cluster_descriptor_path = ndesc_path; + if (cluster_descriptor_path == "") { + cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); + } + + ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path); + + construct_cluster( + sdesc_path, + num_host_mem_ch_per_mmio_device, + skip_driver_allocs, + clean_system_resources, + perform_harvesting, + simulated_harvesting_masks); } -void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { +void Cluster::configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { // Makes UMD aware of which ethernet cores have active links. // Based on this information, UMD determines which ethernet cores can be used for host->cluster non-MMIO transfers. - // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be called for all MMIO devices, if default behaviour - // is not desired. - log_assert(get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, "{} can only be called for Wormhole arch", __FUNCTION__); + // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be + // called for all MMIO devices, if default behaviour is not desired. + log_assert( + get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, + "{} can only be called for Wormhole arch", + __FUNCTION__); auto& eth_cores = get_soc_descriptor(mmio_chip).ethernet_cores; // Cores 0, 1, 6, 7 are only available if in the active set - static std::unordered_set eth_cores_available_if_active = {eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)}; + static std::unordered_set eth_cores_available_if_active = { + eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)}; // Eth cores 8 and 9 are always available - std::vector non_mmio_access_cores_for_chip = {tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))}; + std::vector non_mmio_access_cores_for_chip = { + tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))}; for (const auto& active_eth_core : active_eth_cores_per_chip) { if (eth_cores_available_if_active.find(active_eth_core) != eth_cores_available_if_active.end()) { non_mmio_access_cores_for_chip.push_back(tt_cxy_pair(mmio_chip, active_eth_core)); @@ -483,27 +683,33 @@ void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chi void Cluster::populate_cores() { std::uint32_t count = 0; - for(const auto chip : soc_descriptor_per_chip) { - workers_per_chip.insert({chip.first, std::unordered_set(chip.second.workers.begin(), chip.second.workers.end())}); - if(count == 0) { - eth_cores = std::unordered_set(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end()); - for(std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) { - dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)) ; + for (const auto chip : soc_descriptor_per_chip) { + workers_per_chip.insert( + {chip.first, std::unordered_set(chip.second.workers.begin(), chip.second.workers.end())}); + if (count == 0) { + eth_cores = + std::unordered_set(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end()); + for (std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) { + dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)); } } count++; } } -std::vector Cluster::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) { +std::vector Cluster::extract_rows_to_remove( + const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows) { // Check if harvesting config is legal for GS and WH - log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); + log_assert( + !((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), + "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); std::vector row_coordinates_to_remove; int row_coordinate = 0; int tmp = harvested_rows; while (tmp) { - if (tmp & 1) + if (tmp & 1) { row_coordinates_to_remove.push_back(row_coordinate); + } tmp = tmp >> 1; row_coordinate++; @@ -517,13 +723,14 @@ std::vector Cluster::extract_rows_to_remove(const tt::ARCH &arch, const int return row_coordinates_to_remove; } -void Cluster::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove) { +void Cluster::remove_worker_row_from_descriptor( + tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove) { std::vector workers_to_keep; - for(auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++){ - if(find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == row_coordinates_to_remove.end()){ + for (auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++) { + if (find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == + row_coordinates_to_remove.end()) { workers_to_keep.push_back(*worker); - } - else{ + } else { (full_soc_descriptor.harvested_workers).push_back(*worker); full_soc_descriptor.cores.at(*worker).type = CoreType::HARVESTED; } @@ -535,28 +742,32 @@ void Cluster::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descr std::set modified_y_coords = {}; - for(const auto& core : full_soc_descriptor.workers) { + for (const auto& core : full_soc_descriptor.workers) { modified_y_coords.insert(core.y); } int logical_y_coord = 0; - for(const auto& y_coord : modified_y_coords) { + for (const auto& y_coord : modified_y_coords) { full_soc_descriptor.routing_y_to_worker_y.insert({y_coord, logical_y_coord}); - full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord}); + full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord}); logical_y_coord++; } } void Cluster::harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows) { - std::uint32_t max_row_to_remove = (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [] (const auto& a, const auto& b) { return a.y < b.y; })).y; + std::uint32_t max_row_to_remove = + (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [](const auto& a, const auto& b) { + return a.y < b.y; + })).y; std::vector row_coordinates_to_remove = extract_rows_to_remove(arch, max_row_to_remove, harvested_rows); remove_worker_row_from_descriptor(sdesc, row_coordinates_to_remove); } -void Cluster::perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting) { +void Cluster::perform_harvesting_and_populate_soc_descriptors( + const std::string& sdesc_path, const bool perform_harvesting) { const auto default_sdesc = tt_SocDescriptor(sdesc_path); - for(const auto& chip : harvested_rows_per_target) { + for (const auto& chip : harvested_rows_per_target) { auto temp_sdesc = default_sdesc; - if(perform_harvesting) { + if (perform_harvesting) { harvest_rows_in_soc_descriptor(arch_name, temp_sdesc, chip.second); } soc_descriptor_per_chip.insert({chip.first, temp_sdesc}); @@ -564,25 +775,24 @@ void Cluster::perform_harvesting_and_populate_soc_descriptors(const std::string& } void Cluster::check_pcie_device_initialized(int device_id) { - - PCIDevice *pci_device = get_pci_device(device_id); + PCIDevice* pci_device = get_pci_device(device_id); tt::ARCH device_arch = pci_device->get_arch(); if (arch_name == tt::ARCH::GRAYSKULL) { if (device_arch != tt::ARCH::GRAYSKULL) { - throw std::runtime_error(fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); } - } - else if (arch_name == tt::ARCH::WORMHOLE_B0) { + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { if (device_arch != tt::ARCH::WORMHOLE_B0) { - throw std::runtime_error(fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); } - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { if (device_arch != tt::ARCH::BLACKHOLE) { - throw std::runtime_error(fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch))); } - } - else { + } else { throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name))); } auto architecture_implementation = pci_device->get_architecture_implementation(); @@ -590,29 +800,36 @@ void Cluster::check_pcie_device_initialized(int device_id) { // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs if (arch_name != tt::ARCH::BLACKHOLE) { log_debug(LogSiliconDriver, "== Check if device_id: {} is initialized", device_id); - uint32_t bar_read_initial = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); + uint32_t bar_read_initial = + bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); uint32_t arg = bar_read_initial == 500 ? 325 : 500; uint32_t bar_read_again; - uint32_t arc_msg_return = arc_msg(device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again); + uint32_t arc_msg_return = arc_msg( + device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again); if (arc_msg_return != 0 || bar_read_again != arg + 1) { auto postcode = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset()); - throw std::runtime_error(fmt::format("Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} bar_read_again: {}", - postcode, - arc_msg_return, - arg, - bar_read_initial, - bar_read_again)); + throw std::runtime_error(fmt::format( + "Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} " + "bar_read_again: {}", + postcode, + arc_msg_return, + arg, + bar_read_initial, + bar_read_again)); } } - if (test_setup_interface()) { - throw std::runtime_error("Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run."); + throw std::runtime_error( + "Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC " + "Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run."); } } -std::unordered_map Cluster::create_harvested_coord_translation(const tt::ARCH arch, bool identity_map) { - log_assert(identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices"); +std::unordered_map Cluster::create_harvested_coord_translation( + const tt::ARCH arch, bool identity_map) { + log_assert( + identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices"); std::unordered_map translation_table = {}; tt_xy_pair grid_size; @@ -620,29 +837,29 @@ std::unordered_map Cluster::create_harvested_coord_trans std::vector T6_y = {}; std::vector ethernet = {}; // Store device specific data for GS and WH depending on arch - if(arch == tt::ARCH::GRAYSKULL) { + if (arch == tt::ARCH::GRAYSKULL) { grid_size = tt_xy_pair(13, 12); T6_x = {12, 1, 11, 2, 10, 3, 9, 4, 8, 5, 7, 6}; T6_y = {11, 1, 10, 2, 9, 3, 8, 4, 7, 5}; - } - else if (arch == tt::ARCH::BLACKHOLE) { + } else if (arch == tt::ARCH::BLACKHOLE) { grid_size = tt_xy_pair(17, 12); T6_x = {16, 1, 15, 2, 14, 3, 13, 4, 12, 5, 11, 6, 10, 7}; T6_y = {11, 2, 10, 3, 9, 4, 8, 5, 7, 6}; - } - else { + } else { grid_size = tt_xy_pair(10, 12); T6_x = {1, 2, 3, 4, 6, 7, 8, 9}; T6_y = {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}; - ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}}; + // clang-format off + ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, + {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}}; + // clang-format on } - - if(identity_map) { + if (identity_map) { // When device is initialized, assume no harvesting and create an identity map for cores // This flow is always used for GS, since there is no hardware harvesting - for(int x = 0; x < grid_size.x; x++) { - for(int y = 0; y < grid_size.y; y++) { + for (int x = 0; x < grid_size.x; x++) { + for (int y = 0; y < grid_size.y; y++) { tt_xy_pair curr_core = tt_xy_pair(x, y); translation_table.insert({curr_core, curr_core}); } @@ -653,34 +870,50 @@ std::unordered_map Cluster::create_harvested_coord_trans // If this function is called with identity_map = false, we have perform NOC translation // This can only happen for WH devices // Setup coord translation for workers. Map all worker cores - for(int x = 0; x < grid_size.x; x++) { - for(int y = 0; y < grid_size.y; y++) { + for (int x = 0; x < grid_size.x; x++) { + for (int y = 0; y < grid_size.y; y++) { tt_xy_pair curr_core = tt_xy_pair(x, y); - if(std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() && - std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) { + if (std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() && + std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) { // This is a worker core. Apply translation for WH. tt_xy_pair harvested_worker; - if(x >= 1 && x <= 4) harvested_worker.x = x + 17; - else if(x <= 9 && x > 5) harvested_worker.x = x + 16; - else log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x); + if (x >= 1 && x <= 4) { + harvested_worker.x = x + 17; + } else if (x <= 9 && x > 5) { + harvested_worker.x = x + 16; + } else { + log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x); + } - if(y >= 1 && y <= 5) harvested_worker.y = y + 17; - else if(y <= 11 && y > 6) harvested_worker.y = y + 16; - else log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y); + if (y >= 1 && y <= 5) { + harvested_worker.y = y + 17; + } else if (y <= 11 && y > 6) { + harvested_worker.y = y + 16; + } else { + log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y); + } translation_table.insert({curr_core, harvested_worker}); } - else if(std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()){ + else if (std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()) { // This is an eth core. Apply translation for WH. tt_xy_pair harvested_eth_core; - if(x >= 1 && x <= 4) harvested_eth_core.x = x + 17; - else if(x <= 9 && x > 5) harvested_eth_core.x = x + 16; - else log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x); + if (x >= 1 && x <= 4) { + harvested_eth_core.x = x + 17; + } else if (x <= 9 && x > 5) { + harvested_eth_core.x = x + 16; + } else { + log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x); + } - if(y == 0) harvested_eth_core.y = y + 16; - else if(y == 6) harvested_eth_core.y = y + 11; - else log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y); + if (y == 0) { + harvested_eth_core.y = y + 16; + } else if (y == 6) { + harvested_eth_core.y = y + 11; + } else { + log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y); + } translation_table.insert({curr_core, harvested_eth_core}); } @@ -693,7 +926,7 @@ std::unordered_map Cluster::create_harvested_coord_trans return translation_table; } -void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { +void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { auto translated_coords = harvested_coord_translation[device_id].at(tt_xy_pair(c, r)); c = translated_coords.x; r = translated_coords.y; @@ -702,7 +935,7 @@ void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, void Cluster::initialize_pcie_devices() { log_debug(LogSiliconDriver, "Cluster::start"); - for (auto &device_it : m_pci_device_map){ + for (auto& device_it : m_pci_device_map) { check_pcie_device_initialized(device_it.first); } @@ -711,7 +944,7 @@ void Cluster::initialize_pcie_devices() { init_membars(); } -void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &soft_resets) { +void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& soft_resets) { log_debug(LogSiliconDriver, "Cluster::broadcast_tensix_risc_reset"); PCIDevice* device = get_pci_device(chip_id); @@ -719,7 +952,10 @@ void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSo auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; auto logical_id = device->get_logical_id(); - log_debug(LogSiliconDriver, "== For all tensix set soft-reset for {} risc cores.", TensixSoftResetOptionsToString(valid).c_str()); + log_debug( + LogSiliconDriver, + "== For all tensix set soft-reset for {} risc cores.", + TensixSoftResetOptionsToString(valid).c_str()); auto architecture_implementation = device->get_architecture_implementation(); @@ -738,77 +974,87 @@ void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSo } std::set Cluster::get_target_mmio_device_ids() { - if(!all_target_mmio_devices.size()) { - for (const auto &it: m_pci_device_map) { + if (!all_target_mmio_devices.size()) { + for (const auto& it : m_pci_device_map) { all_target_mmio_devices.insert(it.first); } } return all_target_mmio_devices; } -void Cluster::assert_risc_reset() { - broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); -} +void Cluster::assert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); } -void Cluster::deassert_risc_reset() { - broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); -} +void Cluster::deassert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); } -void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster - log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() || - std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(), - "Cannot deassert reset on a non-tensix or harvested core"); - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device); - if(target_is_mmio_capable) { - log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe"); +void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + // Get Target Device to query soc descriptor and determine location in cluster + std::uint32_t target_device = core.chip; + log_assert( + std::find( + get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != + get_soc_descriptor(target_device).workers.end() || + std::find( + get_soc_descriptor(target_device).ethernet_cores.begin(), + get_soc_descriptor(target_device).ethernet_cores.end(), + core) != get_soc_descriptor(target_device).ethernet_cores.end(), + "Cannot deassert reset on a non-tensix or harvested core"); + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device); + if (target_is_mmio_capable) { + log_assert( + m_pci_device_map.find(target_device) != m_pci_device_map.end(), + "Could not find MMIO mapped device in devices connected over PCIe"); send_tensix_risc_reset_to_core(core, soft_resets); - } - else { + } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Can't issue access to remote core in BH"); send_remote_tensix_risc_reset_to_core(core, soft_resets); } } void Cluster::assert_risc_reset_at_core(tt_cxy_pair core) { - std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster - log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() || - std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(), - "Cannot assert reset on a non-tensix or harvested core"); - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device); - if(target_is_mmio_capable) { - log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe"); + // Get Target Device to query soc descriptor and determine location in cluster + std::uint32_t target_device = core.chip; + log_assert( + std::find( + get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != + get_soc_descriptor(target_device).workers.end() || + std::find( + get_soc_descriptor(target_device).ethernet_cores.begin(), + get_soc_descriptor(target_device).ethernet_cores.end(), + core) != get_soc_descriptor(target_device).ethernet_cores.end(), + "Cannot assert reset on a non-tensix or harvested core"); + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device); + if (target_is_mmio_capable) { + log_assert( + m_pci_device_map.find(target_device) != m_pci_device_map.end(), + "Could not find MMIO mapped device in devices connected over PCIe"); send_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET); - } - else { + } else { send_remote_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET); } } // Free memory during teardown, and remove (clean/unlock) from any leftover mutexes. void Cluster::cleanup_shared_host_state() { - for(auto &mutex : hardware_resource_mutex_map) { + for (auto& mutex : hardware_resource_mutex_map) { mutex.second.reset(); mutex.second = nullptr; named_mutex::remove(mutex.first.c_str()); } } -std::unordered_set Cluster::get_all_chips_in_cluster() { - return ndesc -> get_all_chips(); -} +std::unordered_set Cluster::get_all_chips_in_cluster() { return ndesc->get_all_chips(); } + int Cluster::get_number_of_chips_in_cluster() { // Returns the number of chips seen in the network descriptor - return ndesc -> get_all_chips().size(); + return ndesc->get_all_chips().size(); } -tt_ClusterDescriptor* Cluster::get_cluster_description() {return ndesc.get();} +tt_ClusterDescriptor* Cluster::get_cluster_description() { return ndesc.get(); } + // Can be used before instantiating a silicon device int Cluster::detect_number_of_chips() { - auto available_device_ids = detect_available_device_ids(); return available_device_ids.size(); - } // Can be used before instantiating a silicon device @@ -822,7 +1068,8 @@ std::vector Cluster::detect_available_device_ids() { return PCIDevice::enumerate_devices(); } -std::function Cluster::get_fast_pcie_static_tlb_write_callable(int device_id) { +std::function Cluster::get_fast_pcie_static_tlb_write_callable( + int device_id) { PCIDevice* dev = get_pci_device(device_id); const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) { @@ -841,7 +1088,7 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error("TLBs not initialized"); } - auto *dev = get_pci_device(target.chip); + auto* dev = get_pci_device(target.chip); if (!dev->bar0_wc) { throw std::runtime_error("No write-combined mapping for BAR0"); @@ -855,26 +1102,39 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { } auto [tlb_offset, tlb_size] = tlb_data.value(); - auto *base = reinterpret_cast(dev->bar0_wc); + auto* base = reinterpret_cast(dev->bar0_wc); return tt::Writer(base + tlb_offset, tlb_size); } -void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) { - PCIDevice *dev = get_pci_device(target.chip); +void Cluster::write_device_memory( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair target, + std::uint32_t address, + const std::string& fallback_tlb) { + PCIDevice* dev = get_pci_device(target.chip); const uint8_t* buffer_addr = static_cast(mem_ptr); - log_debug(LogSiliconDriver, "Cluster::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}", - target.chip, target.x, target.y, address, size_in_bytes, small_access); + log_debug( + LogSiliconDriver, + "Cluster::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}", + target.chip, + target.x, + target.y, + address, + size_in_bytes, + small_access); std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init_per_chip[target.chip]) { + if (tlbs_init_per_chip[target.chip]) { tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } - if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { + if (tlb_data.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { auto [tlb_offset, tlb_size] = tlb_data.value(); if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset @@ -887,9 +1147,9 @@ void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, t const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); - while(size_in_bytes > 0) { - - auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = dev->set_dynamic_tlb( + tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->write_block(mapped_address, transfer_size, buffer_addr); @@ -901,22 +1161,36 @@ void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, t } } -void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) { - // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault. - log_debug(LogSiliconDriver, "Cluster::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", target.chip, target.x, target.y, address, size_in_bytes); - PCIDevice *dev = get_pci_device(target.chip); +void Cluster::read_device_memory( + void* mem_ptr, + tt_cxy_pair target, + std::uint32_t address, + std::uint32_t size_in_bytes, + const std::string& fallback_tlb) { + // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this + // function will cause a segfault. + log_debug( + LogSiliconDriver, + "Cluster::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", + target.chip, + target.x, + target.y, + address, + size_in_bytes); + PCIDevice* dev = get_pci_device(target.chip); uint8_t* buffer_addr = static_cast(mem_ptr); std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init_per_chip[target.chip]) { + if (tlbs_init_per_chip[target.chip]) { tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); - if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { + if (tlb_data.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { auto [tlb_offset, tlb_size] = tlb_data.value(); if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset @@ -930,9 +1204,9 @@ void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_ const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - while(size_in_bytes > 0) { - - auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = dev->set_dynamic_tlb( + tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->read_block(mapped_address, transfer_size, buffer_addr); @@ -945,55 +1219,61 @@ void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_ } void Cluster::read_buffer( - void* mem_ptr, - std::uint32_t address, - std::uint16_t channel, - std::uint32_t size_in_bytes, - chip_id_t src_device_id) { - + void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id) { log_assert(src_device_id != -1, "Must provide src_device_id for host_resident read/write"); - log_assert(m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "read_buffer: Device id is not a MMIO device"); + log_assert( + m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "read_buffer: Device id is not a MMIO device"); hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel); - log_assert(hugepage_map.mapping, "read_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." - " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", - src_device_id, - channel); + log_assert( + hugepage_map.mapping, + "read_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." + " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", + src_device_id, + channel); + + void* user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); - void * user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); + log_debug( + LogSiliconDriver, + "Cluster::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}", + src_device_id, + channel, + user_scratchspace); - log_debug(LogSiliconDriver, "Cluster::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}", src_device_id, channel, user_scratchspace); - memcpy(mem_ptr, user_scratchspace, size_in_bytes); } void Cluster::write_buffer( - const void *mem_ptr, - std::uint32_t size, - std::uint32_t address, - std::uint16_t channel, - chip_id_t src_device_id) { - - log_assert(m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "write_buffer: Device id is not a MMIO device"); + const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id) { + log_assert( + m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "write_buffer: Device id is not a MMIO device"); hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel); - log_assert(hugepage_map.mapping, "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." - " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", - src_device_id, - channel); - - log_assert(size <= hugepage_map.mapping_size, "write_buffer data has larger size {} than destination buffer {}", size, hugepage_map.mapping_size); - log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}", + log_assert( + hugepage_map.mapping, + "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." + " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", + src_device_id, + channel); + + log_assert( + size <= hugepage_map.mapping_size, + "write_buffer data has larger size {} than destination buffer {}", + size, + hugepage_map.mapping_size); + log_debug( + LogSiliconDriver, + "Using hugepage mapping at address {} offset {} chan {} size {}", hugepage_map.mapping, (address % hugepage_map.mapping_size), channel, size); - void * user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); + void* user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); memcpy(user_scratchspace, mem_ptr, size); } - uint32_t Cluster::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state) { PCIDevice* pci_device = get_pci_device(chip_id); uint32_t msg = 0xaa00; @@ -1010,34 +1290,37 @@ uint32_t Cluster::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle(); break; } - default: throw std::runtime_error("Unrecognized power state."); + default: + throw std::runtime_error("Unrecognized power state."); } return msg; } void Cluster::set_pcie_power_state(tt_DevicePowerState state) { - - for (auto &device_it : m_pci_device_map){ + for (auto& device_it : m_pci_device_map) { int chip_id = device_it.first; uint32_t msg = get_power_state_arc_msg(chip_id, state); std::stringstream ss; ss << state; auto exit_code = arc_msg(chip_id, 0xaa00 | msg, true, 0, 0); if (exit_code != 0) { - throw std::runtime_error(fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code)); + throw std::runtime_error( + fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code)); } } } int Cluster::get_clock(int logical_device_id) { - // TODO: remove this once ARC messages work. // This is currently used only for testing and bringing up Blackhole on Buda. if (arch_name == tt::ARCH::BLACKHOLE) { char* clk_env_var = getenv("TT_SILICON_DRIVER_AICLK"); if (clk_env_var != nullptr) { - log_warning(LogSiliconDriver, "ARC messages are not enabled on Blackhole. " - "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}" , clk_env_var); + log_warning( + LogSiliconDriver, + "ARC messages are not enabled on Blackhole. " + "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}", + clk_env_var); return std::stoi(clk_env_var); } } @@ -1045,7 +1328,14 @@ int Cluster::get_clock(int logical_device_id) { uint32_t clock; auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical); - auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock); + auto exit_code = arc_msg( + logical_device_id, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), + true, + 0xFFFF, + 0xFFFF, + 1, + &clock); if (exit_code != 0) { throw std::runtime_error(fmt::format("Failed to get aiclk value with exit code {}", exit_code)); } @@ -1053,16 +1343,15 @@ int Cluster::get_clock(int logical_device_id) { } std::map Cluster::get_clocks() { - std::map clock_freq_map; - for (auto &device_it : m_pci_device_map){ + std::map clock_freq_map; + for (auto& device_it : m_pci_device_map) { int d = device_it.first; clock_freq_map.insert({d, get_clock(d)}); } return clock_freq_map; } -Cluster::~Cluster () { - +Cluster::~Cluster() { log_debug(LogSiliconDriver, "Cluster::~Cluster"); cleanup_shared_host_state(); @@ -1083,23 +1372,34 @@ std::optional> Cluster::get_tlb_data_from_target( tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); tlb_data = architecture_implementation->describe_tlb(tlb_index); - } + } return tlb_data; } -void Cluster::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { - log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in Cluster::configure_tlb"); - PCIDevice *pci_device = get_pci_device(logical_device_id); +void Cluster::configure_tlb( + chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { + log_assert( + ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, + "Invalid ordering specified in Cluster::configure_tlb"); + PCIDevice* pci_device = get_pci_device(logical_device_id); pci_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation, ordering); auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value()); - if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}}); + if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { + tlb_config_map.insert({logical_device_id, {}}); + } tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size}); } void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { - log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in Cluster::configure_tlb."); - log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode."); - log_assert(fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); + log_assert( + ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, + "Invalid ordering specified in Cluster::configure_tlb."); + log_assert( + dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), + "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode."); + log_assert( + fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", + "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; } @@ -1109,7 +1409,7 @@ void Cluster::init_pcie_iatus() { int num_enabled_devices = m_pci_device_map.size(); log_debug(LogSiliconDriver, "Cluster::init_pcie_iatus() num_enabled_devices: {}", num_enabled_devices); - for (auto &src_device_it : m_pci_device_map){ + for (auto& src_device_it : m_pci_device_map) { int logical_id = src_device_it.first; PCIDevice* src_pci_device = src_device_it.second.get(); @@ -1119,72 +1419,86 @@ void Cluster::init_pcie_iatus() { if (hugepage_map.mapping) { std::uint32_t region_size = hugepage_map.mapping_size; if (channel_id == 3) { - region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT; + region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT; } // This log message doesn't look right. - log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); + log_debug( + LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); iatu_configure_peer_region(logical_id, channel_id, hugepage_map.physical_address, region_size); } else { - throw std::runtime_error(fmt::format("init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", logical_id, channel_id)); + throw std::runtime_error(fmt::format( + "init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", + logical_id, + channel_id)); } } } } -int Cluster::test_setup_interface () { +int Cluster::test_setup_interface() { if (arch_name == tt::ARCH::GRAYSKULL) { int ret_val = 0; - PCIDevice *dev = m_pci_device_map.begin()->second.get(); + PCIDevice* dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = dev->set_dynamic_tlb( + dev->get_architecture_implementation()->get_reg_tlb(), + tt_xy_pair(0, 0), + 0xffb20108, + harvested_coord_translation) + .bar_offset; uint32_t regval = 0; dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1; return ret_val; - } - else if (arch_name == tt::ARCH::WORMHOLE_B0) { + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { int ret_val = 0; - PCIDevice *dev = m_pci_device_map.begin()->second.get(); + PCIDevice* dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = dev->set_dynamic_tlb( + dev->get_architecture_implementation()->get_reg_tlb(), + tt_xy_pair(1, 0), + 0xffb20108, + harvested_coord_translation) + .bar_offset; uint32_t regval = 0; dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; return ret_val; - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { // MT Inital BH - Try to enable this, but double check "regval == 33" // int ret_val = 0; // PCIDevice *dev = m_pci_device_map.begin()->second->hdev; - // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, + // dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, + // harvested_coord_translation).bar_offset; // uint32_t regval = 0; // read_regs(dev, mapped_reg, 1, ®val); // ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; // return ret_val; return 0; - } - else { + } else { throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name))); } } -void Cluster::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) { - PCIDevice *dev = get_pci_device(logical_device_id); +void Cluster::bar_write32(int logical_device_id, uint32_t addr, uint32_t data) { + PCIDevice* dev = get_pci_device(logical_device_id); if (addr < dev->bar0_uc_offset) { - dev->write_block(addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? + dev->write_block( + addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? } else { dev->write_regs(addr, 1, &data); } } -uint32_t Cluster::bar_read32 (int logical_device_id, uint32_t addr) { +uint32_t Cluster::bar_read32(int logical_device_id, uint32_t addr) { PCIDevice* dev = get_pci_device(logical_device_id); uint32_t data; @@ -1197,32 +1511,39 @@ uint32_t Cluster::bar_read32 (int logical_device_id, uint32_t addr) { } // Returns 0 if everything was OK -int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { - - +int Cluster::pcie_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { if ((msg_code & 0xff00) != 0xaa00) { log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code); } - log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed + log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - PCIDevice *pci_device = get_pci_device(logical_device_id); + PCIDevice* pci_device = get_pci_device(logical_device_id); auto architecture_implementation = pci_device->get_architecture_implementation(); // Exclusive access for a single process at a time. Based on physical pci interface id. std::string msg_type = "ARC_MSG"; const scoped_lock lock(*get_mutex(msg_type, pci_device->get_device_num())); - uint32_t fw_arg = arg0 | (arg1<<16); + uint32_t fw_arg = arg0 | (arg1 << 16); int exit_code = 0; - bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg); - bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code); + bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg); + bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code); - uint32_t misc = bar_read32 (logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset()); + uint32_t misc = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset()); if (misc & (1 << 16)) { log_error("trigger_fw_int failed on device {}", logical_device_id); return 1; } else { - bar_write32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16)); + bar_write32( + logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16)); } if (wait_for_done) { @@ -1231,24 +1552,31 @@ int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_fo auto start = std::chrono::system_clock::now(); while (true) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id)); + throw std::runtime_error(fmt::format( + "Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id)); } status = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4); if ((status & 0xffff) == (msg_code & 0xff)) { if (return_3 != nullptr) { - *return_3 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); + *return_3 = bar_read32( + logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); } if (return_4 != nullptr) { - *return_4 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4); + *return_4 = bar_read32( + logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4); } exit_code = (status & 0xffff0000) >> 16; break; } else if (status == MSG_ERROR_REPLY) { - log_warning(LogSiliconDriver, "On device {}, message code 0x{:x} not recognized by FW", logical_device_id, msg_code); + log_warning( + LogSiliconDriver, + "On device {}, message code 0x{:x} not recognized by FW", + logical_device_id, + msg_code); exit_code = MSG_ERROR_REPLY; break; } @@ -1259,12 +1587,16 @@ int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_fo return exit_code; } -int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { +int Cluster::iatu_configure_peer_region( + int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { uint32_t dest_bar_lo = bar_addr_64 & 0xffffffff; uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff; std::uint32_t region_id_to_use = peer_region_id; - if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset - PCIDevice *pci_device = get_pci_device(logical_device_id); + if (peer_region_id == 3) { + region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address + // space with the correct start offset + } + PCIDevice* pci_device = get_pci_device(logical_device_id); auto architecture_implementation = pci_device->get_architecture_implementation(); // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working. @@ -1274,8 +1606,8 @@ int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_re uint64_t base_size = (region_id_to_use + 1) * region_size; uint64_t limit_address = base_addr + base_size - 1; - uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 - uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 + uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 + uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 uint32_t region_ctrl_3 = 0; uint32_t base_addr_lo = base_addr & 0xffffffff; uint32_t base_addr_hi = (base_addr >> 32) & 0xffffffff; @@ -1285,43 +1617,83 @@ int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_re uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), ®ion_ctrl_1, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), ®ion_ctrl_3, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1); - } - else { - bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), + ®ion_ctrl_1, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), + ®ion_ctrl_2, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), + &base_addr_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), + &base_addr_hi, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), + &limit_address_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), + &dest_bar_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), + &dest_bar_hi, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), + ®ion_ctrl_3, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), + &limit_address_hi, + 1); + } else { + bar_write32( + logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size); - arc_msg(logical_device_id, 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), true, 0, 0); + arc_msg( + logical_device_id, + 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), + true, + 0, + 0); } // Print what just happened - uint32_t peer_region_start = region_id_to_use*region_size; - uint32_t peer_region_end = (region_id_to_use+1)*region_size - 1; - log_debug(LogSiliconDriver, " [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", peer_region_id, peer_region_start, peer_region_end, bar_addr_64); + uint32_t peer_region_start = region_id_to_use * region_size; + uint32_t peer_region_end = (region_id_to_use + 1) * region_size - 1; + log_debug( + LogSiliconDriver, + " [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", + peer_region_id, + peer_region_start, + peer_region_end, + bar_addr_64); return 0; } // Returns broken rows as bits set to 1 in 'memory' and 'logic' uint32_t Cluster::get_harvested_noc_rows(uint32_t harvesting_mask) { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - const std::vector &harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); + const std::vector& harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); uint32_t harv_noc_rows = 0; std::string harv_noc_rows_str = ""; - for (int pos=0; pos> 1; @@ -1332,36 +1704,45 @@ uint32_t Cluster::get_harvested_noc_rows(uint32_t harvesting_mask) { return harv_noc_rows; } -uint32_t Cluster::get_harvested_rows (int logical_device_id) { +uint32_t Cluster::get_harvested_rows(int logical_device_id) { const char* harv_override = std::getenv("T6PY_HARVESTING_OVERRIDE"); uint32_t harv = 0xffffffff; if (harv_override) { harv = std::stoul(harv_override, nullptr, 16); } else { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); - PCIDevice *pci_device = get_pci_device(mmio_capable_chip_logical); - int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv); - log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); + PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical); + int harvesting_msg_code = arc_msg( + logical_device_id, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), + true, + 0, + 0, + 1, + &harv); + log_assert( + harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); } log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!"); - log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv==0) ? "DISABLED":"ENABLED", harv); - + log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv == 0) ? "DISABLED" : "ENABLED", harv); + uint32_t memory = harv & 0x3ff; uint32_t logic = (harv >> 10) & 0x3ff; - return (memory|logic); + return (memory | logic); } -uint32_t Cluster::get_harvested_noc_rows_for_chip (int logical_device_id) { +uint32_t Cluster::get_harvested_noc_rows_for_chip(int logical_device_id) { return get_harvested_noc_rows(get_harvested_rows(logical_device_id)); } -void Cluster::enable_local_ethernet_queue(const chip_id_t &device_id, int timeout) { +void Cluster::enable_local_ethernet_queue(const chip_id_t& device_id, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); auto start = std::chrono::system_clock::now(); while (msg_success != 1) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout)); + throw std::runtime_error( + fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout)); } if (arc_msg(device_id, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success) == MSG_ERROR_REPLY) { @@ -1370,7 +1751,7 @@ void Cluster::enable_local_ethernet_queue(const chip_id_t &device_id, int timeou } } -void *Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { +void* Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel); if (hugepage_map.mapping != nullptr) { return static_cast(hugepage_map.mapping) + offset; @@ -1381,13 +1762,14 @@ void *Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, u // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed. inline PCIDevice* Cluster::get_pci_device(int device_id) const { - if (!m_pci_device_map.count(device_id)){ + if (!m_pci_device_map.count(device_id)) { throw std::runtime_error(fmt::format("device_id: {} attempted to be accessed, but is not enabled.", device_id)); } return m_pci_device_map.at(device_id).get(); } -std::shared_ptr Cluster::get_mutex(const std::string& tlb_name, int pci_interface_id) { +std::shared_ptr Cluster::get_mutex( + const std::string& tlb_name, int pci_interface_id) { std::string mutex_name = tlb_name + std::to_string(pci_interface_id); return hardware_resource_mutex_map.at(mutex_name); } @@ -1415,7 +1797,8 @@ uint16_t Cluster::get_sys_rack(uint32_t rack_x, uint32_t rack_y) { } bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { - return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == (curr_rptr & eth_interface_params.cmd_buf_size_mask)); + return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == + (curr_rptr & eth_interface_params.cmd_buf_size_mask)); } /* @@ -1464,35 +1847,37 @@ bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { * Other schemes may be more performant. */ - /* * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the * ethernet core (host) command queue DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the * mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ - void Cluster::write_to_non_mmio_device( - const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, - bool broadcast, std::vector broadcast_header) { - + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t address, + bool broadcast, + std::vector broadcast_header) { chip_id_t mmio_capable_chip_logical; - - if(broadcast) { + + if (broadcast) { mmio_capable_chip_logical = core.chip; - } - else { + } else { mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); } flush_non_mmio_per_chip[ndesc->get_closest_mmio_capable_chip(core.chip)] = true; if (non_mmio_transfer_cores_customized) { - log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); + log_assert( + active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), + "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); } using data_word_t = uint32_t; constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words + constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words const auto target_chip = ndesc->get_chip_locations().at(core.chip); std::string write_tlb = "LARGE_WRITE_TLB"; @@ -1501,14 +1886,15 @@ void Cluster::write_to_non_mmio_device( translate_to_noc_table_coords(core.chip, core.y, core.x); std::vector erisc_command; std::vector erisc_q_rptr = std::vector(1); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t)); std::vector data_block; - routing_cmd_t *new_cmd; + routing_cmd_t* new_cmd; uint32_t buffer_id = 0; - uint32_t timestamp = 0; //CMD_TIMESTAMP; + uint32_t timestamp = 0; // CMD_TIMESTAMP; bool use_dram; uint32_t max_block_size; @@ -1520,14 +1906,22 @@ void Cluster::write_to_non_mmio_device( // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); - - int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; - tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; - - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + const scoped_lock lock( + *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); + + int& active_core_for_txn = + non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; + tt_cxy_pair remote_transfer_ethernet_core = + remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; + + erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE); + new_cmd = (routing_cmd_t*)&erisc_command[0]; + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); uint32_t full_count = 0; uint32_t offset = 0; uint32_t block_size; @@ -1537,40 +1931,55 @@ void Cluster::write_to_non_mmio_device( erisc_q_rptr[0] = erisc_q_ptrs[4]; while (offset < size_in_bytes) { while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); + read_device_memory( + erisc_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); + full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]); full_count++; } - //full = true; - // set full only if this command will make the q full. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); + // full = true; + // set full only if this command will make the q full. + // otherwise full stays false so that we do not poll the rd pointer in next iteration. + // As long as current command push does not fill up the queue completely, we do not want + // to poll rd pointer in every iteration. + // full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - if ((address + offset) & 0x1F) { // address not 32-byte aligned - block_size = DATA_WORD_SIZE; // 4 byte aligned + if ((address + offset) & 0x1F) { // address not 32-byte aligned + block_size = DATA_WORD_SIZE; // 4 byte aligned } else { // For broadcast we prepend a 32byte header. Decrease block size (size of payload) by this amount. - block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset : max_block_size - 32 * broadcast; + block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset + : max_block_size - 32 * broadcast; // Explictly align block_size to 4 bytes, in case the input buffer is not uint32_t aligned uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; } - // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size in the last block - uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied + // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size + // in the last block + uint64_t transfer_size = + std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied // Use block mode for broadcast - uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req; - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack; + uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) + : eth_interface_params.cmd_wr_req; + uint32_t resp_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) + : eth_interface_params.cmd_wr_ack; timestamp = 0; - - if(broadcast) { + + if (broadcast) { req_flags |= eth_interface_params.cmd_broadcast; } - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. + uint32_t host_dram_block_addr = + host_address_params.eth_routing_buffers_start + + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; + uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. if (req_flags & eth_interface_params.cmd_data_block) { // Copy data to sysmem or device DRAM for Block mode @@ -1579,46 +1988,60 @@ void Cluster::write_to_non_mmio_device( resp_flags |= eth_interface_params.cmd_data_block_dram; size_buffer_to_capacity(data_block, block_size); memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size); - if(broadcast) { + if (broadcast) { // Write broadcast header to sysmem - write_to_sysmem(broadcast_header.data(), broadcast_header.size() * sizeof(uint32_t), host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical); + write_to_sysmem( + broadcast_header.data(), + broadcast_header.size() * sizeof(uint32_t), + host_dram_block_addr, + host_dram_channel, + mmio_capable_chip_logical); } // Write payload to sysmem - write_to_sysmem(data_block.data(), data_block.size() * DATA_WORD_SIZE, host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, host_dram_channel, mmio_capable_chip_logical); + write_to_sysmem( + data_block.data(), + data_block.size() * DATA_WORD_SIZE, + host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, + host_dram_channel, + mmio_capable_chip_logical); } else { uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size; size_buffer_to_capacity(data_block, block_size); memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size); - write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb); + write_device_memory( + data_block.data(), + data_block.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + buf_address, + write_tlb); } tt_driver_atomics::sfence(); } // Send the read request - log_assert(broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned. - - if(broadcast) { + log_assert( + broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), + "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned. + + if (broadcast) { // Only specify endpoint local address for broadcast new_cmd->sys_addr = address + offset; + } else { + new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset); + new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf); } - else { - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); - } - - if(req_flags & eth_interface_params.cmd_data_block) { + + if (req_flags & eth_interface_params.cmd_data_block) { // Block mode new_cmd->data = block_size + BROADCAST_HEADER_SIZE * broadcast; - } - else { - if(size_in_bytes - offset < sizeof(uint32_t)) { + } else { + if (size_in_bytes - offset < sizeof(uint32_t)) { // Handle misalignment at the end of the buffer: // Assemble a padded uint32_t from single bytes, in case we have less than 4 bytes remaining memcpy(&new_cmd->data, static_cast(mem_ptr) + offset, size_in_bytes - offset); - } - else { - new_cmd->data = *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE); + } else { + new_cmd->data = *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE); } } @@ -1626,14 +2049,24 @@ void Cluster::write_to_non_mmio_device( if (use_dram) { new_cmd->src_addr_tag = host_dram_block_addr; } - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); + write_device_memory( + erisc_command.data(), + erisc_command.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), + write_tlb); tt_driver_atomics::sfence(); erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; std::vector erisc_q_wptr; erisc_q_wptr.resize(1); erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_q_wptr.data(), + erisc_q_wptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); offset += transfer_size; @@ -1646,10 +2079,19 @@ void Cluster::write_to_non_mmio_device( if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) { active_core_for_txn++; uint32_t update_mask_for_chip = remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1; - active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); + active_core_for_txn = + non_mmio_transfer_cores_customized + ? (active_core_for_txn & update_mask_for_chip) + : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); // active_core = (active_core & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + remote_transfer_ethernet_core = + remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr[0] = erisc_q_ptrs[4]; } @@ -1657,11 +2099,11 @@ void Cluster::write_to_non_mmio_device( } /* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue - * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above + * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core + * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring + * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) { - using data_word_t = uint32_t; constexpr int DATA_WORD_SIZE = sizeof(data_word_t); std::string write_tlb = "LARGE_WRITE_TLB"; @@ -1669,33 +2111,50 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ std::string empty_tlb = ""; translate_to_noc_table_coords(core.chip, core.y, core.x); - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); + const auto& mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip); std::vector erisc_command; std::vector erisc_q_rptr; - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / DATA_WORD_SIZE); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / DATA_WORD_SIZE); std::vector erisc_resp_q_wptr = std::vector(1); std::vector erisc_resp_q_rptr = std::vector(1); - std::vector data_block; - routing_cmd_t *new_cmd; + routing_cmd_t* new_cmd; - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; + erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE); + new_cmd = (routing_cmd_t*)&erisc_command[0]; // // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); + const scoped_lock lock( + *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0); - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb); - read_device_memory(erisc_resp_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); + read_device_memory( + erisc_resp_q_wptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + DATA_WORD_SIZE, + read_tlb); + read_device_memory( + erisc_resp_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr.resize(1); @@ -1713,25 +2172,34 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ while (offset < size_in_bytes) { while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); + read_device_memory( + erisc_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); + full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]); } uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - if ((address + offset) & 0x1F) { // address not 32-byte aligned - block_size = DATA_WORD_SIZE; // 4 byte aligned block + if ((address + offset) & 0x1F) { // address not 32-byte aligned + block_size = DATA_WORD_SIZE; // 4 byte aligned block } else { block_size = offset + max_block_size > size_in_bytes ? size_in_bytes - offset : max_block_size; // Align up to 4 bytes. uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; - } - uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) : eth_interface_params.cmd_rd_req; - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) : eth_interface_params.cmd_rd_data; + uint32_t req_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) + : eth_interface_params.cmd_rd_req; + uint32_t resp_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) + : eth_interface_params.cmd_rd_data; uint32_t resp_rd_ptr = erisc_resp_q_rptr[0] & eth_interface_params.cmd_buf_size_mask; uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + resp_rd_ptr * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. + uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. if (use_dram && block_size > DATA_WORD_SIZE) { req_flags |= eth_interface_params.cmd_data_block_dram; @@ -1739,22 +2207,35 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ } // Send the read request - log_assert((req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); + log_assert( + (req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), + "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. + new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset); + new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf); new_cmd->data = block_size; new_cmd->flags = req_flags; if (use_dram) { new_cmd->src_addr_tag = host_dram_block_addr; } - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);; + write_device_memory( + erisc_command.data(), + erisc_command.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), + write_tlb); + ; tt_driver_atomics::sfence(); erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; std::vector erisc_q_wptr; erisc_q_wptr.resize(1); erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_q_wptr.data(), + erisc_q_wptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); // If there is more data to read and this command will make the q full, set full to 1. // otherwise full stays false so that we do not poll the rd pointer in next iteration. @@ -1762,7 +2243,12 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // to poll rd pointer in every iteration. if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]), erisc_q_rptr[0])) { - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr[0] = erisc_q_ptrs[4]; } @@ -1778,13 +2264,23 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // So we have to wait for wrptr to advance, then wait for flags to be nonzero, then read data. do { - read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_resp_q_wptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + DATA_WORD_SIZE, + read_tlb); } while (erisc_resp_q_rptr[0] == erisc_resp_q_wptr[0]); tt_driver_atomics::lfence(); uint32_t flags_offset = 12 + sizeof(routing_cmd_t) * resp_rd_ptr; std::vector erisc_resp_flags = std::vector(1); do { - read_device_memory(erisc_resp_flags.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + flags_offset, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_resp_flags.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_routing_cmd_queue_base + flags_offset, + DATA_WORD_SIZE, + read_tlb); } while (erisc_resp_flags[0] == 0); if (erisc_resp_flags[0] == resp_flags) { @@ -1792,27 +2288,40 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ uint32_t data_offset = 8 + sizeof(routing_cmd_t) * resp_rd_ptr; if (block_size == DATA_WORD_SIZE) { std::vector erisc_resp_data = std::vector(1); - read_device_memory(erisc_resp_data.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + data_offset, DATA_WORD_SIZE, read_tlb); - if(size_in_bytes - offset < 4) { + read_device_memory( + erisc_resp_data.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_routing_cmd_queue_base + data_offset, + DATA_WORD_SIZE, + read_tlb); + if (size_in_bytes - offset < 4) { // Handle misaligned (4 bytes) data at the end of the block. // Only read remaining bytes into the host buffer, instead of reading the full uint32_t std::memcpy((uint8_t*)mem_ptr + offset, erisc_resp_data.data(), size_in_bytes - offset); - } - else { - *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE) = erisc_resp_data[0]; + } else { + *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE) = erisc_resp_data[0]; } } else { // Read 4 byte aligned block from device/sysmem if (use_dram) { size_buffer_to_capacity(data_block, block_size); - read_from_sysmem(data_block.data(), host_dram_block_addr, host_dram_channel, block_size, mmio_capable_chip_logical); + read_from_sysmem( + data_block.data(), + host_dram_block_addr, + host_dram_channel, + block_size, + mmio_capable_chip_logical); } else { - uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size; + uint32_t buf_address = + eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size; size_buffer_to_capacity(data_block, block_size); - read_device_memory(data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb); + read_device_memory( + data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb); } // assert(mem_ptr.size() - (offset/DATA_WORD_SIZE) >= (block_size * DATA_WORD_SIZE)); - log_assert((data_block.size() * DATA_WORD_SIZE) >= block_size, "Incorrect data size read back from sysmem/device"); + log_assert( + (data_block.size() * DATA_WORD_SIZE) >= block_size, + "Incorrect data size read back from sysmem/device"); // Account for misalignment by skipping any padding bytes in the copied data_block memcpy((uint8_t*)mem_ptr + offset, data_block.data(), std::min(block_size, size_in_bytes - offset)); } @@ -1820,40 +2329,53 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // Finally increment the rdptr for the response command q erisc_resp_q_rptr[0] = (erisc_resp_q_rptr[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - write_device_memory(erisc_resp_q_rptr.data(), erisc_resp_q_rptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_resp_q_rptr.data(), + erisc_resp_q_rptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); log_assert(erisc_resp_flags[0] == resp_flags, "Unexpected ERISC Response Flags."); offset += block_size; } - } void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) { - if(flush_non_mmio_per_chip[chip_id]) { + if (flush_non_mmio_per_chip[chip_id]) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole"); std::string read_tlb = "LARGE_READ_TLB"; auto chips_with_mmio = this->get_target_mmio_device_ids(); if (chips_with_mmio.find(chip_id) == chips_with_mmio.end()) { - log_debug(LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id); + log_debug( + LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id); return; } if (arch_name == tt::ARCH::WORMHOLE_B0) { std::vector erisc_txn_counters = std::vector(2); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t)); - //wait for all queues to be empty. - for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) { + // wait for all queues to be empty. + for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) { do { - read_device_memory(erisc_q_ptrs.data(), cxy, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + cxy, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); } while (erisc_q_ptrs[0] != erisc_q_ptrs[4]); } - //wait for all write responses to come back. - for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) { + // wait for all write responses to come back. + for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) { do { - read_device_memory(erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb); + read_device_memory( + erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb); } while (erisc_txn_counters[0] != erisc_txn_counters[1]); } } @@ -1861,7 +2383,6 @@ void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) { } } - void Cluster::wait_for_non_mmio_flush(const chip_id_t chip_id) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole"); std::string read_tlb = "LARGE_READ_TLB"; @@ -1882,39 +2403,48 @@ void Cluster::wait_for_non_mmio_flush() { } // Broadcast Functions -void Cluster::generate_tensix_broadcast_grids_for_grayskull(std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude) { +void Cluster::generate_tensix_broadcast_grids_for_grayskull( + std::set>& broadcast_grids, + std::set& rows_to_exclude, + std::set& cols_to_exclude) { // If row 0 is not explicitly excluded, exclude it here since its non-tensix rows_to_exclude.insert(0); // If row 11 is excluded, we can close the SOC grid. If not, exclude row 12 to close grid. - if(rows_to_exclude.find(11) == rows_to_exclude.end()) { + if (rows_to_exclude.find(11) == rows_to_exclude.end()) { rows_to_exclude.insert(12); } // If col 0 is not explicitly excluded, exclude it here since its non-tensix cols_to_exclude.insert(0); // If col 12 is excluded, we can close the SOC grid. If not, exclude col 13 to close grid. - if(cols_to_exclude.find(12) == cols_to_exclude.end()) { + if (cols_to_exclude.find(12) == cols_to_exclude.end()) { cols_to_exclude.insert(13); } std::vector> bb_x_coords = {}; std::vector> bb_y_coords = {}; // Generate starting and ending x coordinates of each bounding box/grid - for(auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) { - if(x_it == std::prev(cols_to_exclude.end(), 1)) continue; - if(cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) { + for (auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) { + if (x_it == std::prev(cols_to_exclude.end(), 1)) { + continue; + } + if (cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and + cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) { bb_x_coords.push_back({*(x_it) + 1, *(std::next(x_it, 1)) - 1}); } } - for(auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) { - if(y_it == std::prev(rows_to_exclude.end(), 1)) continue; - if(rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) { + for (auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) { + if (y_it == std::prev(rows_to_exclude.end(), 1)) { + continue; + } + if (rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and + rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) { bb_y_coords.push_back({*(y_it) + 1, *(std::next(y_it, 1)) - 1}); } } // Assemble x and y coordinates into bounding box vertices - for(const auto& x_pair : bb_x_coords) { - for(const auto& y_pair : bb_y_coords) { + for (const auto& x_pair : bb_x_coords) { + for (const auto& y_pair : bb_y_coords) { tt_xy_pair top_left = tt_xy_pair(x_pair.first, y_pair.first); tt_xy_pair bot_right = tt_xy_pair(x_pair.second, y_pair.second); broadcast_grids.insert({top_left, bot_right}); @@ -1922,81 +2452,94 @@ void Cluster::generate_tensix_broadcast_grids_for_grayskull(std::set>>& Cluster::get_ethernet_broadcast_headers(const std::set& chips_to_exclude) { +std::unordered_map>>& Cluster::get_ethernet_broadcast_headers( + const std::set& chips_to_exclude) { // Generate headers for Ethernet Broadcast (WH) only. Each header corresponds to a unique broadcast "grid". - if(bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) { + if (bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) { bcast_header_cache[chips_to_exclude] = {}; - std::unordered_map>> broadcast_mask_for_target_chips_per_group = {}; + std::unordered_map>> + broadcast_mask_for_target_chips_per_group = {}; std::map, std::tuple>> broadcast_header_union_per_group = {}; chip_id_t first_mmio_chip = *(get_target_mmio_device_ids().begin()); - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) == chips_to_exclude.end()) { + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) == chips_to_exclude.end()) { // Get shelf local physical chip id included in broadcast - chip_id_t physical_chip_id = ndesc -> get_shelf_local_physical_chip_coords(chip); - eth_coord_t eth_coords = ndesc -> get_chip_locations().at(chip); + chip_id_t physical_chip_id = ndesc->get_shelf_local_physical_chip_coords(chip); + eth_coord_t eth_coords = ndesc->get_chip_locations().at(chip); // Rack word to be set in header - uint32_t rack_word = std::get<2>(eth_coords) >> 2; + uint32_t rack_word = eth_coords.rack >> 2; // Rack byte to be set in header - uint32_t rack_byte = std::get<2>(eth_coords) % 4; + uint32_t rack_byte = eth_coords.rack % 4; // 1st level grouping: Group broadcasts based on the MMIO chip they must go through - // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each set connected to host through its closest MMIO chip - // For the first shelf, pass broadcasts to specific chips through their closest MMIO chip - // All other shelves are fully connected galaxy grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list. + // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each + // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific + // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are + // connected to all MMIO devices. Use any (or the first) MMIO device in the list. chip_id_t closest_mmio_chip = 0; - if (std::get<2>(eth_coords) == 0 && std::get<3>(eth_coords) == 0) { - // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its own MMIO counterpart. - closest_mmio_chip = ndesc -> get_closest_mmio_capable_chip(chip); - } - else { - // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are connected. + if (eth_coords.rack == 0 && eth_coords.shelf == 0) { + // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its + // own MMIO counterpart. + closest_mmio_chip = ndesc->get_closest_mmio_capable_chip(chip); + } else { + // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are + // connected. closest_mmio_chip = first_mmio_chip; } - if(broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == broadcast_mask_for_target_chips_per_group.end()) { + if (broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == + broadcast_mask_for_target_chips_per_group.end()) { broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}}); } - // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves that contain this physical id. - if(broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { + // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves + // that contain this physical id. + if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { // Target seen for the first time. std::vector broadcast_mask(8, 0); - broadcast_mask.at(rack_word) |= (1 << std::get<3>(eth_coords)) << rack_byte; + broadcast_mask.at(rack_word) |= (1 << eth_coords.shelf) << rack_byte; broadcast_mask.at(3) |= 1 << physical_chip_id; - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).insert({physical_chip_id, broadcast_mask}); + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip) + .insert({physical_chip_id, broadcast_mask}); - } - else { + } else { // Target was seen before -> include curr rack and shelf in header - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast(1 << std::get<3>(eth_coords)) << rack_byte; + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip) + .at(physical_chip_id) + .at(rack_word) |= static_cast(1 << eth_coords.shelf) << rack_byte; } } } - // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The number of groups after this step represent the final set of broadcast grids. - for(auto& mmio_group : broadcast_mask_for_target_chips_per_group) { - for(auto& chip : mmio_group.second) { + // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The + // number of groups after this step represent the final set of broadcast grids. + for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) { + for (auto& chip : mmio_group.second) { // Generate a hash for this MMIO Chip + Rack + Shelf group - std::vector header_hash = {mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)}; - if(broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) { - broadcast_header_union_per_group.insert({header_hash, std::make_tuple(mmio_group.first, chip.second)}); - } - else { + std::vector header_hash = { + mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)}; + if (broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) { + broadcast_header_union_per_group.insert( + {header_hash, std::make_tuple(mmio_group.first, chip.second)}); + } else { // If group found, update chip header entry std::get<1>(broadcast_header_union_per_group.at(header_hash)).at(3) |= chip.second.at(3); } } } // Get all broadcast headers per MMIO group - for(const auto& header : broadcast_header_union_per_group) { + for (const auto& header : broadcast_header_union_per_group) { chip_id_t mmio_chip = std::get<0>(header.second); - if(bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) { + if (bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) { bcast_header_cache[chips_to_exclude].insert({mmio_chip, {}}); } bcast_header_cache[chips_to_exclude].at(mmio_chip).push_back(std::get<1>(header.second)); } // Invert headers (FW convention) - for(auto& bcast_group : bcast_header_cache[chips_to_exclude]) { - for(auto& header : bcast_group.second) { + for (auto& bcast_group : bcast_header_cache[chips_to_exclude]) { + for (auto& header : bcast_group.second) { int header_idx = 0; - for(auto& header_entry : header) { - if(header_idx == 4) break; + for (auto& header_entry : header) { + if (header_idx == 4) { + break; + } header_entry = ~header_entry; header_idx++; } @@ -2006,14 +2549,23 @@ std::unordered_map>>& Cluster::get_ether return bcast_header_cache[chips_to_exclude]; } -void Cluster::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) { - // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH. - PCIDevice *pci_device = get_pci_device(chip); +void Cluster::pcie_broadcast_write( + chip_id_t chip, + const void* mem_ptr, + uint32_t size_in_bytes, + std::uint32_t addr, + const tt_xy_pair& start, + const tt_xy_pair& end, + const std::string& fallback_tlb) { + // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet + // Broadcast for WH. + PCIDevice* pci_device = get_pci_device(chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); - while(size_in_bytes > 0) { - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast( + tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); pci_device->write_block(mapped_address, transfer_size, buffer_addr); @@ -2023,155 +2575,235 @@ void Cluster::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t } } -inline bool tensix_or_eth_in_broadcast(const std::set& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) { +inline bool tensix_or_eth_in_broadcast( + const std::set& cols_to_exclude, + const tt::umd::architecture_implementation* architecture_implementation) { bool found_tensix_or_eth = false; - for(const auto& col : architecture_implementation->get_t6_x_locations()) { + for (const auto& col : architecture_implementation->get_t6_x_locations()) { found_tensix_or_eth |= (cols_to_exclude.find(col) == cols_to_exclude.end()); } return found_tensix_or_eth; } -inline bool valid_tensix_broadcast_grid(const std::set& rows_to_exclude, const std::set& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) { +inline bool valid_tensix_broadcast_grid( + const std::set& rows_to_exclude, + const std::set& cols_to_exclude, + const tt::umd::architecture_implementation* architecture_implementation) { bool t6_bcast_rows_complete = true; bool t6_bcast_rows_empty = true; - - for(const auto& row : architecture_implementation->get_t6_y_locations()) { + + for (const auto& row : architecture_implementation->get_t6_y_locations()) { t6_bcast_rows_complete &= (rows_to_exclude.find(row) == rows_to_exclude.end()); t6_bcast_rows_empty &= (rows_to_exclude.find(row) != rows_to_exclude.end()); } return t6_bcast_rows_complete || t6_bcast_rows_empty; } - -void Cluster::ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, - const std::set& chips_to_exclude, const std::set& rows_to_exclude, - std::set& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords) { - if(use_ethernet_broadcast) { +void Cluster::ethernet_broadcast_write( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + const std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb, + bool use_virtual_coords) { + if (use_ethernet_broadcast) { // Broadcast through ERISC core supported - std::unordered_map>>& broadcast_headers = get_ethernet_broadcast_headers(chips_to_exclude); - // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level broadcast headers on future/ + std::unordered_map>>& broadcast_headers = + get_ethernet_broadcast_headers(chips_to_exclude); + // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level + // broadcast headers on future/ std::uint32_t row_exclusion_mask = 0; std::uint32_t col_exclusion_mask = 0; - for(const auto& row : rows_to_exclude) { + for (const auto& row : rows_to_exclude) { row_exclusion_mask |= 1 << row; } - for(const auto& col : cols_to_exclude) { + for (const auto& col : cols_to_exclude) { col_exclusion_mask |= 1 << (16 + col); } // Write broadcast block to device. - for(auto& mmio_group : broadcast_headers) { - for(auto& header : mmio_group.second) { - header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks + for (auto& mmio_group : broadcast_headers) { + for (auto& header : mmio_group.second) { + header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks header.at(4) |= row_exclusion_mask; header.at(4) |= col_exclusion_mask; // Write Target: x-y endpoint is a don't care. Initialize to tt_xy_pair(1, 1) - write_to_non_mmio_device(mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header); + write_to_non_mmio_device( + mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header); } } - } - else { + } else { // Broadcast not supported. Implement this at the software level as a for loop std::vector cores_to_write = {}; - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue; - for(const auto& core : get_soc_descriptor(chip).cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb); + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) != chips_to_exclude.end()) { + continue; + } + for (const auto& core : get_soc_descriptor(chip).cores) { + if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and + rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and + core.second.type != CoreType::HARVESTED) { + write_to_device( + mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb); } } } } } -void Cluster::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, - const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { +void Cluster::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { if (arch_name == tt::ARCH::GRAYSKULL) { // Device FW disables broadcasts to all non tensix cores. std::vector dram_cores_to_write = {}; std::vector dram_rows = {0, 6}; std::vector dram_cols = {1, 4, 7, 10}; - for(const auto& row : dram_rows) { - for(const auto& col : dram_cols) { - if(rows_to_exclude.find(row) == rows_to_exclude.end() and cols_to_exclude.find(col) == cols_to_exclude.end()) { + for (const auto& row : dram_rows) { + for (const auto& col : dram_cols) { + if (rows_to_exclude.find(row) == rows_to_exclude.end() and + cols_to_exclude.find(col) == cols_to_exclude.end()) { dram_cores_to_write.push_back(tt_xy_pair(col, row)); } } } - + std::set> broadcast_grids = {}; generate_tensix_broadcast_grids_for_grayskull(broadcast_grids, rows_to_exclude, cols_to_exclude); - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue; - for(const auto& dram : dram_cores_to_write) { + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) != chips_to_exclude.end()) { + continue; + } + for (const auto& dram : dram_cores_to_write) { write_device_memory(mem_ptr, size_in_bytes, tt_cxy_pair(chip, dram), address, fallback_tlb); } - for(const auto& grid : broadcast_grids) { + for (const auto& grid : broadcast_grids) { pcie_broadcast_write(chip, mem_ptr, size_in_bytes, address, grid.first, grid.second, fallback_tlb); } - } - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } + } else if (arch_name == tt::ARCH::BLACKHOLE) { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); - if(cols_to_exclude.find(0) == cols_to_exclude.end()) { + if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { + log_assert( + !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), + "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { // When broadcast includes column zero do not exclude anything std::set unsafe_rows = {}; std::set cols_to_exclude_for_col_0_bcast = cols_to_exclude; std::set rows_to_exclude_for_col_0_bcast = rows_to_exclude; cols_to_exclude_for_col_0_bcast.insert(9); rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end()); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude_for_col_0_bcast, + cols_to_exclude_for_col_0_bcast, + fallback_tlb, + false); } - if(cols_to_exclude.find(9) == cols_to_exclude.end()) { + if (cols_to_exclude.find(9) == cols_to_exclude.end()) { std::set cols_to_exclude_for_col_9_bcast = cols_to_exclude; cols_to_exclude_for_col_9_bcast.insert(0); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude_for_col_9_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude_for_col_9_bcast, + fallback_tlb, + false); } + } else { + log_assert( + use_virtual_coords_for_eth_broadcast or + valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), + "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude, + fallback_tlb, + use_virtual_coords_for_eth_broadcast); } - else { - log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), - "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast); - } - } - else { + } else { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); - if(cols_to_exclude.find(0) == cols_to_exclude.end()) { - // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, since writing to these is unsafe - // ERISC FW does not exclude these. + if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { + log_assert( + !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), + "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { + // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, + // since writing to these is unsafe ERISC FW does not exclude these. std::set unsafe_rows = {2, 3, 4, 8, 9, 10}; std::set cols_to_exclude_for_col_0_bcast = cols_to_exclude; std::set rows_to_exclude_for_col_0_bcast = rows_to_exclude; cols_to_exclude_for_col_0_bcast.insert(5); rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end()); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude_for_col_0_bcast, + cols_to_exclude_for_col_0_bcast, + fallback_tlb, + false); } - if(cols_to_exclude.find(5) == cols_to_exclude.end()) { + if (cols_to_exclude.find(5) == cols_to_exclude.end()) { std::set cols_to_exclude_for_col_5_bcast = cols_to_exclude; cols_to_exclude_for_col_5_bcast.insert(0); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude_for_col_5_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude_for_col_5_bcast, + fallback_tlb, + false); } - } - else { - log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), - "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast); - } - } -} - -int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { + } else { + log_assert( + use_virtual_coords_for_eth_broadcast or + valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), + "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude, + fallback_tlb, + use_virtual_coords_for_eth_broadcast); + } + } +} + +int Cluster::remote_arc_msg( + int chip, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { constexpr uint64_t ARC_RESET_SCRATCH_ADDR = 0x880030060; constexpr uint64_t ARC_RESET_MISC_CNTL_ADDR = 0x880030100; @@ -2180,18 +2812,14 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin if ((msg_code & 0xff00) != 0xaa00) { log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code); } - log_assert (arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed + log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - uint32_t fw_arg = arg0 | (arg1<<16); + uint32_t fw_arg = arg0 | (arg1 << 16); int exit_code = 0; - { - write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); - } + { write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); } - { - write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); - } + { write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); } wait_for_non_mmio_flush(); uint32_t misc = 0; @@ -2213,7 +2841,11 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin if (std::chrono::system_clock::now() - start > timeout_seconds) { std::stringstream ss; ss << std::hex << msg_code; - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", timeout, chip, ss.str())); + throw std::runtime_error(fmt::format( + "Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", + timeout, + chip, + ss.str())); } uint32_t status = 0; @@ -2239,7 +2871,8 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin return exit_code; } -void Cluster::write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { +void Cluster::write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { write_buffer(mem_ptr, size, addr, channel, src_device_id); } @@ -2247,58 +2880,86 @@ void Cluster::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, u read_buffer(mem_ptr, addr, channel, size, src_device_id); } -void Cluster::set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) { - tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered +void Cluster::set_membar_flag( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_value, + const uint32_t barrier_addr, + const std::string& fallback_tlb) { + tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered std::unordered_set cores_synced = {}; std::vector barrier_val_vec = {barrier_value}; for (const auto& core : cores) { - write_to_device(barrier_val_vec.data(), barrier_val_vec.size() * sizeof(uint32_t), tt_cxy_pair(chip, core), barrier_addr, fallback_tlb); - } - tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed + write_to_device( + barrier_val_vec.data(), + barrier_val_vec.size() * sizeof(uint32_t), + tt_cxy_pair(chip, core), + barrier_addr, + fallback_tlb); + } + tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed while (cores_synced.size() != cores.size()) { - for(const auto& core : cores) { + for (const auto& core : cores) { if (cores_synced.find(core) == cores_synced.end()) { uint32_t readback_val; - read_from_device(&readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb); + read_from_device( + &readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb); if (readback_val == barrier_value) { cores_synced.insert(core); - } - else { - log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value); + } else { + log_trace( + LogSiliconDriver, + "Waiting for core {} to recieve mem bar flag {} in function", + core.str(), + barrier_value); } } } } // Ensure that reads or writes after this do not get reordered. // Reordering can cause races where data gets transferred before the barrier has returned - tt_driver_atomics::mfence(); + tt_driver_atomics::mfence(); } -void Cluster::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) { +void Cluster::insert_host_to_device_barrier( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_addr, + const std::string& fallback_tlb) { // Ensure that this memory barrier is atomic across processes/threads - const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); + const scoped_lock lock( + *get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb); set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb); } void Cluster::init_membars() { - for(const auto& chip : target_devices_in_cluster) { - if (ndesc -> is_chip_mmio_capable(chip)) { - set_membar_flag(chip, workers_per_chip.at(chip), tt_MemBarFlag::RESET, l1_address_params.tensix_l1_barrier_base, "LARGE_WRITE_TLB"); - set_membar_flag(chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB"); - set_membar_flag(chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB"); + for (const auto& chip : target_devices_in_cluster) { + if (ndesc->is_chip_mmio_capable(chip)) { + set_membar_flag( + chip, + workers_per_chip.at(chip), + tt_MemBarFlag::RESET, + l1_address_params.tensix_l1_barrier_base, + "LARGE_WRITE_TLB"); + set_membar_flag( + chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB"); + set_membar_flag( + chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB"); } } } -void Cluster::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - if (ndesc -> is_chip_mmio_capable(chip)) { + +void Cluster::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + if (ndesc->is_chip_mmio_capable(chip)) { const auto& all_workers = workers_per_chip.at(chip); const auto& all_eth = eth_cores; if (cores.size()) { // Insert barrier on specific cores with L1 std::unordered_set workers_to_sync = {}; std::unordered_set eth_to_sync = {}; - + for (const auto& core : cores) { if (all_workers.find(core) != all_workers.end()) { workers_to_sync.insert(core); @@ -2308,59 +2969,60 @@ void Cluster::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, c log_fatal("Can only insert an L1 Memory barrier on Tensix or Ethernet cores."); } } - insert_host_to_device_barrier(chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb); + insert_host_to_device_barrier( + chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb); insert_host_to_device_barrier(chip, eth_to_sync, l1_address_params.eth_l1_barrier_base, fallback_tlb); } else { // Insert barrier on all cores with L1 insert_host_to_device_barrier(chip, all_workers, l1_address_params.tensix_l1_barrier_base, fallback_tlb); insert_host_to_device_barrier(chip, all_eth, l1_address_params.eth_l1_barrier_base, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void Cluster::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - if (ndesc -> is_chip_mmio_capable(chip)) { +void Cluster::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + if (ndesc->is_chip_mmio_capable(chip)) { if (cores.size()) { - for(const auto& core : cores) { - log_assert(dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores."); + for (const auto& core : cores) { + log_assert( + dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores."); } insert_host_to_device_barrier(chip, cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); - } - else { + } else { // Insert Barrier on all DRAM Cores insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void Cluster::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - if (ndesc -> is_chip_mmio_capable(chip)) { +void Cluster::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { + if (ndesc->is_chip_mmio_capable(chip)) { if (channels.size()) { std::unordered_set dram_cores_to_sync = {}; - for(const auto& chan : channels) { + for (const auto& chan : channels) { dram_cores_to_sync.insert(get_soc_descriptor(chip).get_core_for_dram_channel(chan, 0)); } - insert_host_to_device_barrier(chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); - } - else { + insert_host_to_device_barrier( + chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); + } else { // Insert Barrier on all DRAM Cores insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void Cluster::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); - if(target_is_mmio_capable) { +void Cluster::write_to_device( + const void* mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); + if (target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { write_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb); } else { @@ -2368,100 +3030,118 @@ void Cluster::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair co } } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); - log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); + log_assert( + (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, + "Cannot issue ethernet writes to a single chip cluster!"); write_to_non_mmio_device(mem_ptr, size, core, addr); } } -void Cluster::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - PCIDevice *pci_device = get_pci_device(core.chip); +void Cluster::read_mmio_device_register( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + PCIDevice* pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); - // Align block to 4bytes if needed. + auto [mapped_address, tlb_size] = + pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); + // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage); - if(aligned_buf.input_size != aligned_buf.block_size) { + if (aligned_buf.input_size != aligned_buf.block_size) { // Copy value from aligned buffer to main buffer. std::memcpy(mem_ptr, aligned_buf.local_storage, size); } } - -void Cluster::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - PCIDevice *pci_device = get_pci_device(core.chip); +void Cluster::write_mmio_device_register( + const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + PCIDevice* pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); - // Align block to 4bytes if needed. + auto [mapped_address, tlb_size] = + pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); + // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); - if(aligned_buf.input_size != aligned_buf.block_size) { + if (aligned_buf.input_size != aligned_buf.block_size) { // Copy value from main buffer to aligned buffer std::memcpy(aligned_buf.local_storage, mem_ptr, size); } pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage); } -void Cluster::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); +void Cluster::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); if (target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { read_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb); } else { read_device_memory(mem_ptr, core, addr, size, fallback_tlb); } - } - else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static - log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet reads from a single chip cluster!"); + } else { + log_assert( + arch_name != tt::ARCH::BLACKHOLE, + "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static + log_assert( + (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, + "Cannot issue ethernet reads from a single chip cluster!"); read_from_non_mmio_device(mem_ptr, core, addr, size); } } -int Cluster::arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { +int Cluster::arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "ARC messages not supported in Blackhole"); - if(ndesc -> is_chip_mmio_capable(logical_device_id)) { + if (ndesc->is_chip_mmio_capable(logical_device_id)) { return pcie_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4); - } - else { + } else { return remote_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4); } } -void Cluster::send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) { +void Cluster::send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; write_to_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0, "REG_TLB"); tt_driver_atomics::sfence(); } -void Cluster::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) { +void Cluster::send_remote_tensix_risc_reset_to_core( + const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; write_to_non_mmio_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0); tt_driver_atomics::sfence(); } -int Cluster::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) { +int Cluster::set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); - return remote_arc_msg(chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL); + return remote_arc_msg( + chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL); } - void Cluster::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); auto start = std::chrono::system_clock::now(); while (msg_success != 1) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout)); + throw std::runtime_error( + fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout)); } int msg_rt = remote_arc_msg(chip, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success, NULL); if (msg_rt == MSG_ERROR_REPLY) { @@ -2470,16 +3150,14 @@ void Cluster::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) { } } - -void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) { - if(arch_name == tt::ARCH::GRAYSKULL) { - for (auto &device_it : m_pci_device_map) { +void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets) { + if (arch_name == tt::ARCH::GRAYSKULL) { + for (auto& device_it : m_pci_device_map) { broadcast_pcie_tensix_risc_reset(device_it.first, soft_resets); } - } - else { + } else { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; std::set chips_to_exclude = {}; std::set rows_to_exclude; std::set columns_to_exclude; @@ -2491,7 +3169,14 @@ void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOption columns_to_exclude = {0, 5}; } std::string fallback_tlb = "LARGE_WRITE_TLB"; - broadcast_write_to_cluster(&valid_val, sizeof(uint32_t), 0xFFB121B0, chips_to_exclude, rows_to_exclude, columns_to_exclude, fallback_tlb); + broadcast_write_to_cluster( + &valid_val, + sizeof(uint32_t), + 0xFFB121B0, + chips_to_exclude, + rows_to_exclude, + columns_to_exclude, + fallback_tlb); // Ensure that reset signal is globally visible wait_for_non_mmio_flush(); } @@ -2500,22 +3185,23 @@ void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOption void Cluster::set_power_state(tt_DevicePowerState device_state) { // MT Initial BH - ARC messages not supported in Blackhole if (arch_name != tt::ARCH::BLACKHOLE) { - for(auto& chip : target_devices_in_cluster) { - if(ndesc -> is_chip_mmio_capable(chip)) { + for (auto& chip : target_devices_in_cluster) { + if (ndesc->is_chip_mmio_capable(chip)) { set_pcie_power_state(device_state); } else { int exit_code = set_remote_power_state(chip, device_state); - log_assert(exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code); + log_assert( + exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code); } } } } void Cluster::enable_ethernet_queue(int timeout) { - for (const chip_id_t &chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { auto arch = get_soc_descriptor(chip).arch; - switch (arch) { + switch (arch) { case tt::ARCH::WORMHOLE_B0: { if (ndesc->is_chip_mmio_capable(chip)) { enable_local_ethernet_queue(chip, timeout); @@ -2524,20 +3210,17 @@ void Cluster::enable_ethernet_queue(int timeout) { } break; - case tt::ARCH::BLACKHOLE: - log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet"); + case tt::ARCH::BLACKHOLE: + log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet"); } default: { break; } } - } } -std::set Cluster::get_target_remote_device_ids() { - return target_remote_chips; -} +std::set Cluster::get_target_remote_device_ids() { return target_remote_chips; } void Cluster::deassert_resets_and_set_power_state() { // Assert tensix resets on all chips in cluster @@ -2546,15 +3229,29 @@ void Cluster::deassert_resets_and_set_power_state() { // MT Initial BH - ARC messages not supported in Blackhole if (arch_name != tt::ARCH::BLACKHOLE) { // Send ARC Messages to deassert RISCV resets - for (auto &device_it : m_pci_device_map){ - arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0); - } - if(ndesc != nullptr) { - for(const chip_id_t& chip : target_devices_in_cluster) { - if(!ndesc -> is_chip_mmio_capable(chip)) { + for (auto& device_it : m_pci_device_map) { + arc_msg( + device_it.first, + 0xaa00 | + device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), + true, + 0, + 0); + } + if (ndesc != nullptr) { + for (const chip_id_t& chip : target_devices_in_cluster) { + if (!ndesc->is_chip_mmio_capable(chip)) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); auto pci_device = get_pci_device(mmio_capable_chip_logical); - remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL); + remote_arc_msg( + chip, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), + true, + 0x0, + 0x0, + 1, + NULL, + NULL); } } enable_ethernet_queue(30); @@ -2565,11 +3262,16 @@ void Cluster::deassert_resets_and_set_power_state() { } void Cluster::verify_eth_fw() { - for(const auto& chip : target_devices_in_cluster) { + for (const auto& chip : target_devices_in_cluster) { uint32_t fw_version; std::vector fw_versions; - for (const tt_xy_pair ð_core : get_soc_descriptor(chip).ethernet_cores) { - read_from_device(&fw_version, tt_cxy_pair(chip, eth_core), l1_address_params.fw_version_addr, sizeof(uint32_t), "LARGE_READ_TLB"); + for (const tt_xy_pair& eth_core : get_soc_descriptor(chip).ethernet_cores) { + read_from_device( + &fw_version, + tt_cxy_pair(chip, eth_core), + l1_address_params.fw_version_addr, + sizeof(uint32_t), + "LARGE_READ_TLB"); fw_versions.push_back(fw_version); } verify_sw_fw_versions(chip, SW_VERSION, fw_versions); @@ -2577,7 +3279,7 @@ void Cluster::verify_eth_fw() { } } -void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) { +void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions) { tt_version sw(sw_version), fw_first_eth_core(fw_versions.at(0)); log_info( LogSiliconDriver, @@ -2585,7 +3287,7 @@ void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std sw.str(), fw_first_eth_core.str(), device_id); - for (std::uint32_t &fw_version : fw_versions) { + for (std::uint32_t& fw_version : fw_versions) { tt_version fw(fw_version); log_assert(fw == fw_first_eth_core, "FW versions are not the same across different ethernet cores"); log_assert(sw.major == fw.major, "SW/FW major version number out of sync"); @@ -2598,14 +3300,16 @@ void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std use_ethernet_broadcast &= fw_first_eth_core >= tt_version(6, 5, 0); // Virtual coordinates can be used for broadcast headers if ERISC FW >= 6.8.0 and NOC translation is enabled // Temporarily enable this feature for 6.7.241 as well for testing. - use_virtual_coords_for_eth_broadcast &= (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && translation_tables_en; + use_virtual_coords_for_eth_broadcast &= + (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && + translation_tables_en; } -void Cluster::start_device(const tt_device_params &device_params) { - if(device_params.init_device) { +void Cluster::start_device(const tt_device_params& device_params) { + if (device_params.init_device) { initialize_pcie_devices(); // MT Initial BH - Ethernet firmware not present in Blackhole - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { verify_eth_fw(); } deassert_resets_and_set_power_state(); @@ -2617,7 +3321,6 @@ void Cluster::close_device() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); } - void Cluster::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { l1_address_params = l1_address_params_; } @@ -2634,24 +3337,29 @@ void Cluster::set_driver_eth_interface_params(const tt_driver_eth_interface_para eth_interface_params = eth_interface_params_; } -void Cluster::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { +void Cluster::setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function) { map_core_to_tlb_per_chip[logical_device_id] = mapping_function; tlbs_init_per_chip[logical_device_id] = true; } std::uint32_t Cluster::get_num_dram_channels(std::uint32_t device_id) { - log_assert(target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), "Querying DRAM parameters for a device that does not exist."); + log_assert( + target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), + "Querying DRAM parameters for a device that does not exist."); return get_soc_descriptor(device_id).get_num_dram_channels(); } std::uint64_t Cluster::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { log_assert(channel < get_num_dram_channels(device_id), "Querying size for a device channel that does not exist."); - return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now } std::uint32_t Cluster::get_num_host_channels(std::uint32_t device_id) { auto devices = get_target_mmio_device_ids(); - log_assert(devices.find(device_id) != devices.end(), "Querying Host Address parameters for a non-mmio device or a device does not exist."); + log_assert( + devices.find(device_id) != devices.end(), + "Querying Host Address parameters for a non-mmio device or a device does not exist."); return m_pci_device_map.at(device_id)->get_num_host_mem_channels(); } @@ -2669,22 +3377,22 @@ std::uint32_t Cluster::get_numa_node_for_pcie_device(std::uint32_t device_id) { std::uint64_t Cluster::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { // TODO: Should probably be lowered to TTDevice. tt::ARCH arch = get_soc_descriptor(chip_id).arch; - if(arch == tt::ARCH::WORMHOLE_B0) { + if (arch == tt::ARCH::WORMHOLE_B0) { return 0x800000000; - } - else if (arch == tt::ARCH::BLACKHOLE) { + } else if (arch == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; - } - else { + } else { return 0; } } tt_version Cluster::get_ethernet_fw_version() const { log_assert(arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures."); - log_assert(eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, "Device must be started before querying Ethernet FW version."); + log_assert( + eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, + "Device must be started before querying Ethernet FW version."); return eth_fw_version; } -} +} // namespace tt::umd diff --git a/device/coordinate_manager.cpp b/device/coordinate_manager.cpp index 330eb864..eb3bda7e 100644 --- a/device/coordinate_manager.cpp +++ b/device/coordinate_manager.cpp @@ -4,9 +4,11 @@ * SPDX-License-Identifier: Apache-2.0 */ #include "umd/device/coordinate_manager.h" + #include -#include "umd/device/coordinate_manager.h" + #include "grayskull/grayskull_coordinate_manager.h" +#include "umd/device/coordinate_manager.h" tt_physical_coords CoordinateManager::to_physical_coords(tt_logical_coords logical_coords) { return tt_physical_coords(logical_x_to_physical_x[logical_coords.x], logical_y_to_physical_y[logical_coords.y]); @@ -71,13 +73,9 @@ void CoordinateManager::clear_harvesting_structures() { virtual_y_to_logical_y.clear(); } -std::set CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { - return {}; -} +std::set CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; } -std::set CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { - return {}; -} +std::set CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; } void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) { clear_harvesting_structures(); @@ -104,14 +102,16 @@ void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) { logical_x_to_virtual_x.resize(grid_size_x - num_harvested_x); logical_y_to_virtual_y.resize(grid_size_y - num_harvested_y); - fill_logical_to_physical_mapping(x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested); + fill_logical_to_physical_mapping( + x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested); fill_logical_to_virtual_mapping(physical_x_unharvested, physical_y_unharvested); } void CoordinateManager::fill_logical_to_physical_mapping( - const std::set& x_to_harvest, const std::set& y_to_harvest, - const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { - + const std::set& x_to_harvest, + const std::set& y_to_harvest, + const std::set& physical_x_unharvested, + const std::set& physical_y_unharvested) { auto physical_y_it = physical_y_unharvested.begin(); std::size_t logical_y = 0; for (size_t y = 0; y < worker_grid_size.y; y++) { @@ -130,7 +130,7 @@ void CoordinateManager::fill_logical_to_physical_mapping( auto physical_x_it = physical_x_unharvested.begin(); std::size_t logical_x = 0; - for(std::size_t x = 0; x < worker_grid_size.x; x++) { + for (std::size_t x = 0; x < worker_grid_size.x; x++) { if (x_to_harvest.find(x) == x_to_harvest.end()) { logical_x_to_physical_x[logical_x] = *physical_x_it; if (physical_x_to_logical_x.find(*physical_x_it) != physical_x_to_logical_x.end()) { @@ -145,7 +145,8 @@ void CoordinateManager::fill_logical_to_physical_mapping( } } -void CoordinateManager::fill_logical_to_virtual_mapping(const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { +void CoordinateManager::fill_logical_to_virtual_mapping( + const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { auto physical_y_it = physical_y_unharvested.begin(); for (std::size_t y = 0; y < logical_y_to_virtual_y.size(); y++) { logical_y_to_virtual_y[y] = *physical_y_it; @@ -176,7 +177,6 @@ std::unique_ptr CoordinateManager::get_coordinate_manager( const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) { - switch (arch) { case tt::ARCH::GRAYSKULL: return std::make_unique(worker_grid_size, workers, harvesting_mask); diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp index b51a26cc..5c9f278b 100644 --- a/device/cpuset_lib.cpp +++ b/device/cpuset_lib.cpp @@ -2,17 +2,21 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "cpuset_lib.hpp" + #include +#include +#include #include "cpuset_lib.hpp" +#include "fmt/core.h" #include "logger.hpp" -#include #include "umd/device/cluster.h" -#include -#include "fmt/core.h" + namespace tt { namespace fs = std::filesystem; + namespace cpuset { ///////////////////////////////////////////////////////////////////////// @@ -21,15 +25,18 @@ namespace cpuset { // Constructor for singleton class cpu id allocator tt_cpuset_allocator::tt_cpuset_allocator() { - - m_pid = getpid(); - m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; + m_pid = getpid(); + m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; // Chicken bit to disable this entire feature for debug/comparison. bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false; auto system_tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", m_pid, system_tid); + log_debug( + LogSiliconDriver, + "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", + m_pid, + system_tid); m_enable_cpuset_allocator = true; @@ -38,86 +45,102 @@ tt_cpuset_allocator::tt_cpuset_allocator() { m_enable_cpuset_allocator &= init_get_number_of_packages(); m_enable_cpuset_allocator &= init_find_tt_pci_devices_packages_numanodes(); - if (!cpuset_allocator_enable_env){ + if (!cpuset_allocator_enable_env) { m_enable_cpuset_allocator = false; - }else{ - - bool is_cpu_supported = init_is_cpu_model_supported(); + } else { + bool is_cpu_supported = init_is_cpu_model_supported(); - if (is_cpu_supported){ + if (is_cpu_supported) { m_enable_cpuset_allocator &= init_determine_cpuset_allocations(); - }else{ + } else { m_enable_cpuset_allocator = false; } - log_debug(LogSiliconDriver,"Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} thread_id: {} ", m_enable_cpuset_allocator, m_pid, system_tid); + log_debug( + LogSiliconDriver, + "Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} " + "thread_id: {} ", + m_enable_cpuset_allocator, + m_pid, + system_tid); } } // Step 1 : Initialize and perform m_topology detection -bool tt_cpuset_allocator::init_topology_init_and_load(){ - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::topology_init_and_load()"); +bool tt_cpuset_allocator::init_topology_init_and_load() { + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::topology_init_and_load()"); - if (!m_enable_cpuset_allocator){ + if (!m_enable_cpuset_allocator) { return false; } - if (hwloc_topology_init(&m_topology)){ + if (hwloc_topology_init(&m_topology)) { log_warning(LogSiliconDriver, "Problem initializing topology"); return false; } - hwloc_topology_set_type_filter(m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices. + hwloc_topology_set_type_filter( + m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices. - if (hwloc_topology_load(m_topology)){ + if (hwloc_topology_load(m_topology)) { log_warning(LogSiliconDriver, "Problem loading topology"); return false; } - return true; // Success + return true; // Success } -// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and numamode. -bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ - - if (!m_enable_cpuset_allocator){ +// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and +// numamode. +bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() { + if (!m_enable_cpuset_allocator) { return false; } - log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()"); + log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()"); m_num_tt_device_by_pci_device_id_map.clear(); hwloc_obj_t pci_device_obj = NULL; const std::regex tt_device_re("tenstorrent!([0-9]+)"); - while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))){ - - if (hwloc_obj_type_is_io(pci_device_obj->type) && (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) { - - std::pair device_id_revision = std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision); + while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))) { + if (hwloc_obj_type_is_io(pci_device_obj->type) && + (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) { + std::pair device_id_revision = + std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision); m_num_tt_device_by_pci_device_id_map[device_id_revision] += 1; - std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj); + std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj); std::string pci_device_dir = fmt::format("/sys/bus/pci/devices/{}/tenstorrent/", pci_bus_id_str); int physical_device_id = -1; - log_trace(LogSiliconDriver, "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", pci_bus_id_str, m_num_tt_device_by_pci_device_id_map[device_id_revision]); + log_trace( + LogSiliconDriver, + "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", + pci_bus_id_str, + m_num_tt_device_by_pci_device_id_map[device_id_revision]); // First, get the physical_device_id of the device. - if (fs::exists(pci_device_dir)){ - for (const auto &entry : fs::directory_iterator(pci_device_dir)){ + if (fs::exists(pci_device_dir)) { + for (const auto &entry : fs::directory_iterator(pci_device_dir)) { auto entry_str = entry.path().string(); - if (std::smatch device_match; std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)){ + if (std::smatch device_match; + std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)) { physical_device_id = stoi(device_match[1]); m_all_tt_devices.push_back(physical_device_id); - log_debug(LogSiliconDriver, "Found physical_device_id: {} from file: {}", physical_device_id, entry_str); + log_debug( + LogSiliconDriver, + "Found physical_device_id: {} from file: {}", + physical_device_id, + entry_str); break; } } - if (physical_device_id == -1){ - log_warning(LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir); + if (physical_device_id == -1) { + log_warning( + LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir); return false; } @@ -125,19 +148,23 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ // Next, get the PackageID of the device and update maps. auto package_id = get_package_id_from_device(pci_device_obj, physical_device_id); - - // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this + + // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this // package and structures storing the CPU characteristics per package. if (m_package_id_to_devices_map.find(package_id) == m_package_id_to_devices_map.end()) { m_package_id_to_devices_map.insert({package_id, {}}); m_package_id_to_num_l3_per_ccx_map.insert({package_id, 0}); m_package_id_to_num_ccx_per_ccd_map.insert({package_id, 0}); } - if (package_id != -1){ + if (package_id != -1) { m_package_id_to_devices_map.at(package_id).push_back(physical_device_id); m_physical_device_id_to_package_id_map.insert({physical_device_id, package_id}); } else { - log_warning(LogSiliconDriver, "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + log_warning( + LogSiliconDriver, + "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); return false; } @@ -145,377 +172,479 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ auto numa_nodeset = get_numa_nodeset_from_device(pci_device_obj, physical_device_id); m_physical_device_id_to_numa_nodeset_map.insert({physical_device_id, numa_nodeset}); - if (numa_nodeset == 0x0){ - log_warning(LogSiliconDriver, "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + if (numa_nodeset == 0x0) { + log_warning( + LogSiliconDriver, + "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); return false; } - m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector. + m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector. m_num_cpu_cores_allocated_per_tt_device.insert({physical_device_id, 0}); } } } - if (m_all_tt_devices.size() == 0){ - log_warning(LogSiliconDriver, "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", TENSTORRENT_VENDOR_ID); + if (m_all_tt_devices.size() == 0) { + log_warning( + LogSiliconDriver, + "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", + TENSTORRENT_VENDOR_ID); return false; } - log_debug(LogSiliconDriver,"Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", m_all_tt_devices.size()); - + log_debug( + LogSiliconDriver, + "Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", + m_all_tt_devices.size()); // Sort these 2 vectors of device_ids before we are done, since discovery can be in any order. - for (auto &p: m_package_id_to_devices_map){ + for (auto &p : m_package_id_to_devices_map) { std::sort(p.second.begin(), p.second.end()); } std::sort(m_all_tt_devices.begin(), m_all_tt_devices.end()); - return true; // Success + return true; // Success } - // Step 3 : Detect the number of packages. -bool tt_cpuset_allocator::init_get_number_of_packages(){ - - if (!m_enable_cpuset_allocator){ +bool tt_cpuset_allocator::init_get_number_of_packages() { + if (!m_enable_cpuset_allocator) { return false; } m_num_packages = hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_PACKAGE); - log_debug(LogSiliconDriver,"Found {} CPU packages", m_num_packages); - return m_num_packages > 0; // Success + log_debug(LogSiliconDriver, "Found {} CPU packages", m_num_packages); + return m_num_packages > 0; // Success } // Step 4 : Return true if all packages are models we want to support. Env-var can be used to ignore this check. -bool tt_cpuset_allocator::init_is_cpu_model_supported(){ - - if (!m_enable_cpuset_allocator){ +bool tt_cpuset_allocator::init_is_cpu_model_supported() { + if (!m_enable_cpuset_allocator) { return false; } - if (m_num_packages == 0){ - log_debug(LogSiliconDriver,"init_is_cpu_model_supported(): Found 0 packages, functions run out of order?"); + if (m_num_packages == 0) { + log_debug(LogSiliconDriver, "init_is_cpu_model_supported(): Found 0 packages, functions run out of order?"); return false; } bool use_any_cpu = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SUPPORT_ANY_CPU") ? true : false; - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::check_if_cpu_model_supported()"); + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::check_if_cpu_model_supported()"); // Supported CPU Models for enabling CPUSET Allocator. Keep the list small to production machines to start. - std::vector supported_cpu_models = { "AMD EPYC 7352 24-Core Processor", - "AMD EPYC 7532 32-Core Processor"}; + std::vector supported_cpu_models = { + "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"}; // CPU Models that have L3 per CCX and 2 CCX per CCD - std::vector opt_2ccx_per_ccd_cpu_models = { "AMD EPYC 7352 24-Core Processor", - "AMD EPYC 7532 32-Core Processor"}; - for(const auto& package: m_package_id_to_devices_map) { + std::vector opt_2ccx_per_ccd_cpu_models = { + "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"}; + for (const auto &package : m_package_id_to_devices_map) { int package_id = package.first; auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id); - if (m_debug) print_hwloc_object(package_obj, 0, true, true); + if (m_debug) { + print_hwloc_object(package_obj, 0, true, true); + } std::string pkg_cpu_model = hwloc_obj_get_info_by_name(package_obj, "CPUModel"); // First find out if this CPU is supported by CPUSET Allocator at all. bool has_supported_cpu = use_any_cpu ? true : false; - for (auto &supported_cpu_model : supported_cpu_models){ + for (auto &supported_cpu_model : supported_cpu_models) { has_supported_cpu |= (pkg_cpu_model.find(supported_cpu_model) != std::string::npos); } - log_debug(LogSiliconDriver,"Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", package_id, has_supported_cpu, pkg_cpu_model); + log_debug( + LogSiliconDriver, + "Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", + package_id, + has_supported_cpu, + pkg_cpu_model); - if (!has_supported_cpu){ + if (!has_supported_cpu) { return false; } // Then, determine if the 2CCX-PER-CCD optimization can be enabled for this CPU Model in the package. - for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models){ - if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos){ + for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models) { + if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos) { m_package_id_to_num_l3_per_ccx_map.at(package_id) = 1; m_package_id_to_num_ccx_per_ccd_map.at(package_id) = 2; } } } - return true; // Successhwloc + return true; // Successhwloc } - -// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given socket/package. -bool tt_cpuset_allocator::init_determine_cpuset_allocations(){ - - if (!m_enable_cpuset_allocator){ +// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given +// socket/package. +bool tt_cpuset_allocator::init_determine_cpuset_allocations() { + if (!m_enable_cpuset_allocator) { return false; } - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::init_determine_cpuset_allocations()"); - for (const auto& package : m_package_id_to_devices_map) { + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::init_determine_cpuset_allocations()"); + for (const auto &package : m_package_id_to_devices_map) { int package_id = package.first; auto num_tt_devices_for_cpu_package = package.second.size(); - if (num_tt_devices_for_cpu_package == 0){ - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", package_id); + if (num_tt_devices_for_cpu_package == 0) { + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", + package_id); continue; } - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", package_id); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", + package_id); auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id); - if (m_debug) print_hwloc_object(package_obj, 0, true, true); + if (m_debug) { + print_hwloc_object(package_obj, 0, true, true); + } - auto num_alloc_slots_in_package = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot); - if (num_alloc_slots_in_package == 0){ - log_warning(LogSiliconDriver, "Could not find any of the alloc objects in package_id: {} for this cpu arc", package_id); + auto num_alloc_slots_in_package = + hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot); + if (num_alloc_slots_in_package == 0) { + log_warning( + LogSiliconDriver, + "Could not find any of the alloc objects in package_id: {} for this cpu arc", + package_id); return false; } auto num_alloc_slots_per_tt_device = num_alloc_slots_in_package / num_tt_devices_for_cpu_package; // Above splits evenly by devices, leaves remainder unused in the example case of 3 devices but 8 slots. - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}", - package_id, num_alloc_slots_in_package, num_tt_devices_for_cpu_package, num_alloc_slots_per_tt_device); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} " + "num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}", + package_id, + num_alloc_slots_in_package, + num_tt_devices_for_cpu_package, + num_alloc_slots_per_tt_device); int device_idx = 0; - for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++){ - - auto obj = hwloc_get_obj_below_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx); + for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++) { + auto obj = hwloc_get_obj_below_by_type( + m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx); - if (obj){ - if (m_debug) print_hwloc_object(obj, 1, true); + if (obj) { + if (m_debug) { + print_hwloc_object(obj, 1, true); + } auto physical_device_id = m_package_id_to_devices_map.at(package_id).at(device_idx); // Hack for maximum number of slots per device. // if (m_physical_device_id_to_cpusets_map.at(physical_device_id).size() < 2){ m_physical_device_id_to_cpusets_map.at(physical_device_id).push_back(obj->cpuset); - int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology,obj->cpuset,HWLOC_OBJ_CORE); + int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, obj->cpuset, HWLOC_OBJ_CORE); m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) += num_cpus; // } // We're distributing allocation objects per package across TT devices, so switch to next one. - if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0){ - device_idx = (device_idx + 1) % num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to first device for that package. + if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0) { + device_idx = (device_idx + 1) % + num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to + // first device for that package. } - }else{ - log_warning(LogSiliconDriver, "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under package"); + } else { + log_warning( + LogSiliconDriver, + "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under " + "package"); return false; } } - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", package_id); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", + package_id); } - // Summary for Debug purposes. - for (auto &physical_device_id : m_all_tt_devices){ - for (size_t device_alloc_idx=0; device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); device_alloc_idx++){ + for (auto &physical_device_id : m_all_tt_devices) { + for (size_t device_alloc_idx = 0; + device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); + device_alloc_idx++) { auto cpuset = m_physical_device_id_to_cpusets_map.at(physical_device_id).at(device_alloc_idx); auto pu_ids_vector = get_hwloc_bitmap_vector(cpuset); auto num_pu_ids = pu_ids_vector.size(); auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} device_alloc_idx: {} picked {} PU's {}", physical_device_id, package_id, device_alloc_idx, num_pu_ids, pu_ids_vector); + log_debug( + LogSiliconDriver, + "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} " + "device_alloc_idx: {} picked {} PU's {}", + physical_device_id, + package_id, + device_alloc_idx, + num_pu_ids, + pu_ids_vector); } } - return true; // Success - + return true; // Success } ///////////////////////////////////////////////////////////////////////// // Runtime Functions //////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// -// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it. -bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ - +// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously +// allocated memory region to it. +bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) { auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: {} (pid: {} tid: {})", physical_device_id, m_pid, tid); - - if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0){ - log_fatal("bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not expected.", physical_device_id); + log_debug( + LogSiliconDriver, + "bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: " + "{} (pid: {} tid: {})", + physical_device_id, + m_pid, + tid); + + if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0) { + log_fatal( + "bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not " + "expected.", + physical_device_id); return false; } auto target_nodeset = m_physical_device_id_to_numa_nodeset_map.at(physical_device_id); - if (target_nodeset != 0){ - if (hwloc_set_area_membind(m_topology, addr, len, target_nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE) ){ - log_warning(LogSiliconDriver,"hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} tid: {})", - physical_device_id, get_hwloc_bitmap_vector(target_nodeset), strerror(errno), m_pid, tid); + if (target_nodeset != 0) { + if (hwloc_set_area_membind( + m_topology, + addr, + len, + target_nodeset, + HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE)) { + log_warning( + LogSiliconDriver, + "hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} " + "tid: {})", + physical_device_id, + get_hwloc_bitmap_vector(target_nodeset), + strerror(errno), + m_pid, + tid); return false; - }else{ - log_debug(LogSiliconDriver,"hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", physical_device_id, get_hwloc_bitmap_vector(target_nodeset), m_pid, tid); + } else { + log_debug( + LogSiliconDriver, + "hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", + physical_device_id, + get_hwloc_bitmap_vector(target_nodeset), + m_pid, + tid); } - }else{ - log_warning(LogSiliconDriver,"bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. Skipping membind.", physical_device_id); + } else { + log_warning( + LogSiliconDriver, + "bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. " + "Skipping membind.", + physical_device_id); return false; } - return true; // Success + return true; // Success } int tt_cpuset_allocator::_get_num_tt_pci_devices() { - for (auto &d : m_physical_device_id_to_package_id_map) { log_trace(LogSiliconDriver, "Found physical_device_id: {} ", d.first); } return m_physical_device_id_to_package_id_map.size(); } - - - ///////////////////////////////////////////////////////////////////////// -//Helper Functions ////////////////////////////////////////////////////// +// Helper Functions ////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - -std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj){ - +std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj) { std::string pci_bus_id_str = ""; - if (hwloc_obj_type_is_io(pci_device_obj->type)) { + if (hwloc_obj_type_is_io(pci_device_obj->type)) { auto attrs = pci_device_obj->attr->pcidev; pci_bus_id_str = fmt::format("{:04x}:{:02x}:{:02x}.{:01x}", attrs.domain, attrs.bus, attrs.dev, attrs.func); } return pci_bus_id_str; - } -int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){ - +int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) { auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", physical_device_id, pci_bus_id_str); + log_debug( + LogSiliconDriver, + "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", + physical_device_id, + pci_bus_id_str); hwloc_obj_t tmp_obj = hwloc_get_non_io_ancestor_obj(m_topology, pci_device_obj); int package_id = -1; // Keep going up until package/machine hierarchy is found, in case we don't find it right away. - while (package_id == -1){ - - if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)){ - if (tmp_obj->os_index != (unsigned) -1){ + while (package_id == -1) { + if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || + (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)) { + if (tmp_obj->os_index != (unsigned)-1) { package_id = tmp_obj->os_index; - }else{ - log_warning(LogSiliconDriver, "Could not find os_index of package or machine object for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + } else { + log_warning( + LogSiliconDriver, + "Could not find os_index of package or machine object for TT device (physical_device_id: {} " + "pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); break; } - }else{ - if (tmp_obj->parent){ + } else { + if (tmp_obj->parent) { tmp_obj = tmp_obj->parent; - }else{ + } else { break; } } } - if (m_debug) print_hwloc_object(pci_device_obj, 1, true, true); - if (m_debug) print_hwloc_object(tmp_obj, 1, true, true); + if (m_debug) { + print_hwloc_object(pci_device_obj, 1, true, true); + } + if (m_debug) { + print_hwloc_object(tmp_obj, 1, true, true); + } return package_id; } -hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){ - +hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device( + hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) { hwloc_nodeset_t nodeset = 0x0; // Currently an issue in non-EPYC machines where PCI devices are directly under Machine, and not any NumaNodes. // As quick workaround, skip this if there is only single numanode since returning 1 seems fine. - if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1){ + if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1) { auto numanode = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, 0); return numanode->nodeset; } auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding NumaNode.", physical_device_id, pci_bus_id_str); + log_debug( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's " + "corresponding NumaNode.", + physical_device_id, + pci_bus_id_str); hwloc_obj_t tmp_obj = pci_device_obj->parent; - while (tmp_obj && !tmp_obj->memory_arity){ + while (tmp_obj && !tmp_obj->memory_arity) { tmp_obj = tmp_obj->parent; /* no memory child, walk up */ } - if (tmp_obj && tmp_obj->nodeset){ - log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found NumaNodeSet: {}", physical_device_id, pci_bus_id_str, get_hwloc_bitmap_vector(tmp_obj->nodeset)); + if (tmp_obj && tmp_obj->nodeset) { + log_debug( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found " + "NumaNodeSet: {}", + physical_device_id, + pci_bus_id_str, + get_hwloc_bitmap_vector(tmp_obj->nodeset)); nodeset = tmp_obj->nodeset; - }else{ - log_warning(LogSiliconDriver, "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + } else { + log_warning( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} " + "pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); } return nodeset; - } int tt_cpuset_allocator::_get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision) { - std::pair device_id_revision = std::make_pair(device_id, revision); if (m_num_tt_device_by_pci_device_id_map.find(device_id_revision) != m_num_tt_device_by_pci_device_id_map.end()) { return m_num_tt_device_by_pci_device_id_map.at(device_id_revision); } else { - log_warning(LogSiliconDriver, "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", device_id, revision); + log_warning( + LogSiliconDriver, + "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", + device_id, + revision); return 0; } } ///////////////////////////////////////////////////////////////////////// -//Debug Functions /////////////////////////////////////////////////////// +// Debug Functions /////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// // Get all PU ids (or numa nodes) in a vector, for legacy/back-compat/debug purposes. -std::vector tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap){ - +std::vector tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap) { std::vector indices; int index; - if (bitmap){ - hwloc_bitmap_foreach_begin(index, bitmap) - indices.push_back(index); + if (bitmap) { + hwloc_bitmap_foreach_begin(index, bitmap) indices.push_back(index); hwloc_bitmap_foreach_end(); } return indices; } -std::vector tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj){ +std::vector tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj) { return get_hwloc_bitmap_vector(obj->cpuset); } -std::vector tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj){ +std::vector tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj) { return get_hwloc_bitmap_vector(obj->nodeset); } - // Nicer way to print pu ids as a vector on single line. -void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj){ +void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj) { std::cout << " Number: " << hwloc_bitmap_weight(obj->cpuset) << " cpuset_pu_ids: " << get_hwloc_cpuset_vector(obj); } -void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj){ - std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj); +void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj) { + std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) + << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj); } -void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids){ - +void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids) { char type[32], attr[1024]; hwloc_obj_type_snprintf(type, sizeof(type), obj, verbose); - printf("%*s%s", 2*depth, "", type); - if (obj->os_index != (unsigned) -1) + printf("%*s%s", 2 * depth, "", type); + if (obj->os_index != (unsigned)-1) { printf("#%u", obj->os_index); + } hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", verbose); - if (*attr) + if (*attr) { printf("(%s)", attr); - if (show_cpuids && obj->cpuset) + } + if (show_cpuids && obj->cpuset) { print_hwloc_cpuset(obj); + } printf("\n"); } - } // namespace cpuset } // namespace tt diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp index a08bc7cc..01210c02 100644 --- a/device/cpuset_lib.hpp +++ b/device/cpuset_lib.hpp @@ -4,19 +4,18 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once +#include + #include -#include -#include #include +#include #include -#include - -#include "umd/device/tt_cluster_descriptor.h" // For chip_id_t +#include #include "hwloc.h" +#include "umd/device/tt_cluster_descriptor.h" // For chip_id_t using tt_cluster_description = tt_ClusterDescriptor; @@ -27,90 +26,87 @@ namespace cpuset { // CPU ID allocator for pinning threads to cpu_ids // It's a singleton that should be retrieved via get() struct tt_cpuset_allocator { - public: - - tt_cpuset_allocator(tt_cpuset_allocator const&) = delete; - void operator=(tt_cpuset_allocator const&) = delete; - - // Bind an already allocated memory region to particular numa nodes - static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ - auto& instance = tt_cpuset_allocator::get(); - return instance.bind_area_memory_nodeset(physical_device_id, addr, len); - } - - static int get_num_tt_pci_devices(){ - auto& instance = tt_cpuset_allocator::get(); - return instance._get_num_tt_pci_devices(); - } - - static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id){ - auto& instance = tt_cpuset_allocator::get(); - return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - } - - private: - - static tt_cpuset_allocator& get() { - static tt_cpuset_allocator instance; - return instance; - } - - tt_cpuset_allocator(); - - int TENSTORRENT_VENDOR_ID = 0x1e52; - - bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len); - int _get_num_tt_pci_devices(); - int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); - - // Series of init functions, must be called in this order. Seperated out to support - // early exit in case of errors. - bool init_topology_init_and_load(); - bool init_find_tt_pci_devices_packages_numanodes(); - bool init_get_number_of_packages(); - bool init_is_cpu_model_supported(); - bool init_determine_cpuset_allocations(); - - // Helper Functions - std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); - int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); - hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); - - // Debug Functions - void print_hwloc_cpuset(hwloc_obj_t &obj); - void print_hwloc_nodeset(hwloc_obj_t &obj); - void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true); - std::vector get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap); - std::vector get_hwloc_cpuset_vector(hwloc_obj_t &obj); - std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); - hwloc_topology_t m_topology; - bool m_debug; - pid_t m_pid; - - // Items calculated by parsing system info, used by allocation algorithm: - std::map> m_package_id_to_devices_map; - std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info - std::map, int> m_num_tt_device_by_pci_device_id_map; - - std::map> m_physical_device_id_to_cpusets_map; - std::map m_physical_device_id_to_package_id_map; - - bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. - int m_num_packages = 0; - std::vector m_all_tt_devices = {}; - - hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default +public: + tt_cpuset_allocator(tt_cpuset_allocator const &) = delete; + void operator=(tt_cpuset_allocator const &) = delete; + + // Bind an already allocated memory region to particular numa nodes + static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) { + auto &instance = tt_cpuset_allocator::get(); + return instance.bind_area_memory_nodeset(physical_device_id, addr, len); + } - // For 2CCX-PER-CCD Optimization detection. - std::map m_package_id_to_num_l3_per_ccx_map; - std::map m_package_id_to_num_ccx_per_ccd_map; + static int get_num_tt_pci_devices() { + auto &instance = tt_cpuset_allocator::get(); + return instance._get_num_tt_pci_devices(); + } - // Memory Binding - std::map m_physical_device_id_to_numa_nodeset_map; + static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id) { + auto &instance = tt_cpuset_allocator::get(); + return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); + } - // Helper for some dynamic multi-threading. - std::map m_num_cpu_cores_allocated_per_tt_device; +private: + static tt_cpuset_allocator &get() { + static tt_cpuset_allocator instance; + return instance; + } + tt_cpuset_allocator(); + + int TENSTORRENT_VENDOR_ID = 0x1e52; + + bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len); + int _get_num_tt_pci_devices(); + int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); + + // Series of init functions, must be called in this order. Seperated out to support + // early exit in case of errors. + bool init_topology_init_and_load(); + bool init_find_tt_pci_devices_packages_numanodes(); + bool init_get_number_of_packages(); + bool init_is_cpu_model_supported(); + bool init_determine_cpuset_allocations(); + + // Helper Functions + std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); + int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); + hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); + + // Debug Functions + void print_hwloc_cpuset(hwloc_obj_t &obj); + void print_hwloc_nodeset(hwloc_obj_t &obj); + void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true); + std::vector get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap); + std::vector get_hwloc_cpuset_vector(hwloc_obj_t &obj); + std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); + hwloc_topology_t m_topology; + bool m_debug; + pid_t m_pid; + + // Items calculated by parsing system info, used by allocation algorithm: + std::map> m_package_id_to_devices_map; + std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info + std::map, int> m_num_tt_device_by_pci_device_id_map; + + std::map> m_physical_device_id_to_cpusets_map; + std::map m_physical_device_id_to_package_id_map; + + bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. + int m_num_packages = 0; + std::vector m_all_tt_devices = {}; + + hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default + + // For 2CCX-PER-CCD Optimization detection. + std::map m_package_id_to_num_l3_per_ccx_map; + std::map m_package_id_to_num_ccx_per_ccd_map; + + // Memory Binding + std::map m_physical_device_id_to_numa_nodeset_map; + + // Helper for some dynamic multi-threading. + std::map m_num_cpu_cores_allocated_per_tt_device; }; template diff --git a/device/grayskull/grayskull_coordinate_manager.h b/device/grayskull/grayskull_coordinate_manager.h index acecbf22..cac8b29a 100644 --- a/device/grayskull/grayskull_coordinate_manager.h +++ b/device/grayskull/grayskull_coordinate_manager.h @@ -9,8 +9,8 @@ #include "umd/device/coordinate_manager.h" class GrayskullCoordinateManager : public CoordinateManager { - public: - GrayskullCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + GrayskullCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} }; diff --git a/device/grayskull/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp index b7199873..209f1c42 100644 --- a/device/grayskull/grayskull_implementation.cpp +++ b/device/grayskull/grayskull_implementation.cpp @@ -4,13 +4,12 @@ #include "umd/device/grayskull_implementation.h" -#include "grayskull/host_mem_address_map.h" #include "grayskull/eth_interface.h" - +#include "grayskull/host_mem_address_map.h" #include "umd/device/cluster.h" -constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 32; // source: noc_parameters.h, unique for GS -constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for GS && WH && BH +constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 32; // source: noc_parameters.h, unique for GS +constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for GS && WH && BH namespace tt::umd { @@ -90,7 +89,9 @@ std::pair grayskull_implementation::get_tlb_data( } tt_driver_host_address_params grayskull_implementation::get_host_address_params() const { - return {::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } tt_driver_eth_interface_params grayskull_implementation::get_eth_interface_params() const { diff --git a/device/hugepage.cpp b/device/hugepage.cpp index e9c45d63..8883bff2 100644 --- a/device/hugepage.cpp +++ b/device/hugepage.cpp @@ -6,11 +6,11 @@ #include "umd/device/hugepage.h" -#include // for umask -#include // for O_RDWR and other constants +#include // for O_RDWR and other constants +#include // for umask -#include "logger.hpp" #include "cpuset_lib.hpp" +#include "logger.hpp" const uint32_t g_MAX_HOST_MEM_CHANNELS = 4; @@ -20,13 +20,12 @@ std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages namespace tt::umd { -uint32_t get_num_hugepages(){ - +uint32_t get_num_hugepages() { std::string nr_hugepages_path = "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"; std::ifstream hugepages_file(nr_hugepages_path); uint32_t num_hugepages = 0; - if(hugepages_file.is_open()) { + if (hugepages_file.is_open()) { std::string value; std::getline(hugepages_file, value); num_hugepages = std::stoi(value); @@ -36,100 +35,121 @@ uint32_t get_num_hugepages(){ } return num_hugepages; - } -uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { - +uint32_t get_available_num_host_mem_channels( + const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id. - uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); - uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - uint32_t total_hugepages = get_num_hugepages(); + uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); + uint32_t num_tt_mmio_devices_for_arch = + tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); + uint32_t total_hugepages = get_num_hugepages(); // This shouldn't happen on silicon machines. if (num_tt_mmio_devices_for_arch == 0) { - log_warning(LogSiliconDriver, + log_warning( + LogSiliconDriver, "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0", - device_id, revision_id); + device_id, + revision_id); return 0; } - // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups that were incomplete - // ie fewer hugepages than devices, which would partially work previously for some devices. - uint32_t num_channels_per_device_available = std::min(num_channels_per_device_target, std::max((uint32_t) 1, total_hugepages / num_tt_mmio_devices_for_arch)); + // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups + // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices. + uint32_t num_channels_per_device_available = + std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch)); - // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later on. + // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later + // on. if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) { - log_warning(LogSiliconDriver, - "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient Hugepages/HostMemChannels per device."); + log_warning( + LogSiliconDriver, + "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient " + "Hugepages/HostMemChannels per device."); } if (total_hugepages < num_tt_mmio_devices_for_arch) { - log_warning(LogSiliconDriver, - "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. NumHostMemChannels would be 0, bumping to 1.", - total_hugepages, num_tt_mmio_devices_for_arch, device_id, revision_id); + log_warning( + LogSiliconDriver, + "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. " + "NumHostMemChannels would be 0, bumping to 1.", + total_hugepages, + num_tt_mmio_devices_for_arch, + device_id, + revision_id); } if (num_channels_per_device_available < num_channels_per_device_target) { - log_warning(LogSiliconDriver, - "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds NumHostMemChannels. Increase Number of Hugepages.", - num_channels_per_device_available, device_id, num_channels_per_device_target); + log_warning( + LogSiliconDriver, + "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds " + "NumHostMemChannels. Increase Number of Hugepages.", + num_channels_per_device_available, + device_id, + num_channels_per_device_target); } - log_assert(num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, + log_assert( + num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.", - num_channels_per_device_available, g_MAX_HOST_MEM_CHANNELS); + num_channels_per_device_available, + g_MAX_HOST_MEM_CHANNELS); return num_channels_per_device_available; - } -std::string find_hugepage_dir(std::size_t pagesize) -{ - - static const std::regex hugetlbfs_mount_re(fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); +std::string find_hugepage_dir(std::size_t pagesize) { + static const std::regex hugetlbfs_mount_re( + fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); static const std::regex pagesize_re("(?:^|,)pagesize=([0-9]+)([KMGT])(?:,|$)"); std::ifstream proc_mounts("/proc/mounts"); - for (std::string line; std::getline(proc_mounts, line); ) - { - if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) - { + for (std::string line; std::getline(proc_mounts, line);) { + if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) { std::string options = mount_match[3]; - if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) - { + if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) { std::size_t mount_page_size = std::stoull(pagesize_match[1]); - switch (pagesize_match[2].str()[0]) - { - case 'T': mount_page_size <<= 10; - case 'G': mount_page_size <<= 10; - case 'M': mount_page_size <<= 10; - case 'K': mount_page_size <<= 10; + switch (pagesize_match[2].str()[0]) { + case 'T': + mount_page_size <<= 10; + case 'G': + mount_page_size <<= 10; + case 'M': + mount_page_size <<= 10; + case 'K': + mount_page_size <<= 10; } - if (mount_page_size == pagesize) - { + if (mount_page_size == pagesize) { return mount_match[2]; } } } } - log_warning(LogSiliconDriver, "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: {}.", hugepage_dir, pagesize); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: " + "{}.", + hugepage_dir, + pagesize); return std::string(); } -int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel) { +int open_hugepage_file(const std::string& dir, chip_id_t physical_device_id, uint16_t channel) { std::vector filename; static const char pipeline_name[] = "tenstorrent"; filename.insert(filename.end(), dir.begin(), dir.end()); - if (filename.back() != '/') filename.push_back('/'); + if (filename.back() != '/') { + filename.push_back('/'); + } // In order to limit number of hugepages while transition from shared hugepage (1 per system) to unique // hugepage per device, will share original/shared hugepage filename with physical device 0. - if (physical_device_id != 0 || channel != 0){ + if (physical_device_id != 0 || channel != 0) { std::string device_id_str = fmt::format("device_{}_", physical_device_id); filename.insert(filename.end(), device_id_str.begin(), device_id_str.end()); } @@ -139,20 +159,32 @@ int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uin filename.insert(filename.end(), channel_id_str.begin(), channel_id_str.end()); } - filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator + filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator std::string filename_str(filename.begin(), filename.end()); - filename_str.erase(std::find(filename_str.begin(), filename_str.end(), '\0'), filename_str.end()); // Erase NULL terminator for printing. - log_debug(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", filename_str.c_str(), physical_device_id, channel); + filename_str.erase( + std::find(filename_str.begin(), filename_str.end(), '\0'), + filename_str.end()); // Erase NULL terminator for printing. + log_debug( + LogSiliconDriver, + "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", + filename_str.c_str(), + physical_device_id, + channel); // Save original and set umask to unrestricted. auto old_umask = umask(0); - int fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH ); + int fd = + open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH); if (fd == -1 && errno == EACCES) { - log_warning(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", filename_str); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", + filename_str); unlink(filename.data()); - fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH ); + fd = open( + filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH); } // Restore original mask @@ -166,4 +198,4 @@ int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uin return fd; } -} // namespace tt::umd +} // namespace tt::umd diff --git a/device/ioctl.h b/device/ioctl.h index 60ec7b2f..1f732cfc 100644 --- a/device/ioctl.h +++ b/device/ioctl.h @@ -4,6 +4,9 @@ * SPDX-License-Identifier: Apache-2.0 */ +// clang-format off +// This file is copied from KMD, so we don't want clang formatting diff. + #ifndef TTDRIVER_IOCTL_H_INCLUDED #define TTDRIVER_IOCTL_H_INCLUDED @@ -155,3 +158,4 @@ struct tenstorrent_pin_pages { }; #endif +// clang-format on diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp index 25985407..15107ebc 100644 --- a/device/mockup/tt_mockup_device.hpp +++ b/device/mockup/tt_mockup_device.hpp @@ -9,31 +9,42 @@ #include #include -#include "umd/device/tt_cluster_descriptor.h" #include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" class tt_MockupDevice : public tt_device { - public: - tt_MockupDevice(const std::string& sdesc_path) : tt_device(sdesc_path) { +public: + tt_MockupDevice(const std::string& sdesc_path) : tt_device() { soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; } + virtual ~tt_MockupDevice() {} // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors() override { return soc_descriptor_per_chip; } + void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) override {} + void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) override {} + void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) override {} - void set_driver_eth_interface_params( - const tt_driver_eth_interface_params& eth_interface_params_) override {} + + void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) override {} + void start_device(const tt_device_params& device_params) override {} + void assert_risc_reset() override {} + void deassert_risc_reset() override {} - void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET) override {} + + void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) override {} + void assert_risc_reset_at_core(tt_cxy_pair core) override {} + void close_device() override {} // Runtime Functions @@ -43,10 +54,13 @@ class tt_MockupDevice : public tt_device { tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) override {} + void read_from_device( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) override {} + void write_to_sysmem( const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) override {} + void read_from_sysmem( void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) override {} @@ -54,10 +68,12 @@ class tt_MockupDevice : public tt_device { const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) override {} + void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) override {} + void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, @@ -66,27 +82,35 @@ class tt_MockupDevice : public tt_device { void wait_for_non_mmio_flush() override {} // Misc. Functions to Query/Set Device State - std::unordered_map get_harvesting_masks_for_soc_descriptors() override { - return {{0, 0}}; - } + std::unordered_map get_harvesting_masks_for_soc_descriptors() override { return {{0, 0}}; } + static std::vector detect_available_device_ids() { return {0}; }; + std::set get_target_remote_device_ids() override { return target_remote_chips; } + std::map get_clocks() override { return {{0, 0}}; } + void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const override { return nullptr; } + std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const override { return 0; } + std::uint32_t get_num_dram_channels(std::uint32_t device_id) override { return get_soc_descriptor(device_id).get_num_dram_channels(); }; + std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return get_soc_descriptor(device_id).dram_bank_size; } + std::uint32_t get_num_host_channels(std::uint32_t device_id) override { return 1; } + std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return 0; } + std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) override { return 0; } - private: +private: std::vector archs_in_cluster = {}; std::set target_devices_in_cluster = {}; std::set target_remote_chips = {}; diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index 433fe4bf..bdf40962 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -4,27 +4,27 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include "umd/device/pci_device.hpp" + +#include // for ::open +#include // for PCI_SLOT, PCI_FUNC +#include // for ioctl +#include // for mmap, munmap +#include // for fstat +#include // for ::close + #include -#include // for memcpy +#include // for memcpy #include -#include // for ::open -#include // for ::close -#include // for ioctl -#include // for mmap, munmap -#include // for fstat -#include // for PCI_SLOT, PCI_FUNC - -#include "umd/device/pci_device.hpp" -#include "ioctl.h" +#include "assert.hpp" +#include "cpuset_lib.hpp" #include "ioctl.h" -#include "umd/device/tt_arch_types.h" -#include "umd/device/driver_atomics.h" +#include "logger.hpp" #include "umd/device/architecture_implementation.h" -#include "cpuset_lib.hpp" +#include "umd/device/driver_atomics.h" #include "umd/device/hugepage.h" -#include "assert.hpp" -#include "logger.hpp" +#include "umd/device/tt_arch_types.h" static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca; static const uint16_t WH_PCIE_DEVICE_ID = 0x401e; @@ -32,25 +32,29 @@ static const uint16_t BH_PCIE_DEVICE_ID = 0xb140; // TODO: we'll have to rethink this when KMD takes control of the inbound PCIe // TLB windows and there is no longer a pre-defined WC/UC split. -static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); +static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156 << 20) + (10 << 21) + (18 << 24); // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC -static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; +static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188 << 21; static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; // Hugepages must be 1GB in size -const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB +const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB using namespace tt; using namespace tt::umd; template static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name) { - const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", - device_info.pci_domain, device_info.pci_bus, - device_info.pci_device, device_info.pci_function, attribute_name); + const auto sysfs_path = fmt::format( + "/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", + device_info.pci_domain, + device_info.pci_bus, + device_info.pci_device, + device_info.pci_function, + attribute_name); std::ifstream attribute_file(sysfs_path); std::string value_str; T value; @@ -75,8 +79,7 @@ static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribu return value; } -static PciDeviceInfo read_device_info(int fd) -{ +static PciDeviceInfo read_device_info(int fd) { tenstorrent_get_device_info info{}; info.in.output_size_bytes = sizeof(info.out); @@ -92,11 +95,11 @@ static PciDeviceInfo read_device_info(int fd) } static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) { - if (pcie_device_id == GS_PCIE_DEVICE_ID){ + if (pcie_device_id == GS_PCIE_DEVICE_ID) { return tt::ARCH::GRAYSKULL; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){ + } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01) { return tt::ARCH::WORMHOLE_B0; - } else if (pcie_device_id == BH_PCIE_DEVICE_ID){ + } else if (pcie_device_id == BH_PCIE_DEVICE_ID) { return tt::ARCH::BLACKHOLE; } else { TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id); @@ -122,28 +125,29 @@ inline void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) if (dest_misalignment != 0) { // Read-modify-write for the first dest element. - dp = reinterpret_cast(dest_addr - dest_misalignment); + dp = reinterpret_cast(dest_addr - dest_misalignment); copy_t tmp = *dp; auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes); - std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); + std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); num_bytes -= leading_len; src = static_cast(src) + leading_len; *dp++ = tmp; } else { - dp = static_cast(dest); + dp = static_cast(dest); } // Copy the destination-aligned middle. - const copy_t *sp = static_cast(src); + const copy_t *sp = static_cast(src); std::size_t num_words = num_bytes / sizeof(copy_t); - for (std::size_t i = 0; i < num_words; i++) + for (std::size_t i = 0; i < num_words; i++) { *dp++ = *sp++; + } // Finally copy any sub-word trailer, again RMW on the destination. auto trailing_len = num_bytes % sizeof(copy_t); @@ -166,7 +170,7 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte unsigned int src_misalignment = src_addr % sizeof(copy_t); if (src_misalignment != 0) { - sp = reinterpret_cast(src_addr - src_misalignment); + sp = reinterpret_cast(src_addr - src_misalignment); copy_t tmp = *sp++; @@ -176,15 +180,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte dest = static_cast(dest) + leading_len; } else { - sp = static_cast(src); + sp = static_cast(src); } // Copy the source-aligned middle. copy_t *dp = static_cast(dest); std::size_t num_words = num_bytes / sizeof(copy_t); - for (std::size_t i = 0; i < num_words; i++) + for (std::size_t i = 0; i < num_words; i++) { *dp++ = *sp++; + } // Finally copy any sub-word trailer. auto trailing_len = num_bytes % sizeof(copy_t); @@ -195,17 +200,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte } tt::ARCH PciDeviceInfo::get_arch() const { - if (this->device_id == GS_PCIE_DEVICE_ID){ + if (this->device_id == GS_PCIE_DEVICE_ID) { return tt::ARCH::GRAYSKULL; } else if (this->device_id == WH_PCIE_DEVICE_ID) { return tt::ARCH::WORMHOLE_B0; - } else if (this->device_id == BH_PCIE_DEVICE_ID){ + } else if (this->device_id == BH_PCIE_DEVICE_ID) { return tt::ARCH::BLACKHOLE; } return tt::ARCH::Invalid; } - /* static */ std::vector PCIDevice::enumerate_devices() { std::vector device_ids; std::string path = "/dev/tenstorrent/"; @@ -213,7 +217,7 @@ tt::ARCH PciDeviceInfo::get_arch() const { if (!std::filesystem::exists(path)) { return device_ids; } - for (const auto& entry : std::filesystem::directory_iterator(path)) { + for (const auto &entry : std::filesystem::directory_iterator(path)) { std::string filename = entry.path().filename().string(); // TODO: this will skip any device that has a non-numeric name, which @@ -237,28 +241,29 @@ tt::ARCH PciDeviceInfo::get_arch() const { try { infos[n] = read_device_info(fd); - } catch (...) {} + } catch (...) { + } close(fd); } return infos; } -PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) - : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)) - , pci_device_num(pci_device_number) - , logical_id(logical_device_id) - , pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)) - , info(read_device_info(pci_device_file_desc)) - , numa_node(read_sysfs(info, "numa_node")) - , revision(read_sysfs(info, "revision")) - , arch(detect_arch(info.device_id, revision)) - , architecture_implementation(tt::umd::architecture_implementation::create(arch)) -{ +PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) : + device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)), + pci_device_num(pci_device_number), + logical_id(logical_device_id), + pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)), + info(read_device_info(pci_device_file_desc)), + numa_node(read_sysfs(info, "numa_node")), + revision(read_sysfs(info, "revision")), + arch(detect_arch(info.device_id, revision)), + architecture_implementation(tt::umd::architecture_implementation::create(arch)) { struct { tenstorrent_query_mappings query_mappings; tenstorrent_mapping mapping_array[8]; } mappings; + memset(&mappings, 0, sizeof(mappings)); mappings.query_mappings.in.output_mapping_count = 8; @@ -302,7 +307,9 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) bar4_wc_mapping = mappings.mapping_array[i]; } - log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}", + log_debug( + LogSiliconDriver, + "BAR mapping id {} base {} size {}", mappings.mapping_array[i].mapping_id, (void *)mappings.mapping_array[i].mapping_base, mappings.mapping_array[i].mapping_size); @@ -317,7 +324,8 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) // Attempt WC mapping first so we can fall back to all-UC if it fails. if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { bar0_wc_size = std::min(bar0_wc_mapping.mapping_size, wc_mapping_size); - bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); + bar0_wc = mmap( + NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); if (bar0_wc == MAP_FAILED) { bar0_wc_size = 0; bar0_wc = nullptr; @@ -334,7 +342,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) bar0_uc_offset = 0; } - bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_uc_mapping.mapping_base + bar0_uc_offset); + bar0_uc = mmap( + NULL, + bar0_uc_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar0_uc_mapping.mapping_base + bar0_uc_offset); if (bar0_uc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", pci_device_num)); @@ -351,22 +365,34 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) system_reg_mapping_size = bar4_uc_mapping.mapping_size; - system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_uc_mapping.mapping_base); + system_reg_mapping = mmap( + NULL, + bar4_uc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar4_uc_mapping.mapping_base); if (system_reg_mapping == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", pci_device_num)); } - system_reg_start_offset = (512 - 16) * 1024*1024; - system_reg_offset_adjust = (512 - 32) * 1024*1024; - } else if(arch == tt::ARCH::BLACKHOLE) { + system_reg_start_offset = (512 - 16) * 1024 * 1024; + system_reg_offset_adjust = (512 - 32) * 1024 * 1024; + } else if (arch == tt::ARCH::BLACKHOLE) { if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) { throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", pci_device_num)); } // Using UnCachable memory mode. This is used for accessing registers on Blackhole. bar2_uc_size = bar2_uc_mapping.mapping_size; - bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar2_uc_mapping.mapping_base); + bar2_uc = mmap( + NULL, + bar2_uc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar2_uc_mapping.mapping_base); if (bar2_uc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", pci_device_num)); @@ -379,7 +405,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole. // WC doesn't guarantee write ordering but has better performance. bar4_wc_size = bar4_wc_mapping.mapping_size; - bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_wc_mapping.mapping_base); + bar4_wc = mmap( + NULL, + bar4_wc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar4_wc_mapping.mapping_base); if (bar4_wc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", pci_device_num)); @@ -391,7 +423,7 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) } PCIDevice::~PCIDevice() { - for (const auto& hugepage_mapping : hugepage_mapping_per_channel) { + for (const auto &hugepage_mapping : hugepage_mapping_per_channel) { if (hugepage_mapping.mapping) { munmap(hugepage_mapping.mapping, hugepage_mapping.mapping_size); } @@ -405,8 +437,8 @@ PCIDevice::~PCIDevice() { // essential for correctness then it needs to move to the driver. uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 - write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); + uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 + write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); } close(pci_device_file_desc); @@ -432,8 +464,8 @@ PCIDevice::~PCIDevice() { } } -template -T* PCIDevice::get_register_address(uint32_t register_offset) { +template +T *PCIDevice::get_register_address(uint32_t register_offset) { // Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole). // Should clarify this interface void *reg_mapping; @@ -446,10 +478,10 @@ T* PCIDevice::get_register_address(uint32_t register_offset) { register_offset -= bar0_uc_offset; reg_mapping = bar0_uc; } - return reinterpret_cast(static_cast(reg_mapping) + register_offset); + return reinterpret_cast(static_cast(reg_mapping) + register_offset); } -void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) { +void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr) { void *dest = nullptr; if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { byte_addr -= BAR0_BH_SIZE; @@ -466,7 +498,7 @@ void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_ } } -void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) { +void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr) { void *src = nullptr; if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { byte_addr -= BAR0_BH_SIZE; @@ -483,7 +515,7 @@ void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buff } if (num_bytes >= sizeof(std::uint32_t)) { - detect_hang_read(*reinterpret_cast(dest)); + detect_hang_read(*reinterpret_cast(dest)); } } @@ -496,14 +528,14 @@ void PCIDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_ void PCIDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) { volatile uint32_t *dest = get_register_address(byte_addr); - const uint32_t *src = reinterpret_cast(data); + const uint32_t *src = reinterpret_cast(data); write_regs(dest, src, word_len); } void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { const volatile uint32_t *src = get_register_address(byte_addr); - uint32_t *dest = reinterpret_cast(data); + uint32_t *dest = reinterpret_cast(data); while (word_len-- != 0) { uint32_t temp = *src++; @@ -511,29 +543,34 @@ void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { } } -void PCIDevice::write_tlb_reg(uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size){ - log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); +void PCIDevice::write_tlb_reg( + uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size) { + log_assert( + (tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), + "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); volatile uint64_t *dest_qw = get_register_address(byte_addr); - volatile uint32_t *dest_extra_dw = get_register_address(byte_addr+8); + volatile uint32_t *dest_extra_dw = get_register_address(byte_addr + 8); #if defined(__ARM_ARCH) || defined(__riscv) // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses. - // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses. - // Insert an explicit full memory barrier for ARM. - // Do the same for RISC-V. + // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory + // accesses. Insert an explicit full memory barrier for ARM. Do the same for RISC-V. tt_driver_atomics::mfence(); #endif *dest_qw = value_lower; if (tlb_cfg_reg_size > 8) { - uint32_t* p_value_upper = reinterpret_cast(&value_upper); + uint32_t *p_value_upper = reinterpret_cast(&value_upper); *dest_extra_dw = p_value_upper[0]; } - tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register. + tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB + // register. } bool PCIDevice::is_hardware_hung() { - volatile const void *addr = reinterpret_cast(bar0_uc) + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - bar0_uc_offset; - std::uint32_t scratch_data = *reinterpret_cast(addr); + volatile const void *addr = reinterpret_cast(bar0_uc) + + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - + bar0_uc_offset; + std::uint32_t scratch_data = *reinterpret_cast(addr); return (scratch_data == c_hang_read_value); } @@ -547,55 +584,94 @@ void PCIDevice::detect_hang_read(std::uint32_t data_read) { } // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it. -dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering) { auto architecture_implementation = get_architecture_implementation(); if (multicast) { std::tie(start, end) = architecture_implementation->multicast_workaround(start, end); } - log_trace(LogSiliconDriver, "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast = {}, ordering = {}", - tlb_index, start.x, start.y, end.x, end.y, address, multicast, (int)ordering); + log_trace( + LogSiliconDriver, + "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast " + "= {}, ordering = {}", + tlb_index, + start.x, + start.y, + end.x, + end.y, + address, + multicast, + (int)ordering); tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index); std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes(); auto translated_start_coords = harvested_coord_translation.at(logical_id).at(start); auto translated_end_coords = harvested_coord_translation.at(logical_id).at(end); - uint32_t tlb_address = address / tlb_config.size; - uint32_t local_address = address % tlb_config.size; - uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); - uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset); - - std::pair tlb_data = tt::umd::tlb_data { - .local_offset = tlb_address, - .x_end = static_cast(translated_end_coords.x), - .y_end = static_cast(translated_end_coords.y), - .x_start = static_cast(translated_start_coords.x), - .y_start = static_cast(translated_start_coords.y), - .mcast = multicast, - .ordering = ordering, - // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. - // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB. - // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. - .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true, - }.apply_offset(tlb_config.offset); - - log_debug(LogSiliconDriver, "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} tlb_cfg_reg: 0x{:x}", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg); + uint32_t tlb_address = address / tlb_config.size; + uint32_t local_address = address % tlb_config.size; + uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); + uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset); + + std::pair tlb_data = + tt::umd::tlb_data{ + .local_offset = tlb_address, + .x_end = static_cast(translated_end_coords.x), + .y_end = static_cast(translated_end_coords.y), + .x_start = static_cast(translated_start_coords.x), + .y_start = static_cast(translated_start_coords.y), + .mcast = multicast, + .ordering = ordering, + // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. + // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be + // the same TLB. Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. + .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true, + } + .apply_offset(tlb_config.offset); + + log_debug( + LogSiliconDriver, + "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} " + "tlb_cfg_reg: 0x{:x}", + tlb_index, + tlb_config.index_offset, + tlb_config.size / (1024 * 1024), + tlb_base, + tlb_cfg_reg); write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES); - return { tlb_base + local_address, tlb_config.size - local_address }; + return {tlb_base + local_address, tlb_config.size - local_address}; } -dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair target, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering) { return set_dynamic_tlb(tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering); } -dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast( + unsigned int tlb_index, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t ordering) { // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid return set_dynamic_tlb(tlb_index, start, end, address, true, harvested_coord_translation, ordering); } -tt::umd::architecture_implementation* PCIDevice::get_architecture_implementation() const {return architecture_implementation.get();} +tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation() const { + return architecture_implementation.get(); +} bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { const size_t hugepage_size = HUGEPAGE_REGION_SIZE; @@ -605,7 +681,10 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { std::string hugepage_dir = find_hugepage_dir(hugepage_size); if (hugepage_dir.empty()) { - log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", hugepage_size); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", + hugepage_size); return false; } @@ -615,11 +694,14 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { // Support for more than 1GB host memory accessible per device, via channels. for (int ch = 0; ch < num_host_mem_channels; ch++) { - int hugepage_fd = open_hugepage_file(hugepage_dir, physical_device_id, ch); if (hugepage_fd == -1) { // Probably a permissions problem. - log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", physical_device_id, ch); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", + physical_device_id, + ch); success = false; continue; } @@ -630,26 +712,43 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { log_warning(LogSiliconDriver, "Error reading hugepage file size after opening."); } - std::byte *mapping = static_cast(mmap(nullptr, hugepage_size, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0)); + std::byte *mapping = static_cast( + mmap(nullptr, hugepage_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0)); close(hugepage_fd); if (mapping == MAP_FAILED) { - log_warning(LogSiliconDriver, "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", physical_device_id, ch, num_host_mem_channels, strerror(errno)); + log_warning( + LogSiliconDriver, + "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", + physical_device_id, + ch, + num_host_mem_channels, + strerror(errno)); if (hugepage_st.st_size == 0) { - log_warning(LogSiliconDriver, "Opened hugepage file has zero size, mapping might've failed due to that. Verify that enough hugepages are provided."); + log_warning( + LogSiliconDriver, + "Opened hugepage file has zero size, mapping might've failed due to that. Verify that enough " + "hugepages are provided."); } - print_file_contents("/proc/cmdline");\ - print_file_contents("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage. + print_file_contents("/proc/cmdline"); + print_file_contents( + "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage. success = false; continue; } - // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same numanode as TT device. - if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)){ - log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: {}). " - "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893).", - physical_device_id, ch); + // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same + // numanode as TT device. + if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) { + log_warning( + LogSiliconDriver, + "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: " + "{}). " + "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf " + "(Issue #893).", + physical_device_id, + ch); } tenstorrent_pin_pages pin_pages; @@ -662,7 +761,13 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { auto fd = get_fd(); if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) { - log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed (errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", physical_device_id, ch, strerror(errno)); + log_warning( + LogSiliconDriver, + "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed " + "(errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", + physical_device_id, + ch, + strerror(errno)); munmap(mapping, hugepage_size); print_file_contents("/sys/module/tenstorrent/version", "(TTKMD version)"); print_file_contents("/proc/meminfo"); @@ -673,15 +778,19 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { hugepage_mapping_per_channel[ch] = {mapping, hugepage_size, pin_pages.out.physical_address}; - log_debug(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", physical_device_id, ch, hugepage_size, (unsigned long long)hugepage_mappings.at(device_id).at(ch).physical_address); + log_debug( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", + physical_device_id, + ch, + hugepage_size, + (unsigned long long)hugepage_mappings.at(device_id).at(ch).physical_address); } return success; } -int PCIDevice::get_num_host_mem_channels() const { - return hugepage_mapping_per_channel.size(); -} +int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); } hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { if (channel < 0 || hugepage_mapping_per_channel.size() <= channel) { @@ -691,10 +800,10 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { } } -void PCIDevice::print_file_contents(std::string filename, std::string hint){ - if (std::filesystem::exists(filename)){ +void PCIDevice::print_file_contents(std::string filename, std::string hint) { + if (std::filesystem::exists(filename)) { std::ifstream meminfo(filename); - if (meminfo.is_open()){ + if (meminfo.is_open()) { std::cout << std::endl << "File " << filename << " " << hint << " is: " << std::endl; std::cout << meminfo.rdbuf(); } diff --git a/device/simulation/deprecated/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp index 25026737..e7d66893 100644 --- a/device/simulation/deprecated/tt_emulation_device.cpp +++ b/device/simulation/deprecated/tt_emulation_device.cpp @@ -3,193 +3,231 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include +#include "tt_emulation_device.h" + #include +#include #include "common/logger.hpp" #include "device/tt_cluster_descriptor.h" -#include "tt_emulation_device.h" #include "tt_emu_zemi3_wrapper.h" - tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - // create just a default one, we do not have cluster anyway - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); + soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); + std::set target_devices = {0}; + // create just a default one, we do not have cluster anyway + ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); + tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); - log_info(tt::LogEmulationDriver, "Created Emulation Device "); + log_info(tt::LogEmulationDriver, "Created Emulation Device "); } tt_emulation_device::~tt_emulation_device() { - ndesc.reset(); - delete tt_zebu_wrapper_inst; - log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); + ndesc.reset(); + delete tt_zebu_wrapper_inst; + log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); } - + void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) { - const uint32_t size = static_cast(data.size()); - tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); + const uint32_t size = static_cast(data.size()); + tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); + log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); } std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { - std::vector data(size); - tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); + std::vector data(size); + tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); + log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); - return data; + return data; } - void tt_emulation_device::start_device(const tt_device_params& device_params) { - tt_zebu_wrapper_inst->zebu_start(); - tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); - log_info(tt::LogEmulationDriver, "Started Emulation Device "); + tt_zebu_wrapper_inst->zebu_start(); + tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); + log_info(tt::LogEmulationDriver, "Started Emulation Device "); } void tt_emulation_device::deassert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_deassert(); - log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); + tt_zebu_wrapper_inst->all_tensix_reset_deassert(); + log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); } void tt_emulation_device::assert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_assert(); - log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); + tt_zebu_wrapper_inst->all_tensix_reset_assert(); + log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); } -void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); +void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); } void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) { - tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); + tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); } - - void tt_emulation_device::close_device() { log_info(tt::LogEmulationDriver, "Closing Emulation Device "); tt_zebu_wrapper_inst->zebu_finish(); } -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ +void tt_emulation_device::start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool /*init_device*/, + bool /*skip_driver_allocs*/ ) { - log_info(tt::LogEmulationDriver, "Starting Emulation Device "); + log_info(tt::LogEmulationDriver, "Starting Emulation Device "); +} + +void tt_emulation_device::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { + for (const auto& core : get_soc_descriptor(0)->cores) { + // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == + // rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { + // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + // } + // MT: Iterate through all the worker cores for bcast: + // if (get_soc_descriptor(0)->is_worker_core(core.first)) { + // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + // } + // Emulation only broadcasts to all Tensix cores or all DRAM cores. + // differentiate which bcast pattern to use based on exclude columns + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { + // Detect DRAM bcast + if (get_soc_descriptor(0)->is_dram_core(core.first)) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } else { + if (get_soc_descriptor(0)->is_worker_core(core.first)) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } + } } - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // MT: Iterate through all the worker cores for bcast: - // if (get_soc_descriptor(0)->is_worker_core(core.first)) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // Emulation only broadcasts to all Tensix cores or all DRAM cores. - // differentiate which bcast pattern to use based on exclude columns - if (cols_to_exclude.find(0) == cols_to_exclude.end()) { - // Detect DRAM bcast - if (get_soc_descriptor(0)->is_dram_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } else { - if (get_soc_descriptor(0)->is_worker_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } +void tt_emulation_device::rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use) { + std::vector vec = base_vec; + uint32_t byte_increment = 4 * vec.size(); + for (uint32_t i = 0; i < unroll_count; ++i) { + vec[0] = i; // slot id for debug + uint64_t offset_addr = base_addr + i * byte_increment; + write_to_device(vec, core, offset_addr, tlb_to_use); } - } -} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) { - std::vector vec = base_vec; - uint32_t byte_increment = 4 * vec.size(); - for (uint32_t i = 0; i < unroll_count; ++i) { - vec[0] = i; // slot id for debug - uint64_t offset_addr = base_addr + i * byte_increment; - write_to_device(vec, core, offset_addr, tlb_to_use); - } } -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} +void tt_emulation_device::write_to_device( + const void* mem_ptr, + uint32_t size, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { + std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); + write_to_device( + mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +} - std::vector byte_data(vec.size() * sizeof(uint32_t)); - std::memcpy(byte_data.data(), vec.data(), byte_data.size()); +void tt_emulation_device::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + std::vector byte_data(vec.size() * sizeof(uint32_t)); + std::memcpy(byte_data.data(), vec.data(), byte_data.size()); - write(core, addr, byte_data); + write(core, addr, byte_data); } -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { +void tt_emulation_device::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } +void tt_emulation_device::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { + std::vector byte_data = read(core, addr, size); + // Verify that the received byte data can be converted to uint32_t + // if (byte_data.size() % sizeof(uint32_t) != 0) { + // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); + // } -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { - std::vector byte_data = read(core, addr, size); - - // Verify that the received byte data can be converted to uint32_t - // if (byte_data.size() % sizeof(uint32_t) != 0) { - // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); - // } - - vec.clear(); - vec.resize(byte_data.size() / sizeof(uint32_t)); - std::memcpy(vec.data(), byte_data.data(), byte_data.size()); + vec.clear(); + vec.resize(byte_data.size() / sizeof(uint32_t)); + std::memcpy(vec.data(), byte_data.data(), byte_data.size()); } void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; + // No translation is performed + return; } + tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } std::set tt_emulation_device::get_target_mmio_device_ids() { - log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); - return {}; + log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); + return {}; } std::set tt_emulation_device::get_target_remote_device_ids() { - log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); - return {}; + log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); + return {}; } void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) { dram_address_params = dram_address_params_; } + int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } + +std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return {0}; } + int tt_emulation_device::detect_number_of_chips() { return 1; } bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} +bool tt_emulation_device::noc_translation_en() { return false; } -std::map tt_emulation_device::get_clocks() { - return std::map(); +std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { + return {{0, 0}}; } -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { - l1_address_params = l1_address_params_; +std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; } +std::map tt_emulation_device::get_clocks() { return std::map(); } - +void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { + l1_address_params = l1_address_params_; +} diff --git a/device/simulation/deprecated/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h index b15e2aaf..8c411d07 100644 --- a/device/simulation/deprecated/tt_emulation_device.h +++ b/device/simulation/deprecated/tt_emulation_device.h @@ -9,63 +9,97 @@ #include #include #include + +#include "cluster.h" #include "tt_soc_descriptor.h" #include "tt_xy_pair.h" -#include "cluster.h" // use forward declaration here so we do not need to include tt_zebu_wrapper.h class tt_zebu_wrapper; class tt_emulation_device : public tt_device { public: - virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care - tt_emulation_device(const std::string& sdesc_path); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params& device_params); - virtual void close_device(); - virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); - virtual void assert_risc_reset(); - virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); + virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care + tt_emulation_device(const std::string& sdesc_path); + virtual void start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool init_device, + bool skip_driver_allocs); + virtual void start_device(const tt_device_params& device_params); + virtual void close_device(); + virtual void deassert_risc_reset(); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void assert_risc_reset(); + virtual void assert_risc_reset_at_core(tt_cxy_pair core); + virtual void write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation - virtual void read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use); // See Versim Implementation + virtual void read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); - virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual bool noc_translation_en(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual ~tt_emulation_device(); - virtual tt_ClusterDescriptor* get_cluster_description(); - virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - static int detect_number_of_chips(); - virtual std::map get_clocks(); -private: - - tt_device_l1_address_params l1_address_params; - std::shared_ptr ndesc; - tt_device_dram_address_params dram_address_params; - - // zebu wrapper, provides interface to zebu emulator device through axi and command transactors - tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL; + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); + virtual bool using_harvested_soc_descriptors(); + virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); + virtual std::unordered_map& get_virtual_soc_descriptors(); + virtual bool noc_translation_en(); + virtual std::set get_target_mmio_device_ids(); + virtual std::set get_target_remote_device_ids(); + virtual ~tt_emulation_device(); + virtual tt_ClusterDescriptor* get_cluster_description(); + virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); + virtual int get_number_of_chips_in_cluster(); + virtual std::unordered_set get_all_chips_in_cluster(); + static int detect_number_of_chips(); + virtual std::map get_clocks(); +private: + tt_device_l1_address_params l1_address_params; + std::shared_ptr ndesc; + tt_device_dram_address_params dram_address_params; + // zebu wrapper, provides interface to zebu emulator device through axi and command transactors + tt_zebu_wrapper* tt_zebu_wrapper_inst = NULL; - // These functions implement the "protocol" between the RTL simulation and the UMD - void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); - std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); - + // These functions implement the "protocol" between the RTL simulation and the UMD + void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); + std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); }; - diff --git a/device/simulation/deprecated/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp index b841359f..bdd97b27 100644 --- a/device/simulation/deprecated/tt_emulation_stub.cpp +++ b/device/simulation/deprecated/tt_emulation_stub.cpp @@ -3,23 +3,21 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include #include +#include #include "common/logger.hpp" #include "tt_emulation_device.h" tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); + throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); } - tt_emulation_device::~tt_emulation_device() {} - -void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} -std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};} +void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} +std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { return {}; } void tt_emulation_device::start_device(const tt_device_params& device_params) {} @@ -27,52 +25,99 @@ void tt_emulation_device::deassert_risc_reset() {} void tt_emulation_device::assert_risc_reset() {} -void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {} +void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {} void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {} void tt_emulation_device::close_device() {} -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {} - - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {} - -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}; -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} - +void tt_emulation_device::start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool /*init_device*/, + bool /*skip_driver_allocs*/) {} + +void tt_emulation_device::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) {} + +void tt_emulation_device::rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use) {} + +void tt_emulation_device::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_emulation_device::write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write){}; + +void tt_emulation_device::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} + +void tt_emulation_device::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} // ------------------------- // Not sure how to implement these functions below, leaving them blank/default for now void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; + // No translation is performed + return; } + tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } -std::set tt_emulation_device::get_target_mmio_device_ids() {return {};} +std::set tt_emulation_device::get_target_mmio_device_ids() { return {}; } -std::set tt_emulation_device::get_target_remote_device_ids() {return {};} +std::set tt_emulation_device::get_target_remote_device_ids() { return {}; } void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} + int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } + +std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return {0}; } + int tt_emulation_device::detect_number_of_chips() { return 1; } bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} - -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} -std::map tt_emulation_device::get_clocks() {return std::map();} +bool tt_emulation_device::noc_translation_en() { return false; } -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} +std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { + return {{0, 0}}; +} +std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; +} +std::map tt_emulation_device::get_clocks() { return std::map(); } +void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} diff --git a/device/simulation/deprecated/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp index 7e700b2f..9504d9f6 100644 --- a/device/simulation/deprecated/tt_versim_device.cpp +++ b/device/simulation/deprecated/tt_versim_device.cpp @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: Apache-2.0 - - -#include "cluster.h" -#include "device/driver_atomics.h" -#include "common/logger.hpp" -#include #include +#include #include #include +#include "cluster.h" +#include "common/logger.hpp" +#include "device/driver_atomics.h" #include "yaml-cpp/yaml.h" // TODO: Remove dependency on command_assembler + soc @@ -19,112 +17,134 @@ #include "device/tt_cluster_descriptor.h" namespace CA = CommandAssembler; - -void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) { - for (auto &core : soc_descriptor.cores) { - CA::SocNocNode node; - CA::xy_pair CA_coord(core.first.x, core.first.y); - node.noc_coord = CA_coord; - node.memory_size = core.second.l1_size; - switch (core.second.type) { - case CoreType::ARC: node.arc = true; break; - case CoreType::DRAM: { - node.dram = true; - #ifdef EN_DRAM_ALIAS - node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); - #endif - } break; - case CoreType::ETH: node.eth = true; break; - case CoreType::PCIE: node.pcie = true; break; - case CoreType::WORKER: node.worker = true; break; - case CoreType::HARVESTED: node.harvested = true; break; - case CoreType::ROUTER_ONLY: node.router_only = true; break; - default: std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; break; +void translate_soc_descriptor_to_ca_soc(CA::Soc& soc, const tt_SocDescriptor soc_descriptor) { + for (auto& core : soc_descriptor.cores) { + CA::SocNocNode node; + CA::xy_pair CA_coord(core.first.x, core.first.y); + node.noc_coord = CA_coord; + node.memory_size = core.second.l1_size; + switch (core.second.type) { + case CoreType::ARC: + node.arc = true; + break; + case CoreType::DRAM: { + node.dram = true; +#ifdef EN_DRAM_ALIAS + node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); +#endif + } break; + case CoreType::ETH: + node.eth = true; + break; + case CoreType::PCIE: + node.pcie = true; + break; + case CoreType::WORKER: + node.worker = true; + break; + case CoreType::HARVESTED: + node.harvested = true; + break; + case CoreType::ROUTER_ONLY: + node.router_only = true; + break; + default: + std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; + break; + } + soc.SetNodeProperties(node.noc_coord, node); } - soc.SetNodeProperties(node.noc_coord, node); - } } //////// // Device Versim //////// +#include + #include "device.h" #include "sim_interactive.h" -#include -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - if (ndesc_path == "") { - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - } - else { - ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); - } +tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) { + soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); + std::set target_devices = {0}; + if (ndesc_path == "") { + ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); + } else { + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + } } -std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} - -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} -void tt_VersimDevice::start_device(const tt_device_params &device_params) { - bool no_checkers = true; - std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size); - start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); +std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; } -void tt_VersimDevice::close_device() { - stop(); +tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); } + +void tt_VersimDevice::start_device(const tt_device_params& device_params) { + bool no_checkers = true; + std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0)->grid_size); + start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); } +void tt_VersimDevice::close_device() { stop(); } + void tt_VersimDevice::start( std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ - ) { - - std::cout << "Start Versim Device " << std::endl; - std::string device_descriptor_dir = "./"; +) { + std::cout << "Start Versim Device " << std::endl; + std::string device_descriptor_dir = "./"; - std::optional vcd_suffix; - if (dump_cores.size() > 0) { - vcd_suffix = "core_dump.vcd"; - } + std::optional vcd_suffix; + if (dump_cores.size() > 0) { + vcd_suffix = "core_dump.vcd"; + } - std::vector vcd_cores; + std::vector vcd_cores; - // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core - // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly - // MT: have to preserve ca_soc_descriptor object since versim references it at runtime - CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y); - // CA::Soc ca_soc_manager(CA_grid_size); - std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); - translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second)); - // TODO: End + // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core + // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly + // MT: have to preserve ca_soc_descriptor object since versim references it at runtime + CA::xy_pair CA_grid_size( + (soc_descriptor_per_chip.begin()->second).grid_size.x, (soc_descriptor_per_chip.begin()->second).grid_size.y); + // CA::Soc ca_soc_manager(CA_grid_size); + std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); + translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin()->second)); + // TODO: End - std::cout << "Versim Device: turn_on_device "; - std::vector trisc_sizes = {static_cast(l1_address_params.trisc0_size), static_cast(l1_address_params.trisc1_size), static_cast(l1_address_params.trisc2_size)}; - std::unique_ptr versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers, - l1_address_params.trisc_base, trisc_sizes); - versim = versim_unique.release(); + std::cout << "Versim Device: turn_on_device "; + std::vector trisc_sizes = { + static_cast(l1_address_params.trisc0_size), + static_cast(l1_address_params.trisc1_size), + static_cast(l1_address_params.trisc2_size)}; + std::unique_ptr versim_unique = versim::turn_on_device( + CA_grid_size, + *p_ca_soc_manager_unique, + plusargs, + vcd_suffix, + dump_cores, + no_checkers, + l1_address_params.trisc_base, + trisc_sizes); + versim = versim_unique.release(); - std::cout << "Versim Device: write info to tvm db " << std::endl; - versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); - versim::build_and_connect_tvm_phase(); + std::cout << "Versim Device: write info to tvm db " << std::endl; + versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); + versim::build_and_connect_tvm_phase(); - versim->spin_threads(*p_ca_soc_manager_unique, false); - versim::assert_reset(*versim); + versim->spin_threads(*p_ca_soc_manager_unique, false); + versim::assert_reset(*versim); - p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); + p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); - std::cout << "Versim Device: Done start " << std::endl; + std::cout << "Versim Device: Done start " << std::endl; } -tt_VersimDevice::~tt_VersimDevice () { - ndesc.reset(); -} +tt_VersimDevice::~tt_VersimDevice() { ndesc.reset(); } // bool tt_VersimDevice::run() { // std::cout << "Versim Device: Run " << std::endl; @@ -136,165 +156,218 @@ tt_VersimDevice::~tt_VersimDevice () { // } void tt_VersimDevice::deassert_risc_reset() { - std::cout << "Versim Device: Deassert risc resets start" << std::endl; - versim::handle_resetting_triscs(*versim); - std::cout << "Versim Device: Start main loop " << std::endl; - versim::startup_versim_main_loop(*versim); + std::cout << "Versim Device: Deassert risc resets start" << std::endl; + versim::handle_resetting_triscs(*versim); + std::cout << "Versim Device: Start main loop " << std::endl; + versim::startup_versim_main_loop(*versim); } -void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - // This function deasserts reset on the full versim device (don't need core level granularity for versim) - deassert_risc_reset(); +void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + // This function deasserts reset on the full versim device (don't need core level granularity for versim) + deassert_risc_reset(); } void tt_VersimDevice::assert_risc_reset() { - std::cout << "Pause all the cores" << std::endl; - versim::pause(*versim); + std::cout << "Pause all the cores" << std::endl; + versim::pause(*versim); - std::cout << "Wait for cores to go to paused state" << std::endl; - versim::sleep_wait_for_paused (*versim); + std::cout << "Wait for cores to go to paused state" << std::endl; + versim::sleep_wait_for_paused(*versim); - std::cout << "Assert riscv reset" << std::endl; - versim::assert_riscv_reset(*versim); + std::cout << "Assert riscv reset" << std::endl; + versim::assert_riscv_reset(*versim); } void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - // This function asserts reset on the full versim device (don't need core level granularity for versim) - assert_risc_reset(); -} - -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { - uint32_t byte_increment = vec.size() * 4; - for (int i=0; i mem_vector(mem_ptr, mem_ptr + len); - rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); + // This function asserts reset on the full versim device (don't need core level granularity for versim) + assert_risc_reset(); } -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr); - - bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM; - // MT: Remove these completely - CommandAssembler::xy_pair CA_target(core.x, core.y); - CommandAssembler::memory CA_tensor_memory(addr, vec); - - nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); +void tt_VersimDevice::rolled_write_to_device( + std::vector& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { + uint32_t byte_increment = vec.size() * 4; + for (int i = 0; i < unroll_count; i++) { + vec[0] = i; // slot id for debug + write_to_device(vec, core, addr + i * byte_increment, tlb_to_use); + } } -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); - - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +void tt_VersimDevice::rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t len, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb) { + std::vector mem_vector(mem_ptr, mem_ptr + len); + rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); +} + +void tt_VersimDevice::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Write vector at target core {}, address: {}", + get_sim_time(*versim), + core.str(), + addr); + + bool aligned_32B = (soc_descriptor_per_chip.begin()->second).cores.at(core).type == CoreType::DRAM; + // MT: Remove these completely + CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::memory CA_tensor_memory(addr, vec); + + nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); +} + +void tt_VersimDevice::write_to_device( + const void* mem_ptr, + uint32_t size, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); + + std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); + write_to_device( + mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +} + +void tt_VersimDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { + for (const auto& core : get_soc_descriptor(0)->cores) { + if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and + rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } } -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } -} void tt_VersimDevice::wait_for_non_mmio_flush() { - // Do nothing, since Versim does not simulate non-mmio mapped chips + // Do nothing, since Versim does not simulate non-mmio mapped chips } -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); +void tt_VersimDevice::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", + get_sim_time(*versim), + addr, + size); - CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::xy_pair CA_target(core.x, core.y); - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - vec = result; + size_t size_in_words = size / 4; + auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); + vec = result; } -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); - log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); +void tt_VersimDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", + get_sim_time(*versim), + addr, + size); + log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); - CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::xy_pair CA_target(core.x, core.y); - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t)); + size_t size_in_words = size / 4; + auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); + memcpy(mem_ptr, result.data(), result.size() * sizeof(uint32_t)); } -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { - // No translation is performed - return; +void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { + // No translation is performed + return; } std::set tt_VersimDevice::get_target_mmio_device_ids() { - // Must only be used for silicon - return {}; + // Must only be used for silicon + return {}; } std::set tt_VersimDevice::get_target_remote_device_ids() { - // Must only be used for silicon - return {}; + // Must only be used for silicon + return {}; } - -bool versim_check_dram_core_exists(const std::vector> &dram_core_channels, tt_xy_pair target_core) { +bool versim_check_dram_core_exists( + const std::vector>& dram_core_channels, tt_xy_pair target_core) { bool dram_core_exists = false; - for (const auto &dram_cores_in_channel: dram_core_channels) { - for (const auto &dram_core : dram_cores_in_channel) { - if (dram_core.x == target_core.x && dram_core.y == target_core.y) { - return true; + for (const auto& dram_cores_in_channel : dram_core_channels) { + for (const auto& dram_core : dram_cores_in_channel) { + if (dram_core.x == target_core.x && dram_core.y == target_core.y) { + return true; + } } - } } return false; } int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } + std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {0}; } + int tt_VersimDevice::detect_number_of_chips() { return 1; } bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } + bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} + +std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; } // Meant to breakout running functions for simulator bool tt_VersimDevice::stop() { - std::cout << "Versim Device: Stop " << std::endl; - - versim::turn_off_device(*versim); - versim->shutdown(); - // Force free of all versim cores - for (auto x = 0; x < versim->grid_size.x; x++) { - for (auto y = 0; y < versim->grid_size.y; y++) { - delete versim->core_grid.at(x).at(y); + std::cout << "Versim Device: Stop " << std::endl; + + versim::turn_off_device(*versim); + versim->shutdown(); + // Force free of all versim cores + for (auto x = 0; x < versim->grid_size.x; x++) { + for (auto y = 0; y < versim->grid_size.y; y++) { + delete versim->core_grid.at(x).at(y); + } } - } - std::cout << "Versim Device: Stop completed " << std::endl; - delete versim; - return true; + std::cout << "Versim Device: Stop completed " << std::endl; + delete versim; + return true; } -std::map tt_VersimDevice::get_clocks() { - return std::map(); -} +std::map tt_VersimDevice::get_clocks() { return std::map(); } void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { l1_address_params = l1_address_params_; @@ -305,11 +378,11 @@ void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_addres } std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { - return get_soc_descriptor(device_id) -> get_num_dram_channels(); + return get_soc_descriptor(device_id)->get_num_dram_channels(); } std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id)->dram_bank_size; // Space per channel is identical for now } std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h index 05ac6b06..2c71f1be 100644 --- a/device/simulation/deprecated/tt_versim_device.h +++ b/device/simulation/deprecated/tt_versim_device.h @@ -11,42 +11,92 @@ #include "tt_xy_pair.h" class c_versim_core; -namespace nuapi {namespace device {template class Simulator;}} -namespace versim { - struct VersimSimulatorState; - using VersimSimulator = nuapi::device::Simulator; + +namespace nuapi { +namespace device { +template +class Simulator; } +} // namespace nuapi + +namespace versim { +struct VersimSimulatorState; +using VersimSimulator = nuapi::device::Simulator; +} // namespace versim /** * @brief Versim Backend Class, derived from the tt_device class * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device. -*/ -class tt_VersimDevice: public tt_device -{ - public: + */ +class tt_VersimDevice : public tt_device { +public: virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path); + tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path); virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params &device_params); + virtual void start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool init_device, + bool skip_driver_allocs); + virtual void start_device(const tt_device_params& device_params); virtual void close_device(); virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset(); virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); + virtual void rolled_write_to_device( + std::vector& vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use); + virtual void read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t size_in_bytes, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); virtual void wait_for_non_mmio_flush(); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); virtual bool noc_translation_en(); @@ -57,12 +107,13 @@ class tt_VersimDevice: public tt_device virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); static int detect_number_of_chips(); - virtual std::map get_clocks(); + virtual std::map get_clocks(); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - private: + +private: bool stop(); tt_device_l1_address_params l1_address_params; tt_device_dram_address_params dram_address_params; diff --git a/device/simulation/deprecated/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp index 8cf0899b..c80e0bdd 100644 --- a/device/simulation/deprecated/tt_versim_stub.cpp +++ b/device/simulation/deprecated/tt_versim_stub.cpp @@ -2,19 +2,18 @@ // // SPDX-License-Identifier: Apache-2.0 - -#include "cluster.h" - -#include #include +#include #include #include -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); +#include "cluster.h" + +tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) { + throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); } -tt_VersimDevice::~tt_VersimDevice () {} +tt_VersimDevice::~tt_VersimDevice() {} std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); @@ -22,23 +21,71 @@ std::unordered_map& tt_VersimDevice::get_virtual_so } int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } + std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {}; } + int tt_VersimDevice::detect_number_of_chips() { return 0; } -void tt_VersimDevice::start_device(const tt_device_params &device_params) {} +void tt_VersimDevice::start_device(const tt_device_params& device_params) {} + void tt_VersimDevice::close_device() {} -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {} -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {} + +void tt_VersimDevice::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_VersimDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) {} + +void tt_VersimDevice::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} + +void tt_VersimDevice::rolled_write_to_device( + std::vector& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { +} + +void tt_VersimDevice::write_to_device( + const void* mem_ptr, + uint32_t len, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_VersimDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} + +void tt_VersimDevice::rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t len, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb) {} + void tt_VersimDevice::wait_for_non_mmio_flush() {} -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} +void tt_VersimDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} + +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} void tt_VersimDevice::start( std::vector plusargs, @@ -49,36 +96,48 @@ void tt_VersimDevice::start( ) {} void tt_VersimDevice::deassert_risc_reset() {} -void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {} + +void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {} + void tt_VersimDevice::assert_risc_reset() {} + void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {} -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {}; +void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c){}; + // void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {} -std::set tt_VersimDevice::get_target_mmio_device_ids() {return {};} -std::set tt_VersimDevice::get_target_remote_device_ids() {return {};} +std::set tt_VersimDevice::get_target_mmio_device_ids() { return {}; } + +std::set tt_VersimDevice::get_target_remote_device_ids() { return {}; } bool versim_check_dram_core_exists( - const std::vector> &dram_core_channels, tt_xy_pair target_core) { - return false; + const std::vector>& dram_core_channels, tt_xy_pair target_core) { + return false; } bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } + bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map();} + +std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { + return std::unordered_map(); +} bool tt_VersimDevice::stop() { return true; } void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} + void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} -std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;} -std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;} -std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} +std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { return 0; } + +std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } + +std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { return 0; } -std::map tt_VersimDevice::get_clocks() {return std::map();} +std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} +std::map tt_VersimDevice::get_clocks() { return std::map(); } +tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); } diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp index 61cd55ac..086eb520 100644 --- a/device/simulation/tt_simulation_device.cpp +++ b/device/simulation/tt_simulation_device.cpp @@ -4,43 +4,44 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include +#include "umd/device/tt_simulation_device.h" + +#include +#include + #include +#include #include #include -#include -#include - -#include "logger.hpp" #include "assert.hpp" +#include "logger.hpp" +#include "tt_simulation_device_generated.h" #include "umd/device/driver_atomics.h" #include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/tt_simulation_device.h" -#include "tt_simulation_device_generated.h" - -flatbuffers::FlatBufferBuilder create_flatbuffer(DEVICE_COMMAND rw, std::vector vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_=0){ +flatbuffers::FlatBufferBuilder create_flatbuffer( + DEVICE_COMMAND rw, std::vector vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_ = 0) { flatbuffers::FlatBufferBuilder builder; auto data = builder.CreateVector(vec); auto core = tt_vcs_core(core_.x, core_.y); - uint64_t size = size_ == 0 ? size = vec.size()*sizeof(uint32_t) : size = size_; + uint64_t size = size_ == 0 ? size = vec.size() * sizeof(uint32_t) : size = size_; auto device_cmd = CreateDeviceRequestResponse(builder, rw, data, &core, addr, size); builder.Finish(device_cmd); return builder; } -void print_flatbuffer(const DeviceRequestResponse *buf){ +void print_flatbuffer(const DeviceRequestResponse* buf) { std::vector data_vec(buf->data()->begin(), buf->data()->end()); uint64_t addr = buf->address(); uint32_t size = buf->size(); tt_cxy_pair core = {0, buf->core()->x(), buf->core()->y()}; - + std::stringstream ss; ss << std::hex << reinterpret_cast(addr); std::string addr_hex = ss.str(); log_info(tt::LogEmulationDriver, "{} bytes @ address {} in core ({}, {})", size, addr_hex, core.x, core.y); - for(int i = 0; i < data_vec.size(); i++){ + for (int i = 0; i < data_vec.size(); i++) { std::ios_base::fmtflags save = std::cout.flags(); std::cout << "0x" << std::hex << std::setw(8) << std::setfill('0') << data_vec[i] << " "; std::cout.flags(save); @@ -48,14 +49,14 @@ void print_flatbuffer(const DeviceRequestResponse *buf){ std::cout << std::endl; } -tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(sdesc_path){ +tt_SimulationDevice::tt_SimulationDevice(const std::string& sdesc_path) : tt_device() { log_info(tt::LogEmulationDriver, "Instantiating simulation device"); soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; - + // Start VCS simulator in a separate process TT_ASSERT(std::getenv("TT_REMOTE_EXE"), "TT_REMOTE_EXE not set, please provide path to the VCS binary"); - uv_loop_t *loop = uv_default_loop(); + uv_loop_t* loop = uv_default_loop(); uv_process_t child_p; uv_process_options_t child_options = {0}; @@ -69,14 +70,12 @@ tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_dev log_info(tt::LogEmulationDriver, "Simulator process spawned with PID: {}", child_p.pid); } - uv_unref((uv_handle_t *) &child_p); + uv_unref((uv_handle_t*)&child_p); uv_run(loop, UV_RUN_DEFAULT); uv_loop_close(loop); } -tt_SimulationDevice::~tt_SimulationDevice() { - close_device(); -} +tt_SimulationDevice::~tt_SimulationDevice() { close_device(); } // Setup/Teardown Functions std::unordered_map& tt_SimulationDevice::get_virtual_soc_descriptors() { @@ -99,11 +98,11 @@ void tt_SimulationDevice::set_driver_eth_interface_params(const tt_driver_eth_in eth_interface_params = eth_interface_params_; } -void tt_SimulationDevice::start_device(const tt_device_params &device_params) { - void *buf_ptr = nullptr; +void tt_SimulationDevice::start_device(const tt_device_params& device_params) { + void* buf_ptr = nullptr; host.start_host(); - + log_info(tt::LogEmulationDriver, "Waiting for ack msg from remote..."); size_t buf_size = host.recv_from_device(&buf_ptr); auto buf = GetDeviceRequestResponse(buf_ptr); @@ -114,8 +113,9 @@ void tt_SimulationDevice::start_device(const tt_device_params &device_params) { void tt_SimulationDevice::assert_risc_reset() { log_info(tt::LogEmulationDriver, "Sending assert_risc_reset signal.."); - auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector(1, 0), {0, 0, 0}, 0); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + auto wr_buffer = + create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector(1, 0), {0, 0, 0}, 0); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); @@ -124,20 +124,25 @@ void tt_SimulationDevice::assert_risc_reset() { void tt_SimulationDevice::deassert_risc_reset() { log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset' signal.."); - auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector(1, 0), {0, 0, 0}, 0); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + auto wr_buffer = + create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector(1, 0), {0, 0, 0}, 0); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); host.send_to_device(wr_buffer_ptr, wr_buffer_size); } -void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)"); +void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + log_info( + tt::LogEmulationDriver, + "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)"); deassert_risc_reset(); } void tt_SimulationDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - log_info(tt::LogEmulationDriver, "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)"); + log_info( + tt::LogEmulationDriver, + "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)"); assert_risc_reset(); } @@ -149,19 +154,21 @@ void tt_SimulationDevice::close_device() { } // Runtime Functions -void tt_SimulationDevice::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { +void tt_SimulationDevice::write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { log_info(tt::LogEmulationDriver, "Device writing"); std::vector data((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size_in_bytes / sizeof(uint32_t)); auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_WRITE, data, core, addr); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); - - print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print + + print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print host.send_to_device(wr_buffer_ptr, wr_buffer_size); } -void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - void *rd_resp; +void tt_SimulationDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + void* rd_resp; // Send read request auto rd_req_buf = create_flatbuffer(DEVICE_COMMAND_READ, {0}, core, addr, size); @@ -171,50 +178,49 @@ void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint size_t rd_rsp_sz = host.recv_from_device(&rd_resp); auto rd_resp_buf = GetDeviceRequestResponse(rd_resp); - if (addr != 0x40){ + if (addr != 0x40) { log_info(tt::LogEmulationDriver, "Device reading vec"); - print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam + print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam } std::memcpy(mem_ptr, rd_resp_buf->data()->data(), rd_resp_buf->data()->size() * sizeof(uint32_t)); nng_free(rd_resp, rd_rsp_sz); } void tt_SimulationDevice::wait_for_non_mmio_flush() {} + void tt_SimulationDevice::wait_for_non_mmio_flush(const chip_id_t chip) {} -void tt_SimulationDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_SimulationDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_SimulationDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} + +void tt_SimulationDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} // Misc. Functions to Query/Set Device State std::unordered_map tt_SimulationDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; } -std::vector tt_SimulationDevice::detect_available_device_ids() { - return {0}; -} +std::vector tt_SimulationDevice::detect_available_device_ids() { return {0}; } -std::set tt_SimulationDevice::get_target_remote_device_ids() { - return target_remote_chips; -} +std::set tt_SimulationDevice::get_target_remote_device_ids() { return target_remote_chips; } -std::map tt_SimulationDevice::get_clocks() { - return {{0, 0}}; -} +std::map tt_SimulationDevice::get_clocks() { return {{0, 0}}; } -void *tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { +void* tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { return nullptr; } std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { return 0x800000000; - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; - } - else { + } else { return 0; } } @@ -224,12 +230,11 @@ std::uint32_t tt_SimulationDevice::get_num_dram_channels(std::uint32_t device_id } std::uint64_t tt_SimulationDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now } -std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { - return 1; -} +std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { return 1; } + +std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } -std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {return 0;} +std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { return 0; } diff --git a/device/simulation/tt_simulation_host.cpp b/device/simulation/tt_simulation_host.cpp index eeee8110..7e5fe8be 100644 --- a/device/simulation/tt_simulation_host.cpp +++ b/device/simulation/tt_simulation_host.cpp @@ -2,19 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include -#include -#include -#include +#include "umd/device/tt_simulation_host.hpp" #include #include -#include "logger.hpp" +#include +#include +#include +#include +#include +#include + #include "assert.hpp" -#include "umd/device/tt_simulation_host.hpp" +#include "logger.hpp" tt_SimulationHost::tt_SimulationHost() { // Initialize socket and dialer @@ -64,7 +65,7 @@ void tt_SimulationHost::start_host() { void tt_SimulationHost::send_to_device(uint8_t *buf, size_t buf_size) { int rv; log_debug(tt::LogEmulationDriver, "Sending messsage to remote.."); - + void *msg = nng_alloc(buf_size); std::memcpy(msg, buf, buf_size); diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 203a7a0c..ff3897f5 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -2,23 +2,25 @@ // // SPDX-License-Identifier: Apache-2.0 - #include "umd/device/tt_cluster_descriptor.h" -#include "libs/create_ethernet_map.h" #include #include -#include +#include +#include "disjoint_set.hpp" +#include "fmt/core.h" +#include "libs/create_ethernet_map.h" #include "logger.hpp" #include "yaml-cpp/yaml.h" -#include "fmt/core.h" - using namespace tt; -bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const { + +bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link( + chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const { return this->ethernet_connections.find(local_chip) != this->ethernet_connections.end() && - this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != this->ethernet_connections.at(local_chip).end(); + this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != + this->ethernet_connections.at(local_chip).end(); } std::tuple tt_ClusterDescriptor::get_chip_and_channel_of_remote_ethernet_core( @@ -39,10 +41,14 @@ std::tuple tt_ClusterDescriptor::get_chip_and_cha } } -// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how extensively router needs to use it -std::vector> tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const { +// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how +// extensively router needs to use it +std::vector> +tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips( + const chip_id_t &first, const chip_id_t &second) const { std::vector> directly_connected_channels = {}; - if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { + if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || + this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { return {}; } @@ -59,9 +65,7 @@ bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t chip_id) const { return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end(); } -bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { - return !is_chip_mmio_capable(chip_id); -} +bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { return !is_chip_mmio_capable(chip_id); } // given two coordinates, finds the number of hops between the two chips // it assumes that shelves are connected in x-dim and racks are connected in y-dim @@ -70,190 +74,266 @@ bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { // then once a chip on the same shelf&rack is found, // the distance from this chip to either location_a or location_b is just x&y dim difference. // the function returns the total distance of travelled between shelves and racks, plust the x&y dim difference -int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const { - - log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b)); - - // eth_coord_t: x, y, rack, shelf - - int x_a = std::get<0>(location_a); - int x_b = std::get<0>(location_b); - - int y_a = std::get<1>(location_a); - int y_b = std::get<1>(location_b); - - int shelf_a = std::get<3>(location_a); - int shelf_b = std::get<3>(location_b); - - int rack_a = std::get<2>(location_a); - int rack_b = std::get<2>(location_b); - - int x_distance = std::abs(x_a - x_b); - int y_distance = std::abs(y_a - y_b); +int tt_ClusterDescriptor::get_ethernet_link_coord_distance( + const eth_coord_t &location_a, const eth_coord_t &location_b) const { + log_trace( + LogSiliconDriver, + "get_ethernet_link_coord_distance from ({}, {}, {}, {}, {}) to ({}, {}, {}, {}, {})", + location_a.cluster_id, + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.cluster_id, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf); + + if (location_a.cluster_id != location_b.cluster_id) { + return std::numeric_limits::max(); + } + + int x_distance = std::abs(location_a.x - location_b.x); + int y_distance = std::abs(location_a.y - location_b.y); // move along y-dim to exit from the shelf to go to a higher shelf - if(shelf_b > shelf_a) { + if (location_b.shelf > location_a.shelf) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert( + galaxy_shelves_exit_chip_coords_per_y_dim.find(location_a.shelf) != + galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) { + if (galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).find(location_a.y) == + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a); - log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); + const Chip2ChipConnection &shelf_to_shelf_connection = + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).at(location_a.y); + log_assert( + shelf_to_shelf_connection.destination_chip_coords.size(), + "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); - // for each shelf-to-shelf connection at y_a, find the distance to location_b, take min + // for each shelf-to-shelf connection at location_a.y, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; - for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - - log_assert(std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a && std::get<2>(exit_shelf) == rack_a, + for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { + log_assert( + exit_shelf.y == location_a.y && exit_shelf.shelf == location_a.shelf && + exit_shelf.rack == location_a.rack, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(std::get<3>(next_shelf) == (shelf_a+1) && std::get<2>(next_shelf) == rack_a, + log_assert( + next_shelf.shelf == (location_a.shelf + 1) && next_shelf.rack == location_a.rack, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_shelf); int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_b); // no path found - if(distance_to_exit == std::numeric_limits::max() || distance_in_next_shelf == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_shelf == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; - } - else if(shelf_a > shelf_b) { - + } else if (location_a.shelf > location_b.shelf) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert( + galaxy_shelves_exit_chip_coords_per_y_dim.find(location_b.shelf) != + galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) { + if (galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).find(location_b.y) == + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b); - log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many") + const Chip2ChipConnection &shelf_to_shelf_connection = + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).at(location_b.y); + log_assert( + shelf_to_shelf_connection.destination_chip_coords.size(), + "Expecting at least one shelf-to-shelf connection, possibly one-to-many") - // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min - int distance = std::numeric_limits::max(); + // for each shelf-to-shelf connection at location_b.y, find the distance to location_a, take min + int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; - for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - - log_assert(std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b && std::get<2>(exit_shelf) == rack_b, + for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { + log_assert( + exit_shelf.y == location_b.y && exit_shelf.shelf == location_b.shelf && + exit_shelf.rack == location_b.rack, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(std::get<3>(next_shelf) == (shelf_b+1) && std::get<2>(next_shelf) == rack_b, + log_assert( + next_shelf.shelf == (location_b.shelf + 1) && next_shelf.rack == location_b.rack, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_shelf); int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_a); // no path found - if(distance_to_exit == std::numeric_limits::max() || distance_in_next_shelf == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_shelf == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; } // move along y-dim to exit from the shelf to go to a higher shelf - if(rack_b > rack_a) { - + if (location_b.rack > location_a.rack) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert( + galaxy_racks_exit_chip_coords_per_x_dim.find(location_a.rack) != + galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) { + if (galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).find(location_a.x) == + galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a); - log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); + const Chip2ChipConnection &rack_to_rack_connection = + galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).at(location_a.x); + log_assert( + rack_to_rack_connection.destination_chip_coords.size(), + "Expecting at least one rack-to-rack connection, possibly one-to-many"); - // for each rack-to-rack connection at x_a, find the distance to location_b, take min + // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; - for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - - log_assert(std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a, + for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { + log_assert( + exit_rack.x == location_a.x && exit_rack.shelf == location_a.shelf && exit_rack.rack == location_a.rack, "Invalid rack exit coordinates"); - log_assert(std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a && std::get<2>(next_rack) == (rack_a+1), + log_assert( + next_rack.x == location_a.x && next_rack.shelf == location_a.shelf && + next_rack.rack == (location_a.rack + 1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_rack); int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_b); // no path found - if (distance_to_exit == std::numeric_limits::max() || distance_in_next_rack == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_rack == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; - } - else if(rack_a > rack_b) { - + } else if (location_a.rack > location_b.rack) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert( + galaxy_racks_exit_chip_coords_per_x_dim.find(location_b.rack) != + galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) { + if (galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).find(location_b.x) == + galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b); - log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); + const Chip2ChipConnection &rack_to_rack_connection = + galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).at(location_b.x); + log_assert( + rack_to_rack_connection.destination_chip_coords.size(), + "Expecting at least one rack-to-rack connection, possibly one-to-many"); - // for each rack-to-rack connection at x_a, find the distance to location_b, take min + // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; - for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - - log_assert(std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b, + for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { + log_assert( + exit_rack.x == location_b.x && exit_rack.shelf == location_b.shelf && exit_rack.rack == location_b.rack, "Invalid rack exit coordinates"); - log_assert(std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b && std::get<2>(next_rack) == (rack_b+1), + log_assert( + next_rack.x == location_b.x && next_rack.shelf == location_b.shelf && + next_rack.rack == (location_b.rack + 1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_rack); int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_a); // no path found - if (distance_to_exit == std::numeric_limits::max() || distance_in_next_rack == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_rack == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), x_distance + y_distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + x_distance + y_distance); // on same shelf/rack, the distance is just x+y difference return x_distance + y_distance; @@ -261,14 +341,13 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo // Returns the closest mmio chip to the given chip chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t chip) { - log_debug(LogSiliconDriver, "get_closest_mmio_chip to chip{}", chip); if (this->is_chip_mmio_capable(chip)) { return chip; } - if(closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) { + if (closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) { return closest_mmio_chip_cache[chip]; } @@ -280,15 +359,24 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch const chip_id_t &mmio_chip = pair.first; eth_coord_t mmio_eth_coord = this->chip_locations.at(mmio_chip); - log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, std::get<0>(mmio_eth_coord), std::get<1>(mmio_eth_coord), std::get<2>(mmio_eth_coord), std::get<3>(mmio_eth_coord)); + log_debug( + LogSiliconDriver, + "Checking chip{} at ({}, {}, {}, {})", + mmio_chip, + mmio_eth_coord.x, + mmio_eth_coord.y, + mmio_eth_coord.rack, + mmio_eth_coord.shelf); int distance = get_ethernet_link_coord_distance(mmio_eth_coord, chip_eth_coord); + log_debug(LogSiliconDriver, "Distance from chip{} to chip{} is {}", chip, mmio_chip, distance); if (distance < min_distance) { min_distance = distance; closest_chip = mmio_chip; } } - log_assert(min_distance != std::numeric_limits::max(), "Chip{} is not connected to any MMIO capable chip", chip); + log_assert( + min_distance != std::numeric_limits::max(), "Chip{} is not connected to any MMIO capable chip", chip); log_assert(is_chip_mmio_capable(closest_chip), "Closest MMIO chip must be MMIO capable"); @@ -302,38 +390,45 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch std::string tt_ClusterDescriptor::get_cluster_descriptor_file_path() { static std::string yaml_path; static bool is_initialized = false; - if (!is_initialized){ - + if (!is_initialized) { // Cluster descriptor yaml will be created in a unique temporary directory. std::filesystem::path temp_path = std::filesystem::temp_directory_path(); std::string cluster_path_dir_template = temp_path / "umd_XXXXXX"; std::filesystem::path cluster_path_dir = mkdtemp(cluster_path_dir_template.data()); std::filesystem::path cluster_path = cluster_path_dir / "cluster_descriptor.yaml"; - if (!std::filesystem::exists(cluster_path)){ - auto val = system ( ("touch " + cluster_path.string()).c_str()); - if(val != 0) throw std::runtime_error("Cluster Generation Failed!"); + if (!std::filesystem::exists(cluster_path)) { + auto val = system(("touch " + cluster_path.string()).c_str()); + if (val != 0) { + throw std::runtime_error("Cluster Generation Failed!"); + } } - int val = create_ethernet_map((char*)cluster_path.string().c_str()); - if(val != 0) throw std::runtime_error("Cluster Generation Failed!"); + int val = create_ethernet_map((char *)cluster_path.string().c_str()); + if (val != 0) { + throw std::runtime_error("Cluster Generation Failed!"); + } yaml_path = cluster_path.string(); is_initialized = true; } return yaml_path; } -std::unique_ptr tt_ClusterDescriptor::create_from_yaml(const std::string &cluster_descriptor_file_path) { +std::unique_ptr tt_ClusterDescriptor::create_from_yaml( + const std::string &cluster_descriptor_file_path) { std::unique_ptr desc = std::unique_ptr(new tt_ClusterDescriptor()); std::ifstream fdesc(cluster_descriptor_file_path); if (fdesc.fail()) { - throw std::runtime_error(fmt::format("Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path)); + throw std::runtime_error(fmt::format( + "Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path)); } fdesc.close(); YAML::Node yaml = YAML::LoadFile(cluster_descriptor_file_path); tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(yaml, *desc); tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(yaml, *desc); + tt_ClusterDescriptor::merge_cluster_ids(*desc); + tt_ClusterDescriptor::fill_galaxy_connections(*desc); tt_ClusterDescriptor::load_harvesting_information(yaml, *desc); desc->enable_all_devices(); @@ -343,22 +438,31 @@ std::unique_ptr tt_ClusterDescriptor::create_from_yaml(con } std::unique_ptr tt_ClusterDescriptor::create_for_grayskull_cluster( - const std::set &logical_mmio_device_ids, - const std::vector &physical_mmio_device_ids) { + const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids) { std::unique_ptr desc = std::unique_ptr(new tt_ClusterDescriptor()); // Some users need not care about physical ids, can provide empty set. - auto use_physical_ids = physical_mmio_device_ids.size() ? true : false; - auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set. - auto num_available_physical_devices = physical_mmio_device_ids.size(); - auto required_physical_devices = largest_workload_logical_device_id + 1; - - log_debug(tt::LogSiliconDriver, "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} required_physical_devices: {}", - __FUNCTION__, use_physical_ids, largest_workload_logical_device_id, num_available_physical_devices, required_physical_devices); - - log_assert(!use_physical_ids || num_available_physical_devices >= required_physical_devices, + auto use_physical_ids = physical_mmio_device_ids.size() ? true : false; + auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set. + auto num_available_physical_devices = physical_mmio_device_ids.size(); + auto required_physical_devices = largest_workload_logical_device_id + 1; + + log_debug( + tt::LogSiliconDriver, + "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} " + "required_physical_devices: {}", + __FUNCTION__, + use_physical_ids, + largest_workload_logical_device_id, + num_available_physical_devices, + required_physical_devices); + + log_assert( + !use_physical_ids || num_available_physical_devices >= required_physical_devices, "Insufficient silicon devices. Workload requires device_id: {} (ie. {} devices) but only {} present", - largest_workload_logical_device_id, required_physical_devices, num_available_physical_devices); + largest_workload_logical_device_id, + required_physical_devices, + num_available_physical_devices); // All Grayskull devices are MMIO mapped so physical_mmio_device_ids correspond to all available devices for (auto &logical_id : logical_mmio_device_ids) { @@ -367,8 +471,10 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull desc->all_chips.insert(logical_id); eth_coord_t chip_location{logical_id, 0, 0, 0}; desc->chip_locations.insert({logical_id, chip_location}); - desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = logical_id; - log_debug(tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); + desc->coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = + logical_id; + log_debug( + tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); } desc->enable_all_devices(); @@ -376,7 +482,8 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull return desc; } -void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { +void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor( + YAML::Node &yaml, tt_ClusterDescriptor &desc) { log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML"); for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as>()) { log_assert(connected_endpoints.IsSequence(), "Invalid YAML"); @@ -409,7 +516,13 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto log_debug(LogSiliconDriver, "Ethernet Connectivity Descriptor:"); for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) { for (const auto &[chan, chip_and_chan] : chan_to_chip_chan_map) { - log_debug(LogSiliconDriver, "\tchip: {}, chan: {} <--> chip: {}, chan: {}", chip, chan, std::get<0>(chip_and_chan), std::get<1>(chip_and_chan)); + log_debug( + LogSiliconDriver, + "\tchip: {}, chan: {} <--> chip: {}, chan: {}", + chip, + chan, + chip_and_chan.x, + chip_and_chan.y); } } @@ -426,52 +539,64 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto } } } +} +void tt_ClusterDescriptor::fill_galaxy_connections(tt_ClusterDescriptor &desc) { int highest_shelf_id = 0; int highest_rack_id = 0; // shelves and racks can be connected at different chip coordinates - // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip on the other shelf/rack is - // this is used in get_ethernet_link_coord_distance to find the distance between two chips + // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip + // on the other shelf/rack is this is used in get_ethernet_link_coord_distance to find the distance between two + // chips for (const auto &[chip_id, chip_eth_coord] : desc.chip_locations) { - highest_shelf_id = std::max(highest_shelf_id, std::get<3>(chip_eth_coord)); - highest_rack_id = std::max(highest_rack_id, std::get<2>(chip_eth_coord)); + highest_shelf_id = std::max(highest_shelf_id, chip_eth_coord.shelf); + highest_rack_id = std::max(highest_rack_id, chip_eth_coord.rack); // iterate over all neighbors - if(desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { - continue; // chip has no eth connections + if (desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { + continue; // chip has no eth connections } for (const auto &[chan, chip_and_chan] : desc.ethernet_connections.at(chip_id)) { const chip_id_t &neighbor_chip = std::get<0>(chip_and_chan); eth_coord_t neighbor_eth_coord = desc.chip_locations.at(neighbor_chip); // shelves are connected in x-dim - if(std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) { - eth_coord_t higher_shelf_coord = std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_shelf_coord = std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - int lower_shelf_id = std::get<3>(lower_shelf_coord); - int lower_shelf_y = std::get<1>(lower_shelf_coord); + if (neighbor_eth_coord.shelf != chip_eth_coord.shelf) { + eth_coord_t higher_shelf_coord = + neighbor_eth_coord.shelf > chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_shelf_coord = + neighbor_eth_coord.shelf < chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; + int lower_shelf_id = lower_shelf_coord.shelf; + int lower_shelf_y = lower_shelf_coord.y; - auto& galaxy_shelf_exit_chip_coords_per_y_dim = desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; + auto &galaxy_shelf_exit_chip_coords_per_y_dim = + desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; log_assert( - galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == galaxy_shelf_exit_chip_coords_per_y_dim.end() || - galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord, + galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == + galaxy_shelf_exit_chip_coords_per_y_dim.end() || + galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord, "Expected a single exit chip on each shelf row"); galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord = lower_shelf_coord; - galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(higher_shelf_coord); + galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert( + higher_shelf_coord); } // racks are connected in y-dim - if(std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) { - eth_coord_t higher_rack_coord = std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_rack_coord = std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - int lower_rack_id = std::get<2>(lower_rack_coord); - int lower_rack_x = std::get<0>(lower_rack_coord); + if (neighbor_eth_coord.rack != chip_eth_coord.rack) { + eth_coord_t higher_rack_coord = + neighbor_eth_coord.rack > chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_rack_coord = + neighbor_eth_coord.rack < chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; + int lower_rack_id = lower_rack_coord.rack; + int lower_rack_x = lower_rack_coord.x; - auto& galaxy_rack_exit_chip_coords_per_x_dim = desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; + auto &galaxy_rack_exit_chip_coords_per_x_dim = + desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; log_assert( - galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == galaxy_rack_exit_chip_coords_per_x_dim.end() || - galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord, + galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == + galaxy_rack_exit_chip_coords_per_x_dim.end() || + galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord, "Expected a single exit chip on each rack column"); galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord = lower_rack_coord; galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].destination_chip_coords.insert(higher_rack_coord); @@ -482,23 +607,36 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto // verify that every shelf (except the highest in id) is found in galaxy_shelves_exit_chip_coords_per_y_dim // this means that we expect the shelves to be connected linearly in a daisy-chain fashion. // shelf0->shelf1->shelf2->...->shelfN - for(int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) { - log_assert(desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(), - "Expected shelf {} to be connected to the next shelf", shelf_id); + for (int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) { + log_assert( + desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != + desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(), + "Expected shelf {} to be connected to the next shelf", + shelf_id); } // this prints the exit chip coordinates for each shelf // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[shelf, shelf_exit_chip_coords_per_y_dim] : desc.galaxy_shelves_exit_chip_coords_per_y_dim) { for (const auto &[y_dim, shelf_exit_chip_coords] : shelf_exit_chip_coords_per_y_dim) { - log_debug(LogSiliconDriver, "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", - shelf, y_dim, - std::get<0>(shelf_exit_chip_coords.source_chip_coord), std::get<1>(shelf_exit_chip_coords.source_chip_coord), - std::get<2>(shelf_exit_chip_coords.source_chip_coord), std::get<3>(shelf_exit_chip_coords.source_chip_coord)); + log_debug( + LogSiliconDriver, + "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", + shelf, + y_dim, + shelf_exit_chip_coords.source_chip_coord.x, + shelf_exit_chip_coords.source_chip_coord.y, + shelf_exit_chip_coords.source_chip_coord.rack, + shelf_exit_chip_coords.source_chip_coord.shelf); for (const auto &destination_chip_coord : shelf_exit_chip_coords.destination_chip_coords) { // print shelf_exit_chip_coord in the format: (x, y, rack, shelf) - log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord)); + log_debug( + LogSiliconDriver, + "\tdestination_chip_coord: ({}, {}, {}, {})", + destination_chip_coord.x, + destination_chip_coord.y, + destination_chip_coord.rack, + destination_chip_coord.shelf); } } } @@ -506,28 +644,61 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto // verify that every rack (except the highest in id) is found in galaxy_racks_exit_chip_coords_per_x_dim // this means that we expect the racks to be connected linearly in a daisy-chain fashion. // rack0->rack1->rack2->...->rackN - for(int rack_id = 0; rack_id < highest_rack_id; rack_id++) { - log_assert(desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != desc.galaxy_racks_exit_chip_coords_per_x_dim.end(), - "Expected rack {} to be connected to the next rack", rack_id); + for (int rack_id = 0; rack_id < highest_rack_id; rack_id++) { + log_assert( + desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != + desc.galaxy_racks_exit_chip_coords_per_x_dim.end(), + "Expected rack {} to be connected to the next rack", + rack_id); } // this prints the exit chip coordinates for each rack // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[rack, rack_exit_chip_coords_per_x_dim] : desc.galaxy_racks_exit_chip_coords_per_x_dim) { for (const auto &[x_dim, rack_exit_chip_coords] : rack_exit_chip_coords_per_x_dim) { - log_debug(LogSiliconDriver, "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", rack, x_dim, - std::get<0>(rack_exit_chip_coords.source_chip_coord), std::get<1>(rack_exit_chip_coords.source_chip_coord), - std::get<2>(rack_exit_chip_coords.source_chip_coord), std::get<3>(rack_exit_chip_coords.source_chip_coord)); + log_debug( + LogSiliconDriver, + "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", + rack, + x_dim, + rack_exit_chip_coords.source_chip_coord.x, + rack_exit_chip_coords.source_chip_coord.y, + rack_exit_chip_coords.source_chip_coord.rack, + rack_exit_chip_coords.source_chip_coord.shelf); for (const auto &destination_chip_coord : rack_exit_chip_coords.destination_chip_coords) { - log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord)); + log_debug( + LogSiliconDriver, + "\tdestination_chip_coord: ({}, {}, {}, {})", + destination_chip_coord.x, + destination_chip_coord.y, + destination_chip_coord.rack, + destination_chip_coord.shelf); } } } } -void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { +void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) { + DisjointSet chip_sets; + for (const auto &[chip, _] : desc.chip_locations) { + chip_sets.add_item(chip); + log_debug(LogSiliconDriver, "Adding chip {} to disjoint set", chip); + } + + for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) { + for (const auto &[chan, dest_chip_chan_tuple] : chan_to_chip_chan_map) { + chip_sets.merge(chip, std::get<0>(dest_chip_chan_tuple)); + log_debug(LogSiliconDriver, "Merging chip {} and chip {}", chip, std::get<0>(dest_chip_chan_tuple)); + } + } + + for (const auto &[chip, chip_eth_coords] : desc.chip_locations) { + desc.chip_locations[chip].cluster_id = chip_sets.get_set(chip); + log_debug(LogSiliconDriver, "Chip {} belongs to cluster {}", chip, chip_sets.get_set(chip)); + } +} +void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) { chip_id_t chip_id = node->first.as(); desc.all_chips.insert(chip_id); @@ -538,19 +709,18 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y std::vector chip_rack_coords = node->second.as>(); log_assert(chip_rack_coords.size() == 4, "Galaxy (x, y, rack, shelf) coords must be size 4"); eth_coord_t chip_location{ - chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)}; + chip_id, chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)}; desc.chip_locations.insert({chip_id, chip_location}); - desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = chip_id; + desc.coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = chip_id; } - - for(const auto& chip : yaml["chips_with_mmio"]) { - if(chip.IsMap()) { + + for (const auto &chip : yaml["chips_with_mmio"]) { + if (chip.IsMap()) { const auto &chip_map = chip.as>(); const auto &chips = chip_map.begin(); desc.chips_with_mmio.insert({chips->first, chips->second}); - } - else { + } else { const auto &chip_val = chip.as(); desc.chips_with_mmio.insert({chip_val, chip_val}); } @@ -561,14 +731,14 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y LogSiliconDriver, "\tchip: {}, EthCoord(x={}, y={}, rack={}, shelf={})", chip_id, - std::get<0>(chip_location), - std::get<1>(chip_location), - std::get<2>(chip_location), - std::get<3>(chip_location)); + chip_location.x, + chip_location.y, + chip_location.rack, + chip_location.shelf); } - if (yaml["boardtype"]) { - for (const auto& chip_board_type : yaml["boardtype"].as>()) { + if (yaml["boardtype"]) { + for (const auto &chip_board_type : yaml["boardtype"].as>()) { auto &chip = chip_board_type.first; BoardType board_type; if (chip_board_type.second == "n150") { @@ -579,25 +749,28 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y board_type = BoardType::GALAXY; } else if (chip_board_type.second == "e150") { board_type = BoardType::E150; - } - else if (chip_board_type.second == "p150A") { + } else if (chip_board_type.second == "p150A") { board_type = BoardType::P150A; } else { - log_warning(LogSiliconDriver, "Unknown board type for chip {}. This might happen because chip is running old firmware. Defaulting to DEFAULT", chip); + log_warning( + LogSiliconDriver, + "Unknown board type for chip {}. This might happen because chip is running old firmware. " + "Defaulting to DEFAULT", + chip); board_type = BoardType::DEFAULT; } desc.chip_board_type.insert({chip, board_type}); } } else { - for (const auto& chip: desc.all_chips) { + for (const auto &chip : desc.all_chips) { desc.chip_board_type.insert({chip, BoardType::DEFAULT}); } } } void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc) { - if(yaml["harvesting"]) { - for (const auto& chip_node : yaml["harvesting"].as>()) { + if (yaml["harvesting"]) { + for (const auto &chip_node : yaml["harvesting"].as>()) { chip_id_t chip = chip_node.first; auto harvesting_info = chip_node.second; desc.noc_translation_enabled.insert({chip, harvesting_info["noc_translation"].as()}); @@ -606,9 +779,7 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus } } -void tt_ClusterDescriptor::enable_all_devices() { - this->enabled_active_chips = this->all_chips; -} +void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; } void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { for (const auto &chip : this->all_chips) { @@ -618,8 +789,10 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { } } -const std::unordered_map > > tt_ClusterDescriptor::get_ethernet_connections() const { - auto eth_connections = std::unordered_map > >(); +const std::unordered_map>> +tt_ClusterDescriptor::get_ethernet_connections() const { + auto eth_connections = std:: + unordered_map>>(); for (const auto &[chip, channel_mapping] : this->ethernet_connections) { if (this->enabled_active_chips.find(chip) != this->enabled_active_chips.end()) { @@ -635,7 +808,7 @@ const std::unordered_map& tt_ClusterDescriptor::get_chip_locations() const { +const std::unordered_map &tt_ClusterDescriptor::get_chip_locations() const { static auto locations = std::unordered_map(); if (locations.empty() and !this->chip_locations.empty()) { for (auto chip_id : this->enabled_active_chips) { @@ -647,11 +820,14 @@ const std::unordered_map& tt_ClusterDescriptor::get_chip } chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t virtual_coord) { - log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates"); + log_assert( + !this->chip_locations.empty(), + "Getting physical chip coordinates is only valid for systems where chips have coordinates"); // Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology. - // See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png - int x = std::get<0>(get_chip_locations().at(virtual_coord)); - int y = std::get<1>(get_chip_locations().at(virtual_coord)); + // See: + // https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png + int x = get_chip_locations().at(virtual_coord).x; + int y = get_chip_locations().at(virtual_coord).y; return 8 * x + y; } @@ -668,30 +844,31 @@ const std::unordered_map tt_ClusterDescriptor::get_chips_w return chips_map; } -const std::unordered_set& tt_ClusterDescriptor::get_all_chips() const { - return this->enabled_active_chips; -} +const std::unordered_set &tt_ClusterDescriptor::get_all_chips() const { return this->enabled_active_chips; } -const std::unordered_map& tt_ClusterDescriptor::get_harvesting_info() const { +const std::unordered_map &tt_ClusterDescriptor::get_harvesting_info() const { return harvesting_masks; } -const std::unordered_map& tt_ClusterDescriptor::get_noc_translation_table_en() const { +const std::unordered_map &tt_ClusterDescriptor::get_noc_translation_table_en() const { return noc_translation_enabled; } std::size_t tt_ClusterDescriptor::get_number_of_chips() const { return this->enabled_active_chips.size(); } int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const { - log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates"); + log_assert( + !this->chip_locations.empty(), + "Getting physical chip coordinates is only valid for systems where chips have coordinates"); return this->get_ethernet_link_coord_distance(chip_locations.at(chip_a), chip_locations.at(chip_b)); } BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const { - BoardType board_type = this->chip_board_type.at(chip_id); - return board_type; + BoardType board_type = this->chip_board_type.at(chip_id); + return board_type; } -const std::unordered_map>& tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const { +const std::unordered_map> & +tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const { return chips_grouped_by_closest_mmio; } diff --git a/device/tt_device.cpp b/device/tt_device.cpp new file mode 100644 index 00000000..071f6676 --- /dev/null +++ b/device/tt_device.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#ifdef TT_DEBUG_LOGGING +#define DEBUG_LOG(str) \ + do { \ + std::cout << str << std::endl; \ + } while (false) +#else +#define DEBUG_LOG(str) ((void)0) +#endif + +#include "tt_device.h" + +#include +#include +#include +#include +#include + +#include "device/tt_cluster_descriptor_types.h" +#include "yaml-cpp/yaml.h" + +//////// +// Device base +//////// +tt_device::tt_device() : soc_descriptor_per_chip({}) {} + +tt_device::~tt_device() {} + +const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { + return soc_descriptor_per_chip.at(chip_id); +} diff --git a/device/tt_silicon_driver_common.cpp b/device/tt_silicon_driver_common.cpp index 0b42f5a3..0d6c8b62 100644 --- a/device/tt_silicon_driver_common.cpp +++ b/device/tt_silicon_driver_common.cpp @@ -3,36 +3,37 @@ // SPDX-License-Identifier: Apache-2.0 #include "umd/device/tt_silicon_driver_common.hpp" -#include "umd/device/tt_xy_pair.h" + #include "umd/device/cluster.h" +#include "umd/device/tt_xy_pair.h" std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value) { std::string output; - if((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) { output += "BRISC | "; } - if((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) { output += "TRISC0 | "; } - if((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) { output += "TRISC1 | "; } - if((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) { output += "TRISC2 | "; } - if((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) { output += "NCRISC | "; } - if((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) { output += "STAGGERED_START | "; } - if(output.empty()) { - output = "UNKNOWN"; - } else { - output.erase(output.end() - 3, output.end()); - } + if (output.empty()) { + output = "UNKNOWN"; + } else { + output.erase(output.end() - 3, output.end()); + } - return output; + return output; } diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index 6d84a2b7..0aa80685 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "yaml-cpp/yaml.h" #include "umd/device/tt_soc_descriptor.h" #include + #include #include #include @@ -13,53 +13,54 @@ #include #include "fmt/core.h" +#include "utils.hpp" +#include "yaml-cpp/yaml.h" // #include "l1_address_map.h" std::string format_node(tt_xy_pair xy) { return fmt::format("{}-{}", xy.x, xy.y); } tt_xy_pair format_node(std::string str) { - int x_coord; - int y_coord; - std::regex expr("([0-9]+)[-,xX]([0-9]+)"); - std::smatch x_y_pair; - - if (std::regex_search(str, x_y_pair, expr)) { - x_coord = std::stoi(x_y_pair[1]); - y_coord = std::stoi(x_y_pair[2]); - } else { - throw std::runtime_error(fmt::format("Could not parse the core id: {}", str)); - } + int x_coord; + int y_coord; + std::regex expr("([0-9]+)[-,xX]([0-9]+)"); + std::smatch x_y_pair; + + if (std::regex_search(str, x_y_pair, expr)) { + x_coord = std::stoi(x_y_pair[1]); + y_coord = std::stoi(x_y_pair[2]); + } else { + throw std::runtime_error(fmt::format("Could not parse the core id: {}", str)); + } - tt_xy_pair xy(x_coord, y_coord); + tt_xy_pair xy(x_coord, y_coord); - return xy; + return xy; } -const char* ws = " \t\n\r\f\v"; + +const char *ws = " \t\n\r\f\v"; // trim from end of string (right) -inline std::string& rtrim(std::string& s, const char* t = ws) -{ +inline std::string &rtrim(std::string &s, const char *t = ws) { s.erase(s.find_last_not_of(t) + 1); return s; } // trim from beginning of string (left) -inline std::string& ltrim(std::string& s, const char* t = ws) -{ +inline std::string <rim(std::string &s, const char *t = ws) { s.erase(0, s.find_first_not_of(t)); return s; } // trim from both ends of string (right then left) -inline std::string& trim(std::string& s, const char* t = ws) -{ - return ltrim(rtrim(s, t), t); -} +inline std::string &trim(std::string &s, const char *t = ws) { return ltrim(rtrim(s, t), t); } void tt_SocDescriptor::load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml) { overlay_version = device_descriptor_yaml["features"]["overlay"]["version"].as(); - noc_translation_id_enabled = device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as() : false; + noc_translation_id_enabled = + device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] + ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as() + : false; packer_version = device_descriptor_yaml["features"]["packer"]["version"].as(); unpacker_version = device_descriptor_yaml["features"]["unpacker"]["version"].as(); dst_size_alignment = device_descriptor_yaml["features"]["math"]["dst_size_alignment"].as(); @@ -90,7 +91,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & } int current_dram_channel = 0; - for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); ++channel_it) { + for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); + ++channel_it) { dram_cores.push_back({}); auto &soc_dram_cores = dram_cores.at(dram_cores.size() - 1); const auto &dram_cores = (*channel_it).as>(); @@ -121,8 +123,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & std::vector worker_cores = device_descriptor_yaml["functional_workers"].as>(); std::set worker_routing_coords_x; std::set worker_routing_coords_y; - std::unordered_map routing_coord_worker_x; - std::unordered_map routing_coord_worker_y; + std::unordered_map routing_coord_worker_x; + std::unordered_map routing_coord_worker_y; for (const auto &core_string : worker_cores) { CoreDescriptor core_descriptor; core_descriptor.coord = format_node(core_string); @@ -137,12 +139,12 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & int func_x_start = 0; int func_y_start = 0; std::set::iterator it; - for (it=worker_routing_coords_x.begin(); it!=worker_routing_coords_x.end(); ++it) { + for (it = worker_routing_coords_x.begin(); it != worker_routing_coords_x.end(); ++it) { worker_log_to_routing_x[func_x_start] = *it; routing_x_to_worker_x[*it] = func_x_start; func_x_start++; } - for (it=worker_routing_coords_y.begin(); it!=worker_routing_coords_y.end(); ++it) { + for (it = worker_routing_coords_y.begin(); it != worker_routing_coords_y.end(); ++it) { worker_log_to_routing_y[func_y_start] = *it; routing_y_to_worker_y[*it] = func_y_start; func_y_start++; @@ -225,7 +227,8 @@ tt_virtual_coords tt_SocDescriptor::to_virtual_coords(tt_translated_coords trans tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask) { std::ifstream fdesc(device_descriptor_path); if (fdesc.fail()) { - throw std::runtime_error(fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path)); + throw std::runtime_error( + fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path)); } fdesc.close(); @@ -233,10 +236,12 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size auto grid_size_x = device_descriptor_yaml["grid"]["x_size"].as(); auto grid_size_y = device_descriptor_yaml["grid"]["y_size"].as(); - int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] ? - device_descriptor_yaml["physical"]["x_size"].as() : grid_size_x; - int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] ? - device_descriptor_yaml["physical"]["y_size"].as() : grid_size_y; + int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] + ? device_descriptor_yaml["physical"]["x_size"].as() + : grid_size_x; + int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] + ? device_descriptor_yaml["physical"]["y_size"].as() + : grid_size_y; load_core_descriptors_from_device_descriptor(device_descriptor_yaml); grid_size = tt_xy_pair(grid_size_x, grid_size_y); physical_grid_size = tt_xy_pair(physical_grid_size_x, physical_grid_size_y); @@ -251,7 +256,7 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size int tt_SocDescriptor::get_num_dram_channels() const { int num_channels = 0; - for (auto& dram_core : dram_cores) { + for (auto &dram_core : dram_cores) { if (dram_core.size() > 0) { num_channels++; } @@ -273,6 +278,22 @@ bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const { return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end(); } +std::string tt_SocDescriptor::get_soc_descriptor_path(tt::ARCH arch) { + switch (arch) { + case tt::ARCH::GRAYSKULL: + // TODO: this path needs to be changed to point to soc descriptors outside of tests directory. + return tt::umd::utils::get_abs_path("tests/soc_descs/grayskull_10x12.yaml"); + case tt::ARCH::WORMHOLE_B0: + // TODO: this path needs to be changed to point to soc descriptors outside of tests directory. + return tt::umd::utils::get_abs_path("tests/soc_descs/wormhole_b0_8x10.yaml"); + case tt::ARCH::BLACKHOLE: + // TODO: this path needs to be changed to point to soc descriptors outside of tests directory. + return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); + default: + throw std::runtime_error("Invalid architecture"); + } +} + std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { if (arch_name == tt::ARCH::Invalid) { out << "none"; @@ -281,7 +302,7 @@ std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { } else if (arch_name == tt::ARCH::WORMHOLE_B0) { out << "wormhole_b0"; } else if (arch_name == tt::ARCH::BLACKHOLE) { - out << "blackhole"; //Just how many ARCH-to-string functions do we plan to have, anyway? + out << "blackhole"; // Just how many ARCH-to-string functions do we plan to have, anyway? } else { out << "ArchNameSerializationNotImplemented"; } diff --git a/device/wormhole/wormhole_coordinate_manager.cpp b/device/wormhole/wormhole_coordinate_manager.cpp index ddb088de..e9766d16 100644 --- a/device/wormhole/wormhole_coordinate_manager.cpp +++ b/device/wormhole/wormhole_coordinate_manager.cpp @@ -19,9 +19,11 @@ std::set WormholeCoordinateManager::get_y_coordinates_to_harvest(st } tt_translated_coords WormholeCoordinateManager::to_translated_coords(tt_logical_coords logical_coords) { - return tt_translated_coords(logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y); + return tt_translated_coords( + logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y); } tt_logical_coords WormholeCoordinateManager::to_logical_coords(tt_translated_coords translated_coords) { - return tt_logical_coords(translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y); + return tt_logical_coords( + translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y); } diff --git a/device/wormhole/wormhole_coordinate_manager.h b/device/wormhole/wormhole_coordinate_manager.h index 0c06d119..eda84809 100644 --- a/device/wormhole/wormhole_coordinate_manager.h +++ b/device/wormhole/wormhole_coordinate_manager.h @@ -9,16 +9,16 @@ #include "umd/device/coordinate_manager.h" class WormholeCoordinateManager : public CoordinateManager { - public: - WormholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + WormholeCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override; tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override; -protected: +protected: std::set get_y_coordinates_to_harvest(std::size_t harvesting_mask) override; private: diff --git a/device/wormhole/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp index c19e59fd..bd6e32e7 100644 --- a/device/wormhole/wormhole_implementation.cpp +++ b/device/wormhole/wormhole_implementation.cpp @@ -4,13 +4,12 @@ #include "umd/device/wormhole_implementation.h" -#include "wormhole/host_mem_address_map.h" -#include "wormhole/eth_interface.h" - #include "umd/device/cluster.h" +#include "wormhole/eth_interface.h" +#include "wormhole/host_mem_address_map.h" -constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH -constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH namespace tt::umd { @@ -98,7 +97,9 @@ std::pair wormhole_implementation::get_tlb_data( } tt_driver_host_address_params wormhole_implementation::get_host_address_params() const { - return {::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } tt_driver_eth_interface_params wormhole_implementation::get_eth_interface_params() const { diff --git a/device/xy_pair.cpp b/device/xy_pair.cpp index 0559f31c..ff9b7f95 100644 --- a/device/xy_pair.cpp +++ b/device/xy_pair.cpp @@ -11,6 +11,7 @@ namespace tt::umd { std::string xy_pair::str() const { return fmt::format("(x={},y={})", x, y); } + std::string cxy_pair::str() const { return fmt::format("(chip={},x={},y={})", chip, x, y); } } // namespace tt::umd diff --git a/tests/.clang-format b/tests/.clang-format deleted file mode 100644 index 9d159247..00000000 --- a/tests/.clang-format +++ /dev/null @@ -1,2 +0,0 @@ -DisableFormat: true -SortIncludes: false diff --git a/tests/api/cluster_descriptor_examples/blackhole_P150.yaml b/tests/api/cluster_descriptor_examples/blackhole_P150.yaml new file mode 100644 index 00000000..06232d98 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/blackhole_P150.yaml @@ -0,0 +1,23 @@ +arch: { + 0: Blackhole, +} + +chips: { +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: false, harvest_mask: 0}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: null, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/galaxy.yaml b/tests/api/cluster_descriptor_examples/galaxy.yaml new file mode 100644 index 00000000..d2ca245c --- /dev/null +++ b/tests/api/cluster_descriptor_examples/galaxy.yaml @@ -0,0 +1,383 @@ +arch: { + 0: Wormhole, + 1: Wormhole, + 2: Wormhole, + 3: Wormhole, + 4: Wormhole, + 5: Wormhole, + 6: Wormhole, + 7: Wormhole, + 8: Wormhole, + 9: Wormhole, + 10: Wormhole, + 11: Wormhole, + 12: Wormhole, + 13: Wormhole, + 14: Wormhole, + 15: Wormhole, + 16: Wormhole, + 17: Wormhole, + 18: Wormhole, + 19: Wormhole, + 20: Wormhole, + 21: Wormhole, + 22: Wormhole, + 23: Wormhole, + 24: Wormhole, + 25: Wormhole, + 26: Wormhole, + 27: Wormhole, + 28: Wormhole, + 29: Wormhole, + 30: Wormhole, + 31: Wormhole, + 32: Wormhole, + 33: Wormhole, + 34: Wormhole, + 35: Wormhole, +} + +chips: { + 0: [0,3,0,0], + 1: [0,2,0,0], + 2: [0,1,0,0], + 3: [0,0,0,0], + 4: [3,6,0,1], + 5: [3,5,0,1], + 6: [2,5,0,1], + 7: [2,6,0,1], + 8: [1,6,0,1], + 9: [1,7,0,1], + 10: [2,7,0,1], + 11: [3,7,0,1], + 12: [0,7,0,1], + 13: [0,6,0,1], + 14: [0,5,0,1], + 15: [1,5,0,1], + 16: [1,4,0,1], + 17: [2,4,0,1], + 18: [3,4,0,1], + 19: [3,3,0,1], + 20: [2,3,0,1], + 21: [1,3,0,1], + 22: [1,2,0,1], + 23: [2,2,0,1], + 24: [3,2,0,1], + 25: [3,1,0,1], + 26: [2,1,0,1], + 27: [1,1,0,1], + 28: [1,0,0,1], + 29: [2,0,0,1], + 30: [3,0,0,1], + 31: [0,0,0,1], + 32: [0,1,0,1], + 33: [0,2,0,1], + 34: [0,3,0,1], + 35: [0,4,0,1], +} + +ethernet_connections: [ + [{chip: 0, chan: 6}, {chip: 11, chan: 12}], + [{chip: 0, chan: 7}, {chip: 4, chan: 12}], + [{chip: 1, chan: 6}, {chip: 5, chan: 12}], + [{chip: 1, chan: 7}, {chip: 18, chan: 12}], + [{chip: 2, chan: 6}, {chip: 19, chan: 12}], + [{chip: 2, chan: 7}, {chip: 24, chan: 12}], + [{chip: 3, chan: 6}, {chip: 25, chan: 12}], + [{chip: 3, chan: 7}, {chip: 30, chan: 12}], + [{chip: 4, chan: 0}, {chip: 11, chan: 0}], + [{chip: 4, chan: 1}, {chip: 11, chan: 1}], + [{chip: 4, chan: 2}, {chip: 11, chan: 2}], + [{chip: 4, chan: 3}, {chip: 11, chan: 3}], + [{chip: 4, chan: 4}, {chip: 7, chan: 12}], + [{chip: 4, chan: 5}, {chip: 7, chan: 13}], + [{chip: 4, chan: 6}, {chip: 7, chan: 14}], + [{chip: 4, chan: 7}, {chip: 7, chan: 15}], + [{chip: 4, chan: 8}, {chip: 5, chan: 8}], + [{chip: 4, chan: 9}, {chip: 5, chan: 9}], + [{chip: 4, chan: 10}, {chip: 5, chan: 10}], + [{chip: 4, chan: 11}, {chip: 5, chan: 11}], + [{chip: 5, chan: 0}, {chip: 18, chan: 0}], + [{chip: 5, chan: 1}, {chip: 18, chan: 1}], + [{chip: 5, chan: 2}, {chip: 18, chan: 2}], + [{chip: 5, chan: 3}, {chip: 18, chan: 3}], + [{chip: 5, chan: 4}, {chip: 6, chan: 12}], + [{chip: 5, chan: 5}, {chip: 6, chan: 13}], + [{chip: 5, chan: 6}, {chip: 6, chan: 14}], + [{chip: 5, chan: 7}, {chip: 6, chan: 15}], + [{chip: 6, chan: 0}, {chip: 17, chan: 0}], + [{chip: 6, chan: 1}, {chip: 17, chan: 1}], + [{chip: 6, chan: 2}, {chip: 17, chan: 2}], + [{chip: 6, chan: 3}, {chip: 17, chan: 3}], + [{chip: 6, chan: 4}, {chip: 15, chan: 12}], + [{chip: 6, chan: 5}, {chip: 15, chan: 13}], + [{chip: 6, chan: 6}, {chip: 15, chan: 14}], + [{chip: 6, chan: 7}, {chip: 15, chan: 15}], + [{chip: 6, chan: 8}, {chip: 7, chan: 8}], + [{chip: 6, chan: 9}, {chip: 7, chan: 9}], + [{chip: 6, chan: 10}, {chip: 7, chan: 10}], + [{chip: 6, chan: 11}, {chip: 7, chan: 11}], + [{chip: 7, chan: 0}, {chip: 10, chan: 0}], + [{chip: 7, chan: 1}, {chip: 10, chan: 1}], + [{chip: 7, chan: 2}, {chip: 10, chan: 2}], + [{chip: 7, chan: 3}, {chip: 10, chan: 3}], + [{chip: 7, chan: 4}, {chip: 8, chan: 12}], + [{chip: 7, chan: 5}, {chip: 8, chan: 13}], + [{chip: 7, chan: 6}, {chip: 8, chan: 14}], + [{chip: 7, chan: 7}, {chip: 8, chan: 15}], + [{chip: 8, chan: 0}, {chip: 15, chan: 0}], + [{chip: 8, chan: 1}, {chip: 15, chan: 1}], + [{chip: 8, chan: 2}, {chip: 15, chan: 2}], + [{chip: 8, chan: 3}, {chip: 15, chan: 3}], + [{chip: 8, chan: 4}, {chip: 13, chan: 12}], + [{chip: 8, chan: 5}, {chip: 13, chan: 13}], + [{chip: 8, chan: 6}, {chip: 13, chan: 14}], + [{chip: 8, chan: 7}, {chip: 13, chan: 15}], + [{chip: 8, chan: 8}, {chip: 9, chan: 8}], + [{chip: 8, chan: 9}, {chip: 9, chan: 9}], + [{chip: 8, chan: 10}, {chip: 9, chan: 10}], + [{chip: 8, chan: 11}, {chip: 9, chan: 11}], + [{chip: 9, chan: 4}, {chip: 12, chan: 12}], + [{chip: 9, chan: 5}, {chip: 12, chan: 13}], + [{chip: 9, chan: 6}, {chip: 12, chan: 14}], + [{chip: 9, chan: 7}, {chip: 12, chan: 15}], + [{chip: 9, chan: 12}, {chip: 10, chan: 4}], + [{chip: 9, chan: 13}, {chip: 10, chan: 5}], + [{chip: 9, chan: 14}, {chip: 10, chan: 6}], + [{chip: 9, chan: 15}, {chip: 10, chan: 7}], + [{chip: 10, chan: 12}, {chip: 11, chan: 4}], + [{chip: 10, chan: 13}, {chip: 11, chan: 5}], + [{chip: 10, chan: 14}, {chip: 11, chan: 6}], + [{chip: 10, chan: 15}, {chip: 11, chan: 7}], + [{chip: 12, chan: 8}, {chip: 13, chan: 8}], + [{chip: 12, chan: 9}, {chip: 13, chan: 9}], + [{chip: 12, chan: 10}, {chip: 13, chan: 10}], + [{chip: 12, chan: 11}, {chip: 13, chan: 11}], + [{chip: 13, chan: 0}, {chip: 14, chan: 0}], + [{chip: 13, chan: 1}, {chip: 14, chan: 1}], + [{chip: 13, chan: 2}, {chip: 14, chan: 2}], + [{chip: 13, chan: 3}, {chip: 14, chan: 3}], + [{chip: 14, chan: 8}, {chip: 35, chan: 8}], + [{chip: 14, chan: 9}, {chip: 35, chan: 9}], + [{chip: 14, chan: 10}, {chip: 35, chan: 10}], + [{chip: 14, chan: 11}, {chip: 35, chan: 11}], + [{chip: 14, chan: 12}, {chip: 15, chan: 4}], + [{chip: 14, chan: 13}, {chip: 15, chan: 5}], + [{chip: 14, chan: 14}, {chip: 15, chan: 6}], + [{chip: 14, chan: 15}, {chip: 15, chan: 7}], + [{chip: 15, chan: 8}, {chip: 16, chan: 8}], + [{chip: 15, chan: 9}, {chip: 16, chan: 9}], + [{chip: 15, chan: 10}, {chip: 16, chan: 10}], + [{chip: 15, chan: 11}, {chip: 16, chan: 11}], + [{chip: 16, chan: 0}, {chip: 21, chan: 0}], + [{chip: 16, chan: 1}, {chip: 21, chan: 1}], + [{chip: 16, chan: 2}, {chip: 21, chan: 2}], + [{chip: 16, chan: 3}, {chip: 21, chan: 3}], + [{chip: 16, chan: 4}, {chip: 35, chan: 12}], + [{chip: 16, chan: 5}, {chip: 35, chan: 13}], + [{chip: 16, chan: 6}, {chip: 35, chan: 14}], + [{chip: 16, chan: 7}, {chip: 35, chan: 15}], + [{chip: 16, chan: 12}, {chip: 17, chan: 4}], + [{chip: 16, chan: 13}, {chip: 17, chan: 5}], + [{chip: 16, chan: 14}, {chip: 17, chan: 6}], + [{chip: 16, chan: 15}, {chip: 17, chan: 7}], + [{chip: 17, chan: 8}, {chip: 20, chan: 8}], + [{chip: 17, chan: 9}, {chip: 20, chan: 9}], + [{chip: 17, chan: 10}, {chip: 20, chan: 10}], + [{chip: 17, chan: 11}, {chip: 20, chan: 11}], + [{chip: 17, chan: 12}, {chip: 18, chan: 4}], + [{chip: 17, chan: 13}, {chip: 18, chan: 5}], + [{chip: 17, chan: 14}, {chip: 18, chan: 6}], + [{chip: 17, chan: 15}, {chip: 18, chan: 7}], + [{chip: 18, chan: 8}, {chip: 19, chan: 8}], + [{chip: 18, chan: 9}, {chip: 19, chan: 9}], + [{chip: 18, chan: 10}, {chip: 19, chan: 10}], + [{chip: 18, chan: 11}, {chip: 19, chan: 11}], + [{chip: 19, chan: 0}, {chip: 24, chan: 0}], + [{chip: 19, chan: 1}, {chip: 24, chan: 1}], + [{chip: 19, chan: 2}, {chip: 24, chan: 2}], + [{chip: 19, chan: 3}, {chip: 24, chan: 3}], + [{chip: 19, chan: 4}, {chip: 20, chan: 12}], + [{chip: 19, chan: 5}, {chip: 20, chan: 13}], + [{chip: 19, chan: 6}, {chip: 20, chan: 14}], + [{chip: 19, chan: 7}, {chip: 20, chan: 15}], + [{chip: 20, chan: 0}, {chip: 23, chan: 0}], + [{chip: 20, chan: 1}, {chip: 23, chan: 1}], + [{chip: 20, chan: 2}, {chip: 23, chan: 2}], + [{chip: 20, chan: 3}, {chip: 23, chan: 3}], + [{chip: 20, chan: 4}, {chip: 21, chan: 12}], + [{chip: 20, chan: 5}, {chip: 21, chan: 13}], + [{chip: 20, chan: 6}, {chip: 21, chan: 14}], + [{chip: 20, chan: 7}, {chip: 21, chan: 15}], + [{chip: 21, chan: 4}, {chip: 34, chan: 12}], + [{chip: 21, chan: 5}, {chip: 34, chan: 13}], + [{chip: 21, chan: 6}, {chip: 34, chan: 14}], + [{chip: 21, chan: 7}, {chip: 34, chan: 15}], + [{chip: 21, chan: 8}, {chip: 22, chan: 8}], + [{chip: 21, chan: 9}, {chip: 22, chan: 9}], + [{chip: 21, chan: 10}, {chip: 22, chan: 10}], + [{chip: 21, chan: 11}, {chip: 22, chan: 11}], + [{chip: 22, chan: 0}, {chip: 27, chan: 0}], + [{chip: 22, chan: 1}, {chip: 27, chan: 1}], + [{chip: 22, chan: 2}, {chip: 27, chan: 2}], + [{chip: 22, chan: 3}, {chip: 27, chan: 3}], + [{chip: 22, chan: 4}, {chip: 33, chan: 12}], + [{chip: 22, chan: 5}, {chip: 33, chan: 13}], + [{chip: 22, chan: 6}, {chip: 33, chan: 14}], + [{chip: 22, chan: 7}, {chip: 33, chan: 15}], + [{chip: 22, chan: 12}, {chip: 23, chan: 4}], + [{chip: 22, chan: 13}, {chip: 23, chan: 5}], + [{chip: 22, chan: 14}, {chip: 23, chan: 6}], + [{chip: 22, chan: 15}, {chip: 23, chan: 7}], + [{chip: 23, chan: 8}, {chip: 26, chan: 8}], + [{chip: 23, chan: 9}, {chip: 26, chan: 9}], + [{chip: 23, chan: 10}, {chip: 26, chan: 10}], + [{chip: 23, chan: 11}, {chip: 26, chan: 11}], + [{chip: 23, chan: 12}, {chip: 24, chan: 4}], + [{chip: 23, chan: 13}, {chip: 24, chan: 5}], + [{chip: 23, chan: 14}, {chip: 24, chan: 6}], + [{chip: 23, chan: 15}, {chip: 24, chan: 7}], + [{chip: 24, chan: 8}, {chip: 25, chan: 8}], + [{chip: 24, chan: 9}, {chip: 25, chan: 9}], + [{chip: 24, chan: 10}, {chip: 25, chan: 10}], + [{chip: 24, chan: 11}, {chip: 25, chan: 11}], + [{chip: 25, chan: 0}, {chip: 30, chan: 0}], + [{chip: 25, chan: 1}, {chip: 30, chan: 1}], + [{chip: 25, chan: 2}, {chip: 30, chan: 2}], + [{chip: 25, chan: 3}, {chip: 30, chan: 3}], + [{chip: 25, chan: 4}, {chip: 26, chan: 12}], + [{chip: 25, chan: 5}, {chip: 26, chan: 13}], + [{chip: 25, chan: 6}, {chip: 26, chan: 14}], + [{chip: 25, chan: 7}, {chip: 26, chan: 15}], + [{chip: 26, chan: 0}, {chip: 29, chan: 0}], + [{chip: 26, chan: 1}, {chip: 29, chan: 1}], + [{chip: 26, chan: 2}, {chip: 29, chan: 2}], + [{chip: 26, chan: 3}, {chip: 29, chan: 3}], + [{chip: 26, chan: 4}, {chip: 27, chan: 12}], + [{chip: 26, chan: 5}, {chip: 27, chan: 13}], + [{chip: 26, chan: 6}, {chip: 27, chan: 14}], + [{chip: 26, chan: 7}, {chip: 27, chan: 15}], + [{chip: 27, chan: 4}, {chip: 32, chan: 12}], + [{chip: 27, chan: 5}, {chip: 32, chan: 13}], + [{chip: 27, chan: 6}, {chip: 32, chan: 14}], + [{chip: 27, chan: 7}, {chip: 32, chan: 15}], + [{chip: 27, chan: 8}, {chip: 28, chan: 8}], + [{chip: 27, chan: 9}, {chip: 28, chan: 9}], + [{chip: 27, chan: 10}, {chip: 28, chan: 10}], + [{chip: 27, chan: 11}, {chip: 28, chan: 11}], + [{chip: 28, chan: 4}, {chip: 31, chan: 12}], + [{chip: 28, chan: 5}, {chip: 31, chan: 13}], + [{chip: 28, chan: 6}, {chip: 31, chan: 14}], + [{chip: 28, chan: 7}, {chip: 31, chan: 15}], + [{chip: 28, chan: 12}, {chip: 29, chan: 4}], + [{chip: 28, chan: 13}, {chip: 29, chan: 5}], + [{chip: 28, chan: 14}, {chip: 29, chan: 6}], + [{chip: 28, chan: 15}, {chip: 29, chan: 7}], + [{chip: 29, chan: 12}, {chip: 30, chan: 4}], + [{chip: 29, chan: 13}, {chip: 30, chan: 5}], + [{chip: 29, chan: 14}, {chip: 30, chan: 6}], + [{chip: 29, chan: 15}, {chip: 30, chan: 7}], + [{chip: 31, chan: 8}, {chip: 32, chan: 8}], + [{chip: 31, chan: 9}, {chip: 32, chan: 9}], + [{chip: 31, chan: 10}, {chip: 32, chan: 10}], + [{chip: 31, chan: 11}, {chip: 32, chan: 11}], + [{chip: 32, chan: 0}, {chip: 33, chan: 0}], + [{chip: 32, chan: 1}, {chip: 33, chan: 1}], + [{chip: 32, chan: 2}, {chip: 33, chan: 2}], + [{chip: 32, chan: 3}, {chip: 33, chan: 3}], + [{chip: 33, chan: 8}, {chip: 34, chan: 8}], + [{chip: 33, chan: 9}, {chip: 34, chan: 9}], + [{chip: 33, chan: 10}, {chip: 34, chan: 10}], + [{chip: 33, chan: 11}, {chip: 34, chan: 11}], + [{chip: 34, chan: 0}, {chip: 35, chan: 0}], + [{chip: 34, chan: 1}, {chip: 35, chan: 1}], + [{chip: 34, chan: 2}, {chip: 35, chan: 2}], + [{chip: 34, chan: 3}, {chip: 35, chan: 3}], +] + +chips_with_mmio: [ + 0: 0, + 1: 1, + 2: 2, + 3: 3, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 1}, + 1: {noc_translation: true, harvest_mask: 1}, + 2: {noc_translation: true, harvest_mask: 4}, + 3: {noc_translation: true, harvest_mask: 8}, + 4: {noc_translation: true, harvest_mask: 0}, + 5: {noc_translation: true, harvest_mask: 0}, + 6: {noc_translation: true, harvest_mask: 0}, + 7: {noc_translation: true, harvest_mask: 0}, + 8: {noc_translation: true, harvest_mask: 0}, + 9: {noc_translation: true, harvest_mask: 0}, + 10: {noc_translation: true, harvest_mask: 0}, + 11: {noc_translation: true, harvest_mask: 0}, + 12: {noc_translation: true, harvest_mask: 0}, + 13: {noc_translation: true, harvest_mask: 0}, + 14: {noc_translation: true, harvest_mask: 0}, + 15: {noc_translation: true, harvest_mask: 0}, + 16: {noc_translation: true, harvest_mask: 0}, + 17: {noc_translation: true, harvest_mask: 0}, + 18: {noc_translation: true, harvest_mask: 0}, + 19: {noc_translation: true, harvest_mask: 0}, + 20: {noc_translation: true, harvest_mask: 0}, + 21: {noc_translation: true, harvest_mask: 0}, + 22: {noc_translation: true, harvest_mask: 0}, + 23: {noc_translation: true, harvest_mask: 0}, + 24: {noc_translation: true, harvest_mask: 0}, + 25: {noc_translation: true, harvest_mask: 0}, + 26: {noc_translation: true, harvest_mask: 0}, + 27: {noc_translation: true, harvest_mask: 0}, + 28: {noc_translation: true, harvest_mask: 0}, + 29: {noc_translation: true, harvest_mask: 0}, + 30: {noc_translation: true, harvest_mask: 0}, + 31: {noc_translation: true, harvest_mask: 0}, + 32: {noc_translation: true, harvest_mask: 0}, + 33: {noc_translation: true, harvest_mask: 0}, + 34: {noc_translation: true, harvest_mask: 0}, + 35: {noc_translation: true, harvest_mask: 0}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n150, + 1: n150, + 2: n150, + 3: n150, + 4: GALAXY, + 5: GALAXY, + 6: GALAXY, + 7: GALAXY, + 8: GALAXY, + 9: GALAXY, + 10: GALAXY, + 11: GALAXY, + 12: GALAXY, + 13: GALAXY, + 14: GALAXY, + 15: GALAXY, + 16: GALAXY, + 17: GALAXY, + 18: GALAXY, + 19: GALAXY, + 20: GALAXY, + 21: GALAXY, + 22: GALAXY, + 23: GALAXY, + 24: GALAXY, + 25: GALAXY, + 26: GALAXY, + 27: GALAXY, + 28: GALAXY, + 29: GALAXY, + 30: GALAXY, + 31: GALAXY, + 32: GALAXY, + 33: GALAXY, + 34: GALAXY, + 35: GALAXY, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/grayskull_E150.yaml b/tests/api/cluster_descriptor_examples/grayskull_E150.yaml new file mode 100644 index 00000000..6545cdad --- /dev/null +++ b/tests/api/cluster_descriptor_examples/grayskull_E150.yaml @@ -0,0 +1,23 @@ +arch: { + 0: Grayskull, +} + +chips: { +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: false, harvest_mask: 0}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: e150, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/grayskull_E300.yaml b/tests/api/cluster_descriptor_examples/grayskull_E300.yaml new file mode 100644 index 00000000..16a57168 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/grayskull_E300.yaml @@ -0,0 +1,23 @@ +arch: { + 0: Grayskull, +} + +chips: { +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: false, harvest_mask: 514}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: e300, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml b/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml new file mode 100644 index 00000000..896888d0 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml @@ -0,0 +1,41 @@ +arch: { + 0: Wormhole, + 1: Wormhole, + 2: Wormhole, + 3: Wormhole, +} + +chips: { + 0: [0,0,0,0], + 1: [0,0,0,0], + 2: [1,0,0,0], + 3: [1,0,0,0], +} + +ethernet_connections: [ + [{chip: 0, chan: 8}, {chip: 2, chan: 0}], + [{chip: 0, chan: 9}, {chip: 2, chan: 1}], + [{chip: 1, chan: 8}, {chip: 3, chan: 0}], + [{chip: 1, chan: 9}, {chip: 3, chan: 1}], +] + +chips_with_mmio: [ + 0: 0, + 1: 1, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 65}, + 1: {noc_translation: true, harvest_mask: 3}, + 2: {noc_translation: true, harvest_mask: 5}, + 3: {noc_translation: true, harvest_mask: 33}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n300, + 1: n300, + 2: n300, + 3: n300, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/wormhole_N150.yaml b/tests/api/cluster_descriptor_examples/wormhole_N150.yaml new file mode 100644 index 00000000..c2dd123a --- /dev/null +++ b/tests/api/cluster_descriptor_examples/wormhole_N150.yaml @@ -0,0 +1,24 @@ +arch: { + 0: Wormhole, +} + +chips: { + 0: [0,0,0,0], +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 32}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n150, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/wormhole_N300.yaml b/tests/api/cluster_descriptor_examples/wormhole_N300.yaml new file mode 100644 index 00000000..78f7822a --- /dev/null +++ b/tests/api/cluster_descriptor_examples/wormhole_N300.yaml @@ -0,0 +1,30 @@ +arch: { + 0: Wormhole, + 1: Wormhole, +} + +chips: { + 0: [0,0,0,0], + 1: [1,0,0,0], +} + +ethernet_connections: [ + [{chip: 0, chan: 8}, {chip: 1, chan: 0}], + [{chip: 0, chan: 9}, {chip: 1, chan: 1}], +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 65}, + 1: {noc_translation: true, harvest_mask: 5}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n300, + 1: n300, +} \ No newline at end of file diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp index d15ff66d..2e9a268f 100644 --- a/tests/api/test_chip.cpp +++ b/tests/api/test_chip.cpp @@ -5,55 +5,23 @@ // This file holds Chip specific API examples. #include -#include "fmt/xchar.h" #include #include #include #include +#include "fmt/xchar.h" #include "tests/test_utils/generate_cluster_desc.hpp" // TODO: change to tt_cluster +#include "umd/device/architecture_implementation.h" #include "umd/device/cluster.h" #include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/architecture_implementation.h" using namespace tt::umd; -inline std::unique_ptr get_cluster_desc() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. - std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - - // TODO: Make this test work on a host system without any tt devices. - if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; - return nullptr; - } - - // TODO: Remove different branch for different archs - std::unique_ptr cluster_desc; - // TODO: remove getting manually cluster descriptor from yaml. - std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); - - return cluster_desc; -} - -inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr &umd_cluster) { +inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr& umd_cluster) { chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin(); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip); tt_xy_pair core = soc_desc.workers[0]; @@ -61,61 +29,12 @@ inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr &um } inline std::unique_ptr get_cluster() { - - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - // TODO: Make this test work on a host system without any tt devices. if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; return nullptr; } - - std::string yaml_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - yaml_path = ""; - } else if (device_arch == tt::ARCH::BLACKHOLE) { - yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"); - } else { - // TODO: remove getting manually cluster descriptor from yaml. - yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - } - // TODO: Remove the need to do this, allow default constructor to construct with all chips. - std::unique_ptr cluster_desc = get_cluster_desc(); - std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); - - // TODO: make this unordered vs set conversion not needed. - std::set detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end()); - - - // TODO: This would be incorporated inside SocDescriptor. - std::string soc_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - } else if (device_arch == tt::ARCH::WORMHOLE_B0) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); - } else if (device_arch == tt::ARCH::BLACKHOLE) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); - } else { - throw std::runtime_error("Unsupported architecture"); - } - - - // TODO: Don't pass each of these arguments. - return std::unique_ptr(new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set)); + return std::unique_ptr(new Cluster()); } // TODO: Once default auto TLB setup is in, check it is setup properly. @@ -123,8 +42,7 @@ TEST(ApiChipTest, ManualTLBConfiguration) { std::unique_ptr umd_cluster = get_cluster(); if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } // Expect to throw for remote chip for any worker core @@ -150,16 +68,17 @@ TEST(ApiChipTest, ManualTLBConfiguration) { if (!is_worker_core) { return -1; } - return core.x + core.y * umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x(); + return core.x + + core.y * + umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x(); }; std::int32_t c_zero_address = 0; // Each MMIO chip has it's own set of TLBs, so needs its own configuration. - for (chip_id_t mmio_chip: umd_cluster->get_target_mmio_device_ids()) { - + for (chip_id_t mmio_chip : umd_cluster->get_target_mmio_device_ids()) { const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(mmio_chip); - for (tt_xy_pair core: soc_desc.workers) { + for (tt_xy_pair core : soc_desc.workers) { umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address); } @@ -183,8 +102,7 @@ TEST(ApiChipTest, SimpleAPIShowcase) { std::unique_ptr umd_cluster = get_cluster(); if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } chip_id_t chip_id = umd_cluster->get_cluster_description()->get_chips_with_mmio().begin()->first; @@ -198,7 +116,11 @@ TEST(ApiChipTest, SimpleAPIShowcase) { // It reads back the risc reset reg to validate TEST(ApiChipTest, DeassertRiscResetOnCore) { std::unique_ptr umd_cluster = get_cluster(); - + + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); umd_cluster->assert_risc_reset_at_core(chip_core_coord); @@ -218,6 +140,10 @@ TEST(ApiChipTest, DeassertRiscResetOnCore) { TEST(ApiChipTest, SpecifyLegalDeassertRiscResetOnCore) { std::unique_ptr umd_cluster = get_cluster(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); umd_cluster->assert_risc_reset_at_core(chip_core_coord); @@ -236,6 +162,10 @@ TEST(ApiChipTest, SpecifyLegalDeassertRiscResetOnCore) { TEST(ApiChipTest, SpecifyIllegalDeassertRiscResetOnCore) { std::unique_ptr umd_cluster = get_cluster(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); umd_cluster->assert_risc_reset_at_core(chip_core_coord); diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index c6f1285a..15aa28d2 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -13,15 +13,14 @@ #include "fmt/xchar.h" #include "tests/test_utils/generate_cluster_desc.hpp" - -#include "umd/device/tt_cluster_descriptor.h" #include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" // TODO: obviously we need some other way to set this up +#include "noc/noc_parameters.h" #include "src/firmware/riscv/wormhole/eth_l1_address_map.h" #include "src/firmware/riscv/wormhole/host_mem_address_map.h" #include "src/firmware/riscv/wormhole/l1_address_map.h" -#include "noc/noc_parameters.h" using namespace tt::umd; @@ -30,93 +29,13 @@ using namespace tt::umd; // N150. N300 // Galaxy -// TODO: This function should not exist, the API itself should be simple enough. -inline std::unique_ptr get_cluster_desc() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. - std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device(physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - - // TODO: Make this test work on a host system without any tt devices. - if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; - return nullptr; - } - - std::unique_ptr cluster_desc; - // TODO: remove getting manually cluster descriptor from yaml. - std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); - - return cluster_desc; -} - -// TODO: This function should not exist, the API itself should be simple enough. inline std::unique_ptr get_cluster() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device(physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - // TODO: Make this test work on a host system without any tt devices. if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; return nullptr; } - - std::string yaml_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - yaml_path = ""; - } else if (device_arch == tt::ARCH::BLACKHOLE) { - yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"); - } else { - // TODO: remove getting manually cluster descriptor from yaml. - yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - } - // TODO: Remove the need to do this, allow default constructor to construct with all chips. - std::unique_ptr cluster_desc = get_cluster_desc(); - std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); - - // TODO: make this unordered vs set conversion not needed. - std::set detected_num_chips_set(detected_num_chips.begin(), detected_num_chips.end()); - - // TODO: This would be incorporated inside SocDescriptor. - std::string soc_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - } else if (device_arch == tt::ARCH::WORMHOLE_B0) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); - } else if (device_arch == tt::ARCH::BLACKHOLE) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); - } else { - throw std::runtime_error("Unsupported architecture"); - } - - // TODO: Don't pass each of these arguments. - return std::unique_ptr( - new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set)); + return std::unique_ptr(new Cluster()); } // TODO: Should not be wormhole specific. @@ -128,11 +47,9 @@ void setup_wormhole_remote(Cluster* umd_cluster) { // Populate address map and NOC parameters that the driver needs for remote transactions umd_cluster->set_device_l1_address_params( - { - l1_mem::address_map::L1_BARRIER_BASE, + {l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, - eth_l1_mem::address_map::FW_VERSION_ADDR - }); + eth_l1_mem::address_map::FW_VERSION_ADDR}); } } @@ -140,12 +57,12 @@ void setup_wormhole_remote(Cluster* umd_cluster) { TEST(ApiClusterTest, OpenAllChips) { std::unique_ptr umd_cluster = get_cluster(); } TEST(ApiClusterTest, SimpleIOAllChips) { - std::unique_ptr cluster_desc = get_cluster_desc(); std::unique_ptr umd_cluster = get_cluster(); + const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } // Initialize random data. @@ -198,12 +115,12 @@ TEST(ApiClusterTest, SimpleIOAllChips) { } TEST(ApiClusterTest, RemoteFlush) { - std::unique_ptr cluster_desc = get_cluster_desc(); std::unique_ptr umd_cluster = get_cluster(); + const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } size_t data_size = 1024; @@ -256,3 +173,61 @@ TEST(ApiClusterTest, RemoteFlush) { std::cout << "Testing whole cluster wait for remote chip flush again, should be no-op." << std::endl; umd_cluster->wait_for_non_mmio_flush(); } + +TEST(ApiClusterTest, SimpleIOSpecificChips) { + std::unique_ptr umd_cluster = std::make_unique(0); + + const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description(); + + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + + // Initialize random data. + size_t data_size = 1024; + std::vector data(data_size, 0); + for (int i = 0; i < data_size; i++) { + data[i] = i % 256; + } + + // TODO: this should be part of constructor if it is mandatory. + setup_wormhole_remote(umd_cluster.get()); + + for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); + + // TODO: figure out if core locations should contain chip_id + tt_xy_pair any_core = soc_desc.workers[0]; + tt_cxy_pair any_core_global(chip_id, any_core); + + if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { + std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; + continue; + } + + std::cout << "Writing to chip " << chip_id << " core " << any_core.str() << std::endl; + + umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + } + + // Now read back the data. + for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); + + // TODO: figure out if core locations should contain chip_id + tt_xy_pair any_core = soc_desc.workers[0]; + tt_cxy_pair any_core_global(chip_id, any_core); + + if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { + std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; + continue; + } + + std::cout << "Reading from chip " << chip_id << " core " << any_core.str() << std::endl; + + std::vector readback_data(data_size, 0); + umd_cluster->read_from_device(readback_data.data(), any_core_global, 0, data_size, "LARGE_READ_TLB"); + + ASSERT_EQ(data, readback_data); + } +} diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index 867388ae..a6328b95 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -5,44 +5,22 @@ #include #include -#include #include +#include +#include "disjoint_set.hpp" #include "tests/test_utils/generate_cluster_desc.hpp" - #include "umd/device/pci_device.hpp" #include "umd/device/tt_cluster_descriptor.h" // TODO: Needed for detect_arch, remove when it is part of cluster descriptor. #include "umd/device/cluster.h" - inline std::unique_ptr get_cluster_desc() { - - std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); - - // TODO: This test requires knowledge of the device architecture, which should not be true. - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - int physical_device_id = pci_device_ids[0]; - PCIDevice pci_device (physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - - // TODO: Make this test work on a host system without any tt devices. - if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; - return nullptr; - } - - // TODO: Remove different branch for different archs - std::unique_ptr cluster_desc; // TODO: remove getting manually cluster descriptor from yaml. std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); - return cluster_desc; + return tt_ClusterDescriptor::create_from_yaml(yaml_path); } TEST(ApiClusterDescriptorTest, DetectArch) { @@ -65,11 +43,10 @@ TEST(ApiClusterDescriptorTest, DetectArch) { } TEST(ApiClusterDescriptorTest, BasicFunctionality) { - std::unique_ptr cluster_desc = get_cluster_desc(); if (cluster_desc == nullptr) { - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } std::unordered_set all_chips = cluster_desc->get_all_chips(); @@ -77,7 +54,7 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) { std::unordered_map eth_chip_coords = cluster_desc->get_chip_locations(); std::unordered_map local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio(); std::unordered_set local_chips; - for (auto [chip, _]: local_chips_to_pci_device_id) { + for (auto [chip, _] : local_chips_to_pci_device_id) { local_chips.insert(chip); } std::unordered_set remote_chips; @@ -87,65 +64,58 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) { } } - std::unordered_map> chips_grouped_by_closest_mmio = cluster_desc->get_chips_grouped_by_closest_mmio(); + std::unordered_map> chips_grouped_by_closest_mmio = + cluster_desc->get_chips_grouped_by_closest_mmio(); } -// A standard disjoint set data structure to track connected components. -class DisjointSet { - public: - void add_item(int item) { - parent[item] = item; +TEST(ApiClusterDescriptorTest, TestAllOfflineClusterDescriptors) { + for (std::string cluster_desc_yaml : { + "blackhole_P150.yaml", + "galaxy.yaml", + "grayskull_E150.yaml", + "grayskull_E300.yaml", + "wormhole_2xN300_unconnected.yaml", + "wormhole_N150.yaml", + "wormhole_N300.yaml", + }) { + std::cout << "Testing " << cluster_desc_yaml << std::endl; + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml( + test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/" + cluster_desc_yaml)); + + std::unordered_set all_chips = cluster_desc->get_all_chips(); + std::unordered_map harvesting_for_chips = cluster_desc->get_harvesting_info(); + std::unordered_map eth_chip_coords = cluster_desc->get_chip_locations(); + std::unordered_map local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio(); + std::unordered_set local_chips; + for (auto [chip, _] : local_chips_to_pci_device_id) { + local_chips.insert(chip); } - - int get_parent(int item) { - while (parent[item] != item) { - item = parent[item]; + std::unordered_set remote_chips; + for (auto chip : all_chips) { + if (local_chips.find(chip) == local_chips.end()) { + remote_chips.insert(chip); } - return item; } - void merge(int item1, int item2) { - int parent1 = get_parent(item1); - int parent2 = get_parent(item2); - parent[parent1] = parent2; - } - - bool are_same_set(int item1, int item2) { - return get_parent(item1) == get_parent(item2); - } - - int get_num_sets() { - std::unordered_set sets; - for (auto [item, _]: parent) { - sets.insert(get_parent(item)); - } - return sets.size(); - } - - private: - std::unordered_map parent; -}; + std::unordered_map> chips_grouped_by_closest_mmio = + cluster_desc->get_chips_grouped_by_closest_mmio(); + } +} -// This tests fails on a machine with multiple cards. -// It works as long as all the devices that are discoverable are connected through ethernet. -// Our ClusterDescriptor doesn't have a notion of multiple unconnected clusters of cards. TEST(ApiClusterDescriptorTest, SeparateClusters) { - std::unique_ptr cluster_desc = get_cluster_desc(); - - if (cluster_desc == nullptr) { - return; - } + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml( + test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml")); auto all_chips = cluster_desc->get_all_chips(); - DisjointSet chip_clusters; + DisjointSet chip_clusters; for (auto chip : all_chips) { chip_clusters.add_item(chip); } // Merge into clusters of chips. - for (auto connection: cluster_desc->get_ethernet_connections()) { + for (auto connection : cluster_desc->get_ethernet_connections()) { chip_id_t chip = connection.first; - for (auto [channel, remote_chip_and_channel]: connection.second) { + for (auto [channel, remote_chip_and_channel] : connection.second) { chip_id_t remote_chip = std::get<0>(remote_chip_and_channel); chip_clusters.merge(chip, remote_chip); } diff --git a/tests/api/test_mockup_device.cpp b/tests/api/test_mockup_device.cpp index d687075e..bb8001ea 100644 --- a/tests/api/test_mockup_device.cpp +++ b/tests/api/test_mockup_device.cpp @@ -11,8 +11,8 @@ #include #include "device/mockup/tt_mockup_device.hpp" -#include "umd/device/tt_arch_types.h" #include "tests/test_utils/generate_cluster_desc.hpp" +#include "umd/device/tt_arch_types.h" namespace test::mockup_device { @@ -25,14 +25,18 @@ std::string get_env_arch_name() { } tt::ARCH get_arch_from_string(const std::string &arch_str) { - if (arch_str == "grayskull" || arch_str == "GRAYSKULL") + if (arch_str == "grayskull" || arch_str == "GRAYSKULL") { return tt::ARCH::GRAYSKULL; - if (arch_str == "wormhole_b0" || arch_str == "WORMHOLE_B0") + } + if (arch_str == "wormhole_b0" || arch_str == "WORMHOLE_B0") { return tt::ARCH::WORMHOLE_B0; - if (arch_str == "blackhole" || arch_str == "BLACKHOLE") + } + if (arch_str == "blackhole" || arch_str == "BLACKHOLE") { return tt::ARCH::BLACKHOLE; - if (arch_str == "Invalid" || arch_str == "INVALID") + } + if (arch_str == "Invalid" || arch_str == "INVALID") { return tt::ARCH::Invalid; + } throw std::runtime_error(arch_str + " is not recognized as tt::ARCH."); } @@ -41,11 +45,16 @@ std::string get_soc_descriptor_file(tt::ARCH arch) { // const std::string umd_root = get_umd_root(); switch (arch) { - case tt::ARCH::GRAYSKULL: return test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - case tt::ARCH::WORMHOLE_B0: return test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); - case tt::ARCH::BLACKHOLE: return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"); - case tt::ARCH::Invalid: throw std::runtime_error("Invalid arch not supported"); - default: throw std::runtime_error("Unsupported device architecture"); + case tt::ARCH::GRAYSKULL: + return test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); + case tt::ARCH::WORMHOLE_B0: + return test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); + case tt::ARCH::BLACKHOLE: + return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"); + case tt::ARCH::Invalid: + throw std::runtime_error("Invalid arch not supported"); + default: + throw std::runtime_error("Unsupported device architecture"); } } diff --git a/tests/api/test_soc_descriptor_bh.cpp b/tests/api/test_soc_descriptor_bh.cpp index 5234a7c0..7007a98f 100644 --- a/tests/api/test_soc_descriptor_bh.cpp +++ b/tests/api/test_soc_descriptor_bh.cpp @@ -4,11 +4,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #include "gtest/gtest.h" - -#include "umd/device/tt_soc_descriptor.h" #include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/soc_desc_test_utils.hpp" - +#include "umd/device/tt_soc_descriptor.h" // Blackhole workers - x-y annotation // functional_workers: @@ -28,8 +26,8 @@ // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(SocDescriptor, SocDescriptorBHNoHarvesting) { - - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 0); + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 0); // We expect full grid size since there is no harvesting. tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; @@ -38,7 +36,7 @@ TEST(SocDescriptor, SocDescriptorBHNoHarvesting) { tt_logical_coords logical_coords = tt_logical_coords(x, y); tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords); tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords); - + // Virtual and physical coordinates should be the same. EXPECT_EQ(physical_coords, virtual_coords); } @@ -49,7 +47,8 @@ TEST(SocDescriptor, SocDescriptorBHNoHarvesting) { // We expect that the top left core will have virtual and physical coordinates (1, 2) and (2, 2) for // the logical coordinates if the first row is harvested. TEST(SocDescriptor, SocDescriptorBHTopLeftCore) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 1); + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 1); tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; tt_logical_coords logical_coords = tt_logical_coords(0, 0); @@ -65,13 +64,12 @@ TEST(SocDescriptor, SocDescriptorBHTopLeftCore) { // Test logical to physical coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates. -// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) { - const std::size_t max_num_harvested_x = 14; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_physical; @@ -97,7 +95,7 @@ TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) { for (auto it : logical_to_physical) { tt_physical_coords physical_coords = it.second; tt_logical_coords logical_coords = soc_desc.to_logical_coords(physical_coords); - + // Expect that reverse mapping of physical coordinates gives the same logical coordinates // using which we got the physical coordinates. EXPECT_EQ(it.first, logical_coords); @@ -107,13 +105,12 @@ TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) { // Test logical to virtual coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates. -// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorBHLogicalVirtualMapping) { - const std::size_t max_num_harvested_x = 14; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_virtual; @@ -149,13 +146,12 @@ TEST(SocDescriptor, SocDescriptorBHLogicalVirtualMapping) { // Test logical to translated coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of translated coordinates. -// For the reverse mapping back of translated to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of translated to logical coordinates we expect that same logical coordinates are +// returned as from original mapping. TEST(SocDescriptor, SocDescriptorBHLogicalTranslatedMapping) { - const std::size_t max_num_harvested_x = 14; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_translated; @@ -170,7 +166,8 @@ TEST(SocDescriptor, SocDescriptorBHLogicalTranslatedMapping) { tt_translated_coords translated_coords = soc_desc.to_translated_coords(logical_coords); logical_to_translated[logical_coords] = translated_coords; - // Expect that logical to translated translation is 1-1 mapping. No duplicates for translated coordinates. + // Expect that logical to translated translation is 1-1 mapping. No duplicates for translated + // coordinates. EXPECT_EQ(translated_coords_set.count(translated_coords), 0); translated_coords_set.insert(translated_coords); } @@ -196,7 +193,7 @@ TEST(SocDescriptor, SocDescriptorBHVirtualEqualTranslated) { tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { soc_desc.perform_harvesting(harvesting_mask); - + std::size_t num_harvested_x = test_utils::get_num_harvested(harvesting_mask); for (std::size_t x = 0; x < soc_desc.worker_grid_size.x - num_harvested_x; x++) { @@ -209,5 +206,5 @@ TEST(SocDescriptor, SocDescriptorBHVirtualEqualTranslated) { EXPECT_EQ(translated_coords, virtual_coords); } } - } + } } diff --git a/tests/api/test_soc_descriptor_gs.cpp b/tests/api/test_soc_descriptor_gs.cpp index c697a59d..b5cabc7c 100644 --- a/tests/api/test_soc_descriptor_gs.cpp +++ b/tests/api/test_soc_descriptor_gs.cpp @@ -4,10 +4,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #include "gtest/gtest.h" - -#include "umd/device/tt_soc_descriptor.h" #include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/soc_desc_test_utils.hpp" +#include "umd/device/tt_soc_descriptor.h" // Grayskull workers - x-y annotation // functional_workers: @@ -27,7 +26,6 @@ // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(SocDescriptor, SocDescriptorGSNoHarvesting) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); // We expect full grid size since there is no harvesting. @@ -37,7 +35,7 @@ TEST(SocDescriptor, SocDescriptorGSNoHarvesting) { tt_logical_coords logical_coords = tt_logical_coords(x, y); tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords); tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords); - + // Virtual and physical coordinates should be the same. EXPECT_EQ(physical_coords, virtual_coords); } @@ -48,7 +46,6 @@ TEST(SocDescriptor, SocDescriptorGSNoHarvesting) { // We expect that the top left core will have virtual and physical coordinates (1, 1) and (1, 2) for // the logical coordinates if the first row is harvested. TEST(SocDescriptor, SocDescriptorGSTopLeftCore) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; @@ -75,7 +72,7 @@ TEST(SocDescriptor, SocDescriptorGSTranslatingCoords) { tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords); tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords); tt_translated_coords translated_coords = soc_desc.to_translated_coords(logical_coords); - + // Virtual, physical and translated coordinates should be the same. EXPECT_EQ(physical_coords, virtual_coords); EXPECT_EQ(physical_coords, translated_coords); @@ -85,9 +82,9 @@ TEST(SocDescriptor, SocDescriptorGSTranslatingCoords) { // Test logical to physical coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates. -// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); std::map logical_to_physical; @@ -111,7 +108,7 @@ TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) { for (auto it : logical_to_physical) { tt_physical_coords physical_coords = it.second; tt_logical_coords logical_coords = soc_desc.to_logical_coords(physical_coords); - + // Expect that reverse mapping of physical coordinates gives the same logical coordinates // using which we got the physical coordinates. EXPECT_EQ(it.first, logical_coords); @@ -120,9 +117,9 @@ TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) { // Test logical to virtual coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates. -// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorGSLogicalVirtualMapping) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); std::map logical_to_virtual; diff --git a/tests/api/test_soc_descriptor_wh.cpp b/tests/api/test_soc_descriptor_wh.cpp index 2e8f5367..a10afbdc 100644 --- a/tests/api/test_soc_descriptor_wh.cpp +++ b/tests/api/test_soc_descriptor_wh.cpp @@ -4,34 +4,32 @@ * SPDX-License-Identifier: Apache-2.0 */ #include "gtest/gtest.h" - -#include "umd/device/tt_soc_descriptor.h" #include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/soc_desc_test_utils.hpp" - +#include "umd/device/tt_soc_descriptor.h" // Wormhole workers - x-y annotation // functional_workers: // [ -// 1-1, 2-1, 3-1, 4-1, 6-1, 7-1, 8-1, 9-1, -// 1-2, 2-2, 3-2, 4-2, 6-2, 7-2, 8-2, 9-2, -// 1-3, 2-3, 3-3, 4-3, 6-3, 7-3, 8-3, 9-3, -// 1-4, 2-4, 3-4, 4-4, 6-4, 7-4, 8-4, 9-4, -// 1-5, 2-5, 3-5, 4-5, 6-5, 7-5, 8-5, 9-5, -// 1-7, 2-7, 3-7, 4-7, 6-7, 7-7, 8-7, 9-7, -// 1-8, 2-8, 3-8, 4-8, 6-8, 7-8, 8-8, 9-8, -// 1-9, 2-9, 3-9, 4-9, 6-9, 7-9, 8-9, 9-9, -// 1-10, 2-10, 3-10, 4-10, 6-10, 7-10, 8-10, 9-10, -// 1-11, 2-11, 3-11, 4-11, 6-11, 7-11, 8-11, 9-11, +// 1-1, 2-1, 3-1, 4-1, 6-1, 7-1, 8-1, 9-1, +// 1-2, 2-2, 3-2, 4-2, 6-2, 7-2, 8-2, 9-2, +// 1-3, 2-3, 3-3, 4-3, 6-3, 7-3, 8-3, 9-3, +// 1-4, 2-4, 3-4, 4-4, 6-4, 7-4, 8-4, 9-4, +// 1-5, 2-5, 3-5, 4-5, 6-5, 7-5, 8-5, 9-5, +// 1-7, 2-7, 3-7, 4-7, 6-7, 7-7, 8-7, 9-7, +// 1-8, 2-8, 3-8, 4-8, 6-8, 7-8, 8-8, 9-8, +// 1-9, 2-9, 3-9, 4-9, 6-9, 7-9, 8-9, 9-9, +// 1-10, 2-10, 3-10, 4-10, 6-10, 7-10, 8-10, 9-10, +// 1-11, 2-11, 3-11, 4-11, 6-11, 7-11, 8-11, 9-11, // ] // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(SocDescriptor, SocDescriptorWHNoHarvesting) { - const std::size_t harvesting_mask = 0; - - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); + + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); // We expect full grid size since there is no harvesting. tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; @@ -51,10 +49,10 @@ TEST(SocDescriptor, SocDescriptorWHNoHarvesting) { // We expect that the top left core will have virtual and physical coordinates (1, 1) and (1, 2) for // the logical coordinates if the first row is harvested. TEST(SocDescriptor, SocDescriptorWHTopLeftCore) { - const std::size_t harvesting_mask = 1; - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; tt_logical_coords logical_coords = tt_logical_coords(0, 0); @@ -70,13 +68,12 @@ TEST(SocDescriptor, SocDescriptorWHTopLeftCore) { // Test logical to physical coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates. -// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) { - const std::size_t max_num_harvested_y = 10; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_physical; @@ -96,8 +93,9 @@ TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) { physical_coords_set.insert(physical_coords); } } - - // Expect that the number of physical coordinates is equal to the number of workers minus the number of harvested rows. + + // Expect that the number of physical coordinates is equal to the number of workers minus the number of + // harvested rows. EXPECT_EQ(physical_coords_set.size(), worker_grid_size.x * (worker_grid_size.y - num_harvested_y)); for (auto it : logical_to_physical) { @@ -113,13 +111,12 @@ TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) { // Test logical to virtual coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates. -// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorWHLogicalVirtualMapping) { - const std::size_t max_num_harvested_y = 10; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_virtual; @@ -153,17 +150,18 @@ TEST(SocDescriptor, SocDescriptorWHLogicalVirtualMapping) { // Test top left corner translation from logical to translated coordinates. TEST(SocDescriptor, SocDescriptorWHLogicalTranslatedTopLeft) { - const std::size_t translated_x_start = 18; const std::size_t translated_y_start = 18; - const tt_translated_coords expected_translated_coords = tt_translated_coords(translated_x_start, translated_y_start); + const tt_translated_coords expected_translated_coords = + tt_translated_coords(translated_x_start, translated_y_start); const std::size_t max_num_harvested_y = 10; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml")); - // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we don't want to try to convert if everything is harvested. + // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we don't want to try to convert if + // everything is harvested. for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y) - 1; harvesting_mask++) { soc_desc.perform_harvesting(harvesting_mask); - + tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; std::size_t num_harvested_y = test_utils::get_num_harvested(harvesting_mask); diff --git a/tests/blackhole/test_bh_common.h b/tests/blackhole/test_bh_common.h index a84b2cdd..0297b191 100644 --- a/tests/blackhole/test_bh_common.h +++ b/tests/blackhole/test_bh_common.h @@ -3,12 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "umd/device/tt_xy_pair.h" -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" - -#include "tests/test_utils/stimulus_generators.hpp" #include "eth_l1_address_map.h" +#include "tests/test_utils/stimulus_generators.hpp" +#include "tt_cluster_descriptor.h" +#include "tt_xy_pair.h" +#include "umd/device/cluster.h" using namespace tt::umd; @@ -16,68 +15,68 @@ namespace tt::umd::test::utils { static void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } class BlackholeTestFixture : public ::testing::Test { - protected: - // You can remove any or all of the following functions if their bodies would - // be empty. +protected: + // You can remove any or all of the following functions if their bodies would + // be empty. - std::unique_ptr device; + std::unique_ptr device; - BlackholeTestFixture() { + BlackholeTestFixture() {} - } - - ~BlackholeTestFixture() override { - // You can do clean-up work that doesn't throw exceptions here. - } + ~BlackholeTestFixture() override { + // You can do clean-up work that doesn't throw exceptions here. + } - virtual int get_detected_num_chips() = 0; - virtual bool is_test_skipped() = 0; + virtual int get_detected_num_chips() = 0; + virtual bool is_test_skipped() = 0; - // If the constructor and destructor are not enough for setting up - // and cleaning up each test, you can define the following methods: + // If the constructor and destructor are not enough for setting up + // and cleaning up each test, you can define the following methods: - void SetUp() override { - // Code here will be called immediately after the constructor (right - // before each test). + void SetUp() override { + // Code here will be called immediately after the constructor (right + // before each test). - if (is_test_skipped()) { - GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; - } + if (is_test_skipped()) { + GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; + } - // std::cout << "Setting Up Test." << std::endl; - assert(get_detected_num_chips() > 0); - auto devices = std::vector(get_detected_num_chips()); - std::iota(devices.begin(), devices.end(), 0); - std::set target_devices = {devices.begin(), devices.end()}; - uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_unique(test_utils::GetAbsPath(SOC_DESC_PATH), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); - assert(device != nullptr); - assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); + // std::cout << "Setting Up Test." << std::endl; + assert(get_detected_num_chips() > 0); + auto devices = std::vector(get_detected_num_chips()); + std::iota(devices.begin(), devices.end(), 0); + std::set target_devices = {devices.begin(), devices.end()}; + uint32_t num_host_mem_ch_per_mmio_device = 1; + device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); + assert(device != nullptr); + assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); - set_params_for_remote_txn(*device); + set_params_for_remote_txn(*device); - tt_device_params default_params; - device->start_device(default_params); + tt_device_params default_params; + device->start_device(default_params); - device->deassert_risc_reset(); + device->deassert_risc_reset(); - device->wait_for_non_mmio_flush(); - } + device->wait_for_non_mmio_flush(); + } - void TearDown() override { - // Code here will be called immediately after each test (right - // before the destructor). + void TearDown() override { + // Code here will be called immediately after each test (right + // before the destructor). - if (!is_test_skipped()) { - // std::cout << "Tearing Down Test." << std::endl; - device->close_device(); + if (!is_test_skipped()) { + // std::cout << "Tearing Down Test." << std::endl; + device->close_device(); + } } - } - }; -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index 1ac75e65..735bad0d 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -2,30 +2,41 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "gtest/gtest.h" #include -#include "eth_l1_address_map.h" -#include "l1_address_map.h" -#include "host_mem_address_map.h" -#include + #include +#include +#include "eth_l1_address_map.h" +#include "gtest/gtest.h" +#include "host_mem_address_map.h" +#include "l1_address_map.h" +#include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" #include "umd/device/blackhole_implementation.h" #include "umd/device/tt_cluster_descriptor.h" -#include "tests/test_utils/generate_cluster_desc.hpp" -#include "tests/test_utils/device_test_utils.hpp" using namespace tt::umd; void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } std::int32_t get_static_tlb_index(tt_xy_pair target) { - bool is_eth_location = std::find(std::begin(tt::umd::blackhole::ETH_LOCATIONS), std::end(tt::umd::blackhole::ETH_LOCATIONS), target) != std::end(tt::umd::blackhole::ETH_LOCATIONS); - bool is_tensix_location = std::find(std::begin(tt::umd::blackhole::T6_X_LOCATIONS), std::end(tt::umd::blackhole::T6_X_LOCATIONS), target.x) != std::end(tt::umd::blackhole::T6_X_LOCATIONS) && - std::find(std::begin(tt::umd::blackhole::T6_Y_LOCATIONS), std::end(tt::umd::blackhole::T6_Y_LOCATIONS), target.y) != std::end(tt::umd::blackhole::T6_Y_LOCATIONS); + bool is_eth_location = + std::find(std::begin(tt::umd::blackhole::ETH_LOCATIONS), std::end(tt::umd::blackhole::ETH_LOCATIONS), target) != + std::end(tt::umd::blackhole::ETH_LOCATIONS); + bool is_tensix_location = + std::find( + std::begin(tt::umd::blackhole::T6_X_LOCATIONS), std::end(tt::umd::blackhole::T6_X_LOCATIONS), target.x) != + std::end(tt::umd::blackhole::T6_X_LOCATIONS) && + std::find( + std::begin(tt::umd::blackhole::T6_Y_LOCATIONS), std::end(tt::umd::blackhole::T6_Y_LOCATIONS), target.y) != + std::end(tt::umd::blackhole::T6_Y_LOCATIONS); if (is_eth_location) { if (target.y == 6) { target.y = 1; @@ -61,7 +72,8 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) { std::set get_target_devices() { std::set target_devices; - std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + std::unique_ptr cluster_desc_uniq = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) { target_devices.insert(i); } @@ -73,8 +85,15 @@ TEST(SiliconDriverBH, CreateDestroy) { uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - for(int i = 0; i < 50; i++) { - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false); + for (int i = 0; i < 50; i++) { + Cluster device = Cluster( + test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false); set_params_for_remote_txn(device); device.start_device(default_params); device.deassert_risc_reset(); @@ -85,81 +104,113 @@ TEST(SiliconDriverBH, CreateDestroy) { // TEST(SiliconDriverWH, Harvesting) { // std::set target_devices = {0, 1}; // std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; - + // { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); +// std::unique_ptr cluster_desc_uniq = +// tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); // if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; +// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula +// system"; // } // } // uint32_t num_host_mem_ch_per_mmio_device = 1; -// Cluster device = Cluster("./tests/soc_descs/wormhole_b0_8x10.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); +// Cluster device = Cluster( +// "./tests/soc_descs/wormhole_b0_8x10.yaml", +// tt_ClusterDescriptor::get_cluster_descriptor_file_path(), +// target_devices, +// num_host_mem_ch_per_mmio_device, +// false, +// true, +// true, +// simulated_harvesting_masks); // auto sdesc_per_chip = device.get_virtual_soc_descriptors(); // ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; -// for(const auto& chip : sdesc_per_chip) { -// ASSERT_EQ(chip.second.workers.size(), 48) << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; +// for (const auto& chip : sdesc_per_chip) { +// ASSERT_EQ(chip.second.workers.size(), 48) +// << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; // } -// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0), 30) << "Expected first chip to have harvesting mask of 30"; -// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 60) << "Expected second chip to have harvesting mask of 60"; +// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0), 30) +// << "Expected first chip to have harvesting mask of 30"; +// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 60) +// << "Expected second chip to have harvesting mask of 60"; // } // TEST(SiliconDriverWH, CustomSocDesc) { // std::set target_devices = {0, 1}; // std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; // { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); +// std::unique_ptr cluster_desc_uniq = +// tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); // if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; +// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula +// system"; // } // } // uint32_t num_host_mem_ch_per_mmio_device = 1; // // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting -// Cluster device = Cluster("./tests/soc_descs/wormhole_b0_1x1.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); +// Cluster device = Cluster( +// "./tests/soc_descs/wormhole_b0_1x1.yaml", +// tt_ClusterDescriptor::get_cluster_descriptor_file_path(), +// target_devices, +// num_host_mem_ch_per_mmio_device, +// false, +// true, +// false, +// simulated_harvesting_masks); // auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - -// ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; -// for(const auto& chip : sdesc_per_chip) { + +// ASSERT_EQ(device.using_harvested_soc_descriptors(), false) +// << "SOC descriptors should not be modified when harvesting is disabled"; +// for (const auto& chip : sdesc_per_chip) { // ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; // } // } // TEST(SiliconDriverWH, HarvestingRuntime) { - -// auto get_static_tlb_index_callback = [] (tt_xy_pair target) { -// return get_static_tlb_index(target); -// }; +// auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; // std::set target_devices = {0, 1}; // std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; // { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); +// std::unique_ptr cluster_desc_uniq = +// tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); // if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; +// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula +// system"; // } // } // uint32_t num_host_mem_ch_per_mmio_device = 1; - -// Cluster device = Cluster("./tests/soc_descs/wormhole_b0_8x10.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + +// Cluster device = Cluster( +// "./tests/soc_descs/wormhole_b0_8x10.yaml", +// tt_ClusterDescriptor::get_cluster_descriptor_file_path(), +// target_devices, +// num_host_mem_ch_per_mmio_device, +// false, +// true, +// true, +// simulated_harvesting_masks); // set_params_for_remote_txn(device); // auto mmio_devices = device.get_target_mmio_device_ids(); - -// for(int i = 0; i < target_devices.size(); i++) { + +// for (int i = 0; i < target_devices.size(); i++) { // // Iterate over MMIO devices and only setup static TLBs for worker cores -// if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { +// if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { // auto& sdesc = device.get_virtual_soc_descriptors().at(i); -// for(auto& core : sdesc.workers) { -// // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. -// device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); +// for (auto& core : sdesc.workers) { +// // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. +// device.configure_tlb( +// i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); // } -// } +// } // } // device.setup_core_to_tlb_map(get_static_tlb_index_callback); - + // tt_device_params default_params; // device.start_device(default_params); // device.deassert_risc_reset(); @@ -169,29 +220,57 @@ TEST(SiliconDriverBH, CreateDestroy) { // std::vector readback_vec = {}; // std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -// for(int i = 0; i < target_devices.size(); i++) { +// for (int i = 0; i < target_devices.size(); i++) { // std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; // std::uint32_t dynamic_write_address = 0x40000000; -// for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses -// for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { -// device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); -// device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); -// device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - +// for (int loop = 0; loop < 100; +// loop++) { // Write to each core a 100 times at different statically mapped addresses +// for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { +// device.write_to_device( +// vector_to_write.data(), +// vector_to_write.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// address, +// ""); +// device.write_to_device( +// vector_to_write.data(), +// vector_to_write.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// dynamic_write_address, +// "SMALL_READ_WRITE_TLB"); +// device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + // test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); -// test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; -// ASSERT_EQ(vector_to_write, dynamic_readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; +// test_utils::read_data_from_device( +// device, +// dynamic_readback_vec, +// tt_cxy_pair(i, core), +// dynamic_write_address, +// 40, +// "SMALL_READ_WRITE_TLB"); +// ASSERT_EQ(vector_to_write, readback_vec) +// << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; +// ASSERT_EQ(vector_to_write, dynamic_readback_vec) +// << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; // device.wait_for_non_mmio_flush(); - -// device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data -// device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); // Clear any written data + +// device.write_to_device( +// zeros.data(), +// zeros.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// dynamic_write_address, +// "SMALL_READ_WRITE_TLB"); // Clear any written data +// device.write_to_device( +// zeros.data(), +// zeros.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// address, +// ""); // Clear any written data // device.wait_for_non_mmio_flush(); // readback_vec = {}; // dynamic_readback_vec = {}; // } -// address += 0x20; // Increment by uint32_t size for each write +// address += 0x20; // Increment by uint32_t size for each write // dynamic_write_address += 0x20; // } // } @@ -199,45 +278,44 @@ TEST(SiliconDriverBH, CreateDestroy) { // } TEST(SiliconDriverBH, UnalignedStaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); - } + } } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); std::vector unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025}; - for(int i = 0; i < target_devices.size(); i++) { - for(const auto& size : unaligned_sizes) { + for (int i = 0; i < target_devices.size(); i++) { + for (const auto& size : unaligned_sizes) { std::vector write_vec(size, 0); - for(int i = 0; i < size; i++){ + for (int i = 0; i < size; i++) { write_vec[i] = size + i; } std::vector readback_vec(size, 0); std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 50; loop++){ - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, ""); device.wait_for_non_mmio_flush(); device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, ""); @@ -251,37 +329,35 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) { } address += 0x20; } - } } device.close_device(); } TEST(SiliconDriverBH, StaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 2MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + for (auto& core : sdesc.workers) { + // Statically mapping a 2MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); - } + } } - + printf("MT: Static TLBs set\n"); tt_device_params default_params; @@ -292,31 +368,44 @@ TEST(SiliconDriverBH, StaticTLB_RW) { std::vector readback_vec = {}; std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 1; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + for (int loop = 0; loop < 1; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); + device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } - device.close_device(); + device.close_device(); } TEST(SiliconDriverBH, DynamicTLB_RW) { - // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); @@ -329,42 +418,68 @@ TEST(SiliconDriverBH, DynamicTLB_RW) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; std::vector readback_vec = {}; - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } printf("Target Tensix cores completed\n"); - + // Target DRAM channel 0 constexpr int NUM_CHANNELS = 8; std::vector dram_vector_to_write = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19}; std::uint32_t address = 0x400; - for(int i = 0; i < target_devices.size(); i++) { - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for (int ch=0; ch chan = device.get_virtual_soc_descriptors().at(i).dram_cores.at(ch); tt_xy_pair subchan = chan.at(0); - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, subchan), address, "SMALL_READ_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, subchan), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << subchan.x << "-" << subchan.y << "does not match what was written"; + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, subchan), + address, + "SMALL_READ_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, subchan), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << subchan.x << "-" + << subchan.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, subchan), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, subchan), + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); readback_vec = {}; - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } } @@ -380,8 +495,8 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); + set_params_for_remote_txn(device); tt_device_params default_params; @@ -392,11 +507,18 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -407,12 +529,19 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = 0x30000000; - for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { - for(int loop = 0; loop < 100; loop++) { - for(auto& core : core_ls) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : core_ls) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -427,25 +556,23 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { TEST(SiliconDriverBH, MultiThreadedMemBar) { // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. + // All (fairly large) transactions go through a static TLB. // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); @@ -454,24 +581,41 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); - + std::vector readback_membar_vec = {}; - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } - - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores + + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers were correctly initialized on all ethernet cores readback_membar_vec = {}; } @@ -481,38 +625,43 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { std::vector vec2(2560); std::vector zeros(2560, 0); - for(int i = 0; i < vec1.size(); i++) { + for (int i = 0; i < vec1.size(); i++) { vec1.at(i) = i; } - for(int i = 0; i < vec2.size(); i++) { + for (int i = 0; i < vec2.size(); i++) { vec2.at(i) = vec1.size() + i; } std::thread th1 = std::thread([&] { std::uint32_t address = base_addr; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), ""); ASSERT_EQ(readback_vec, vec1); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } - } }); std::thread th2 = std::thread([&] { std::uint32_t address = base_addr + vec1.size() * 4; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), ""); ASSERT_EQ(readback_vec, vec2); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ; + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -521,27 +670,43 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { th1.join(); th2.join(); - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers end up in the correct sate for ethernet cores readback_membar_vec = {}; } device.close_device(); } -TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole .. wait_for_non_mmio_flush() is not working as expected? +TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/ethernet and DRAM simultaneously on + // Blackhole .. wait_for_non_mmio_flush() is not working as expected? // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -555,62 +720,95 @@ TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/e std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // flush here so we don't simultaneously broadcast to DRAM? + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // flush here so we don't simultaneously broadcast to DRAM? // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } -TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as above.. +TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as above.. // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); tt_device_params default_params; device.start_device(default_params); auto eth_version = device.get_ethernet_fw_version(); - bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; + bool virtual_bcast_supported = + (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; if (!virtual_bcast_supported) { device.close_device(); - GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; + GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support " + "Virtual Coordinate Broadcast or NOC translation is not enabled"; } - + device.deassert_risc_reset(); std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; @@ -619,38 +817,69 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp index 8ff436ba..b4136807 100644 --- a/tests/emulation/test_emulation_device.cpp +++ b/tests/emulation/test_emulation_device.cpp @@ -3,10 +3,10 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include "gtest/gtest.h" -#include "device/tt_soc_descriptor.h" #include "device/cluster.h" #include "device/tt_emulation_device.h" +#include "device/tt_soc_descriptor.h" +#include "gtest/gtest.h" // DEPRECATED TEST SUITE !!! @@ -22,7 +22,7 @@ TEST(EmulationDeviceGS, BasicEmuTest) { uint64_t l1_addr = 0x1000; std::vector wdata(size); std::vector rdata(size); - + try { device.start_device(default_params); @@ -31,13 +31,23 @@ TEST(EmulationDeviceGS, BasicEmuTest) { } device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), l1_addr, "l1"); test_utils::read_data_from_device(device, rdata, tt_cxy_pair(0, core), l1_addr, size, "l1"); - ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; device.deassert_risc_reset(); - device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1"); + device.write_to_device( + wdata.data(), + wdata.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), + l1_addr, + "l1"); device.assert_risc_reset(); - device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1"); - + device.write_to_device( + wdata.data(), + wdata.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), + l1_addr, + "l1"); } catch (const std::exception &e) { std::cout << "Error: " << e.what() << std::endl; diff --git a/tests/galaxy/test_galaxy_common.cpp b/tests/galaxy/test_galaxy_common.cpp index 546c4c7f..4cff57f1 100644 --- a/tests/galaxy/test_galaxy_common.cpp +++ b/tests/galaxy/test_galaxy_common.cpp @@ -10,9 +10,18 @@ void move_data( Cluster& device, tt_multichip_core_addr sender_core, tt_multichip_core_addr receiver_core, uint32_t size) { std::vector readback_vec = {}; test_utils::read_data_from_device( - device, readback_vec, tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, size, "SMALL_READ_WRITE_TLB"); + device, + readback_vec, + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + size, + "SMALL_READ_WRITE_TLB"); device.write_to_device( - readback_vec.data(), readback_vec.size() * sizeof(std::uint32_t), tt_cxy_pair(receiver_core.chip, receiver_core.core), receiver_core.addr, "SMALL_READ_WRITE_TLB"); + readback_vec.data(), + readback_vec.size() * sizeof(std::uint32_t), + tt_cxy_pair(receiver_core.chip, receiver_core.core), + receiver_core.addr, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited return; @@ -25,7 +34,12 @@ void broadcast_data( uint32_t size) { std::vector readback_vec = {}; test_utils::read_data_from_device( - device, readback_vec, tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, size, "SMALL_READ_WRITE_TLB"); + device, + readback_vec, + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + size, + "SMALL_READ_WRITE_TLB"); for (const auto& receiver_core : receiver_cores) { device.write_to_device( readback_vec.data(), diff --git a/tests/galaxy/test_galaxy_common.h b/tests/galaxy/test_galaxy_common.h index 1198d0a4..01ecc704 100644 --- a/tests/galaxy/test_galaxy_common.h +++ b/tests/galaxy/test_galaxy_common.h @@ -4,37 +4,32 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once #include +#include #include #include #include -#include +#include "fmt/core.h" #include "umd/device/cluster.h" #include "umd/device/tt_xy_pair.h" -#include "fmt/core.h" - // static const std::string SOC_DESC_PATH = "./tests/soc_descs/wormhole_b0_8x10.yaml"; using namespace tt::umd; -using chip_id_t = int; -using ethernet_channel_t = int; -using eth_coord_t = std::tuple; // x, y, rack, shelf struct tt_multichip_core_addr { tt_multichip_core_addr() : core{}, chip{}, addr{} {} + tt_multichip_core_addr(chip_id_t chip, tt_xy_pair core, std::uint64_t addr) : core(core), chip(chip), addr(addr) {} tt_xy_pair core; chip_id_t chip; std::uint64_t addr; - std::string str() const { - return fmt::format("(chip={},x={},y={},addr=0x{:x})", chip, core.x, core.y, addr); - } + + std::string str() const { return fmt::format("(chip={},x={},y={},addr=0x{:x})", chip, core.x, core.y, addr); } }; // SIMPLE DATAMOVEMENT API BASED ON UMD diff --git a/tests/galaxy/test_umd_concurrent_threads.cpp b/tests/galaxy/test_umd_concurrent_threads.cpp index 2c4f6d42..e668160f 100644 --- a/tests/galaxy/test_umd_concurrent_threads.cpp +++ b/tests/galaxy/test_umd_concurrent_threads.cpp @@ -2,22 +2,21 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include #include -#include -#include "gtest/gtest.h" #include "common/logger.hpp" -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" #include "eth_interface.h" +#include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" - #include "test_galaxy_common.h" -#include "tests/wormhole/test_wh_common.h" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/wormhole/test_wh_common.h" +#include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; @@ -52,7 +51,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, all_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + all_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -70,7 +74,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; for (const auto& chip : target_devices_th1) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(vector_to_write_th1.data(), vector_to_write_th1.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write_th1.data(), + vector_to_write_th1.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -91,7 +100,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; for (const auto& chip : target_devices_th2) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(vector_to_write_th2.data(), vector_to_write_th2.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write_th2.data(), + vector_to_write_th2.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -140,7 +154,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, all_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + all_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -162,7 +181,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) { std::uint32_t address = 0x4000000; for (const auto& chip : target_devices_th1) { for (auto& core : dram_cores) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -182,7 +206,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) { std::uint32_t address = 0x5000000; for (const auto& chip : target_devices_th2) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -217,7 +246,12 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -239,7 +273,12 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) { chip_id_t mmio_chip = cluster_desc->get_chips_with_mmio().begin()->first; std::vector readback_vec = {}; std::uint32_t address = 0x0; - device.write_to_device(large_vector.data(), large_vector.size() * sizeof(std::uint32_t), tt_cxy_pair(mmio_chip, tt_xy_pair(0, 0)), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + large_vector.data(), + large_vector.size() * sizeof(std::uint32_t), + tt_cxy_pair(mmio_chip, tt_xy_pair(0, 0)), + address, + "SMALL_READ_WRITE_TLB"); test_utils::read_data_from_device( device, readback_vec, @@ -257,14 +296,24 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; for (const auto& chip : target_devices) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(small_vector.data(), small_vector.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + small_vector.data(), + small_vector.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); for (const auto& chip : target_devices) { for (auto& core : sdesc_per_chip.at(chip).workers) { test_utils::read_data_from_device( - device, readback_vec, tt_cxy_pair(chip, core), address, small_vector.size() * 4, "SMALL_READ_WRITE_TLB"); + device, + readback_vec, + tt_cxy_pair(chip, core), + address, + small_vector.size() * 4, + "SMALL_READ_WRITE_TLB"); EXPECT_EQ(small_vector, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; diff --git a/tests/galaxy/test_umd_remote_api.cpp b/tests/galaxy/test_umd_remote_api.cpp index 535607b5..366ea05d 100644 --- a/tests/galaxy/test_umd_remote_api.cpp +++ b/tests/galaxy/test_umd_remote_api.cpp @@ -2,21 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 -#include #include +#include -#include "gtest/gtest.h" #include "common/logger.hpp" -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" #include "eth_interface.h" +#include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" - #include "test_galaxy_common.h" -#include "tests/wormhole/test_wh_common.h" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/wormhole/test_wh_common.h" +#include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; @@ -32,7 +31,12 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -64,7 +68,12 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) { for (const auto& core : target_cores) { tt_cxy_pair target_core = tt_cxy_pair(chip, core); auto start = std::chrono::high_resolution_clock::now(); - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), target_core, address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + target_core, + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited auto end = std::chrono::high_resolution_clock::now(); auto duration = double(std::chrono::duration_cast(end - start).count()); @@ -72,7 +81,8 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) { // std::cout << " chip " << chip << " core " << target_core.str() << " " << duration << std::endl; start = std::chrono::high_resolution_clock::now(); - test_utils::read_data_from_device(device, readback_vec, target_core, address, write_size, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, target_core, address, write_size, "SMALL_READ_WRITE_TLB"); end = std::chrono::high_resolution_clock::now(); duration = double(std::chrono::duration_cast(end - start).count()); // std::cout << " read chip " << chip << " core " << target_core.str()<< " " << duration << std::endl; @@ -145,7 +155,12 @@ void run_data_mover_test( uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -162,7 +177,11 @@ void run_data_mover_test( std::vector send_bw; // Set up data in sender core device.write_to_device( - vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, "SMALL_READ_WRITE_TLB"); + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited // Send data from sender core to receiver core @@ -261,7 +280,12 @@ void run_data_broadcast_test( uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -278,7 +302,11 @@ void run_data_broadcast_test( std::vector send_bw; // Set up data in sender core device.write_to_device( - vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, "SMALL_READ_WRITE_TLB"); + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited // Send data from sender core to receiver core diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp index ae2f8094..86416e4d 100644 --- a/tests/galaxy/test_umd_remote_api_stability.cpp +++ b/tests/galaxy/test_umd_remote_api_stability.cpp @@ -7,173 +7,167 @@ #include #include -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" - #include "common/logger.hpp" #include "eth_interface.h" #include "filesystem" #include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" -#include "umd/device/tt_soc_descriptor.h" - -#include "tests/test_utils/stimulus_generators.hpp" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/galaxy/test_galaxy_common.h" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/test_utils/stimulus_generators.hpp" #include "tests/wormhole/test_wh_common.h" +#include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" +#include "umd/device/tt_soc_descriptor.h" namespace tt::umd::test::utils { - class WormholeGalaxyStabilityTestFixture : public WormholeTestFixture { - private: - static int detected_num_chips; - static bool skip_tests; - - protected: - - static constexpr int EXPECTED_MIN_CHIPS = 32; - static uint32_t scale_number_of_tests; - - static void SetUpTestSuite() { - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); - detected_num_chips = cluster_desc->get_number_of_chips(); - if (detected_num_chips < EXPECTED_MIN_CHIPS) { - skip_tests = true; - } - if(char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { - scale_number_of_tests = std::atoi(scale_number_of_tests_env); +private: + static int detected_num_chips; + static bool skip_tests; + +protected: + static constexpr int EXPECTED_MIN_CHIPS = 32; + static uint32_t scale_number_of_tests; + + static void SetUpTestSuite() { + std::unique_ptr cluster_desc = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + detected_num_chips = cluster_desc->get_number_of_chips(); + if (detected_num_chips < EXPECTED_MIN_CHIPS) { + skip_tests = true; + } + if (char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { + scale_number_of_tests = std::atoi(scale_number_of_tests_env); + } } - } - virtual int get_detected_num_chips() { - return detected_num_chips; - } - - virtual bool is_test_skipped() { - return skip_tests; - } + virtual int get_detected_num_chips() { return detected_num_chips; } + virtual bool is_test_skipped() { return skip_tests; } }; - int WormholeGalaxyStabilityTestFixture::detected_num_chips = -1; bool WormholeGalaxyStabilityTestFixture::skip_tests = false; uint32_t WormholeGalaxyStabilityTestFixture::scale_number_of_tests = 1; - TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) { int seed = 0; - + assert(device != nullptr); - log_info(LogSiliconDriver,"Started MixedRemoteTransfers"); + log_info(LogSiliconDriver, "Started MixedRemoteTransfers"); std::vector command_history; try { RunMixedTransfersUniformDistributions( - *this->device, + *this->device, 100000 * scale_number_of_tests, seed, - transfer_type_weights_t{.write = 0.40, .read = 0.4}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // Set to true if you want to emit the command history code to command line + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } - } TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTransfersMediumSmall) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersMediumSmall"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersMediumSmall"); assert(device != nullptr); - std::thread t1([&](){ + std::thread t1([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 50000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // Set to true if you want to emit the command history code to command line + false, + nullptr); }); - std::thread t2([&](){ + std::thread t2([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 50000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + // Set to true if you want to emit the command history code to command line + std::uniform_int_distribution(0x4, 30000), + false, + nullptr); }); - std::thread t3([&](){ + std::thread t3([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 50000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + // Set to true if you want to emit the command history code to command line + std::uniform_int_distribution(0x4, 30000), + false, + nullptr); }); - std::thread t4([&](){ + std::thread t4([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 0.1, .read = 0.1}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + nullptr); }); t1.join(); @@ -182,4 +176,4 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran t4.join(); } -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index b2e34c70..04af85bb 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -5,13 +5,13 @@ #include #include "gtest/gtest.h" +#include "l1_address_map.h" +#include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" #include "umd/device/cluster.h" -#include "umd/device/tt_soc_descriptor.h" #include "umd/device/tt_cluster_descriptor.h" +#include "umd/device/tt_soc_descriptor.h" #include "umd/device/wormhole_implementation.h" -#include "l1_address_map.h" -#include "tests/test_utils/generate_cluster_desc.hpp" -#include "tests/test_utils/device_test_utils.hpp" using namespace tt::umd; @@ -19,8 +19,8 @@ TEST(SiliconDriverGS, CreateDestroySequential) { std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; - for(int i = 0; i < 100; i++) { - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); + for (int i = 0; i < 100; i++) { + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); device.start_device(default_params); device.deassert_risc_reset(); device.close_device(); @@ -33,13 +33,13 @@ TEST(SiliconDriverGS, CreateMultipleInstance) { tt_device_params default_params; default_params.init_device = false; std::unordered_map concurrent_devices = {}; - for(int i = 0; i < 100; i++) { - concurrent_devices.insert({i, new Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true)}); - concurrent_devices.at(i) -> start_device(default_params); + for (int i = 0; i < 100; i++) { + concurrent_devices.insert({i, new Cluster(num_host_mem_ch_per_mmio_device, false, true)}); + concurrent_devices.at(i)->start_device(default_params); } - for(auto& device : concurrent_devices) { - device.second -> close_device(); + for (auto& device : concurrent_devices) { + device.second->close_device(); delete device.second; } } @@ -48,15 +48,19 @@ TEST(SiliconDriverGS, Harvesting) { std::set target_devices = {0}; std::unordered_map simulated_harvesting_masks = {{0, 6}, {1, 12}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; - for(const auto& chip : sdesc_per_chip) { - ASSERT_LE(chip.second.workers.size(), 96) << "Expected SOC descriptor with harvesting to have less than or equal to 96 workers for chip " << chip.first; + for (const auto& chip : sdesc_per_chip) { + ASSERT_LE(chip.second.workers.size(), 96) + << "Expected SOC descriptor with harvesting to have less than or equal to 96 workers for chip " + << chip.first; } - ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0) & simulated_harvesting_masks[0], 6) << "Expected first chip to include simulated harvesting mask of 6"; - // ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 12) << "Expected second chip to have harvesting mask of 12"; + ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0) & simulated_harvesting_masks[0], 6) + << "Expected first chip to include simulated harvesting mask of 6"; + // ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 12) << "Expected second chip to have + // harvesting mask of 12"; device.close_device(); } @@ -65,16 +69,25 @@ TEST(SiliconDriverGS, CustomSocDesc) { std::unordered_map simulated_harvesting_masks = {{0, 6}, {1, 12}}; uint32_t num_host_mem_ch_per_mmio_device = 1; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - Cluster device = Cluster(test_utils::GetAbsPath("./tests/soc_descs/grayskull_1x1_arch.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); + Cluster device = Cluster( + test_utils::GetAbsPath("./tests/soc_descs/grayskull_1x1_arch.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false, + simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; - for(const auto& chip : sdesc_per_chip) { + ASSERT_EQ(device.using_harvested_soc_descriptors(), false) + << "SOC descriptors should not be modified when harvesting is disabled"; + for (const auto& chip : sdesc_per_chip) { ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; } } TEST(SiliconDriverGS, HarvestingRuntime) { - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x; if (flat_index == 0) { return -1; @@ -85,13 +98,13 @@ TEST(SiliconDriverGS, HarvestingRuntime) { std::set target_devices = {0}; std::unordered_map simulated_harvesting_masks = {{0, 6}, {1, 12}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index); @@ -108,29 +121,59 @@ TEST(SiliconDriverGS, HarvestingRuntime) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; float timeout_in_seconds = 10; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; std::uint32_t dynamic_write_address = 0x30000000; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + dynamic_write_address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); - test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, + dynamic_readback_vec, + tt_cxy_pair(i, core), + dynamic_write_address, + 40, + "SMALL_READ_WRITE_TLB"); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + dynamic_write_address, + "SMALL_READ_WRITE_TLB"); // Clear any written data readback_vec = {}; dynamic_readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write dynamic_write_address += 0x20; } } @@ -138,7 +181,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) { } TEST(SiliconDriverGS, StaticTLB_RW) { - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x; if (flat_index == 0) { return -1; @@ -146,19 +189,20 @@ TEST(SiliconDriverGS, StaticTLB_RW) { return flat_index; }; std::set target_devices = {0}; - + uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - for(int i = 0; i < target_devices.size(); i++) { + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. - device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); } device.setup_core_to_tlb_map(i, get_static_tlb_index); } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -168,36 +212,52 @@ TEST(SiliconDriverGS, StaticTLB_RW) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; float timeout_in_seconds = 10; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } - device.close_device(); + device.close_device(); } TEST(SiliconDriverGS, DynamicTLB_RW) { - // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - device.set_fallback_tlb_ordering_mode("SMALL_READ_WRITE_TLB", TLB_DATA::Posted); // Explicitly test API to set fallback tlb ordering mode + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); + device.set_fallback_tlb_ordering_mode( + "SMALL_READ_WRITE_TLB", TLB_DATA::Posted); // Explicitly test API to set fallback tlb ordering mode tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -207,25 +267,40 @@ TEST(SiliconDriverGS, DynamicTLB_RW) { std::vector readback_vec = {}; float timeout_in_seconds = 10; - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } device.close_device(); @@ -238,8 +313,8 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -249,18 +324,27 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { std::vector readback_vec = {}; float timeout_in_seconds = 10; std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -272,19 +356,28 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { std::vector readback_vec = {}; float timeout_in_seconds = 10; std::uint32_t address = 0x30000000; - for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { - for(int loop = 0; loop < 100; loop++) { - for(auto& core : core_ls) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); + for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : core_ls) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); } - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -297,14 +390,14 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { device.close_device(); } -TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run - // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. - // We want to make sure the memory barrier is thread/process safe. +TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run + // Have 2 threads read and write from a single device concurrently + // All (fairly large) transactions go through a static TLB. + // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x; if (flat_index == 0) { return -1; @@ -316,13 +409,13 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - - for(int i = 0; i < target_devices.size(); i++) { + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); + + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), base_addr); } device.setup_core_to_tlb_map(i, get_static_tlb_index); @@ -332,22 +425,28 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run device.start_device(default_params); device.deassert_risc_reset(); std::vector readback_membar_vec = {}; - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } // Launch 2 thread accessing different locations of L1 and using memory barrier between write and read @@ -356,23 +455,26 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run std::vector vec2(25600); std::vector zeros(25600, 0); - for(int i = 0; i < vec1.size(); i++) { + for (int i = 0; i < vec1.size(); i++) { vec1.at(i) = i; } - for(int i = 0; i < vec2.size(); i++) { + for (int i = 0; i < vec2.size(); i++) { vec2.at(i) = vec1.size() + i; } std::thread th1 = std::thread([&] { std::uint32_t address = base_addr; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), ""); ASSERT_EQ(readback_vec, vec1); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -380,14 +482,17 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run std::thread th2 = std::thread([&] { std::uint32_t address = base_addr + vec1.size() * 4; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), ""); ASSERT_EQ(readback_vec, vec2); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ; + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -396,11 +501,71 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run th1.join(); th2.join(); - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in correct sate workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in correct sate workers readback_membar_vec = {}; } device.close_device(); } + +/** + * Copied from Wormhole unit tests. + */ +TEST(SiliconDriverGS, SysmemTestWithPcie) { + Cluster cluster( + test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), + "", // test_utils::GetClusterDescYAML(), + {0}, + 1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + + cluster.start_device(tt_device_params{}); // no special parameters + + const chip_id_t mmio_chip_id = 0; + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); + const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly. + + // PCIe core is at (x=0, y=4) on Grayskull NOC0. + ASSERT_EQ(PCIE.x, 0); + ASSERT_EQ(PCIE.y, 4); + + // Bad API: how big is the buffer? How do we know it's big enough? + // Situation today is that there's a 1G hugepage behind it, although this is + // unclear from the API and may change in the future. + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); + ASSERT_NE(sysmem, nullptr); + + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); + + // Buffer that we will use to read sysmem into, then write sysmem from. + std::vector buffer(test_size_bytes, 0x0); + + // Step 1: Fill sysmem with random bytes. + test_utils::fill_with_random_bytes(sysmem, test_size_bytes); + + // Step 2: Read sysmem into buffer. + cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); + + // Step 3: Verify that buffer matches sysmem. + ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); + + // Step 4: Fill buffer with random bytes. + test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes); + + // Step 5: Write buffer into sysmem, overwriting what was there. + cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); + + // Step 5b: Read back sysmem into a throwaway buffer. The intent is to + // ensure the write has completed before we check sysmem against buffer. + std::vector throwaway(test_size_bytes, 0x0); + cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); + + // Step 6: Verify that sysmem matches buffer. + ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); +} diff --git a/tests/microbenchmark/device_fixture.hpp b/tests/microbenchmark/device_fixture.hpp index 3e20679a..b4b744b8 100644 --- a/tests/microbenchmark/device_fixture.hpp +++ b/tests/microbenchmark/device_fixture.hpp @@ -2,24 +2,27 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include +#include + #include +#include +#include #include -#include #include "cluster.h" -#include "l1_address_map.h" #include "device/tt_soc_descriptor.h" +#include "l1_address_map.h" #include "tests/test_utils/generate_cluster_desc.hpp" +using tt::umd::Cluster; + class uBenchmarkFixture : public ::testing::Test { - protected: +protected: void SetUp() override { // get arch name? results_csv.open("ubench_results.csv", std::ios_base::app); - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * 10 + target.x; // grid_size_x = 10 for GS/WH ????? something is wrong here if (flat_index == 0) { return -1; @@ -28,13 +31,19 @@ class uBenchmarkFixture : public ::testing::Test { }; std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_shared(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), "", target_devices, num_host_mem_ch_per_mmio_device, false, true); + device = std::make_shared( + test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), + "", + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device->get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device->configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } } diff --git a/tests/microbenchmark/test_rw_tensix.cpp b/tests/microbenchmark/test_rw_tensix.cpp index 274e17a7..9d1973b2 100644 --- a/tests/microbenchmark/test_rw_tensix.cpp +++ b/tests/microbenchmark/test_rw_tensix.cpp @@ -6,11 +6,11 @@ #include -#include "nanobench.h" #include "device_fixture.hpp" +#include "nanobench.h" #include "tests/test_utils/device_test_utils.hpp" -std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min=0) { +std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min = 0) { ankerl::nanobench::Rng gen(80085); std::uniform_int_distribution<> dis(min, max); // between 0 and 1MB return dis(gen); @@ -19,81 +19,119 @@ std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min=0) { TEST_F(uBenchmarkFixture, WriteAllCores32Bytes) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7}; std::uint64_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path + std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path ankerl::nanobench::Bench bench_static; ankerl::nanobench::Bench bench_dynamic; - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { std::stringstream wname; wname << "Write to device core (" << core.x << ", " << core.y << ")"; // Write 32 bytes through static tlbs - bench_static.title("Write 32 bytes").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] { - device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - }); + bench_static.title("Write 32 bytes") + .unit("writes") + .minEpochIterations(50) + .output(nullptr) + .run(wname.str(), [&] { + device->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + }); // Write through "fallback/dynamic" tlb - bench_dynamic.title("Write 32 bytes fallback").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] { - device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), bad_address, "SMALL_READ_WRITE_TLB"); - }); + bench_dynamic.title("Write 32 bytes fallback") + .unit("writes") + .minEpochIterations(50) + .output(nullptr) + .run(wname.str(), [&] { + device->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + bad_address, + "SMALL_READ_WRITE_TLB"); + }); wname.clear(); } bench_static.render(ankerl::nanobench::templates::csv(), results_csv); bench_dynamic.render(ankerl::nanobench::templates::csv(), results_csv); } -TEST_F(uBenchmarkFixture, ReadAllCores32Bytes){ +TEST_F(uBenchmarkFixture, ReadAllCores32Bytes) { std::vector readback_vec = {}; std::uint64_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path + std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path ankerl::nanobench::Bench bench_static; ankerl::nanobench::Bench bench_dynamic; - - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { std::stringstream rname; // Read through static tlbs rname << "Read from device core (" << core.x << ", " << core.y << ")"; bench_static.title("Read 32 bytes").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] { - test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + *device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); }); // Read through "fallback/dynamic" tlb - bench_dynamic.title("Read 32 bytes fallback").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] { - test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), bad_address, 0x20, "SMALL_READ_WRITE_TLB"); - }); + bench_dynamic.title("Read 32 bytes fallback") + .unit("reads") + .minEpochIterations(50) + .output(nullptr) + .run(rname.str(), [&] { + test_utils::read_data_from_device( + *device, readback_vec, tt_cxy_pair(0, core), bad_address, 0x20, "SMALL_READ_WRITE_TLB"); + }); rname.clear(); } bench_static.render(ankerl::nanobench::templates::csv(), results_csv); bench_dynamic.render(ankerl::nanobench::templates::csv(), results_csv); } -TEST_F(uBenchmarkFixture, Write32BytesRandomAddr){ +TEST_F(uBenchmarkFixture, Write32BytesRandomAddr) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7}; std::uint32_t address; ankerl::nanobench::Bench bench; - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { - address = generate_random_address(1<<20); // between 0 and 1MB + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + address = generate_random_address(1 << 20); // between 0 and 1MB std::stringstream wname; wname << "Write to device core (" << core.x << ", " << core.y << ") @ address " << std::hex << address; - bench.title("Write 32 bytes random address").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] { - device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - }); + bench.title("Write 32 bytes random address") + .unit("writes") + .minEpochIterations(50) + .output(nullptr) + .run(wname.str(), [&] { + device->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + }); wname.clear(); } bench.render(ankerl::nanobench::templates::csv(), results_csv); } -TEST_F(uBenchmarkFixture, Read32BytesRandomAddr){ +TEST_F(uBenchmarkFixture, Read32BytesRandomAddr) { std::vector readback_vec = {}; std::uint32_t address; ankerl::nanobench::Bench bench; - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { - address = generate_random_address(1<<20); // between 0 and 1MB + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + address = generate_random_address(1 << 20); // between 0 and 1MB std::stringstream rname; rname << "Read from device core (" << core.x << ", " << core.y << ") @ address " << std::hex << address; - bench.title("Read 32 bytes random address").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] { - test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); - }); + bench.title("Read 32 bytes random address") + .unit("reads") + .minEpochIterations(50) + .output(nullptr) + .run(rname.str(), [&] { + test_utils::read_data_from_device( + *device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); + }); rname.clear(); } bench.render(ankerl::nanobench::templates::csv(), results_csv); diff --git a/tests/pcie/test_pcie_device.cpp b/tests/pcie/test_pcie_device.cpp index b12d835e..02de5fe1 100644 --- a/tests/pcie/test_pcie_device.cpp +++ b/tests/pcie/test_pcie_device.cpp @@ -5,16 +5,15 @@ */ #include -#include "fmt/xchar.h" #include #include #include #include +#include "fmt/xchar.h" #include "umd/device/pci_device.hpp" - TEST(PcieDeviceTest, Numa) { std::vector nodes; diff --git a/tests/simulation/device_fixture.hpp b/tests/simulation/device_fixture.hpp index 115d3ac1..4d76c308 100644 --- a/tests/simulation/device_fixture.hpp +++ b/tests/simulation/device_fixture.hpp @@ -5,15 +5,14 @@ #pragma once #include - -#include "umd/device/tt_simulation_device.h" -#include "common/logger.hpp" -#include "tests/test_utils/generate_cluster_desc.hpp" - #include +#include #include #include -#include + +#include "common/logger.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "umd/device/tt_simulation_device.h" class SimulationDeviceFixture : public ::testing::Test { protected: @@ -24,9 +23,7 @@ class SimulationDeviceFixture : public ::testing::Test { device->start_device(default_params); } - static void TearDownTestSuite() { - device->close_device(); - } + static void TearDownTestSuite() { device->close_device(); } static std::unique_ptr device; }; diff --git a/tests/simulation/test_simulation_device.cpp b/tests/simulation/test_simulation_device.cpp index 1ac6146a..3b3015e0 100644 --- a/tests/simulation/test_simulation_device.cpp +++ b/tests/simulation/test_simulation_device.cpp @@ -3,86 +3,79 @@ // SPDX-License-Identifier: Apache-2.0 #include + #include "device_fixture.hpp" #include "tests/test_utils/device_test_utils.hpp" -std::vector generate_data(uint32_t size_in_bytes){ - size_t size = size_in_bytes/sizeof(uint32_t); +std::vector generate_data(uint32_t size_in_bytes) { + size_t size = size_in_bytes / sizeof(uint32_t); std::vector data(size); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dis(0, 100); - for(uint32_t i = 0; i < size; i++){ + for (uint32_t i = 0; i < size; i++) { data[i] = dis(gen); } return data; } -class LoopbackAllCoresParam : public SimulationDeviceFixture , - public ::testing::WithParamInterface {}; +class LoopbackAllCoresParam : public SimulationDeviceFixture, public ::testing::WithParamInterface {}; INSTANTIATE_TEST_SUITE_P( - LoopbackAllCores, - LoopbackAllCoresParam, - ::testing::Values( - tt_xy_pair{0, 1}, - tt_xy_pair{1, 1}, - tt_xy_pair{1, 0} - ) -); - -TEST_P(LoopbackAllCoresParam, LoopbackSingleTensix){ - std::vector wdata = {1,2,3,4,5}; + LoopbackAllCores, LoopbackAllCoresParam, ::testing::Values(tt_xy_pair{0, 1}, tt_xy_pair{1, 1}, tt_xy_pair{1, 0})); + +TEST_P(LoopbackAllCoresParam, LoopbackSingleTensix) { + std::vector wdata = {1, 2, 3, 4, 5}; std::vector rdata(wdata.size(), 0); tt_cxy_pair core = {0, GetParam()}; - device->write_to_device(wdata.data(), wdata.size()*sizeof(uint32_t), core, 0x100, ""); - device->read_from_device(rdata.data(), core, 0x100, rdata.size()*sizeof(uint32_t), ""); - + device->write_to_device(wdata.data(), wdata.size() * sizeof(uint32_t), core, 0x100, ""); + device->read_from_device(rdata.data(), core, 0x100, rdata.size() * sizeof(uint32_t), ""); + ASSERT_EQ(wdata, rdata); } -bool loopback_stress_size(std::unique_ptr &device, tt_xy_pair core, uint32_t byte_shift){ +bool loopback_stress_size(std::unique_ptr &device, tt_xy_pair core, uint32_t byte_shift) { uint64_t addr = 0x0; std::vector wdata = generate_data(1 << byte_shift); std::vector rdata(wdata.size(), 0); - device->write_to_device(wdata.data(), wdata.size()*sizeof(uint32_t), tt_cxy_pair{0, core}, addr, ""); - device->read_from_device(rdata.data(), tt_cxy_pair{0, core}, addr, rdata.size()*sizeof(uint32_t), ""); - + device->write_to_device(wdata.data(), wdata.size() * sizeof(uint32_t), tt_cxy_pair{0, core}, addr, ""); + device->read_from_device(rdata.data(), tt_cxy_pair{0, core}, addr, rdata.size() * sizeof(uint32_t), ""); + return wdata == rdata; } -TEST_P(LoopbackAllCoresParam, LoopbackStressSize){ +TEST_P(LoopbackAllCoresParam, LoopbackStressSize) { tt_xy_pair core = GetParam(); tt_xy_pair dram = {1, 0}; if (core == dram) { - for (uint32_t i = 2; i <= 30; ++i) { // 2^30 = 1 GB + for (uint32_t i = 2; i <= 30; ++i) { // 2^30 = 1 GB ASSERT_TRUE(loopback_stress_size(device, core, i)); } } else { - for (uint32_t i = 2; i <= 20; ++i) { // 2^20 = 1 MB + for (uint32_t i = 2; i <= 20; ++i) { // 2^20 = 1 MB ASSERT_TRUE(loopback_stress_size(device, core, i)); } } } -TEST_F(SimulationDeviceFixture, LoopbackTwoTensix){ - std::vector wdata1 = {1,2,3,4,5}; - std::vector wdata2 = {6,7,8,9,10}; +TEST_F(SimulationDeviceFixture, LoopbackTwoTensix) { + std::vector wdata1 = {1, 2, 3, 4, 5}; + std::vector wdata2 = {6, 7, 8, 9, 10}; std::vector rdata1(wdata1.size()); std::vector rdata2(wdata2.size()); tt_cxy_pair core1 = {0, 0, 1}; tt_cxy_pair core2 = {0, 1, 1}; - device->write_to_device(wdata1.data(), wdata1.size()*sizeof(uint32_t), core1, 0x100, ""); - device->write_to_device(wdata2.data(), wdata2.size()*sizeof(uint32_t), core2, 0x100, ""); + device->write_to_device(wdata1.data(), wdata1.size() * sizeof(uint32_t), core1, 0x100, ""); + device->write_to_device(wdata2.data(), wdata2.size() * sizeof(uint32_t), core2, 0x100, ""); + + device->read_from_device(rdata1.data(), core1, 0x100, rdata1.size() * sizeof(uint32_t), ""); + device->read_from_device(rdata2.data(), core2, 0x100, rdata2.size() * sizeof(uint32_t), ""); - device->read_from_device(rdata1.data(), core1, 0x100, rdata1.size()*sizeof(uint32_t), ""); - device->read_from_device(rdata2.data(), core2, 0x100, rdata2.size()*sizeof(uint32_t), ""); - ASSERT_EQ(wdata1, rdata1); ASSERT_EQ(wdata2, rdata2); } diff --git a/tests/test_utils/device_test_utils.hpp b/tests/test_utils/device_test_utils.hpp index 136c6c5e..842e4ce5 100644 --- a/tests/test_utils/device_test_utils.hpp +++ b/tests/test_utils/device_test_utils.hpp @@ -6,15 +6,16 @@ #pragma once #include -#include +#include #include +#include #include "umd/device/cluster.h" namespace test_utils { template -static void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { +static void size_buffer_to_capacity(std::vector& data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; if (size_in_bytes > 0) { target_size = ((size_in_bytes - 1) / sizeof(T)) + 1; @@ -22,9 +23,27 @@ static void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_i data_buf.resize(target_size); } -static void read_data_from_device(tt_device& device, std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { +static void read_data_from_device( + tt_device& device, + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + uint32_t size, + const std::string& tlb_to_use) { size_buffer_to_capacity(vec, size); device.read_from_device(vec.data(), core, addr, size, tlb_to_use); } +inline void fill_with_random_bytes(uint8_t* data, size_t n) { + static std::random_device rd; + static std::mt19937_64 gen(rd()); + uint64_t* data64 = reinterpret_cast(data); + std::generate_n(data64, n / 8, [&]() { return gen(); }); + + // Handle remaining bytes + for (size_t i = (n / 8) * 8; i < n; ++i) { + data[i] = static_cast(gen()); + } } + +} // namespace test_utils diff --git a/tests/test_utils/generate_cluster_desc.hpp b/tests/test_utils/generate_cluster_desc.hpp index 145f011a..539dd39f 100644 --- a/tests/test_utils/generate_cluster_desc.hpp +++ b/tests/test_utils/generate_cluster_desc.hpp @@ -7,24 +7,26 @@ #pragma once #include -#include #include +#include #include "fmt/core.h" namespace test_utils { -inline std::string GetAbsPath(std::string path_){ - // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the compiler. +inline std::string GetAbsPath(std::string path_) { + // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the + // compiler. std::filesystem::path current_file_path = std::filesystem::path(__FILE__); std::filesystem::path umd_root; if (current_file_path.is_absolute()) { umd_root = current_file_path.parent_path().parent_path().parent_path(); } else { - std::filesystem::path umd_root_relative = std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path().parent_path(), "../"); + std::filesystem::path umd_root_relative = + std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path().parent_path(), "../"); umd_root = std::filesystem::canonical(umd_root_relative); } std::filesystem::path abs_path = umd_root / path_; return abs_path.string(); } -} // namespace test_utils +} // namespace test_utils diff --git a/tests/test_utils/soc_desc_test_utils.hpp b/tests/test_utils/soc_desc_test_utils.hpp index 30fb90d2..884a3504 100644 --- a/tests/test_utils/soc_desc_test_utils.hpp +++ b/tests/test_utils/soc_desc_test_utils.hpp @@ -15,4 +15,4 @@ static std::size_t get_num_harvested(std::size_t harvesting_mask) { return __builtin_popcount(harvesting_mask); } -} +} // namespace test_utils diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp index 3773d7de..025284bc 100644 --- a/tests/test_utils/stimulus_generators.hpp +++ b/tests/test_utils/stimulus_generators.hpp @@ -4,18 +4,17 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include "umd/device/tt_xy_pair.h" -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" - - +#include #include #include #include #include -#include #include -#include +#include + +#include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" +#include "umd/device/tt_xy_pair.h" /* Sizes: * Distribution (including min/max) @@ -40,7 +39,6 @@ namespace tt::umd::test::utils { static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; - enum RemoteTransferType : uint8_t { WRITE = 0, READ }; template < @@ -50,7 +48,7 @@ template < class DISTRIBUTION_T, typename GENERATOR_T = std::mt19937> class ConstrainedTemplateTemplateGenerator { - public: +public: ConstrainedTemplateTemplateGenerator( int seed, DISTRIBUTION_T const& distribution, @@ -62,24 +60,17 @@ class ConstrainedTemplateTemplateGenerator { return constrain(sample); } - private: +private: GENERATOR_T generator; DISTRIBUTION_T distribution; std::function constrain; }; - -template < - typename SAMPLE_T, - typename UNCONSTRAINED_SAMPLE_T, - class DISTRIBUTION_T, - typename GENERATOR_T = std::mt19937> +template class ConstrainedTemplateGenerator { - public: +public: ConstrainedTemplateGenerator( - int seed, - DISTRIBUTION_T const& distribution, - std::function constrain) : + int seed, DISTRIBUTION_T const& distribution, std::function constrain) : generator(seed), distribution(distribution), constrain(constrain) {} SAMPLE_T generate() { @@ -87,14 +78,14 @@ class ConstrainedTemplateGenerator { return constrain(sample); } - private: +private: GENERATOR_T generator; DISTRIBUTION_T distribution; std::function constrain; }; - -using DefaultTransferTypeGenerator = ConstrainedTemplateTemplateGenerator; +using DefaultTransferTypeGenerator = + ConstrainedTemplateTemplateGenerator; using address_t = uint32_t; using destination_t = tt_cxy_pair; @@ -107,6 +98,7 @@ struct write_transfer_sample_t { std::string tlb_to_use; // (payload.data(), size, destination, address, tlb_to_use, false, false); }; + struct read_transfer_sample_t { destination_t destination; address_t address; @@ -115,7 +107,8 @@ struct read_transfer_sample_t { // (payload.data(), destination, address, size, tlb_to_use); }; -using remote_transfer_sample_t = std::tuple>; +using remote_transfer_sample_t = + std::tuple>; template < template @@ -130,7 +123,8 @@ template < struct WriteCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; WriteCommandGenerator( destination_generator_t const& destination_generator, @@ -159,7 +153,8 @@ template < struct WriteEpochCmdCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; using last_cmd_generator_t = ConstrainedTemplateGenerator; using ordered_generator_t = ConstrainedTemplateGenerator; @@ -196,8 +191,10 @@ template < typename GENERATOR_T = std::mt19937> struct RolledWriteCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; - using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using address_generator_t = + ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; using unroll_count_generator_t = ConstrainedTemplateTemplateGenerator; RolledWriteCommandGenerator( @@ -229,7 +226,8 @@ template < struct ReadCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; ReadCommandGenerator( destination_generator_t const& destination_generator, @@ -239,8 +237,6 @@ struct ReadCommandGenerator { address_generator(address_generator), size_generator(size_generator) {} - - destination_generator_t destination_generator; address_generator_t address_generator; size_generator_t size_generator; @@ -265,12 +261,14 @@ template < typename GENERATOR_T = std::mt19937> class TestGenerator { - using transfer_type_generator_t = DefaultTransferTypeGenerator; // ConstrainedTemplateTemplateGenerator; - using write_command_generator_t = WriteCommandGenerator; - using read_command_generator_t = ReadCommandGenerator; - - public: + // ConstrainedTemplateTemplateGenerator; + using transfer_type_generator_t = DefaultTransferTypeGenerator; + using write_command_generator_t = + WriteCommandGenerator; + using read_command_generator_t = + ReadCommandGenerator; + +public: TestGenerator( int seed, transfer_type_generator_t const& transfer_type_distribution, @@ -279,13 +277,10 @@ class TestGenerator { generator(seed), transfer_type_distribution(transfer_type_distribution), write_command_generator(write_command_generator), - read_command_generator(read_command_generator) - { - } + read_command_generator(read_command_generator) {} // Generate a sample (transfer type, size, destination, address) based on custom distributions remote_transfer_sample_t generate_sample() { - // Randomly select a transfer type RemoteTransferType transfer_type = transfer_type_distribution.generate(); assert(transfer_type < 4 && transfer_type >= 0); @@ -294,22 +289,26 @@ class TestGenerator { destination_t const& destination = write_command_generator.destination_generator.generate(); address_t const& address = write_command_generator.address_generator.generate(); transfer_size_t const& size_in_bytes = write_command_generator.size_generator.generate(); - return {transfer_type, write_transfer_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .tlb_to_use = "LARGE_WRITE_TLB"}}; + return { + transfer_type, + write_transfer_sample_t{ + .destination = destination, + .address = address, + .size_in_bytes = size_in_bytes, + .tlb_to_use = "LARGE_WRITE_TLB"}}; } break; case RemoteTransferType::READ: { destination_t const& destination = read_command_generator.destination_generator.generate(); address_t const& address = read_command_generator.address_generator.generate(); transfer_size_t const& size_in_bytes = read_command_generator.size_generator.generate(); - return {transfer_type, read_transfer_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .tlb_to_use = "LARGE_READ_TLB"}}; + return { + transfer_type, + read_transfer_sample_t{ + .destination = destination, + .address = address, + .size_in_bytes = size_in_bytes, + .tlb_to_use = "LARGE_READ_TLB"}}; } break; default: @@ -317,7 +316,7 @@ class TestGenerator { }; } - private: +private: std::mt19937 generator; transfer_type_generator_t transfer_type_distribution; @@ -331,15 +330,32 @@ struct transfer_type_weights_t { double read; }; - -static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;}; -static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; }; -static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; -static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; -template +static auto address_aligner = [](address_t addr) -> address_t { + addr = (((addr - 1) / 32) + 1) * 32; + assert(addr % 32 == 0); + return addr; +}; +static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { + size = (((size - 1) / 4) + 1) * 4; + assert(size > 0); + assert(size % 4 == 0); + return size; +}; +static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { + size = (((size - 1) / 32) + 1) * 32; + assert(size > 0); + return size; +}; +static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { + size = (((size - 1) / 32) + 1) * 32; + assert(size > 0); + return size; +}; +template static auto passthrough_constrainer = [](T const& t) -> T { return t; }; -static inline std::vector generate_core_index_locations(tt_ClusterDescriptor const& cluster_desc, tt_SocDescriptor const& soc_desc) { +static inline std::vector generate_core_index_locations( + tt_ClusterDescriptor const& cluster_desc, tt_SocDescriptor const& soc_desc) { std::vector core_index_to_location = {}; for (chip_id_t chip : cluster_desc.get_all_chips()) { @@ -360,16 +376,19 @@ static void print_command(remote_transfer_sample_t const& command) { case RemoteTransferType::WRITE: { write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "Transfer type: WRITE, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; + << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x + << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes + << std::endl; } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; + << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x + << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes + << std::endl; } break; - default: throw std::runtime_error("Invalid transfer type"); + default: + throw std::runtime_error("Invalid transfer type"); }; } @@ -379,12 +398,9 @@ int bytes_to_words(int num_bytes) { } static inline void dispatch_remote_transfer_command( - Cluster &driver, - remote_transfer_sample_t const& command, - std::vector &payload) { - + Cluster& driver, remote_transfer_sample_t const& command, std::vector& payload) { RemoteTransferType transfer_type = std::get<0>(command); - auto resize_payload = [](std::vector &payload, int size_in_bytes) { + auto resize_payload = [](std::vector& payload, int size_in_bytes) { payload.resize(bytes_to_words(size_in_bytes)); }; @@ -392,28 +408,37 @@ static inline void dispatch_remote_transfer_command( case RemoteTransferType::WRITE: { write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.write_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use); + resize_payload(payload, command_args.size_in_bytes); + driver.write_to_device( + payload.data(), + bytes_to_words(command_args.size_in_bytes), + command_args.destination, + command_args.address, + command_args.tlb_to_use); } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use); + resize_payload(payload, command_args.size_in_bytes); + driver.read_from_device( + payload.data(), + command_args.destination, + command_args.address, + command_args.size_in_bytes, + command_args.tlb_to_use); } break; default: throw std::runtime_error("Invalid transfer type"); }; } - static void print_command_executable_code(remote_transfer_sample_t const& command) { - auto emit_payload_resize_string = [](int size_bytes, int size_word) { std::cout << "payload.resize(((" << size_bytes << " - 1) / " << size_word << ") + 1);" << std::endl; }; auto emit_bytes_to_words_len_string = [](std::string const& var_name, int size_in_bytes, int size_word) { - std::cout << "int " << var_name << " = (((" << size_in_bytes << " - 1) / " << size_word << ") + 1);" << std::endl; + std::cout << "int " << var_name << " = (((" << size_in_bytes << " - 1) / " << size_word << ") + 1);" + << std::endl; }; std::cout << "{" << std::endl; @@ -421,19 +446,25 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman case RemoteTransferType::WRITE: { write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; + std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " + << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl; emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t)); emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl; - // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false); + std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" + << command_args.tlb_to_use << "\");" << std::endl; + // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, + // command_args.tlb_to_use, false, false); } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; + std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " + << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl; - // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use); + std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " + << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl; + // driver.read_from_device(payload.data(), command_args.destination, command_args.address, + // command_args.size, command_args.tlb_to_use); } break; default: throw std::runtime_error("Invalid transfer type"); @@ -450,32 +481,36 @@ static void print_command_history_executable_code(std::vector class WRITE_DEST_DISTR_T, - template class WRITE_ADDR_DISTR_T, +template < + template + class WRITE_DEST_DISTR_T, + template + class WRITE_ADDR_DISTR_T, class WRITE_SIZE_DISTR_OUT_T, - template class WRITE_SIZE_DISTR_T, + template + class WRITE_SIZE_DISTR_T, - template class READ_DEST_DISTR_T, - template class READ_ADDR_DISTR_T, - class READ_SIZE_DISTR_OUT_T, - template class READ_SIZE_DISTR_T -> + template + class READ_DEST_DISTR_T, + template + class READ_ADDR_DISTR_T, + class READ_SIZE_DISTR_OUT_T, + template + class READ_SIZE_DISTR_T> void RunMixedTransfers( - Cluster& device, + Cluster& device, int num_samples, int seed, transfer_type_weights_t const& transfer_type_weights, - WriteCommandGenerator const& write_command_generator, - ReadCommandGenerator const& read_command_generator, - + WriteCommandGenerator const& + write_command_generator, + ReadCommandGenerator const& + read_command_generator, + bool record_command_history = false, - std::vector *command_history = nullptr -) { + std::vector* command_history = nullptr) { SCOPED_TRACE("RunMixedTransfers"); auto test_generator = TestGenerator( seed, @@ -490,7 +525,7 @@ void RunMixedTransfers( if (record_command_history) { assert(command_history != nullptr); - assert(command_history->size() == 0); // only support passing in empty command histories + assert(command_history->size() == 0); // only support passing in empty command histories command_history->reserve(num_samples); } std::vector payload = {}; @@ -513,16 +548,17 @@ void RunMixedTransfers( } } - -static ConstrainedTemplateTemplateGenerator get_default_address_generator(int seed, address_t start, address_t end) { +static ConstrainedTemplateTemplateGenerator +get_default_address_generator(int seed, address_t start, address_t end) { auto const& address_distribution = std::uniform_int_distribution(start, end); - return ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner); + return ConstrainedTemplateTemplateGenerator( + seed + 1, address_distribution, address_aligner); } - -static ConstrainedTemplateTemplateGenerator get_default_full_dram_dest_generator(int seed, Cluster *device) { +static ConstrainedTemplateTemplateGenerator +get_default_full_dram_dest_generator(int seed, Cluster* device) { assert(device != nullptr); - tt_ClusterDescriptor *cluster_desc = device->get_cluster_description(); + tt_ClusterDescriptor* cluster_desc = device->get_cluster_description(); tt_SocDescriptor const& soc_desc = device->get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); @@ -536,19 +572,23 @@ static WriteCommandGenerator< std::uniform_int_distribution, std::uniform_int_distribution, transfer_size_t, - std::uniform_int_distribution -> build_dummy_write_command_generator(Cluster &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); + std::uniform_int_distribution> +build_dummy_write_command_generator(Cluster& device) { + tt_ClusterDescriptor* cluster_desc = device.get_cluster_description(); tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); auto dest_generator = ConstrainedTemplateTemplateGenerator( 0, std::uniform_int_distribution(0, core_index_to_location.size() - 1), [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator = ConstrainedTemplateTemplateGenerator(0 , std::uniform_int_distribution(0,0), address_aligner); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner_32B); - auto write_size_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), transfer_size_aligner); + auto addr_generator = ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), address_aligner); + auto addr_generator_32B_aligned = + ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), address_aligner_32B); + auto write_size_generator = + ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), transfer_size_aligner); return WriteCommandGenerator(dest_generator, addr_generator, write_size_generator); } @@ -557,24 +597,25 @@ static ReadCommandGenerator< std::uniform_int_distribution, std::uniform_int_distribution, transfer_size_t, - std::uniform_int_distribution -> build_dummy_read_command_generator(Cluster &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); + std::uniform_int_distribution> +build_dummy_read_command_generator(Cluster& device) { + tt_ClusterDescriptor* cluster_desc = device.get_cluster_description(); tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); auto dest_generator = ConstrainedTemplateTemplateGenerator( 0, std::uniform_int_distribution(0, core_index_to_location.size() - 1), [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner); - auto read_size_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), transfer_size_aligner); + auto addr_generator = ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), address_aligner); + auto read_size_generator = + ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), transfer_size_aligner); return ReadCommandGenerator(dest_generator, addr_generator, read_size_generator); - } -template< +template < template class ADDR_GENERATOR_T, typename ADDR_DISTR_T, @@ -583,10 +624,9 @@ template< template class READ_SIZE_GENERATOR_T, template - class UNROLL_COUNT_GENERATOR_T -> + class UNROLL_COUNT_GENERATOR_T> void RunMixedTransfersUniformDistributions( - Cluster& device, + Cluster& device, int num_samples, int seed, @@ -597,11 +637,10 @@ void RunMixedTransfersUniformDistributions( float percent_not_last_epoch_cmd, float percent_not_remote_ordered, READ_SIZE_GENERATOR_T const& read_size_distribution, - + bool record_command_history = false, - std::vector *command_history = nullptr -) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); + std::vector* command_history = nullptr) { + tt_ClusterDescriptor* cluster_desc = device.get_cluster_description(); tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); @@ -609,21 +648,30 @@ void RunMixedTransfersUniformDistributions( seed, std::uniform_int_distribution(0, core_index_to_location.size() - 1), [&core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator = ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner_32B); - auto write_size_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, write_size_distribution, transfer_size_aligner); - auto read_size_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, read_size_distribution, transfer_size_aligner); + auto addr_generator = ConstrainedTemplateTemplateGenerator( + seed + 1, address_distribution, address_aligner); + auto addr_generator_32B_aligned = + ConstrainedTemplateTemplateGenerator( + seed + 1, address_distribution, address_aligner_32B); + auto write_size_generator = + ConstrainedTemplateTemplateGenerator( + seed + 2, write_size_distribution, transfer_size_aligner); + auto read_size_generator = + ConstrainedTemplateTemplateGenerator( + seed + 2, read_size_distribution, transfer_size_aligner); auto last_epoch_cmd_generator = ConstrainedTemplateGenerator( - seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; }); + seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { + return last_epoch_cmd; + }); auto ordered_generator = ConstrainedTemplateGenerator( - seed + 3, std::bernoulli_distribution(percent_not_remote_ordered), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; }); + seed + 3, + std::bernoulli_distribution(percent_not_remote_ordered), + [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; }); auto unroll_count_generator = ConstrainedTemplateTemplateGenerator( seed + 4, unroll_count_distribution, [](int unroll_count) -> int { return unroll_count; }); RunMixedTransfers( - device, + device, num_samples, seed, @@ -631,12 +679,9 @@ void RunMixedTransfersUniformDistributions( WriteCommandGenerator(dest_generator, addr_generator, write_size_generator), ReadCommandGenerator(dest_generator, addr_generator, read_size_generator), - - record_command_history, - command_history - ); + record_command_history, + command_history); } - } // namespace tt::umd::test::utils diff --git a/tests/unit_test_main.cpp b/tests/unit_test_main.cpp index ff89a889..c48ceb23 100644 --- a/tests/unit_test_main.cpp +++ b/tests/unit_test_main.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #include "gtest/gtest.h" - #include "gtest_initializer.hpp" int main(int argc, char **argv) { - initialize_gtest(argc, argv); - return RUN_ALL_TESTS(); + initialize_gtest(argc, argv); + return RUN_ALL_TESTS(); } diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index 791586c9..c85f84c5 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -1,41 +1,40 @@ // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include #include -#include +#include -#include "gtest/gtest.h" -#include "umd/device/cluster.h" #include "eth_l1_address_map.h" -#include "l1_address_map.h" +#include "gtest/gtest.h" #include "host_mem_address_map.h" - +#include "l1_address_map.h" +#include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "umd/device/cluster.h" #include "umd/device/tt_cluster_descriptor.h" #include "umd/device/wormhole_implementation.h" -#include "tests/test_utils/generate_cluster_desc.hpp" -#include "tests/test_utils/device_test_utils.hpp" using namespace tt::umd; -inline void fill_with_random_bytes(uint8_t* data, size_t n) -{ - static std::random_device rd; - static std::mt19937 gen(rd()); - static std::uniform_int_distribution dis(0, 255); - - std::generate(data, data + n, [&]() { return dis(gen); }); -} - void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } std::int32_t get_static_tlb_index(tt_xy_pair target) { - bool is_eth_location = std::find(std::cbegin(tt::umd::wormhole::ETH_LOCATIONS), std::cend(tt::umd::wormhole::ETH_LOCATIONS), target) != std::cend(tt::umd::wormhole::ETH_LOCATIONS); - bool is_tensix_location = std::find(std::cbegin(tt::umd::wormhole::T6_X_LOCATIONS), std::cend(tt::umd::wormhole::T6_X_LOCATIONS), target.x) != std::cend(tt::umd::wormhole::T6_X_LOCATIONS) && - std::find(std::cbegin(tt::umd::wormhole::T6_Y_LOCATIONS), std::cend(tt::umd::wormhole::T6_Y_LOCATIONS), target.y) != std::cend(tt::umd::wormhole::T6_Y_LOCATIONS); + bool is_eth_location = + std::find(std::cbegin(tt::umd::wormhole::ETH_LOCATIONS), std::cend(tt::umd::wormhole::ETH_LOCATIONS), target) != + std::cend(tt::umd::wormhole::ETH_LOCATIONS); + bool is_tensix_location = + std::find( + std::cbegin(tt::umd::wormhole::T6_X_LOCATIONS), std::cend(tt::umd::wormhole::T6_X_LOCATIONS), target.x) != + std::cend(tt::umd::wormhole::T6_X_LOCATIONS) && + std::find( + std::cbegin(tt::umd::wormhole::T6_Y_LOCATIONS), std::cend(tt::umd::wormhole::T6_Y_LOCATIONS), target.y) != + std::cend(tt::umd::wormhole::T6_Y_LOCATIONS); if (is_eth_location) { if (target.y == 6) { target.y = 1; @@ -74,7 +73,8 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) { std::set get_target_devices() { std::set target_devices; - std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + std::unique_ptr cluster_desc_uniq = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) { target_devices.insert(i); } @@ -86,8 +86,15 @@ TEST(SiliconDriverWH, CreateDestroy) { uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - for(int i = 0; i < 50; i++) { - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false); + for (int i = 0; i < 50; i++) { + Cluster device = Cluster( + test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false); set_params_for_remote_txn(device); device.start_device(default_params); device.deassert_risc_reset(); @@ -101,16 +108,18 @@ TEST(SiliconDriverWH, Harvesting) { std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; - for(const auto& chip : sdesc_per_chip) { - ASSERT_EQ(chip.second.workers.size(), 48) << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; + for (const auto& chip : sdesc_per_chip) { + ASSERT_EQ(chip.second.workers.size(), 48) + << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; } - for(int i = 0; i < num_devices; i++){ - ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(i), simulated_harvesting_masks.at(i)) << "Expecting chip " << i << " to have harvesting mask of " << simulated_harvesting_masks.at(i); + for (int i = 0; i < num_devices; i++) { + ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(i), simulated_harvesting_masks.at(i)) + << "Expecting chip " << i << " to have harvesting mask of " << simulated_harvesting_masks.at(i); } } @@ -120,11 +129,20 @@ TEST(SiliconDriverWH, CustomSocDesc) { uint32_t num_host_mem_ch_per_mmio_device = 1; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); + Cluster device = Cluster( + test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false, + simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - - ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; - for(const auto& chip : sdesc_per_chip) { + + ASSERT_EQ(device.using_harvested_soc_descriptors(), false) + << "SOC descriptors should not be modified when harvesting is disabled"; + for (const auto& chip : sdesc_per_chip) { ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; } } @@ -143,22 +161,22 @@ TEST(SiliconDriverWH, HarvestingRuntime) { uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - + for(int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } - } + } } device.setup_core_to_tlb_map(get_static_tlb_index_callback); - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -177,13 +195,13 @@ TEST(SiliconDriverWH, HarvestingRuntime) { device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - + test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB"); ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; ASSERT_EQ(vector_to_write, dynamic_readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - + device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); // Clear any written data device.wait_for_non_mmio_flush(); @@ -199,46 +217,44 @@ TEST(SiliconDriverWH, HarvestingRuntime) { #endif TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); int num_devices = target_devices.size(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); std::vector unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025}; - for(int i = 0; i < num_devices; i++) { - for(const auto& size : unaligned_sizes) { + for (int i = 0; i < num_devices; i++) { + for (const auto& size : unaligned_sizes) { std::vector write_vec(size, 0); - for(int i = 0; i < size; i++){ + for (int i = 0; i < size; i++) { write_vec[i] = size + i; } std::vector readback_vec(size, 0); std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 50; loop++){ - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, ""); device.wait_for_non_mmio_flush(); device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, ""); @@ -252,38 +268,34 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { } address += 0x20; } - } } device.close_device(); } TEST(SiliconDriverWH, StaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); - } + } } - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -292,31 +304,45 @@ TEST(SiliconDriverWH, StaticTLB_RW) { std::vector readback_vec = {}; std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + // Write to each core a 100 times at different statically mapped addresses + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); + // Barrier to ensure that all writes over ethernet were commited + device.wait_for_non_mmio_flush(); test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } - device.close_device(); + device.close_device(); } TEST(SiliconDriverWH, DynamicTLB_RW) { - // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); @@ -328,20 +354,34 @@ TEST(SiliconDriverWH, DynamicTLB_RW) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; std::vector readback_vec = {}; - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + // Write to each core a 100 times at different statically mapped addresses + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); + // Barrier to ensure that all writes over ethernet were commited device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.wait_for_non_mmio_flush(); + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } device.close_device(); @@ -354,8 +394,8 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); + set_params_for_remote_txn(device); tt_device_params default_params; @@ -366,11 +406,18 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -381,12 +428,19 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = 0x30000000; - for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { - for(int loop = 0; loop < 100; loop++) { - for(auto& core : core_ls) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : core_ls) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -401,28 +455,26 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { TEST(SiliconDriverWH, MultiThreadedMemBar) { // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. + // All (fairly large) transactions go through a static TLB. // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - - for(int i = 0; i < target_devices.size(); i++) { + + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); @@ -432,24 +484,41 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); - + std::vector readback_membar_vec = {}; - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } - - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores + + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers were correctly initialized on all ethernet cores readback_membar_vec = {}; } @@ -459,38 +528,43 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { std::vector vec2(2560); std::vector zeros(2560, 0); - for(int i = 0; i < vec1.size(); i++) { + for (int i = 0; i < vec1.size(); i++) { vec1.at(i) = i; } - for(int i = 0; i < vec2.size(); i++) { + for (int i = 0; i < vec2.size(); i++) { vec2.at(i) = vec1.size() + i; } std::thread th1 = std::thread([&] { std::uint32_t address = base_addr; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), ""); ASSERT_EQ(readback_vec, vec1); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } - } }); std::thread th2 = std::thread([&] { std::uint32_t address = base_addr + vec1.size() * 4; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), ""); ASSERT_EQ(readback_vec, vec2); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ; + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -499,28 +573,42 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { th1.join(); th2.join(); - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers end up in the correct sate for ethernet cores readback_membar_vec = {}; } device.close_device(); } - TEST(SiliconDriverWH, BroadcastWrite) { // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -534,40 +622,71 @@ TEST(SiliconDriverWH, BroadcastWrite) { std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { @@ -575,20 +694,22 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); tt_device_params default_params; device.start_device(default_params); auto eth_version = device.get_ethernet_fw_version(); - bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; + bool virtual_bcast_supported = + (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; if (!virtual_bcast_supported) { device.close_device(); - GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; + GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support " + "Virtual Coordinate Broadcast or NOC translation is not enabled"; } - + device.deassert_risc_reset(); std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; @@ -597,43 +718,73 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } - /** * This is a basic DMA test -- not using the PCIe controller's DMA engine, but * rather using the ability of the NOC to access the host system bus via traffic @@ -658,58 +809,132 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { TEST(SiliconDriverWH, SysmemTestWithPcie) { auto target_devices = get_target_devices(); - Cluster device(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), - tt_ClusterDescriptor::get_cluster_descriptor_file_path(), - target_devices, - 1, // one "host memory channel", currently a 1G huge page - false, // skip driver allocs - no (don't skip) - true, // clean system resources - yes - true); // perform harvesting - yes + Cluster cluster( + 1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes - set_params_for_remote_txn(device); - device.start_device(tt_device_params{}); // no special parameters + set_params_for_remote_txn(cluster); + cluster.start_device(tt_device_params{}); // no special parameters - // PCIe core is at (x=0, y=3) on Wormhole NOC0. const chip_id_t mmio_chip_id = 0; - const size_t PCIE_X = 0; // NOC0 - const size_t PCIE_Y = 3; // NOC0 - const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE_X, PCIE_Y); + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly. + // PCIe core is at (x=0, y=3) on Wormhole NOC0. + ASSERT_EQ(PCIE.x, 0); + ASSERT_EQ(PCIE.y, 3); + // Bad API: how big is the buffer? How do we know it's big enough? // Situation today is that there's a 1G hugepage behind it, although this is // unclear from the API and may change in the future. - uint8_t *sysmem = (uint8_t*)device.host_dma_address(0, 0, 0); + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); ASSERT_NE(sysmem, nullptr); // This is the address inside the Wormhole PCIe block that is mapped to the // system bus. In Wormhole, this is a fixed address, 0x8'0000'0000. // The driver should have mapped this address to the bottom of sysmem. - uint64_t base_address = device.get_pcie_base_addr_from_device(mmio_chip_id); + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); // Buffer that we will use to read sysmem into, then write sysmem from. std::vector buffer(test_size_bytes, 0x0); // Step 1: Fill sysmem with random bytes. - fill_with_random_bytes(sysmem, test_size_bytes); + test_utils::fill_with_random_bytes(sysmem, test_size_bytes); // Step 2: Read sysmem into buffer. - device.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); + cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); // Step 3: Verify that buffer matches sysmem. ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); // Step 4: Fill buffer with random bytes. - fill_with_random_bytes(&buffer[0], test_size_bytes); + test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes); // Step 5: Write buffer into sysmem, overwriting what was there. - device.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); + cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); // Step 5b: Read back sysmem into a throwaway buffer. The intent is to // ensure the write has completed before we check sysmem against buffer. std::vector throwaway(test_size_bytes, 0x0); - device.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); + cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); // Step 6: Verify that sysmem matches buffer. ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); } + +/** + * Same idea as above, but with four channels of sysmem and random addresses. + * The hardware mechanism is too slow to sweep the entire range. + */ +TEST(SiliconDriverWH, RandomSysmemTestWithPcie) { + const size_t num_channels = 2; // ideally 4, but CI seems to have 2... + auto target_devices = get_target_devices(); + + Cluster cluster( + test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_channels, + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + + set_params_for_remote_txn(cluster); + cluster.start_device(tt_device_params{}); // no special parameters + + const chip_id_t mmio_chip_id = 0; + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); + const size_t ONE_GIG = 1 << 30; + const size_t num_tests = 0x20000; // runs in a reasonable amount of time + + // PCIe core is at (x=0, y=3) on Wormhole NOC0. + ASSERT_EQ(PCIE.x, 0); + ASSERT_EQ(PCIE.y, 3); + + const uint64_t ALIGNMENT = sizeof(uint32_t); + auto generate_aligned_address = [&](uint64_t lo, uint64_t hi) -> uint64_t { + static std::random_device rd; + static std::mt19937_64 gen(rd()); + std::uniform_int_distribution dis(lo / ALIGNMENT, hi / ALIGNMENT); + return dis(gen) * ALIGNMENT; + }; + + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); + for (size_t channel = 0; channel < num_channels; ++channel) { + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, channel); + ASSERT_NE(sysmem, nullptr); + + test_utils::fill_with_random_bytes(sysmem, ONE_GIG); + + uint64_t lo = (ONE_GIG * channel); + uint64_t hi = (lo + ONE_GIG) - 1; + + if (channel == 3) { + // TODO: I thought everything past 0xffff'dddd was registers or + // something, but a) I don't know what's actually there, and b) + // the unusable range seems to be bigger than that... so + // restricting to 0x8'f000'0000. + hi &= ~0x0fff'ffffULL; + } + + for (size_t i = 0; i < num_tests; ++i) { + uint64_t address = generate_aligned_address(lo, hi); + uint64_t noc_addr = base_address + address; + uint64_t sysmem_address = address - lo; + + ASSERT_GE(address, lo) << "Address too low"; + ASSERT_LE(address, hi) << "Address too high"; + ASSERT_EQ(address % ALIGNMENT, 0) << "Address not properly aligned"; + + uint32_t value = 0; + cluster.read_from_device(&value, PCIE_CORE, noc_addr, sizeof(uint32_t), "REG_TLB"); + + uint32_t expected = *reinterpret_cast(&sysmem[sysmem_address]); + ASSERT_EQ(value, expected) << fmt::format("Mismatch at address {:#x}", address); + } + } +} diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp index 16f1d101..26978a2b 100644 --- a/tests/wormhole/test_umd_remote_api_stability.cpp +++ b/tests/wormhole/test_umd_remote_api_stability.cpp @@ -2,58 +2,51 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include +#include #include #include #include -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" - #include "common/logger.hpp" #include "eth_interface.h" #include "filesystem" #include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" -#include "umd/device/tt_soc_descriptor.h" - -#include "tests/test_utils/stimulus_generators.hpp" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "test_wh_common.h" - -#include -#include +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/test_utils/stimulus_generators.hpp" +#include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" +#include "umd/device/tt_soc_descriptor.h" namespace tt::umd::test::utils { class WormholeNebulaX2TestFixture : public WormholeTestFixture { - private: - static int detected_num_chips; - static bool skip_tests; - - protected: - - static constexpr int EXPECTED_NUM_CHIPS = 2; - static uint32_t scale_number_of_tests; - - static void SetUpTestSuite() { - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); - detected_num_chips = cluster_desc->get_number_of_chips(); - if (detected_num_chips != EXPECTED_NUM_CHIPS) { - skip_tests = true; +private: + static int detected_num_chips; + static bool skip_tests; + +protected: + static constexpr int EXPECTED_NUM_CHIPS = 2; + static uint32_t scale_number_of_tests; + + static void SetUpTestSuite() { + std::unique_ptr cluster_desc = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + detected_num_chips = cluster_desc->get_number_of_chips(); + if (detected_num_chips != EXPECTED_NUM_CHIPS) { + skip_tests = true; + } + if (char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { + scale_number_of_tests = std::atoi(scale_number_of_tests_env); + } } - if(char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { - scale_number_of_tests = std::atoi(scale_number_of_tests_env); - } - } - virtual int get_detected_num_chips() { - return detected_num_chips; - } + virtual int get_detected_num_chips() { return detected_num_chips; } - virtual bool is_test_skipped() { - return skip_tests; - } + virtual bool is_test_skipped() { return skip_tests; } }; int WormholeNebulaX2TestFixture::detected_num_chips = -1; @@ -63,28 +56,29 @@ uint32_t WormholeNebulaX2TestFixture::scale_number_of_tests = 1; TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) { int seed = 0; - log_info(LogSiliconDriver,"Started MixedRemoteTransfersMediumSmall"); + log_info(LogSiliconDriver, "Started MixedRemoteTransfersMediumSmall"); std::vector command_history; try { assert(device != nullptr); RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.25, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } @@ -93,88 +87,92 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) { TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersMediumSmall"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersMediumSmall"); assert(device != nullptr); std::vector command_history0; std::vector command_history1; std::vector command_history2; std::vector command_history3; - std::thread t1([&](){ + std::thread t1([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread t2([&](){ + std::thread t2([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history1 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history1); }); - std::thread t3([&](){ + std::thread t3([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history2 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history2); }); - std::thread t4([&](){ + std::thread t4([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .read = 0.0}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history3 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history3); }); t1.join(); @@ -186,154 +184,155 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) { int seed = 0; - log_info(LogSiliconDriver,"Started MixedRemoteTransfersLarge"); + log_info(LogSiliconDriver, "Started MixedRemoteTransfersLarge"); assert(device != nullptr); std::vector command_history; try { RunMixedTransfersUniformDistributions( - *device, + *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.15, .read = 0.15}, - - std::uniform_int_distribution(0x10000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x10000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 300000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + // Set to true if you want to emit the command history code to command line + std::uniform_int_distribution(0x4, 300000), + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } - } TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4) { int seed = 0; - log_info(LogSiliconDriver,"Started WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4"); + log_info(LogSiliconDriver, "Started WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4"); assert(device != nullptr); std::vector command_history; auto write_size_generator = ConstrainedTemplateTemplateGenerator( - seed, std::normal_distribution<>(10000, 3000), [](double x) -> transfer_size_t { return size_aligner_32B(static_cast((x >= 4) ? x : 4)); }); - + seed, std::normal_distribution<>(10000, 3000), [](double x) -> transfer_size_t { + return size_aligner_32B(static_cast((x >= 4) ? x : 4)); + }); auto dest_generator = get_default_full_dram_dest_generator(seed, device.get()); auto address_generator = get_default_address_generator(seed, 0x100000, 0x5000000); try { RunMixedTransfers( - *device, + *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .read = 0.}, - WriteCommandGenerator(dest_generator, address_generator, write_size_generator), build_dummy_read_command_generator(*device), - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } - } TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersLMS"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersLMS"); assert(device != nullptr); std::vector command_history0; std::vector command_history1; std::vector command_history2; std::vector command_history3; - std::thread t1([&](){ + std::thread t1([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(4, 300000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread t2([&](){ + std::thread t2([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history1 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history1); }); - std::thread t3([&](){ + std::thread t3([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history2 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history2); }); - std::thread t4([&](){ + std::thread t4([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .read = 0.0}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history3 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history3); }); t1.join(); @@ -345,85 +344,80 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWritesSmallReads) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersLargeWritesSmallReads"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersLargeWritesSmallReads"); assert(device != nullptr); std::vector command_history0; std::vector command_history1; - auto write_size_generator = ConstrainedTemplateTemplateGenerator( - seed, std::uniform_int_distribution(1000000, 30000000), [](transfer_size_t x) -> transfer_size_t { return size_aligner_32B(static_cast((x >= 4) ? x : 4)); }); - auto read_size_generator = ConstrainedTemplateTemplateGenerator( - seed, std::uniform_int_distribution(16, 4096), [](transfer_size_t x) -> transfer_size_t { return size_aligner_32B(static_cast((x >= 4) ? x : 4)); }); + auto write_size_generator = + ConstrainedTemplateTemplateGenerator( + seed, + std::uniform_int_distribution(1000000, 30000000), + [](transfer_size_t x) -> transfer_size_t { + return size_aligner_32B(static_cast((x >= 4) ? x : 4)); + }); + auto read_size_generator = + ConstrainedTemplateTemplateGenerator( + seed, std::uniform_int_distribution(16, 4096), [](transfer_size_t x) -> transfer_size_t { + return size_aligner_32B(static_cast((x >= 4) ? x : 4)); + }); auto dest_generator = get_default_full_dram_dest_generator(seed, device.get()); auto address_generator = get_default_address_generator(seed, 0x100000, 0x5000000); - std::thread write_cmds_thread1([&](){ + std::thread write_cmds_thread1([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .read = 0.}, - WriteCommandGenerator(dest_generator, address_generator, write_size_generator), build_dummy_read_command_generator(*device), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread write_cmds_thread2([&](){ + std::thread write_cmds_thread2([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .read = 0.}, - WriteCommandGenerator(dest_generator, address_generator, write_size_generator), build_dummy_read_command_generator(*device), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread read_cmd_threads1([&](){ + std::thread read_cmd_threads1([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .read = 1.}, - build_dummy_write_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread read_cmd_threads2([&](){ + std::thread read_cmd_threads2([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .read = 1.}, - build_dummy_write_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); write_cmds_thread1.join(); write_cmds_thread2.join(); read_cmd_threads1.join(); read_cmd_threads2.join(); - } -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils diff --git a/tests/wormhole/test_wh_common.h b/tests/wormhole/test_wh_common.h index e96ad803..fe76e3c2 100644 --- a/tests/wormhole/test_wh_common.h +++ b/tests/wormhole/test_wh_common.h @@ -5,80 +5,77 @@ */ #pragma once -#include "umd/device/tt_cluster_descriptor.h" -#include "umd/device/cluster.h" -#include "umd/device/tt_xy_pair.h" #include "eth_l1_address_map.h" - -#include "tests/test_utils/stimulus_generators.hpp" #include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/test_utils/stimulus_generators.hpp" +#include "umd/device/cluster.h" +#include "umd/device/tt_cluster_descriptor.h" +#include "umd/device/tt_xy_pair.h" namespace tt::umd::test::utils { static void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } class WormholeTestFixture : public ::testing::Test { - protected: - // You can remove any or all of the following functions if their bodies would - // be empty. +protected: + // You can remove any or all of the following functions if their bodies would + // be empty. - std::unique_ptr device; + std::unique_ptr device; - WormholeTestFixture() { + WormholeTestFixture() {} - } + ~WormholeTestFixture() override { + // You can do clean-up work that doesn't throw exceptions here. + } - ~WormholeTestFixture() override { - // You can do clean-up work that doesn't throw exceptions here. - } + virtual int get_detected_num_chips() = 0; + virtual bool is_test_skipped() = 0; - virtual int get_detected_num_chips() = 0; - virtual bool is_test_skipped() = 0; + // If the constructor and destructor are not enough for setting up + // and cleaning up each test, you can define the following methods: - // If the constructor and destructor are not enough for setting up - // and cleaning up each test, you can define the following methods: + void SetUp() override { + // Code here will be called immediately after the constructor (right + // before each test). - void SetUp() override { - // Code here will be called immediately after the constructor (right - // before each test). + if (is_test_skipped()) { + GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; + } - if (is_test_skipped()) { - GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; - } + assert(get_detected_num_chips() > 0); + auto devices = std::vector(get_detected_num_chips()); + std::iota(devices.begin(), devices.end(), 0); + std::set target_devices = {devices.begin(), devices.end()}; + uint32_t num_host_mem_ch_per_mmio_device = 1; + device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); + assert(device != nullptr); + assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); - // std::cout << "Setting Up Test." << std::endl; - assert(get_detected_num_chips() > 0); - auto devices = std::vector(get_detected_num_chips()); - std::iota(devices.begin(), devices.end(), 0); - std::set target_devices = {devices.begin(), devices.end()}; - uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_unique(test_utils::GetAbsPath(SOC_DESC_PATH), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); - assert(device != nullptr); - assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); + set_params_for_remote_txn(*device); - set_params_for_remote_txn(*device); + tt_device_params default_params; + device->start_device(default_params); - tt_device_params default_params; - device->start_device(default_params); + device->deassert_risc_reset(); - device->deassert_risc_reset(); - - device->wait_for_non_mmio_flush(); - } + device->wait_for_non_mmio_flush(); + } - void TearDown() override { - // Code here will be called immediately after each test (right - // before the destructor). + void TearDown() override { + // Code here will be called immediately after each test (right + // before the destructor). - if (!is_test_skipped()) { - // std::cout << "Tearing Down Test." << std::endl; - device->close_device(); + if (!is_test_skipped()) { + device->close_device(); + } } - } - }; -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils