diff --git a/.clang-format b/.clang-format index 5caa7153..c84154ce 100644 --- a/.clang-format +++ b/.clang-format @@ -1,147 +1,26 @@ ---- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 +BasedOnStyle: Google +ColumnLimit: 120 +IndentWidth: 4 + +# This will make access modifiers (public/protected/private) sit on the same indentation as `class` keyword +AccessModifierOffset: -4 + +# Arguments, parameters and construction initializer are broken as following: +# - Try to fit everything into single line (controlled by ColumnLimit). +# - If it doesn't fit, break immediately after open bracket (in case of arguments and parameters) +# or after colon in case of constructor initializers. +# - Try to fit everything else into the second line. +# - If it doesn't fit in second line, then each argument, parameter or initializer will sit in its own line. AlignAfterOpenBracket: AlwaysBreak -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false -BraceWrapping: -# AfterCaseLabel: false - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false + +# When constructor initializers exist in the constructor definition, leave the colon as last thing on the original +# line instead of putting it on the next line. BreakConstructorInitializers: AfterColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 120 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: true -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^' - Priority: 2 - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IncludeIsMainRegex: '([-_](test|unittest))?$' -IndentCaseLabels: true -IndentPPDirectives: None -IndentWidth: 4 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Right -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - CanonicalDelimiter: '' - BasedOnStyle: google -ReflowComments: true -SortIncludes: true -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Cpp11 -TabWidth: 4 -UseTab: Never -... +# Disallow single statements after if/else/for/while/do without curly braces. +InsertBraces: true + +# Separate definition blocks, including classes, structs, enums, and functions. +SeparateDefinitionBlocks: Always diff --git a/.gersemirc b/.gersemirc new file mode 100644 index 00000000..145d24c0 --- /dev/null +++ b/.gersemirc @@ -0,0 +1,6 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/BlankSpruce/gersemi/master/gersemi/configuration.schema.json + +definitions: [cmake] +indent: 4 +line_length: 80 +list_expansion: favour-expansion diff --git a/.github/workflows/build-device.yml b/.github/workflows/build-device.yml index 17a0f6ca..0175724c 100644 --- a/.github/workflows/build-device.yml +++ b/.github/workflows/build-device.yml @@ -1,4 +1,4 @@ -# Builds umd_device. +# Builds device. # Build is performed on all supported OS versions. name: Build Target @@ -16,7 +16,7 @@ on: type: number env: - BUILD_TARGET: umd_device + BUILD_TARGET: device BUILD_OUTPUT_DIR: ./build LIB_OUTPUT_DIR: ./build/lib DEPS_OUTPUT_DIR: ./build/_deps @@ -34,7 +34,7 @@ jobs: {runs-on: ubuntu-20.04, docker-image: tt-umd-ci-ubuntu-20.04}, ] - name: Build umd_device for any arch on ${{ matrix.build.runs-on }} + name: Build device for any arch on ${{ matrix.build.runs-on }} runs-on: ${{ matrix.build.runs-on }} container: image: ghcr.io/${{ github.repository }}/${{ matrix.build.docker-image }}:latest diff --git a/.github/workflows/on-pr.yml b/.github/workflows/on-pr.yml index e582e0e3..985b9785 100644 --- a/.github/workflows/on-pr.yml +++ b/.github/workflows/on-pr.yml @@ -16,3 +16,23 @@ jobs: uses: ./.github/workflows/build-device.yml with: timeout: 10 + + pre-commit: + name: Run Pre-commit Hooks and Propose Fixes + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history so 'origin/main' is available + ref: ${{ github.head_ref }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Run Pre-commit and Fix Issues + uses: pre-commit/action@v3.0.1 + with: + extra_args: "--from-ref origin/main --to-ref HEAD" diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 2053ee8e..98b2a526 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -70,6 +70,7 @@ jobs: env: ARCH_NAME: ${{ inputs.arch }} + LD_LIBRARY_PATH: ./build/lib steps: - name: Cleanup tt-umd dir, and change directory as if we were in a github.repository diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..819c7d75 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: +- repo: https://github.com/BlankSpruce/gersemi + rev: 0.16.2 + hooks: + - id: gersemi +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v17.0.6 + hooks: + - id: clang-format diff --git a/CMakeLists.txt b/CMakeLists.txt index 0664960e..b809100d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ include(cmake/compilers.cmake) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) # This also impacts dependencies brought in through CPM if (DEFINED ENV{CMAKE_C_COMPILER} AND DEFINED ENV{CMAKE_CXX_COMPILER}) message(STATUS "Setting C and C++ compiler from environment variables") @@ -19,88 +20,54 @@ else() FIND_AND_SET_CLANG17() endif() -project(umd_device) +project( + umd + VERSION 0.1.0 + DESCRIPTION "Tenstorrent User Mode Driver" + HOMEPAGE_URL "https://github.com/tenstorrent/tt-umd" + LANGUAGES + CXX +) +list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) CHECK_COMPILERS() +include(check_libcpp) +include(GNUInstallDirs) set(MASTER_PROJECT OFF) if(PROJECT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(MASTER_PROJECT ON) - message("-- UMD: Building as master project") endif() if(MASTER_PROJECT) + message(STATUS "UMD: Building as master project") if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting build type to 'Release' as none was specified.") - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Release build is the default" FORCE) + set(CMAKE_BUILD_TYPE + "Release" + CACHE STRING + "Release build is the default" + FORCE + ) endif() - option(ENABLE_ASAN "Enable build with AddressSanitizer" OFF) - message(STATUS "Build with ASAN: ${ENABLE_ASAN}") - - set(SANITIZER_ENABLED ${ENABLE_ASAN}) - - option(ENABLE_MSAN "Enable build with MemorySanitizer" OFF) - message(STATUS "Build with MSAN: ${ENABLE_MSAN}") - - if(SANITIZER_ENABLED AND ENABLE_MSAN) - message(FATAL_ERROR "Multiple sanitizers are not supported") - elseif(ENABLE_MSAN) - set(SANITIZER_ENABLED ${ENABLE_MSAN}) - endif() - - option(ENABLE_TSAN "Enable build with ThreadSanitizer" OFF) - message(STATUS "Build with TSAN: ${ENABLE_TSAN}") - - if(SANITIZER_ENABLED AND ENABLE_TSAN) - message(FATAL_ERROR "Multiple sanitizers are not supported") - elseif(ENABLE_TSAN) - set(SANITIZER_ENABLED ${ENABLE_TSAN}) - endif() - - option(ENABLE_UBSAN "Enable build with UndefinedBehaviorSanitizer" OFF) - message(STATUS "Build with UBSAN: ${ENABLE_UBSAN}") - - if(SANITIZER_ENABLED AND ENABLE_UBSAN) - message(FATAL_ERROR "Multiple sanitizers are not supported") - endif() - - unset(SANITIZER_ENABLED) - - add_library(compiler_flags INTERFACE) - target_compile_options( - compiler_flags - INTERFACE -DFMT_HEADER_ONLY - $<$:-fsanitize=address> - $<$:-fsanitize=memory> - $<$:-fsanitize=thread> - $<$:-fsanitize=undefined>) - - add_library(linker_flags INTERFACE) - target_link_options( - linker_flags - INTERFACE - $<$:-fsanitize=address> - $<$:-fsanitize=memory> - $<$:-fsanitize=thread> - $<$:-fsanitize=undefined>) - - target_link_libraries(compiler_flags INTERFACE linker_flags) + include(sanitizer_options) endif() + message(STATUS "UMD build type: ${CMAKE_BUILD_TYPE}") -set(CMAKE_CXX_FLAGS_RELEASE "-O3") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DDEBUG=DEBUG") -set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG=DEBUG") -include(${PROJECT_SOURCE_DIR}/cmake/dependencies.cmake) +add_compile_definitions( + $<$:DEBUG> + $<$:DEBUG> +) -add_library(umd_common_directories INTERFACE) -target_include_directories(umd_common_directories INTERFACE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/device) +include(dependencies) -add_subdirectory(${PROJECT_SOURCE_DIR}/device) +add_subdirectory(device) option(TT_UMD_BUILD_TESTS "Enables build of tt_umd tests" OFF) if(TT_UMD_BUILD_TESTS) - add_subdirectory(${PROJECT_SOURCE_DIR}/tests) + add_subdirectory(tests) endif(TT_UMD_BUILD_TESTS) +include(packaging) diff --git a/README.md b/README.md index 12e8f3ab..208f7a06 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,7 @@ sudo ./llvm.sh 17 To build `libdevice.so`: ``` cmake -B build -G Ninja -ninja -C build -# or -ninja umd_device -C build +cmake --build build ``` Tests are build separatelly for each architecture. @@ -31,7 +29,7 @@ You also need to configure cmake to enable tests, hence the need to run cmake co To build tests: ``` cmake -B build -G Ninja -DTT_UMD_BUILD_TESTS=ON -ninja umd_tests -C build +cmake --build build ``` To build with GCC, set these environment variables before invoking `cmake`: @@ -40,12 +38,78 @@ export CMAKE_C_COMPILER=/usr/bin/gcc export CMAKE_CXX_COMPILER=/usr/bin/g++ ``` -## As a submodule/external project -If your project has CMake support, simply add this repo as a subdirectory: +## Build debian dev package +``` +cmake --build build --target package + +# Generates umd-dev-x.y.z-Linux.deb +``` + +# Integration +UMD can be consumed by downstream projects in multiple ways. + +## From Source (CMake) +You can link `libdevice.so` by linking against the `umd::device` target. + +### Using CPM Package Manager +``` +CPMAddPackage( + NAME umd + GITHUB_REPOSITORY tenstorrent/tt-umd + GIT_TAG v0.1.0 + VERSION 0.1.0 +) +``` + +### As a submodule/external project ``` add_subdirectory() ``` -You can then use `libdevice.so` by linking against the `umd_device` target wheverever is needed. + +## From Prebuilt Binaries + +### Ubuntu ``` -target_link_libraries(tt_metal PUBLIC umd_device) +apt install ./umd-dev-x.y.z-Linux.deb ``` + +# Pre-commit Hook Integration for Formatting and Linting + +As part of maintaining consistent code formatting across the project, we have integrated the [pre-commit](https://pre-commit.com/) framework into our workflow. The pre-commit hooks will help automatically check and format code before commits are made, ensuring that we adhere to the project's coding standards. + +## What is Pre-commit? + +Pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. It helps catch common issues early by running a set of hooks before code is committed, automating tasks like: + +- Formatting code (e.g., fixing trailing whitespace, enforcing end-of-file newlines) +- Running linters (e.g., `clang-format`, `black`, `flake8`) +- Checking for merge conflicts or other common issues. + +For more details on pre-commit, you can visit the [official documentation](https://pre-commit.com/). + +## How to Set Up Pre-commit Locally + +To set up pre-commit on your local machine, follow these steps: + +1. **Install Pre-commit**: + Ensure you have Python installed, then run: + ```bash + pip install pre-commit + ``` +2. **Install the Git Hook Scripts**: + In your local repository, run the following command to install the pre-commit hooks: + ```bash + pre-commit install + ``` + This command will configure your local Git to run the defined hooks automatically before each commit. +3. **Run Pre-commit Hooks Manually**: + You can also run the hooks manually against all files at any time with: + ```bash + pre-commit run --all-files + ``` +## Why You Should Use Pre-commit +By setting up pre-commit locally, you can help maintain the quality of the codebase and ensure that commits consistently meet the project's formatting standards. This saves time during code reviews and reduces the likelihood of code formatting issues slipping into the repository. + +Since the hooks run automatically before each commit, you don't need to remember to manually format or check your code, making it easier to maintain consistency. + +We strongly encourage all developers to integrate pre-commit into their workflow. diff --git a/cmake/check_libcpp.cmake b/cmake/check_libcpp.cmake new file mode 100644 index 00000000..f96cb85d --- /dev/null +++ b/cmake/check_libcpp.cmake @@ -0,0 +1,18 @@ +# Only perform the check if Clang is the compiler +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + include(CheckCXXCompilerFlag) + + check_cxx_compiler_flag( + "-stdlib=libc++" + HAS_LIBCPP + ) + + if(HAS_LIBCPP) + message(STATUS "libc++ is available") + else() + message( + FATAL_ERROR + "libc++ was not detected! Please ensure that libc++ is installed and available." + ) + endif() +endif() diff --git a/cmake/compilers.cmake b/cmake/compilers.cmake index c919b395..2e208a0b 100644 --- a/cmake/compilers.cmake +++ b/cmake/compilers.cmake @@ -15,6 +15,11 @@ function(CHECK_COMPILERS) message(STATUS "Checking compilers") if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -stdlib=libc++" + CACHE STRING + "CXX FLAGS for clang" + ) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "17.0.0" OR CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL "18.0.0") message(WARNING "Only Clang-17 is tested right now") endif() diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 74337e83..a8443120 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -1,4 +1,3 @@ - set(ENV{CPM_SOURCE_CACHE} "${PROJECT_SOURCE_DIR}/.cpmcache") include(${PROJECT_SOURCE_DIR}/cmake/CPM.cmake) @@ -27,8 +26,13 @@ CPMAddPackage( "YAML_BUILD_SHARED_LIBS OFF" ) -if (yaml-cpp_ADDED) - set_target_properties(yaml-cpp PROPERTIES DEBUG_POSTFIX "") +if(yaml-cpp_ADDED) + set_target_properties( + yaml-cpp + PROPERTIES + DEBUG_POSTFIX + "" + ) endif() ############################################################################################################################ @@ -60,8 +64,6 @@ CPMAddPackage( OPTIONS "FLATBUFFERS_BUILD_FLATC OFF" "FLATBUFFERS_BUILD_TESTS OFF" - "FLATBUFFERS_INSTALL OFF" - "FLATBUFFERS_BUILD_FLATLIB OFF" "FLATBUFFERS_SKIP_MONSTER_EXTRA ON" "FLATBUFFERS_STRICT_MODE ON" ) @@ -87,16 +89,10 @@ CPMAddPackage( GIT_TAG 11.0.1 ) -if(NOT MASTER_PROJECT) - set(nng_include_dir ${nanomsg_SOURCE_DIR}/include PARENT_SCOPE) - set(flatbuffers_include_dir ${flatbuffers_SOURCE_DIR}/include PARENT_SCOPE) - set(libuv_include_dir ${libuv_SOURCE_DIR}/include PARENT_SCOPE) -endif() - ############################################################################################################################ # nanobench (for uBenchmarking) ############################################################################################################################ -if (MASTER_PROJECT) +if(MASTER_PROJECT) CPMAddPackage( NAME nanobench GITHUB_REPOSITORY martinus/nanobench diff --git a/cmake/packaging.cmake b/cmake/packaging.cmake new file mode 100644 index 00000000..7cac50a9 --- /dev/null +++ b/cmake/packaging.cmake @@ -0,0 +1,31 @@ +include(CMakePackageConfigHelpers) + +write_basic_package_version_file( + ${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) + +# Configure the Config file +configure_package_config_file( + ${PROJECT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in + ${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} +) + +# Install the Config and ConfigVersion files +install( + FILES + ${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake + ${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} +) + +set(CPACK_PACKAGE_NAME "${PROJECT_NAME}-dev") +set(CPACK_GENERATOR "DEB") +set(CPACK_PACKAGE_VENDOR "Tenstorrent, Inc.") +set(CPACK_DEBIAN_PACKAGE_MAINTAINER "support@tenstorrent.com") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Tenstorrent User Mode Driver") +#set(CPACK_DEBIAN_PACKAGE_DEPENDS "") + +include(CPack) diff --git a/cmake/sanitizer_options.cmake b/cmake/sanitizer_options.cmake new file mode 100644 index 00000000..b0fd8412 --- /dev/null +++ b/cmake/sanitizer_options.cmake @@ -0,0 +1,46 @@ +include_guard(GLOBAL) + +option(ENABLE_ASAN "Enable build with AddressSanitizer" OFF) +message(STATUS "Build with ASAN: ${ENABLE_ASAN}") + +set(SANITIZER_ENABLED ${ENABLE_ASAN}) + +option(ENABLE_MSAN "Enable build with MemorySanitizer" OFF) +message(STATUS "Build with MSAN: ${ENABLE_MSAN}") + +if(SANITIZER_ENABLED AND ENABLE_MSAN) + message(FATAL_ERROR "Multiple sanitizers are not supported") +elseif(ENABLE_MSAN) + set(SANITIZER_ENABLED ${ENABLE_MSAN}) +endif() + +option(ENABLE_TSAN "Enable build with ThreadSanitizer" OFF) +message(STATUS "Build with TSAN: ${ENABLE_TSAN}") + +if(SANITIZER_ENABLED AND ENABLE_TSAN) + message(FATAL_ERROR "Multiple sanitizers are not supported") +elseif(ENABLE_TSAN) + set(SANITIZER_ENABLED ${ENABLE_TSAN}) +endif() + +option(ENABLE_UBSAN "Enable build with UndefinedBehaviorSanitizer" OFF) +message(STATUS "Build with UBSAN: ${ENABLE_UBSAN}") + +if(SANITIZER_ENABLED AND ENABLE_UBSAN) + message(FATAL_ERROR "Multiple sanitizers are not supported") +endif() + +unset(SANITIZER_ENABLED) + +add_compile_options( + $<$:-fsanitize=address> + $<$:-fsanitize=memory> + $<$:-fsanitize=thread> + $<$:-fsanitize=undefined> +) +add_link_options( + $<$:-fsanitize=address> + $<$:-fsanitize=memory> + $<$:-fsanitize=thread> + $<$:-fsanitize=undefined> +) diff --git a/cmake/umdConfig.cmake.in b/cmake/umdConfig.cmake.in new file mode 100644 index 00000000..9f1696ea --- /dev/null +++ b/cmake/umdConfig.cmake.in @@ -0,0 +1,14 @@ + +# @PROJECT_NAME@Config.cmake.in +@PACKAGE_INIT@ + +# Set package as found +set(@PROJECT_NAME@_FOUND TRUE) + +# Include the exported targets +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") + +# Set the directory containing the CMake files for the project +get_filename_component(@PROJECT_NAME@_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + +message(STATUS "Found @PROJECT_NAME@ at ${@PROJECT_NAME@_CMAKE_DIR}") diff --git a/cmake/x86-linux-clang-17-toolchain.cmake b/cmake/x86-linux-clang-17-toolchain.cmake new file mode 100644 index 00000000..27c08fd7 --- /dev/null +++ b/cmake/x86-linux-clang-17-toolchain.cmake @@ -0,0 +1,9 @@ +set(CMAKE_SYSTEM_NAME Linux) + +set(CMAKE_CXX_COMPILER clang++-17 CACHE STRING "C++ compiler") +set(CMAKE_C_COMPILER clang-17 CACHE STRING "C compiler") +set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -stdlib=libc++" + CACHE STRING + "CXX FLAGS for clang" +) diff --git a/device/.clang-format b/device/.clang-format new file mode 100644 index 00000000..9d159247 --- /dev/null +++ b/device/.clang-format @@ -0,0 +1,2 @@ +DisableFormat: true +SortIncludes: false diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index f8a18b42..8f2f821e 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -1,3 +1,4 @@ +set(POSITION_INDEPENDENT_CODE ON) set(UMD_DEVICE_SRCS architecture_implementation.cpp @@ -15,18 +16,82 @@ set(UMD_DEVICE_SRCS wormhole/wormhole_implementation.cpp pcie/pci_device.cpp ) -add_library(umd_device SHARED ${UMD_DEVICE_SRCS}) -target_link_libraries(umd_device - PUBLIC yaml-cpp::yaml-cpp umd_common_directories nng uv compiler_flags - PRIVATE hwloc rt Boost::interprocess fmt + +add_library(device SHARED ${UMD_DEVICE_SRCS}) +add_library(${PROJECT_NAME}::device ALIAS device) +add_library(${PROJECT_NAME}_device ALIAS device) # For legacy I guess + +target_include_directories( + device + PUBLIC + $ + $ + $ + $ +) + +# flatbuffers is public - exposed to tt_metal by tt_simulation_device_generated.h +# nng is public - exposed to tt_metal by tt_simulation_host.hpp +target_link_libraries( + device + PUBLIC + nng + flatbuffers + uv + PRIVATE + hwloc + rt + Boost::interprocess + fmt::fmt-header-only + yaml-cpp::yaml-cpp +) + +install( + TARGETS + device + EXPORT ${PROJECT_NAME}Targets + ARCHIVE + DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY + DESTINATION ${CMAKE_INSTALL_LIBDIR} + COMPONENT dev ) -target_include_directories(umd_device PUBLIC - ${flatbuffers_SOURCE_DIR}/include - ${nanomsg_SOURCE_DIR}/include - ${libuv_SOURCE_DIR}/include + +install( + EXPORT ${PROJECT_NAME}Targets + FILE ${PROJECT_NAME}Targets.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} + NAMESPACE ${PROJECT_NAME}:: +) + +# Add a custom command to copy the library to build/lib +add_custom_command( + TARGET device + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy $ ${CMAKE_BINARY_DIR}/lib/ + COMMENT "Copying device library to build/lib" +) + +# No separation of public and private header files +# I can only assume everything is public +install( + DIRECTORY + ${PROJECT_SOURCE_DIR}/device + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/device + FILES_MATCHING + PATTERN + "*.h" + PATTERN + "*.hpp" ) -set_target_properties(umd_device PROPERTIES - OUTPUT_NAME device - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - POSITION_INDEPENDENT_CODE ON +install( + DIRECTORY + ${PROJECT_SOURCE_DIR}/common + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/common + FILES_MATCHING + PATTERN + "*.h" + PATTERN + "*.hpp" ) diff --git a/device/architecture.h b/device/architecture.h deleted file mode 100644 index 67a97bf7..00000000 --- a/device/architecture.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -namespace tt::umd { - -/** - * @brief architecture Enums - */ -enum class architecture { - jawbridge = 0, - grayskull = 1, - wormhole = 2, - wormhole_b0 = 3, - blackhole = 4, - invalid = 0xFF, -}; - -} // namespace tt::umd diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp index d55d3e29..7cd1dac8 100644 --- a/device/architecture_implementation.cpp +++ b/device/architecture_implementation.cpp @@ -10,12 +10,11 @@ namespace tt::umd { -std::unique_ptr architecture_implementation::create(architecture architecture) { +std::unique_ptr architecture_implementation::create(tt::ARCH architecture) { switch (architecture) { - case architecture::blackhole: return std::make_unique(); - case architecture::grayskull: return std::make_unique(); - case architecture::wormhole: - case architecture::wormhole_b0: return std::make_unique(); + case tt::ARCH::BLACKHOLE: return std::make_unique(); + case tt::ARCH::GRAYSKULL: return std::make_unique(); + case tt::ARCH::WORMHOLE_B0: return std::make_unique(); default: return nullptr; } } diff --git a/device/architecture_implementation.h b/device/architecture_implementation.h index b807b334..f05d27f6 100644 --- a/device/architecture_implementation.h +++ b/device/architecture_implementation.h @@ -11,9 +11,9 @@ #include #include -#include "device/architecture.h" #include "device/tlb.h" #include "device/xy_pair.h" +#include "device/tt_arch_types.h" namespace tt::umd { @@ -21,7 +21,7 @@ class architecture_implementation { public: virtual ~architecture_implementation() = default; - virtual architecture get_architecture() const = 0; + virtual tt::ARCH get_architecture() const = 0; virtual uint32_t get_arc_message_arc_get_harvesting() const = 0; virtual uint32_t get_arc_message_arc_go_busy() const = 0; virtual uint32_t get_arc_message_arc_go_long_idle() const = 0; @@ -63,7 +63,7 @@ class architecture_implementation { virtual std::optional> describe_tlb(std::int32_t tlb_index) const = 0; virtual std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const = 0; - static std::unique_ptr create(architecture architecture); + static std::unique_ptr create(tt::ARCH architecture); }; } // namespace tt::umd diff --git a/device/blackhole/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h index 282e622d..7686abef 100644 --- a/device/blackhole/blackhole_implementation.h +++ b/device/blackhole/blackhole_implementation.h @@ -169,7 +169,7 @@ static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97; class blackhole_implementation : public architecture_implementation { public: - architecture get_architecture() const override { return architecture::blackhole; } + tt::ARCH get_architecture() const override { return tt::ARCH::BLACKHOLE; } uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(blackhole::arc_message_type::ARC_GET_HARVESTING); } diff --git a/device/grayskull/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h index 5af254a6..0d520439 100644 --- a/device/grayskull/grayskull_implementation.h +++ b/device/grayskull/grayskull_implementation.h @@ -172,7 +172,7 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; class grayskull_implementation : public architecture_implementation { public: - architecture get_architecture() const override { return architecture::grayskull; } + tt::ARCH get_architecture() const override { return tt::ARCH::GRAYSKULL; } uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(grayskull::arc_message_type::ARC_GET_HARVESTING); } diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp index 17312174..c6c3dcc2 100644 --- a/device/mockup/tt_mockup_device.hpp +++ b/device/mockup/tt_mockup_device.hpp @@ -87,7 +87,7 @@ class tt_MockupDevice : public tt_device { void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const override { return nullptr; } - std::uint64_t get_pcie_base_addr_from_device() const override { return 0; } + std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const override { return 0; } std::uint32_t get_num_dram_channels(std::uint32_t device_id) override { return get_soc_descriptor(device_id)->get_num_dram_channels(); }; diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index a5d2d404..f7ff3331 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -87,11 +87,6 @@ static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) return tt::ARCH::GRAYSKULL; } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){ return tt::ARCH::WORMHOLE_B0; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID){ - // TODO: did we ship any of these? I've never seen one. Can we stop - // having an ARCH for it if they don't exist? - TT_THROW("Wormhole is not supported. Please use Wormhole B0 instead."); - return tt::ARCH::WORMHOLE; } else if (pcie_device_id == BH_PCIE_DEVICE_ID){ return tt::ARCH::BLACKHOLE; } else { @@ -249,7 +244,7 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) , numa_node(read_sysfs(info, "numa_node")) , revision(read_sysfs(info, "revision")) , arch(detect_arch(info.device_id, revision)) - , architecture_implementation(tt::umd::architecture_implementation::create(static_cast(arch))) + , architecture_implementation(tt::umd::architecture_implementation::create(arch)) { struct { tenstorrent_query_mappings query_mappings; diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp index d4ea3233..44df970a 100644 --- a/device/simulation/tt_simulation_device.cpp +++ b/device/simulation/tt_simulation_device.cpp @@ -181,8 +181,8 @@ void *tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_ return nullptr; } -std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device() const { - if(arch_name == tt::ARCH::WORMHOLE or arch_name == tt::ARCH::WORMHOLE_B0) { +std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { + if(arch_name == tt::ARCH::WORMHOLE_B0) { return 0x800000000; } else if (arch_name == tt::ARCH::BLACKHOLE) { diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h index 8af6c434..63632450 100644 --- a/device/simulation/tt_simulation_device.h +++ b/device/simulation/tt_simulation_device.h @@ -56,7 +56,7 @@ class tt_SimulationDevice: public tt_device { virtual std::set get_target_remote_device_ids(); virtual std::map get_clocks(); virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; - virtual std::uint64_t get_pcie_base_addr_from_device() const; + virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); diff --git a/device/tt_arch_types.h b/device/tt_arch_types.h index 8344db6b..8a7c5dba 100644 --- a/device/tt_arch_types.h +++ b/device/tt_arch_types.h @@ -6,22 +6,15 @@ #pragma once -// defines of tt_arch_types -#include "device/architecture.h" - namespace tt { -// TODO: why do we have ARCH and architecture? This is a mess. Can we have just one? -// Can we get rid of the entries that (for all practical purposes) do not exist? /** * @brief ARCH Enums */ enum class ARCH { - JAWBRIDGE = static_cast(tt::umd::architecture::jawbridge), - GRAYSKULL = static_cast(tt::umd::architecture::grayskull), - WORMHOLE = static_cast(tt::umd::architecture::wormhole), - WORMHOLE_B0 = static_cast(tt::umd::architecture::wormhole_b0), - BLACKHOLE = static_cast(tt::umd::architecture::blackhole), - Invalid = static_cast(tt::umd::architecture::invalid), + GRAYSKULL = 1, + WORMHOLE_B0 = 2, + BLACKHOLE = 3, + Invalid = 0xFF, }; } diff --git a/device/tt_device.cpp b/device/tt_device.cpp index 2417fa15..9d974936 100644 --- a/device/tt_device.cpp +++ b/device/tt_device.cpp @@ -27,6 +27,6 @@ tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({} tt_device::~tt_device() { } -const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id){ +const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { return soc_descriptor_per_chip.at(chip_id); } diff --git a/device/tt_device.h b/device/tt_device.h index 07d507ba..9ae64a3a 100644 --- a/device/tt_device.h +++ b/device/tt_device.h @@ -282,9 +282,10 @@ class tt_device /** * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core). * + * @param logical_device_id MMIO chip being targeted. * @param mapping_function Function which maps core to TLB index. */ - virtual void setup_core_to_tlb_map(std::function mapping_function) { + virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n"); } @@ -583,16 +584,15 @@ class tt_device return nullptr; } - virtual std::uint64_t get_pcie_base_addr_from_device() const { + virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const { throw std::runtime_error("---- tt_device::get_pcie_base_addr_from_device is not implemented\n"); return 0; } - const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id); + const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; bool performed_harvesting = false; std::unordered_map harvested_rows_per_target = {}; bool translation_tables_en = false; - bool tlbs_init = false; protected: std::unordered_map soc_descriptor_per_chip = {}; @@ -615,7 +615,6 @@ class tt_SiliconDevice: public tt_device * @param ndesc_path Network Descriptor specifying the network topology of the system. * @param target_devices Devices to target. * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages). - * @param dynamic_tlb_config_ Map specifying tlb name to tlb index mapping. * @param skip_driver_allocs * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. @@ -633,7 +632,7 @@ class tt_SiliconDevice: public tt_device virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted); virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); - virtual void setup_core_to_tlb_map(std::function mapping_function); + virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function); virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); virtual void start_device(const tt_device_params &device_params); virtual void assert_risc_reset(); @@ -661,7 +660,7 @@ class tt_SiliconDevice: public tt_device /** * If the tlbs are initialized, returns a tuple with the TLB base address and its size */ - std::optional> get_tlb_data_from_target(const tt_xy_pair& target); + std::optional> get_tlb_data_from_target(const tt_cxy_pair& target); /** * This API allows you to write directly to device memory that is addressable by a static TLB */ @@ -693,7 +692,7 @@ class tt_SiliconDevice: public tt_device virtual std::set get_target_remote_device_ids(); virtual std::map get_clocks(); virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; - virtual std::uint64_t get_pcie_base_addr_from_device() const; + virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; static std::vector extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows); static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows); @@ -705,6 +704,8 @@ class tt_SiliconDevice: public tt_device virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; + // TODO: This should be accessible through public API, probably to be moved to tt_device. + PCIDevice *get_pci_device(int device_id) const; // Destructor virtual ~tt_SiliconDevice (); @@ -760,7 +761,6 @@ class tt_SiliconDevice: public tt_device int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); - PCIDevice *get_pci_device(int pci_intf_id) const; std::shared_ptr get_mutex(const std::string& tlb_name, int pci_interface_id); virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips void generate_tensix_broadcast_grids_for_grayskull( std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude); @@ -815,7 +815,11 @@ class tt_SiliconDevice: public tt_device std::map> tlb_config_map = {}; std::set all_target_mmio_devices; std::unordered_map> host_channel_size; - std::function map_core_to_tlb; + + // Note that these maps holds only entries for local PCIe chips. + std::unordered_map> map_core_to_tlb_per_chip = {}; + std::unordered_map tlbs_init_per_chip = {}; + std::unordered_map dynamic_tlb_config = {}; std::unordered_map dynamic_tlb_ordering_modes = {}; std::map, std::unordered_map>>> bcast_header_cache = {}; diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index 084c8f73..98de517e 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -42,7 +42,6 @@ #include "device/cpuset_lib.hpp" #include "device/driver_atomics.h" -#include "device/architecture.h" #include "device/architecture_implementation.h" #include "device/tlb.h" #include "device/tt_arch_types.h" @@ -284,7 +283,7 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); - if (arch_name == tt::ARCH::WORMHOLE or arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id); // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for ethernet broadcast if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); @@ -418,7 +417,7 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str } // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and writes. - auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); + auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb(); dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb(); @@ -436,7 +435,7 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str use_virtual_coords_for_eth_broadcast = false; } - if(arch_name == tt::ARCH::WORMHOLE or arch_name == tt::ARCH::WORMHOLE_B0) { + if(arch_name == tt::ARCH::WORMHOLE_B0) { const auto& harvesting_masks = ndesc -> get_harvesting_info(); const auto& noc_translation_enabled = ndesc -> get_noc_translation_table_en(); @@ -502,7 +501,7 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str } simulated_harvesting_masks.at(*device_id) |= harvested_rows_per_target[*device_id]; } - else if(arch_name == tt::ARCH::WORMHOLE_B0 || arch_name == tt::ARCH::WORMHOLE) { + else if(arch_name == tt::ARCH::WORMHOLE_B0) { log_assert(std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= std::bitset<32>(harvested_rows_per_target[*device_id]).count(), "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. Actual Harvested Rows : {} Simulated Harvested Rows : {}", harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); @@ -517,7 +516,7 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str populate_cores(); // MT: Initial BH - skip this for BH - if(arch_name == tt::ARCH::WORMHOLE or arch_name == tt::ARCH::WORMHOLE_B0) { + if(arch_name == tt::ARCH::WORMHOLE_B0) { remote_transfer_ethernet_cores.resize(target_mmio_device_ids.size()); for (const auto &logical_mmio_chip_id : target_mmio_device_ids) { const tt_SocDescriptor& soc_desc = get_soc_descriptor(logical_mmio_chip_id); @@ -583,7 +582,7 @@ std::vector tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, tmp = tmp >> 1; row_coordinate++; } - if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) { + if (arch == tt::ARCH::WORMHOLE_B0) { // For Wormhole, we always remove the last few rows in the SOC descriptor in case of harvesting for (int i = 0; i < row_coordinates_to_remove.size(); i++) { row_coordinates_to_remove[i] = worker_grid_rows - i; @@ -647,8 +646,8 @@ void tt_SiliconDevice::check_pcie_device_initialized(int device_id) { throw std::runtime_error(fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); } } - else if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) { - if (device_arch != tt::ARCH::WORMHOLE && device_arch != tt::ARCH::WORMHOLE_B0) { + else if (arch_name == tt::ARCH::WORMHOLE_B0) { + if (device_arch != tt::ARCH::WORMHOLE_B0) { throw std::runtime_error(fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); } } @@ -914,7 +913,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str())); } - if (!tlbs_init || !map_core_to_tlb) { + if (!tlbs_init_per_chip[target.chip] || !map_core_to_tlb_per_chip[target.chip]) { throw std::runtime_error("TLBs not initialized"); } @@ -924,7 +923,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error("No write-combined mapping for BAR0"); } - auto tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); + auto tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); auto tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); if (!tlb_data.has_value()) { @@ -946,8 +945,8 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init) { - tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); + if(tlbs_init_per_chip[target.chip]) { + tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } @@ -987,8 +986,8 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init) { - tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); + if(tlbs_init_per_chip[target.chip]) { + tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); @@ -1165,13 +1164,13 @@ tt_SiliconDevice::~tt_SiliconDevice () { dynamic_tlb_ordering_modes.clear(); } -std::optional> tt_SiliconDevice::get_tlb_data_from_target(const tt_xy_pair& target) { +std::optional> tt_SiliconDevice::get_tlb_data_from_target(const tt_cxy_pair& target) { std::int32_t tlb_index = 0; std::optional> tlb_data; - if (tlbs_init) { - tlb_index = map_core_to_tlb(target); - auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); + if (tlbs_init_per_chip[target.chip]) { + tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); + auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); tlb_data = architecture_implementation->describe_tlb(tlb_index); } return tlb_data; @@ -1420,7 +1419,7 @@ int tt_SiliconDevice::test_setup_interface () { ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1; return ret_val; } - else if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) { + else if (arch_name == tt::ARCH::WORMHOLE_B0) { int ret_val = 0; PCIDevice *dev = m_pci_device_map.begin()->second.get(); @@ -1587,7 +1586,7 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_ // Returns broken rows as bits set to 1 in 'memory' and 'logic' uint32_t tt_SiliconDevice::get_harvested_noc_rows(uint32_t harvesting_mask) { - auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); + auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); const std::vector &harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); uint32_t harv_noc_rows = 0; std::string harv_noc_rows_str = ""; @@ -2114,7 +2113,7 @@ void tt_SiliconDevice::wait_for_connected_non_mmio_flush(const chip_id_t chip_id return; } - if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { std::vector erisc_txn_counters = std::vector(2); std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); @@ -2387,7 +2386,7 @@ void tt_SiliconDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t } } else if (arch_name == tt::ARCH::BLACKHOLE) { - auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); + auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); if(cols_to_exclude.find(0) == cols_to_exclude.end()) { @@ -2415,7 +2414,7 @@ void tt_SiliconDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t } } else { - auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); + auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); if(cols_to_exclude.find(0) == cols_to_exclude.end()) { @@ -2790,7 +2789,6 @@ void tt_SiliconDevice::enable_ethernet_queue(int timeout) { auto arch = get_soc_descriptor(chip).arch; switch (arch) { - case tt::ARCH::WORMHOLE: case tt::ARCH::WORMHOLE_B0: { if (ndesc->is_chip_mmio_capable(chip)) { enable_local_ethernet_queue(chip, timeout); @@ -2880,7 +2878,7 @@ void tt_SiliconDevice::start_device(const tt_device_params &device_params) { if(device_params.init_device) { initialize_pcie_devices(); // MT Initial BH - Ethernet firmware not present in Blackhole - if(arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) { + if(arch_name == tt::ARCH::WORMHOLE_B0) { verify_eth_fw(); } deassert_resets_and_set_power_state(); @@ -2909,9 +2907,9 @@ void tt_SiliconDevice::set_driver_eth_interface_params(const tt_driver_eth_inter eth_interface_params = eth_interface_params_; } -void tt_SiliconDevice::setup_core_to_tlb_map(std::function mapping_function) { - map_core_to_tlb = mapping_function; - tlbs_init = true; +void tt_SiliconDevice::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { + map_core_to_tlb_per_chip[logical_device_id] = mapping_function; + tlbs_init_per_chip[logical_device_id] = true; } std::uint32_t tt_SiliconDevice::get_num_dram_channels(std::uint32_t device_id) { @@ -2939,11 +2937,13 @@ std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t devi return get_pci_device(device_id)->get_numa_node(); } -std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device() const { - if(arch_name == tt::ARCH::WORMHOLE or arch_name == tt::ARCH::WORMHOLE_B0) { +std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { + // TODO: Should probably be lowered to TTDevice. + tt::ARCH arch = get_soc_descriptor(chip_id).arch; + if(arch == tt::ARCH::WORMHOLE_B0) { return 0x800000000; } - else if (arch_name == tt::ARCH::BLACKHOLE) { + else if (arch == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; } @@ -2953,7 +2953,7 @@ std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device() const { } tt_version tt_SiliconDevice::get_ethernet_fw_version() const { - log_assert(arch_name == tt::ARCH::WORMHOLE or arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures."); + log_assert(arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures."); log_assert(eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, "Device must be started before querying Ethernet FW version."); return eth_fw_version; } diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index d69a47c8..7ff961e5 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -216,14 +216,10 @@ bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const { } std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { - if (arch_name == tt::ARCH::JAWBRIDGE) { - out << "jawbridge"; - } else if (arch_name == tt::ARCH::Invalid) { + if (arch_name == tt::ARCH::Invalid) { out << "none"; } else if (arch_name == tt::ARCH::GRAYSKULL) { out << "grayskull"; - } else if (arch_name == tt::ARCH::WORMHOLE) { - out << "wormhole"; } else if (arch_name == tt::ARCH::WORMHOLE_B0) { out << "wormhole_b0"; } else if (arch_name == tt::ARCH::BLACKHOLE) { diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h index a34b8e6e..cb4acede 100644 --- a/device/tt_soc_descriptor.h +++ b/device/tt_soc_descriptor.h @@ -31,12 +31,8 @@ std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name); static inline std::string get_arch_str(const tt::ARCH arch_name){ std::string arch_name_str; - if (arch_name == tt::ARCH::JAWBRIDGE) { - arch_name_str = "jawbridge"; - } else if (arch_name == tt::ARCH::GRAYSKULL) { + if (arch_name == tt::ARCH::GRAYSKULL) { arch_name_str = "grayskull"; - } else if (arch_name == tt::ARCH::WORMHOLE) { - arch_name_str = "wormhole"; } else if (arch_name == tt::ARCH::WORMHOLE_B0) { arch_name_str = "wormhole_b0"; } else if (arch_name == tt::ARCH::BLACKHOLE) { @@ -51,13 +47,9 @@ static inline std::string get_arch_str(const tt::ARCH arch_name){ static inline tt::ARCH get_arch_name(const std::string &arch_str){ tt::ARCH arch; - if ((arch_str == "jawbridge") || (arch_str == "JAWBRIDGE")) { - arch = tt::ARCH::JAWBRIDGE; - } else if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) { + if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) { arch = tt::ARCH::GRAYSKULL; - } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE")){ - arch = tt::ARCH::WORMHOLE; - } else if ((arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){ + } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){ arch = tt::ARCH::WORMHOLE_B0; } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")){ arch = tt::ARCH::BLACKHOLE; diff --git a/device/wormhole/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h index bb6dfbba..79fbb8b1 100644 --- a/device/wormhole/wormhole_implementation.h +++ b/device/wormhole/wormhole_implementation.h @@ -206,7 +206,7 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; class wormhole_implementation : public architecture_implementation { public: - architecture get_architecture() const override { return architecture::wormhole; } + tt::ARCH get_architecture() const override { return tt::ARCH::WORMHOLE_B0; } uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(wormhole::arc_message_type::ARC_GET_HARVESTING); } diff --git a/src/.clang-format b/src/.clang-format new file mode 100644 index 00000000..9d159247 --- /dev/null +++ b/src/.clang-format @@ -0,0 +1,2 @@ +DisableFormat: true +SortIncludes: false diff --git a/tests/.clang-format b/tests/.clang-format new file mode 100644 index 00000000..9d159247 --- /dev/null +++ b/tests/.clang-format @@ -0,0 +1,2 @@ +DisableFormat: true +SortIncludes: false diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6c4a09f5..e59049c7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,35 +1,54 @@ - # Tests currently depend on ARCH_NAME for compile time include paths if(NOT DEFINED ENV{ARCH_NAME}) - message(FATAL_ERROR "Please set ARCH_NAME to grayskull, wormhole_b0, or blackhole") -elseif($ENV{ARCH_NAME} STREQUAL "grayskull") + message( + FATAL_ERROR + "Please set ARCH_NAME to grayskull, wormhole_b0, or blackhole" + ) +endif(NOT DEFINED ENV{ARCH_NAME}) + +add_library(test_common INTERFACE) +target_link_libraries( + test_common + INTERFACE + umd_device + gtest_main + gtest + pthread + fmt::fmt-header-only +) +target_include_directories( + test_common + INTERFACE + ${PROJECT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) +if($ENV{ARCH_NAME} STREQUAL "grayskull") message(STATUS "UMD: Building for Grayskull") - target_include_directories(umd_common_directories INTERFACE - ${PROJECT_SOURCE_DIR}/device/grayskull - ${PROJECT_SOURCE_DIR}/src/firmware/riscv/grayskull + target_include_directories( + test_common + INTERFACE + ${PROJECT_SOURCE_DIR}/device/grayskull + ${PROJECT_SOURCE_DIR}/src/firmware/riscv/grayskull ) elseif($ENV{ARCH_NAME} STREQUAL "wormhole_b0") message(STATUS "UMD: Building for Wormhole") - target_include_directories(umd_common_directories INTERFACE - ${PROJECT_SOURCE_DIR}/device/wormhole - ${PROJECT_SOURCE_DIR}/src/firmware/riscv/wormhole + target_include_directories( + test_common + INTERFACE + ${PROJECT_SOURCE_DIR}/device/wormhole + ${PROJECT_SOURCE_DIR}/src/firmware/riscv/wormhole ) elseif($ENV{ARCH_NAME} STREQUAL "blackhole") message(STATUS "UMD: Building for Blackhole") - target_include_directories(umd_common_directories INTERFACE - ${PROJECT_SOURCE_DIR}/device/blackhole - ${PROJECT_SOURCE_DIR}/src/firmware/riscv/blackhole + target_include_directories( + test_common + INTERFACE + ${PROJECT_SOURCE_DIR}/device/blackhole + ${PROJECT_SOURCE_DIR}/src/firmware/riscv/blackhole ) endif() -add_library(test_common INTERFACE) -target_link_libraries(test_common INTERFACE umd_device gtest_main gtest pthread fmt) -target_include_directories(test_common INTERFACE - ${PROJECT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} -) - -if (MASTER_PROJECT) +if(MASTER_PROJECT) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/microbenchmark) endif() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/api) @@ -38,9 +57,21 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/simulation) if($ENV{ARCH_NAME} STREQUAL "wormhole_b0") add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/wormhole) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/galaxy) - add_custom_target(umd_unit_tests DEPENDS unit_tests_wormhole unit_tests_glx) + add_custom_target( + umd_unit_tests + DEPENDS + unit_tests_wormhole + unit_tests_glx + ) else() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/$ENV{ARCH_NAME}) endif() -add_custom_target(umd_tests DEPENDS umd_unit_tests simulation_tests test_pcie_device api_tests) +add_custom_target( + umd_tests + DEPENDS + umd_unit_tests + simulation_tests + test_pcie_device + api_tests +) diff --git a/tests/api/CMakeLists.txt b/tests/api/CMakeLists.txt index 08c11ffe..ce569112 100644 --- a/tests/api/CMakeLists.txt +++ b/tests/api/CMakeLists.txt @@ -1,6 +1,7 @@ set(API_TESTS_SRCS - test_cluster.cpp + test_chip.cpp test_cluster_descriptor.cpp + test_cluster.cpp ) add_executable(api_tests ${API_TESTS_SRCS}) diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp new file mode 100644 index 00000000..5c10970e --- /dev/null +++ b/tests/api/test_chip.cpp @@ -0,0 +1,172 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file holds Chip specific API examples. + +#include +#include "fmt/xchar.h" + +#include +#include +#include +#include + +#include "tests/test_utils/generate_cluster_desc.hpp" + +// TODO: change to tt_cluster +#include "device/tt_device.h" +#include "device/tt_cluster_descriptor.h" + +// TODO: write this test to work with Chip not whole Cluster. +using Cluster = tt_SiliconDevice; + +inline std::unique_ptr get_cluster_desc() { + // TODO: This should not be needed. And could be part of the cluster descriptor probably. + // Note that cluster descriptor holds logical ids of chips. + // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. + // You have to see if physical PCIe is GS before constructing a cluster descriptor. + std::vector pci_device_ids = PCIDevice::enumerate_devices(); + std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); + + tt::ARCH device_arch = tt::ARCH::GRAYSKULL; + if (!pci_device_ids.empty()) { + // TODO: This should be removed from the API, the driver itself should do it. + int physical_device_id = pci_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device (physical_device_id, 0); + device_arch = pci_device.get_arch(); + } + + // TODO: Make this test work on a host system without any tt devices. + if (pci_device_ids.empty()) { + std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; + return nullptr; + } + + // TODO: remove getting manually cluster descriptor from yaml. + std::string yaml_path = test_utils::GetClusterDescYAML(); + std::unique_ptr cluster_desc; + if (device_arch == tt::ARCH::GRAYSKULL) { + cluster_desc = tt_ClusterDescriptor::create_for_grayskull_cluster(pci_device_ids_set, pci_device_ids); + } else { + cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); + } + + return cluster_desc; +} + +inline std::unique_ptr get_cluster() { + + // TODO: This should not be needed. And could be part of the cluster descriptor probably. + // Note that cluster descriptor holds logical ids of chips. + // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. + // You have to see if physical PCIe is GS before constructing a cluster descriptor. + std::vector pci_device_ids = PCIDevice::enumerate_devices(); + std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); + + tt::ARCH device_arch = tt::ARCH::GRAYSKULL; + if (!pci_device_ids.empty()) { + // TODO: This should be removed from the API, the driver itself should do it. + int physical_device_id = pci_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device (physical_device_id, 0); + device_arch = pci_device.get_arch(); + } + + // TODO: Make this test work on a host system without any tt devices. + if (pci_device_ids.empty()) { + std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; + return nullptr; + } + + // TODO: remove getting manually cluster descriptor from yaml. + std::string yaml_path = test_utils::GetClusterDescYAML(); + // TODO: Remove the need to do this, allow default constructor to construct with all chips. + std::unique_ptr cluster_desc = get_cluster_desc(); + std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); + + // TODO: make this unordered vs set conversion not needed. + std::set detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end()); + + + // TODO: This would be incorporated inside SocDescriptor. + std::string soc_path; + if (device_arch == tt::ARCH::GRAYSKULL) { + soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); + } else if (device_arch == tt::ARCH::WORMHOLE_B0) { + soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); + } else if (device_arch == tt::ARCH::BLACKHOLE) { + soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); + } else { + throw std::runtime_error("Unsupported architecture"); + } + + + // TODO: Don't pass each of these arguments. + return std::unique_ptr(new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set)); +} + +// TODO: Once default auto TLB setup is in, check it is setup properly. +TEST(ApiChipTest, ManualTLBConfiguration) { + std::unique_ptr umd_cluster = get_cluster(); + + // Expect to throw for remote chip for any worker core + auto remote_chips = umd_cluster->get_target_remote_device_ids(); + if (!remote_chips.empty()) { + chip_id_t any_remote_chip = *remote_chips.begin(); + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_remote_chip); + tt_xy_pair core = soc_desc.workers[0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_remote_chip, core)), std::runtime_error); + } + + // Expect to throw for non configured mmio chip. + chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin(); + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip); + tt_xy_pair core = soc_desc.workers[0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, core)), std::runtime_error); + + // TODO: This should be part of TTDevice interface, not Cluster or Chip. + // Configure TLBs. + std::function get_static_tlb_index = [&](tt_xy_pair core) -> int { + // TODO: Make this per arch. + bool is_worker_core = soc_desc.is_worker_core(core); + if (!is_worker_core) { + return -1; + } + return core.x + core.y * umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x(); + }; + + std::int32_t c_zero_address = 0; + + // Each MMIO chip has it's own set of TLBs, so needs its own configuration. + for (chip_id_t mmio_chip: umd_cluster->get_target_mmio_device_ids()) { + + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(mmio_chip); + for (tt_xy_pair core: soc_desc.workers) { + umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address); + } + + umd_cluster->setup_core_to_tlb_map(mmio_chip, get_static_tlb_index); + } + + // Expect not to throw for now configured mmio chip, same one as before. + EXPECT_NO_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, core))); + + // Expect to throw for non worker cores. + tt_xy_pair dram_core = soc_desc.dram_cores[0][0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, dram_core)), std::runtime_error); + if (!soc_desc.ethernet_cores.empty()) { + tt_xy_pair eth_core = soc_desc.ethernet_cores[0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, eth_core)), std::runtime_error); + } +} + +// TODO: Move to test_chip +TEST(ApiChipTest, SimpleAPIShowcase) { + std::unique_ptr umd_cluster = get_cluster(); + chip_id_t chip_id = *umd_cluster->get_all_chips_in_cluster().begin(); + + // TODO: In future, will be accessed through tt::umd::Chip api. + umd_cluster->get_pcie_base_addr_from_device(chip_id); +} diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index 41faea8a..f104a997 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -1,24 +1,29 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file holds Cluster specific API examples. #include -#include "fmt/xchar.h" #include #include #include #include +#include "fmt/xchar.h" #include "tests/test_utils/generate_cluster_desc.hpp" // TODO: change to tt_cluster -#include "device/tt_device.h" #include "device/tt_cluster_descriptor.h" +#include "device/tt_device.h" // TODO: obviously we need some other way to set this up -#include "src/firmware/riscv/wormhole/host_mem_address_map.h" -#include "src/firmware/riscv/wormhole/noc/noc_parameters.h" #include "src/firmware/riscv/wormhole/eth_interface.h" -#include "src/firmware/riscv/wormhole/l1_address_map.h" #include "src/firmware/riscv/wormhole/eth_l1_address_map.h" +#include "src/firmware/riscv/wormhole/host_mem_address_map.h" +#include "src/firmware/riscv/wormhole/l1_address_map.h" +#include "src/firmware/riscv/wormhole/noc/noc_parameters.h" // TODO: do proper renaming. using Cluster = tt_SiliconDevice; @@ -30,20 +35,19 @@ using Cluster = tt_SiliconDevice; // TODO: This function should not exist, the API itself should be simple enough. inline std::unique_ptr get_cluster_desc() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. // Note that cluster descriptor holds logical ids of chips. // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. // You have to see if physical PCIe is GS before constructing a cluster descriptor. std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); + std::set pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end()); tt::ARCH device_arch = tt::ARCH::GRAYSKULL; if (!pci_device_ids.empty()) { // TODO: This should be removed from the API, the driver itself should do it. int physical_device_id = pci_device_ids[0]; // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); + PCIDevice pci_device(physical_device_id, 0); device_arch = pci_device.get_arch(); } @@ -67,20 +71,19 @@ inline std::unique_ptr get_cluster_desc() { // TODO: This function should not exist, the API itself should be simple enough. inline std::unique_ptr get_cluster() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. // Note that cluster descriptor holds logical ids of chips. // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. // You have to see if physical PCIe is GS before constructing a cluster descriptor. std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); + std::set pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end()); tt::ARCH device_arch = tt::ARCH::GRAYSKULL; if (!pci_device_ids.empty()) { // TODO: This should be removed from the API, the driver itself should do it. int physical_device_id = pci_device_ids[0]; // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); + PCIDevice pci_device(physical_device_id, 0); device_arch = pci_device.get_arch(); } @@ -97,14 +100,13 @@ inline std::unique_ptr get_cluster() { std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); // TODO: make this unordered vs set conversion not needed. - std::set detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end()); + std::set detected_num_chips_set(detected_num_chips.begin(), detected_num_chips.end()); - // TODO: This would be incorporated inside SocDescriptor. std::string soc_path; if (device_arch == tt::ARCH::GRAYSKULL) { soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - } else if (device_arch == tt::ARCH::WORMHOLE || device_arch == tt::ARCH::WORMHOLE_B0) { + } else if (device_arch == tt::ARCH::WORMHOLE_B0) { soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); } else if (device_arch == tt::ARCH::BLACKHOLE) { soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); @@ -112,35 +114,60 @@ inline std::unique_ptr get_cluster() { throw std::runtime_error("Unsupported architecture"); } - // TODO: Don't pass each of these arguments. - return std::unique_ptr(new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set)); + return std::unique_ptr( + new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set)); } // TODO: Should not be wormhole specific. // TODO: Offer default setup for what you can. void setup_wormhole_remote(Cluster* umd_cluster) { - if (!umd_cluster->get_target_remote_device_ids().empty() && umd_cluster->get_soc_descriptor(*umd_cluster->get_all_chips_in_cluster().begin()).arch == tt::ARCH::WORMHOLE_B0) { - + if (!umd_cluster->get_target_remote_device_ids().empty() && + umd_cluster->get_soc_descriptor(*umd_cluster->get_all_chips_in_cluster().begin()).arch == + tt::ARCH::WORMHOLE_B0) { // Populate address map and NOC parameters that the driver needs for remote transactions - umd_cluster->set_driver_host_address_params({host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, host_mem::address_map::ETH_ROUTING_BUFFERS_START}); - - umd_cluster->set_driver_eth_interface_params({NOC_ADDR_LOCAL_BITS, NOC_ADDR_NODE_ID_BITS, ETH_RACK_COORD_WIDTH, CMD_BUF_SIZE_MASK, MAX_BLOCK_SIZE, - REQUEST_CMD_QUEUE_BASE, RESPONSE_CMD_QUEUE_BASE, CMD_COUNTERS_SIZE_BYTES, REMOTE_UPDATE_PTR_SIZE_BYTES, - CMD_DATA_BLOCK, CMD_WR_REQ, CMD_WR_ACK, CMD_RD_REQ, CMD_RD_DATA, CMD_BUF_SIZE, CMD_DATA_BLOCK_DRAM, ETH_ROUTING_DATA_BUFFER_ADDR, - REQUEST_ROUTING_CMD_QUEUE_BASE, RESPONSE_ROUTING_CMD_QUEUE_BASE, CMD_BUF_PTR_MASK, CMD_ORDERED, CMD_BROADCAST}); - - umd_cluster->set_device_l1_address_params({l1_mem::address_map::NCRISC_FIRMWARE_BASE, l1_mem::address_map::FIRMWARE_BASE, - l1_mem::address_map::TRISC0_SIZE, l1_mem::address_map::TRISC1_SIZE, l1_mem::address_map::TRISC2_SIZE, - l1_mem::address_map::TRISC_BASE, l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); - + umd_cluster->set_driver_host_address_params( + {host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, host_mem::address_map::ETH_ROUTING_BUFFERS_START}); + + umd_cluster->set_driver_eth_interface_params( + {NOC_ADDR_LOCAL_BITS, + NOC_ADDR_NODE_ID_BITS, + ETH_RACK_COORD_WIDTH, + CMD_BUF_SIZE_MASK, + MAX_BLOCK_SIZE, + REQUEST_CMD_QUEUE_BASE, + RESPONSE_CMD_QUEUE_BASE, + CMD_COUNTERS_SIZE_BYTES, + REMOTE_UPDATE_PTR_SIZE_BYTES, + CMD_DATA_BLOCK, + CMD_WR_REQ, + CMD_WR_ACK, + CMD_RD_REQ, + CMD_RD_DATA, + CMD_BUF_SIZE, + CMD_DATA_BLOCK_DRAM, + ETH_ROUTING_DATA_BUFFER_ADDR, + REQUEST_ROUTING_CMD_QUEUE_BASE, + RESPONSE_ROUTING_CMD_QUEUE_BASE, + CMD_BUF_PTR_MASK, + CMD_ORDERED, + CMD_BROADCAST}); + + umd_cluster->set_device_l1_address_params( + {l1_mem::address_map::NCRISC_FIRMWARE_BASE, + l1_mem::address_map::FIRMWARE_BASE, + l1_mem::address_map::TRISC0_SIZE, + l1_mem::address_map::TRISC1_SIZE, + l1_mem::address_map::TRISC2_SIZE, + l1_mem::address_map::TRISC_BASE, + l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } } // This test should be one line only. -TEST(ApiClusterTest, OpenAllChips) { - std::unique_ptr umd_cluster = get_cluster(); -} +TEST(ApiClusterTest, OpenAllChips) { std::unique_ptr umd_cluster = get_cluster(); } TEST(ApiClusterTest, SimpleIOAllChips) { std::unique_ptr cluster_desc = get_cluster_desc(); @@ -166,7 +193,7 @@ TEST(ApiClusterTest, SimpleIOAllChips) { // TODO: figure out if core locations should contain chip_id tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global (chip_id, any_core); + tt_cxy_pair any_core_global(chip_id, any_core); if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; @@ -184,7 +211,7 @@ TEST(ApiClusterTest, SimpleIOAllChips) { // TODO: figure out if core locations should contain chip_id tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global (chip_id, any_core); + tt_cxy_pair any_core_global(chip_id, any_core); if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; @@ -201,7 +228,6 @@ TEST(ApiClusterTest, SimpleIOAllChips) { } TEST(ApiClusterTest, RemoteFlush) { - std::unique_ptr cluster_desc = get_cluster_desc(); std::unique_ptr umd_cluster = get_cluster(); @@ -221,7 +247,7 @@ TEST(ApiClusterTest, RemoteFlush) { // TODO: figure out if core locations should contain chip_id tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global (chip_id, any_core); + tt_cxy_pair any_core_global(chip_id, any_core); if (!cluster_desc->is_chip_remote(chip_id)) { std::cout << "Chip " << chip_id << " skipped because it is not a remote chip." << std::endl; @@ -246,13 +272,13 @@ TEST(ApiClusterTest, RemoteFlush) { chip_id_t any_remote_chip = *umd_cluster->get_target_remote_device_ids().begin(); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_remote_chip); tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global (any_remote_chip, any_core); + tt_cxy_pair any_core_global(any_remote_chip, any_core); if (soc_desc.arch != tt::ARCH::WORMHOLE_B0) { std::cout << "Skipping whole cluster wait because it is not a wormhole_b0 chip." << std::endl; return; } std::cout << "Writing to chip " << any_remote_chip << " core " << any_core.str() << std::endl; - umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); std::cout << "Testing whole cluster wait for remote chip flush." << std::endl; umd_cluster->wait_for_non_mmio_flush(); diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index d5df1bc0..c7b313d9 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 #include @@ -46,7 +49,7 @@ inline std::unique_ptr get_cluster_desc() { return cluster_desc; } -TEST(ApiTest, DetectArch) { +TEST(ApiClusterDescriptorTest, DetectArch) { // TODO: This should be part of cluster descriptor. It is currently used like this from tt_metal. tt::ARCH arch = detect_arch(); diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index 346c80b2..f6b2985e 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -91,10 +91,9 @@ TEST(SiliconDriverGS, HarvestingRuntime) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } + device.setup_core_to_tlb_map(i, get_static_tlb_index); } - device.setup_core_to_tlb_map(get_static_tlb_index); - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -154,9 +153,8 @@ TEST(SiliconDriverGS, StaticTLB_RW) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); } + device.setup_core_to_tlb_map(i, get_static_tlb_index); } - - device.setup_core_to_tlb_map(get_static_tlb_index); tt_device_params default_params; device.start_device(default_params); @@ -324,10 +322,9 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), base_addr); } + device.setup_core_to_tlb_map(i, get_static_tlb_index); } - device.setup_core_to_tlb_map(get_static_tlb_index); - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index 00eb9455..722be47b 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -230,10 +230,9 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } - } + device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); + } } - - device.setup_core_to_tlb_map(get_static_tlb_index_callback); tt_device_params default_params; device.start_device(default_params); @@ -290,10 +289,10 @@ TEST(SiliconDriverWH, StaticTLB_RW) { // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } + device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - device.setup_core_to_tlb_map(get_static_tlb_index_callback); tt_device_params default_params; device.start_device(default_params); @@ -436,9 +435,9 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } + device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - device.setup_core_to_tlb_map(get_static_tlb_index_callback); tt_device_params default_params; device.start_device(default_params); @@ -681,9 +680,10 @@ TEST(SiliconDriverWH, SysmemTestWithPcie) { device.start_device(tt_device_params{}); // no special parameters // PCIe core is at (x=0, y=3) on Wormhole NOC0. + const chip_id_t mmio_chip_id = 0; const size_t PCIE_X = 0; // NOC0 const size_t PCIE_Y = 3; // NOC0 - const tt_cxy_pair PCIE_CORE(0, PCIE_X, PCIE_Y); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE_X, PCIE_Y); const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly. // Bad API: how big is the buffer? How do we know it's big enough? @@ -695,7 +695,7 @@ TEST(SiliconDriverWH, SysmemTestWithPcie) { // This is the address inside the Wormhole PCIe block that is mapped to the // system bus. In Wormhole, this is a fixed address, 0x8'0000'0000. // The driver should have mapped this address to the bottom of sysmem. - uint64_t base_address = device.get_pcie_base_addr_from_device(); + uint64_t base_address = device.get_pcie_base_addr_from_device(mmio_chip_id); // Buffer that we will use to read sysmem into, then write sysmem from. std::vector buffer(test_size_bytes, 0x0);