Merge branch 'main' into mbahnas/vit_trace_2cq

tenstorrent · Sep 12, 2024 · 72171e7 · 72171e7
2 parents 181f15a + 6d3424c
commit 72171e7
Show file tree

Hide file tree

Showing 206 changed files with 12,259 additions and 10,294 deletions.
diff --git a/.github/actions/docker-run/action.yml b/.github/actions/docker-run/action.yml
@@ -60,7 +60,6 @@ runs:
           ${{ inputs.docker_opts }}
           -e LOGURU_LEVEL=${{ env.LOGURU_LEVEL }}
           -e PYTHONPATH=/usr/app
-          -e ARCH_NAME=${{ inputs.docker_image_arch }}
           ${{ inputs.device }}
         run: |
           cp -r /github_workspace/* /usr/app/

diff --git a/.github/actions/install-metal-deps/dependencies.json b/.github/actions/install-metal-deps/dependencies.json
@@ -4,15 +4,13 @@
     "build-essential=12.8ubuntu1.1",
     "python3.8-venv",
     "libhwloc-dev",
-    "graphviz",
-    "patchelf"
+    "graphviz"
   ],
   "ubuntu-22.04": [
     "software-properties-common",
     "build-essential",
     "python3.10-venv",
     "libhwloc-dev",
-    "graphviz",
-    "patchelf"
+    "graphviz"
   ]
 }
diff --git a/.github/workflows/_build-wheels-impl.yaml b/.github/workflows/_build-wheels-impl.yaml
@@ -18,9 +18,7 @@ jobs:
   build-wheel:
     runs-on: ${{ inputs.os }}
     env:
-      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
       ARCH_NAME: ${{ inputs.arch }}
-      TT_METAL_CREATE_STATIC_LIB: 1
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
         with:
@@ -43,9 +41,6 @@ jobs:
         run: |
           pip config set global.extra-index-url https://download.pytorch.org/whl/cpu
           pip install -r tt_metal/python_env/requirements-dev.txt
-      - name: Set up dynamic env vars for build
-        run: |
-          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - name: Use g++ as umd compiler for ubuntu 22.04
         if: ${{ inputs.os == 'ubuntu-22.04' }}
         run: |
@@ -54,11 +49,12 @@ jobs:
         if: ${{ inputs.from-precompiled }}
         with:
           arch: ${{ inputs.arch }}
+      - name: Set precompiled dir for precompile builds
+        if: ${{ inputs.from-precompiled }}
+        # TT_FROM_PRECOMPILED_DIR env variable allows us to not re-run the full C++ build and instead
+        # rely on the artifact that was already compiled. We point it to where the repo is.
+        run: echo "TT_FROM_PRECOMPILED_DIR=${{ github.workspace }}" >> $GITHUB_ENV
       - name: Build Python package distribution
-        env:
-          # TT_FROM_PRECOMPILED env variable allows us to not re-run the full C++ build and instead
-          # rely on the artifact that was already compiled.
-          TT_FROM_PRECOMPILED: ${{ inputs.from-precompiled && 'True' || 'False' }}
         run: python -m build
       - name: Upload distribution as artifact
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
@@ -19,6 +19,7 @@ on:
       - "(T3K) T3000 model perf tests"
       - "(Single-card) Model perf tests"
       - "(Single-card) Device perf tests"
+      - "Nightly fast dispatch tests"
     types:
       - completed
 

diff --git a/...orkflows/_build-and-test-wheels-impl.yaml → .github/workflows/_test-wheels-impl.yaml b/...orkflows/_build-and-test-wheels-impl.yaml → .github/workflows/_test-wheels-impl.yaml
@@ -1,4 +1,4 @@
-name: "[internal] Python wheels build and test impl"
+name: "[internal] Python wheels test impl"
 
 on:
   workflow_call:
@@ -8,21 +8,20 @@ on:
         default: True
         type: boolean
 
+# Since pre-compiled assets are only built on ubuntu-20.04, we force tests
+# to only be run on ubuntu-20.04.
+#
+# Otherwise, we run across 20.04 and 22.04 as we should have assets for both
+# from previous wheel builds if from-precompiled is false.
+#
+# I chose the more heavy-handed approach because:
+# - This should all go away soon once we have more OSes + more Docker up and
+# running so we can matrix properly across more stuff
+# - though provides less flexibility to caller workflows, we want to be pretty
+# strict with the matrix + doesn't change often
+
 jobs:
-  build-wheels:
-    strategy:
-      matrix:
-        # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
-        # The full 22.04 flow can be tested without precompiled
-        os: ${{ fromJson(inputs.from-precompiled && '["ubuntu-20.04"]' || '["ubuntu-20.04", "ubuntu-22.04"]') }}
-        arch: [grayskull, wormhole_b0]
-    uses: ./.github/workflows/_build-wheels-impl.yaml
-    with:
-      os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
-      from-precompiled: ${{ inputs.from-precompiled }}
   test-wheels-host:
-    needs: build-wheels
     strategy:
       matrix:
         os: ${{ fromJson(inputs.from-precompiled && '["ubuntu-20.04"]' || '["ubuntu-20.04", "ubuntu-22.04"]') }}
@@ -48,9 +47,9 @@ jobs:
           cd tests/end_to_end_tests
           pytest -c conftest.py . -m eager_host_side
   test-wheels-silicon:
-    needs: build-wheels
     strategy:
       matrix:
+        # We only have this for non-Docker silicon runners right now
         os: [ubuntu-20.04]
         runner-hw-info: [
           {arch: grayskull, type: E150},

diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml
@@ -17,9 +17,23 @@ jobs:
   static-checks:
     uses: ./.github/workflows/all-static-checks.yaml
     secrets: inherit
-  build-and-test-wheels:
+  build-wheels:
     needs: build-artifact
-    uses: ./.github/workflows/_build-and-test-wheels-impl.yaml
+    strategy:
+      matrix:
+        # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
+        # The full 22.04 flow can be tested without precompiled
+        os: [ubuntu-20.04]
+        arch: [grayskull, wormhole_b0]
+    uses: ./.github/workflows/_build-wheels-impl.yaml
+    with:
+      os: ${{ matrix.os }}
+      arch: ${{ matrix.arch }}
+      from-precompiled: true
+    secrets: inherit
+  test-wheels:
+    needs: build-wheels
+    uses: ./.github/workflows/_test-wheels-impl.yaml
     with:
       from-precompiled: true
     secrets: inherit
@@ -64,7 +78,7 @@ jobs:
       runner-label: ${{ matrix.test-group.runner-label }}
   # Fast Dispatch Unit Tests
   fast-dispatch-unit-tests:
-    needs: build-and-test-wheels
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false
@@ -80,7 +94,7 @@ jobs:
       runner-label: ${{ matrix.test-group.runner-label }}
   # TTNN FD Unit tests
   ttnn-unit-tests:
-    needs: build-and-test-wheels
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false
@@ -96,7 +110,7 @@ jobs:
       runner-label: ${{ matrix.test-group.runner-label }}
   # FD Model Tests
   models-unit-tests:
-    needs: build-and-test-wheels
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false

diff --git a/.github/workflows/blackhole-post-commit.yaml b/.github/workflows/blackhole-post-commit.yaml
@@ -65,7 +65,7 @@ jobs:
 #     uses: ./.github/workflows/run-profiler-regression.yaml
 #     secrets: inherit
 #   build-and-test-wheels:
-#     uses: ./.github/workflows/_build-and-test-wheels-impl.yaml
+#     uses: Check all-post-commit yaml for directions
 #     secrets: inherit
 #   build-docs:
 #     needs: build-artifact

diff --git a/.github/workflows/build-and-test-wheels.yaml b/.github/workflows/build-and-test-wheels.yaml
@@ -15,9 +15,23 @@ jobs:
     if: ${{ github.event_name == 'workflow_dispatch' && inputs.from-precompiled }}
     uses: ./.github/workflows/build-artifact.yaml
     secrets: inherit
-  build-and-test-wheels:
+  build-wheels:
     needs: build-artifact
     if: ${{ always() }}
-    uses: ./.github/workflows/_build-and-test-wheels-impl.yaml
+    strategy:
+      matrix:
+        # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
+        # The full 22.04 flow can be tested without precompiled
+        os: ${{ fromJson((github.event_name == 'schedule' || inputs.from-precompiled) && '["ubuntu-20.04"]' || '["ubuntu-20.04", "ubuntu-22.04"]') }}
+        arch: [grayskull, wormhole_b0]
+    uses: ./.github/workflows/_build-wheels-impl.yaml
+    with:
+      os: ${{ matrix.os }}
+      arch: ${{ matrix.arch }}
+      from-precompiled: ${{ inputs.from-precompiled }}
+  test-wheels:
+    needs: build-wheels
+    if: ${{ always() }}
+    uses: ./.github/workflows/_test-wheels-impl.yaml
     with:
       from-precompiled: ${{ github.event_name == 'workflow_dispatch' && inputs.from-precompiled }}
diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
@@ -56,20 +56,15 @@ jobs:
         ]
     name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
     env:
-      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ inputs.arch }}
       LOGURU_LEVEL: INFO
+      # may not need this
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     runs-on:
       - ${{ inputs.runner-label }}
       - cloud-virtual-machine
       - in-service
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
-      - name: Set up dynamic env vars for build
-        run: |
-          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
-          echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
           name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -149,3 +149,9 @@ jobs:
           cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
           ${{ matrix.test-group.cmd }}
+      - uses: ./.github/actions/upload-artifact-with-job-uuid
+        if: ${{ !cancelled() }}
+        with:
+          path: |
+            generated/test_reports/
+          prefix: "test_reports_"
diff --git a/.github/workflows/models-post-commit.yaml b/.github/workflows/models-post-commit.yaml
@@ -44,13 +44,12 @@ jobs:
       matrix:
         os: ["ubuntu-20.04"]
         test-group: [
-          {name: model, cmd: ./tests/scripts/run_python_model_tests.sh},
+          {name: model},
         ]
     name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
     env:
-      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ inputs.arch }}
       LOGURU_LEVEL: INFO
+      # may not need this
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     runs-on:
       - ${{ inputs.runner-label }}
@@ -71,7 +70,7 @@ jobs:
           run_args: |
             WHEEL_FILENAME=$(ls -1 *.whl)
             pip3 install --user $WHEEL_FILENAME
-            ${{ matrix.test-group.cmd }}
+            source tests/scripts/run_python_model_tests.sh && run_python_model_tests_${{ inputs.arch }}
       - uses: ./.github/actions/slack-report
         if: ${{ failure() }}
         with:

diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         test-group: [
           { name: "t3k ttmetal tests", arch: wormhole_b0, cmd: run_t3000_ttmetal_tests, timeout: 30, owner_id: ULMEPM2MA}, #Sean Nijjar
-          { name: "t3k ttnn tests", arch: wormhole_b0, cmd: run_t3000_ttnn_tests, timeout: 120, owner_id: UAFM0F6FM}, #Akhmed Rakhmati
+          { name: "t3k ttnn tests", arch: wormhole_b0, cmd: run_t3000_ttnn_tests, timeout: 120, owner_id: UBHPP2NDP}, #Joseph Chu
           { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 30, owner_id: UBHPP2NDP}, #Joseph Chu
           { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 30, owner_id: U053W15B6JF}, #Djordje Ivanovic
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz

diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml
@@ -63,9 +63,8 @@ jobs:
             cmd: ./tests/scripts/run_ttnn_examples.sh
     name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
     env:
-      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ inputs.arch }}
       LOGURU_LEVEL: INFO
+      # may not need this
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     runs-on:
       - ${{ inputs.runner-label }}

diff --git a/.gitignore b/.gitignore
@@ -117,8 +117,12 @@ ttnn/tutorials/DiT/
 
 compile_commands.json
 
-# rpath_check
-ttnn/ttnn/.rpath_checked*
+# files to ensure that we can use the wheel with runtime artifacts
+ttnn/build
+ttnn/ttnn/build
+ttnn/runtime
+ttnn/ttnn/runtime
+.ttnn_runtime_artifacts/
 
 # exclude packages brough in from CPM
 .cpmcache

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -86,11 +86,7 @@ if(ENABLE_BUILD_TIME_TRACE)
 endif()
 
 # Default to building everything as a shared lib
-if($ENV{TT_METAL_CREATE_STATIC_LIB})
-    option(BUILD_SHARED_LIBS "Create shared library" OFF)
-else()
-    option(BUILD_SHARED_LIBS "Create shared library" ON)
-endif()
+option(BUILD_SHARED_LIBS "Create shared libraries" ON)
 message(STATUS "Build shared libs: ${BUILD_SHARED_LIBS}")
 
 option(ENABLE_ASAN "Enable build with AddressSanitizer" OFF)

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -92,14 +92,14 @@ tests/scripts/run_performance.sh @tt-rkim
 
 # TTNN
 ttnn/ @eyonland @patrickroberts @yan-zaretskiy @cfjchu @xanderchin @TT-BrianLiu @ayerofieiev-tt @dmakoviichuk-tt
+ttnn/ttnn/library_tweaks.py @ayerofieiev-tt @dmakoviichuk-tt @tt-rkim
 ttnn/**/kernels/ # Removes the owners above from owning kernels unless specified afterwards
-ttnn/setup.py @ayerofieiev-tt @dmakoviichuk-tt @tt-rkim
 ttnn/**/CMakeLists.txt @ayerofieiev-tt @dmakoviichuk-tt @yan-zaretskiy
 ttnn/cpp/ttnn/tensor/ @patrickroberts @yan-zaretskiy @eyonland @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
 ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/ccl/ @SeanNijjar
 ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/program_cache.*pp @eyonland @cfjchu @xanderchin
 ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh*/ @razorback3 @dongjin-na @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
-ttnn/cpp/ttnn/deprecated/tt_lib/csrc/ @TT-BrianLiu @tt-aho @mywoodstock @eyonland @ayerofieiev-tt @razorback3 @dongjin-na
+ttnn/cpp/ttnn/deprecated/tt_lib/csrc/ @TT-BrianLiu @mywoodstock @eyonland @ayerofieiev-tt @razorback3 @dongjin-na
 ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp @eyonland @cfjchu @xanderchin
 ttnn/cpp/ttnn/deprecated/tt_lib/fallback_ops @tt-aho
 
@@ -117,6 +117,7 @@ ttnn/cpp/ttnn/operations/embedding/ @tarafdarTT @tt-aho @TT-BrianLiu
 ttnn/cpp/ttnn/operations/embedding_backward/ @TT-BrianLiu @yan-zaretskiy
 tests/ttnn/ @eyonland @patrickroberts @yan-zaretskiy @cfjchu @xanderchin @TT-BrianLiu @ayerofieiev-tt @dmakoviichuk-tt @razorback3 @dongjin-na
 tests/sweep_framework/ @xanderchin @jdesousa-TT @sjameelTT
+
 # models
 /models/ @tt-rkim @uaydonat
 /models/*/**

diff --git a/ErrorMessageBestPractices.md b/ErrorMessageBestPractices.md
@@ -1,5 +1,5 @@
 # Guidelines for Writing Effective Error Messages ✍️
-Clear and informative error messages are crucial for debugging and maintenance. A well-crafted error message can save hours of troubleshooting and make our codebase more user-friendly, especially for those less familiar with the system. 
+Clear and informative error messages are crucial for debugging and maintenance. A well-crafted error message can save hours of troubleshooting and make our codebase more user-friendly, especially for those less familiar with the system.
 
 A well-written error message provides the following information to the user:
 * What happened and why?
@@ -17,7 +17,7 @@ TT_FATAL(input_shape.rank() == 3, "Invalid input tensor dimensions.");
 ```
 Write:
 ```cpp
-TT_FATAL(input_shape.rank() == 3, fmt::format("Invalid input tensor: expected 3 dimensions, but found {}.", input_shape.rank()));
+TT_FATAL(input_shape.rank() == 3, "Invalid input tensor: expected 3 dimensions, but found {}.", input_shape.rank());
 ```
 ### 2. Explain the Issue
 Provide a brief explanation of why the error occurred or why the condition is important. This helps users understand the context of the error.
@@ -53,15 +53,15 @@ TT_FATAL(head_size % TILE_WIDTH != 0, "Head size is invalid.");
 ```
 Write:
 ```cpp
-TT_FATAL(head_size % TILE_WIDTH != 0, fmt::format("Invalid head size: {}. The head size must be a multiple of tile width ({}). Please adjust the dimensions accordingly.", head_size, TILE_WIDTH));
+TT_FATAL(head_size % TILE_WIDTH != 0, "Invalid head size: {}. The head size must be a multiple of tile width ({}). Please adjust the dimensions accordingly.", head_size, TILE_WIDTH);
 ```
 
 ## Good Example
 This message clearly states the problem, includes the actual value of head_size, and offers guidance on how to fix it.
 ```cpp
 TT_FATAL(head_size % TILE_WIDTH == 0,
-         fmt::format("Invalid head size: {}. The head size must be a multiple of the tile width ({}). Please adjust the dimensions accordingly.", 
-                     head_size, TILE_WIDTH));
+         "Invalid head size: {}. The head size must be a multiple of the tile width ({}). Please adjust the dimensions accordingly.",
+         head_size, TILE_WIDTH);
 ```
 
 ## Style recommendations