#0: Merge branch 'main' of https://github.com/tenstorrent/tt-metal in…

…to qwen-decode
tenstorrent · Nov 14, 2024 · 04f449d · 04f449d
2 parents 1377f6a + ce6ff4c
commit 04f449d
Show file tree

Hide file tree

Showing 431 changed files with 22,522 additions and 2,292 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+*tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tt-train/sources/examples/nano_gpt/data/shakespeare.txt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml
@@ -168,6 +168,20 @@ jobs:
     with:
       arch: ${{ matrix.test-group.arch }}
       runner-label: ${{ matrix.test-group.runner-label }}
+  tt-train-cpp-unit-tests:
+    needs: build-artifact
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          { arch: wormhole_b0, runner-label: N150 },
+          { arch: wormhole_b0, runner-label: N300 },
+        ]
+    uses: ./.github/workflows/tt-train-post-commit.yaml
+    with:
+      arch: ${{ matrix.test-group.arch }}
+      runner-label: ${{ matrix.test-group.runner-label }}
   profiler-regression:
     needs: build-artifact-profiler
     uses: ./.github/workflows/run-profiler-regression.yaml

diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
@@ -124,6 +124,7 @@ jobs:
             -v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
             -v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
             -e ARCH_NAME=${{ matrix.arch }}
+            -e CARGO_HOME=${{ github.workspace }}/.cargo
             -w ${{ github.workspace }}
           run: |
             set -eu # basic shell hygiene
@@ -136,7 +137,7 @@ jobs:
             # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache
             ccache -z
 
-            build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --build-tests --build-programming-examples --enable-ccache"
+            build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --build-all --enable-ccache"
             echo "${{ inputs.tracy }}"
             if [ "${{ inputs.tracy }}" = "true" ]; then
               build_command="$build_command --enable-profiler"
@@ -150,7 +151,7 @@ jobs:
           cat build/ccache.stats >> $GITHUB_STEP_SUMMARY
           echo '```' >> $GITHUB_STEP_SUMMARY
       - name: 'Tar files'
-        run: tar -cvf ttm_${{ matrix.arch }}.tar build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools runtime
+        run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train runtime
       - name: 'Upload Artifact'
         uses: actions/upload-artifact@v4
         with:

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -43,9 +43,8 @@ jobs:
             -e ARCH_NAME=${{ matrix.arch }}
           docker_os_arch: ${{ matrix.build.os }}-amd64
           run_args: |
-            nice -n 19 cmake -B build -DCMAKE_BUILD_TYPE=${{ matrix.build.type }} -DCMAKE_CXX_COMPILER=${{ matrix.build.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.build.c_compiler }} -G Ninja -DTT_METAL_BUILD_TESTS=ON -DTTNN_BUILD_TESTS=ON -DTT_UMD_BUILD_TESTS=ON
-            nice -n 19 cmake --build build
-
+            build_command="./build_metal.sh --build-type ${{ matrix.build.type }} --cxx-compiler-path ${{ matrix.build.cxx_compiler }} --c-compiler-path ${{ matrix.build.c_compiler }} --build-tests --build-programming-examples --disable-unity-builds"
+            nice -n 19 $build_command
       - name: Check disk space
         run: |
           df -h

diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml
@@ -22,7 +22,7 @@ jobs:
           { name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k llama3 tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
           { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich
-          { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich
+          # { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich  # FIXME issue #14934
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
           { name: "t3k resnet tests", arch: wormhole_b0, cmd: run_t3000_resnet_tests, timeout: 30, owner_id: U013121KDH9}, #Austin Ho
         ]

diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -22,6 +22,7 @@ jobs:
           { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
           { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
           { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
+          { name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests && run_t3000_ccl_reduce_scatter_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
@@ -45,13 +46,25 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
+      - name: Download profiler build artifact
+        id: download-profiler-artifact
+        if: ${{ matrix.test-group.tracy }}
+        uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+        continue-on-error: true
+      - name: Download build artifact
+        id: download-artifact
+        if: ${{ !matrix.test-group.tracy }}
+        uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
+        if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
+        if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
         shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
@@ -63,18 +76,51 @@ jobs:
           env python models/perf/merge_perf_results.py
       - name: Check perf report exists
         id: check-perf-report
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
         run: |
-          ls -hal
-          export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
-          ls -hal $PERF_REPORT_FILENAME
-          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
-      - name: Upload perf report
-        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
+          TODAY=$(date +%Y_%m_%d)
+          PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL_ALL_GATHER="CCL_all_gather_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER="CCL_reduce_scatter_Perf_${TODAY}.csv"
+          if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
+            found_reports=false
+            if [ -f "$PERF_REPORT_FILENAME_CCL_ALL_GATHER" ]; then
+              echo "Found CCL AllGather Perf report: $PERF_REPORT_FILENAME_CCL_ALL_GATHER"
+              echo "perf_report_filename_all_gather=$PERF_REPORT_FILENAME_CCL_ALL_GATHER" >> "$GITHUB_OUTPUT"
+              found_reports=true
+            fi
+            if [ -f "$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" ]; then
+              echo "Found CCL ReduceScatter Perf report: $PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER"
+              echo "perf_report_filename_reduce_scatter=$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" >> "$GITHUB_OUTPUT"
+              found_reports=true
+            fi
+            if [ "$found_reports" = false ]; then
+              echo "No CCL perf report found for today."
+              exit 1
+            fi
+          else
+            if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
+              echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
+            else
+              echo "No Models perf report found for today."
+              exit 1
+            fi
+          fi
+      - name: Upload Models perf report
+        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && !matrix.test-group.tracy}}
         uses: actions/upload-artifact@v4
         with:
           name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
           path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
+      - name: Upload CCL perf report
+        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && matrix.test-group.tracy}}
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
+          path: |
+            ${{ steps.check-perf-report.outputs.perf_report_filename_all_gather }}
+            ${{ steps.check-perf-report.outputs.perf_report_filename_reduce_scatter }}
       - uses: ./.github/actions/slack-report
         if: ${{ failure() }}
         with:

diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
@@ -11,7 +11,13 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
+  build-artifact-profiler:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+      tracy: true
+    secrets: inherit
   t3000-model-perf-tests:
-    needs: build-artifact
+    needs: [build-artifact, build-artifact-profiler]
     secrets: inherit
     uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
diff --git a/.github/workflows/tt-train-post-commit-wrapper.yaml b/.github/workflows/tt-train-post-commit-wrapper.yaml
@@ -0,0 +1,27 @@
+name: "[post-commit] tt-train C++ tests"
+
+on:
+  workflow_call:
+  workflow_dispatch:
+
+jobs:
+  static-checks:
+    uses: ./.github/workflows/all-static-checks.yaml
+    secrets: inherit
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    secrets: inherit
+  tt-train-cpp-unit-tests:
+    needs: build-artifact
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          { arch: wormhole_b0, runner-label: N150 },
+          { arch: wormhole_b0, runner-label: N300 },
+        ]
+    uses: ./.github/workflows/tt-train-post-commit.yaml
+    with:
+      arch: ${{ matrix.test-group.arch}}
+      runner-label: ${{ matrix.test-group.runner-label}}
diff --git a/.github/workflows/tt-train-post-commit.yaml b/.github/workflows/tt-train-post-commit.yaml
@@ -0,0 +1,81 @@
+name: "[internal] tt-train C++ tests impl"
+
+on:
+  workflow_call:
+    inputs:
+      arch:
+        required: true
+        type: string
+      runner-label:
+        required: true
+        type: string
+      timeout:
+        required: false
+        type: number
+        default: 20
+  workflow_dispatch:
+    inputs:
+      arch:
+        required: true
+        type: choice
+        options:
+          - wormhole_b0
+      runner-label:
+        required: true
+        type: choice
+        options:
+          - N150
+          - N300
+      timeout:
+        required: false
+        type: number
+        default: 20
+
+jobs:
+  models:
+    strategy:
+      # Do not fail-fast because we need to ensure all tests go to completion
+      # so we try not to get hanging machines
+      fail-fast: false
+      matrix:
+        test-group: [
+          {name: tt-train, cmd: ctest --no-tests=error --output-on-failure},
+        ]
+    name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ inputs.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+      TEST_DATA_DIR: ${{ github.workspace }}/tt-train/tests/test_data
+    runs-on:
+      - ${{ inputs.runner-label }}
+      - cloud-virtual-machine
+      - in-service
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - uses: ./.github/actions/prepare-metal-run
+        with:
+          arch: ${{ inputs.arch }}
+      - name: ${{ matrix.test-group.name }} tests
+        timeout-minutes: ${{ inputs.timeout }}
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          export PYTHONPATH=$TT_METAL_HOME
+          cd $TT_METAL_HOME
+          cp ./build/tt-train/3rd_party/wandb-cpp/libwandbcpp.so build/lib/
+          find ./build -type f -name "*.tcl" -o -name "*.cmake" -exec sed -i "s|/home/ubuntu/[^/]*/_work/tt-metal/tt-metal/build_Release|${TT_METAL_HOME}/build|g" {} +
+          cd $TT_METAL_HOME/build/tt-train
+          ldd tests/ttml_tests || true
+          ${{ matrix.test-group.cmd }}
+      - uses: ./.github/actions/slack-report
+        if: ${{ failure() }}
+        with:
+          slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
+          owner: U07ASPTGJTS # Denys
+      - name: Generate system logs on failure
+        uses: ./.github/actions/generate-system-logs
+        if: ${{ failure() }}
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
@@ -312,12 +312,16 @@ on:
           - conv2d.full.conv2d_sharding
           - conv2d.full.conv2d_sliding_window
           - conv2d.short.conv2d_short_sweep
+          - pooling.global_avg_pool2d
+          - pooling.max_pool2d
           - max_pool2d.short.max_pool2d_short_sweep
           - max_pool2d.full.max_pool2d_params
           - max_pool2d.full.max_pool2d_large_dims
           - transformer.concatenate_heads.concatenate_heads
           - transformer.split_query_key_value_and_split_heads.split_query_key_value_and_split_heads
           - transformer.split_query_key_value_and_split_heads.split_query_key_value_and_split_heads_kv_input
+          - transformer.attention_softmax.attention_softmax
+          - transformer.attention_softmax.attention_softmax_
           - data_movement.stack.stack_pytorch2
           - data_movement.repeat.repeat_pytorch2
           - data_movement.split.split_pytorch2

diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ coremodel/model/release/
 
 pipegen.yaml
 device_desc.yaml
+cluster_descriptor.yaml
 .umd/
 /clean
 *coverage.txt

diff --git a/.gitmodules b/.gitmodules
@@ -28,3 +28,9 @@
 [submodule "tt_metal/third_party/tt_llk_blackhole"]
 	path = tt_metal/third_party/tt_llk_blackhole
 	url = https://github.com/tenstorrent/tt-llk-bh.git
+[submodule "tokenizers-cpp"]
+	path = tt-train/3rd_party/tokenizers-cpp
+	url = https://github.com/mlc-ai/tokenizers-cpp.git
+[submodule "3rd_party/wandb-cpp"]
+	path = tt-train/3rd_party/wandb-cpp
+	url = https://github.com/yhisaki/wandb-cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -203,6 +203,9 @@ target_link_libraries(
         numa
 )
 
+if(NOT DEFINED ENV{ARCH_NAME})
+    message(FATAL_ERROR "Please set ARCH_NAME to grayskull, wormhole_b0, or blackhole")
+endif(NOT DEFINED ENV{ARCH_NAME})
 string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
 add_compile_definitions(ARCH_${ARCH_NAME_DEF})
 add_compile_options(
@@ -331,3 +334,7 @@ add_custom_target(
 )
 
 include(packaging)
+
+if(BUILD_TT_TRAIN)
+    add_subdirectory(tt-train)
+endif()
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -23,16 +23,15 @@ MANIFEST.in @tt-rkim
 setup.py @tt-rkim
 pyproject.toml @tt-rkim @TT-billteng
 requirements*.txt @tt-rkim @TT-billteng @ttmchiou
-setup_hugepages.py @tt-rkim @TT-billteng
+setup_hugepages.py @tt-rkim
 
-scripts/docker @TT-billteng
-scripts/build_scripts/ @tt-rkim @vtangTT @TT-billteng
-cmake/ @tt-rkim @vtangTT @TT-billteng @afuller-TT
-build_metal.sh @tt-rkim @vtangTT @TT-billteng
+scripts/build_scripts/ @tt-rkim @vtangTT
+cmake/ @tt-rkim @vtangTT @afuller-TT
+build_metal.sh @tt-rkim @vtangTT
 
 Makefile @tt-rkim
-/CMakeLists.txt @tt-rkim @vtangTT @TT-billteng @blozano-tt @afuller-TT
-tests/CMakeLists.txt @tt-rkim @vtangTT @TT-billteng @blozano-tt @afuller-TT
+/CMakeLists.txt @tt-rkim @vtangTT @blozano-tt @afuller-TT
+tests/CMakeLists.txt @tt-rkim @vtangTT @blozano-tt @afuller-TT
 
 # Testing scripts and infra
 
@@ -176,9 +175,13 @@ tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py @esmalTT
 tests/ttnn/integration_tests/unet @esmalTT @uaydonat @mywoodstock
 tests/nightly/wh_b0_only_eth/experimental/functional_unet @esmalTT @uaydonat @mywoodstock
 scripts/profiler/ @mo-tenstorrent
-scripts/docker @ttmchiou @TT-billteng @tt-rkim
+scripts/docker @ttmchiou @tt-rkim
 
-dockerfile @ttmchiou @TT-billteng @tt-rkim
+dockerfile @ttmchiou @tt-rkim
 
 tt_metal/CMakeLists.txt @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @blozano-tt
 ttnn/CMakeLists.txt @ayerofieiev-tt @dmakoviichuk-tt @yan-zaretskiy
+
+
+# tt-train
+tt-train/** @dmakoviichuk-tt @rfurko-tt
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		*tokenizer.json filter=lfs diff=lfs merge=lfs -text
		tt-train/sources/examples/nano_gpt/data/shakespeare.txt filter=lfs diff=lfs merge=lfs -text