Skip to content

Commit

Permalink
Merge branch 'main' into llama3/sharded-residual
Browse files Browse the repository at this point in the history
  • Loading branch information
yieldthought authored Nov 17, 2024
2 parents 8ceb336 + 7a99cc8 commit 5151767
Show file tree
Hide file tree
Showing 497 changed files with 27,712 additions and 4,140 deletions.
3 changes: 0 additions & 3 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ Checks: >
-bugprone-unchecked-optional-access,
-bugprone-unhandled-self-assignment,
-bugprone-unused-raii,
-bugprone-use-after-move,
-cert-env33-c,
-cert-err09-cpp,
-cert-err33-c,
Expand Down Expand Up @@ -99,7 +98,6 @@ Checks: >
-hicpp-named-parameter,
-hicpp-no-array-decay,
-hicpp-no-malloc,
-hicpp-no-malloc,
-hicpp-noexcept-move,
-hicpp-signed-bitwise,
-hicpp-special-member-functions,
Expand Down Expand Up @@ -155,7 +153,6 @@ Checks: >
-performance-inefficient-string-concatenation,
-performance-inefficient-vector-operation,
-performance-move-const-arg,
-performance-move-const-arg,
-performance-move-constructor-init,
-performance-no-int-to-ptr,
-performance-noexcept-move-constructor,
Expand Down
20 changes: 20 additions & 0 deletions .github/workflows/all-post-commit-workflows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,26 @@ jobs:
with:
arch: ${{ matrix.test-group.arch }}
runner-label: ${{ matrix.test-group.runner-label }}
code-analysis:
needs: build-docker-image-2204
uses: ./.github/workflows/code-analysis.yaml
secrets: inherit
with:
os: ubuntu-22.04-amd64
tt-train-cpp-unit-tests:
needs: build-artifact
secrets: inherit
strategy:
fail-fast: false
matrix:
test-group: [
{ arch: wormhole_b0, runner-label: N150 },
{ arch: wormhole_b0, runner-label: N300 },
]
uses: ./.github/workflows/tt-train-post-commit.yaml
with:
arch: ${{ matrix.test-group.arch }}
runner-label: ${{ matrix.test-group.runner-label }}
profiler-regression:
needs: build-artifact-profiler
uses: ./.github/workflows/run-profiler-regression.yaml
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-and-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ jobs:
run: |
source ${{ github.workspace }}/python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type post_commit --dispatch-mode slow
./tests/scripts/run_tunneler_tests.sh --machine-type ${{ inputs.runner-label }}
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/build-artifact.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ jobs:
-v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
-v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
-e ARCH_NAME=${{ matrix.arch }}
-e CARGO_HOME=${{ github.workspace }}/.cargo
-w ${{ github.workspace }}
run: |
set -eu # basic shell hygiene
Expand All @@ -136,7 +137,7 @@ jobs:
# NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache
ccache -z
build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --build-tests --build-programming-examples --enable-ccache"
build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --build-all --enable-ccache"
echo "${{ inputs.tracy }}"
if [ "${{ inputs.tracy }}" = "true" ]; then
build_command="$build_command --enable-profiler"
Expand All @@ -150,7 +151,7 @@ jobs:
cat build/ccache.stats >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
- name: 'Tar files'
run: tar -cvf ttm_${{ matrix.arch }}.tar build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools runtime
run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train build/data runtime
- name: 'Upload Artifact'
uses: actions/upload-artifact@v4
with:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/build-wrapper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ on:

permissions:
actions: read
contents: read
contents: write
pages: write
id-token: write
packages: write
pull-requests: write

jobs:
static-checks:
Expand Down
45 changes: 43 additions & 2 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ jobs:
runs-on: ${{ matrix.build.runs-on }}
name: ${{ matrix.build.type }} ${{ matrix.build.cxx_compiler }} ${{ matrix.arch }} ${{ matrix.build.os }}
steps:
- name: Verify ccache availability
shell: bash
run: |
if [ ! -d "/mnt/MLPerf/ccache" ]; then
echo "::error title=ccache-mlperf-not-mounted::NFS drive is not mounted; build machine not properly provisioned."
exit 1
fi
if [ ! -d "$HOME/.ccache-ci" ]; then
echo "::error title=ccache-not-provisioned::Ccache is not properly provisioned."
exit 1
fi
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Set up dynamic env vars for build
run: |
Expand All @@ -41,11 +52,41 @@ jobs:
docker_image_arch: ${{ inputs.arch }}
docker_opts: |
-e ARCH_NAME=${{ matrix.arch }}
--group-add 1457
-v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
-e CCACHE_DIR=/home/ubuntu/.ccache
-v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
docker_os_arch: ${{ matrix.build.os }}-amd64
run_args: |
nice -n 19 cmake -B build -DCMAKE_BUILD_TYPE=${{ matrix.build.type }} -DCMAKE_CXX_COMPILER=${{ matrix.build.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.build.c_compiler }} -G Ninja -DTT_METAL_BUILD_TESTS=ON -DTTNN_BUILD_TESTS=ON -DTT_UMD_BUILD_TESTS=ON
nice -n 19 cmake --build build
set -eu # basic shell hygiene
set -x
# /tmp is a tmpfs; more efficient than persisted storage
mkdir -p /tmp/ccache
export CCACHE_TEMPDIR=/tmp/ccache
ccache --version
ccache --show-config
ccache --show-stats
# Zero out the stats so we can see how we did this build
# NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache
ccache -z
build_command="./build_metal.sh --build-type ${{ matrix.build.type }} --cxx-compiler-path ${{ matrix.build.cxx_compiler }} --c-compiler-path ${{ matrix.build.c_compiler }} --build-tests --build-programming-examples --disable-unity-builds --enable-ccache"
nice -n 19 $build_command
ccache --show-stats
mkdir out
ccache -s > out/ccache.stats
cat out/ccache.stats
- name: Publish Ccache summary
run: |
cat out/ccache.stats
echo '## CCache Summary' >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
cat out/ccache.stats >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
- name: Check disk space
run: |
df -h
Expand Down
105 changes: 105 additions & 0 deletions .github/workflows/code-analysis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: "Code analysis"

on:
workflow_call:
inputs:
os:
required: false
type: string
default: "ubuntu-22.04-amd64"
workflow_dispatch:
inputs:
os:
required: false
type: string
default: "ubuntu-22.04-amd64"

jobs:
build-docker-image:
uses: ./.github/workflows/build-docker-artifact.yaml
secrets: inherit
with:
os: ${{ inputs.os }}


clang-tidy:
needs: build-docker-image
env:
ARCH_NAME: wormhole_b0
runs-on:
- build
- in-service
steps:
- name: Verify ccache availability
shell: bash
run: |
if [ ! -d "/mnt/MLPerf/ccache" ]; then
echo "::error title=ccache-mlperf-not-mounted::NFS drive is not mounted; build machine not properly provisioned."
exit 1
fi
if [ ! -d "$HOME/.ccache-ci" ]; then
echo "::error title=ccache-not-provisioned::Ccache is not properly provisioned."
exit 1
fi
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV
echo "RUNNER_GID=$(id -g)" >> $GITHUB_ENV
- name: Update submodules
run: |
git submodule update --init --recursive
- name: Generate docker tag
id: generate-docker-tag
uses: ./.github/actions/generate-docker-tag
with:
image: ${{ inputs.os }}
- name: Docker login
uses: docker/login-action@v3
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull docker image
run: docker pull ${{ env.TT_METAL_DOCKER_IMAGE_TAG }}
- name: Analyze code with clang-tidy
uses: addnab/docker-run-action@v3
with:
image: ${{ env.TT_METAL_DOCKER_IMAGE_TAG }}
options: |
--rm
--tmpfs /tmp
-u ${{ env.RUNNER_UID }}:${{ env.RUNNER_GID }}
--group-add 1457
-v ${{ github.workspace }}:${{ github.workspace }}
-v /etc/passwd:/etc/passwd:ro
-v /etc/shadow:/etc/shadow:ro
-v /etc/bashrc:/etc/bashrc:ro
-v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
-v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
-e ARCH_NAME=${{ env.ARCH_NAME }}
-e CARGO_HOME=${{ github.workspace }}/.cargo
-w ${{ github.workspace }}
run: |
set -eu # basic shell hygiene
# /tmp is a tmpfs; more efficient than persisted storage
mkdir -p /tmp/ccache
export CCACHE_TEMPDIR=/tmp/ccache
# Zero out the stats so we can see how we did this build
# NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache
ccache -z
cmake --preset clang-tidy
# cmake -B .build/clang-tidy -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_CLANG_TIDY=clang-tidy-17 -DTT_UNITY_BUILDS=FALSE -DCMAKE_DISABLE_PRECOMPILE_HEADERS=TRUE -DENABLE_CCACHE=TRUE -DTT_METAL_BUILD_TESTS=TRUE -DTTNN_BUILD_TESTS=TRUE -DBUILD_PROGRAMMING_EXAMPLES=TRUE -DBUILD_TT_TRAIN=TRUE
nice -n 19 cmake --build --preset clang-tidy
mkdir out
ccache -s > out/ccache.stats
- name: Publish Ccache summary
run: |
echo '## CCache Summary' >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
cat out/ccache.stats >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
33 changes: 25 additions & 8 deletions .github/workflows/t3000-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
{ name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
{ name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
{ name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
{ name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests && run_t3000_ccl_reduce_scatter_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
#{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
]
name: ${{ matrix.test-group.name }}
Expand Down Expand Up @@ -80,12 +80,21 @@ jobs:
run: |
TODAY=$(date +%Y_%m_%d)
PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL_ALL_GATHER="CCL_all_gather_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER="CCL_reduce_scatter_Perf_${TODAY}.csv"
if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
else
found_reports=false
if [ -f "$PERF_REPORT_FILENAME_CCL_ALL_GATHER" ]; then
echo "Found CCL AllGather Perf report: $PERF_REPORT_FILENAME_CCL_ALL_GATHER"
echo "perf_report_filename_all_gather=$PERF_REPORT_FILENAME_CCL_ALL_GATHER" >> "$GITHUB_OUTPUT"
found_reports=true
fi
if [ -f "$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" ]; then
echo "Found CCL ReduceScatter Perf report: $PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER"
echo "perf_report_filename_reduce_scatter=$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" >> "$GITHUB_OUTPUT"
found_reports=true
fi
if [ "$found_reports" = false ]; then
echo "No CCL perf report found for today."
exit 1
fi
Expand All @@ -98,12 +107,20 @@ jobs:
exit 1
fi
fi
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
- name: Upload Models perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && !matrix.test-group.tracy}}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Upload CCL perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && matrix.test-group.tracy}}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
path: |
${{ steps.check-perf-report.outputs.perf_report_filename_all_gather }}
${{ steps.check-perf-report.outputs.perf_report_filename_reduce_scatter }}
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
Expand Down
27 changes: 27 additions & 0 deletions .github/workflows/tt-train-post-commit-wrapper.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: "[post-commit] tt-train C++ tests"

on:
workflow_call:
workflow_dispatch:

jobs:
static-checks:
uses: ./.github/workflows/all-static-checks.yaml
secrets: inherit
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
secrets: inherit
tt-train-cpp-unit-tests:
needs: build-artifact
secrets: inherit
strategy:
fail-fast: false
matrix:
test-group: [
{ arch: wormhole_b0, runner-label: N150 },
{ arch: wormhole_b0, runner-label: N300 },
]
uses: ./.github/workflows/tt-train-post-commit.yaml
with:
arch: ${{ matrix.test-group.arch}}
runner-label: ${{ matrix.test-group.runner-label}}
Loading

0 comments on commit 5151767

Please sign in to comment.