Skip to content

Commit

Permalink
#0: Merge branch 'main' of https://github.com/tenstorrent/tt-metal in…
Browse files Browse the repository at this point in the history
…to qwen-decode
  • Loading branch information
sraizada-tt committed Nov 14, 2024
2 parents 1377f6a + ce6ff4c commit 04f449d
Show file tree
Hide file tree
Showing 431 changed files with 22,522 additions and 2,292 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*tokenizer.json filter=lfs diff=lfs merge=lfs -text
tt-train/sources/examples/nano_gpt/data/shakespeare.txt filter=lfs diff=lfs merge=lfs -text
14 changes: 14 additions & 0 deletions .github/workflows/all-post-commit-workflows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,20 @@ jobs:
with:
arch: ${{ matrix.test-group.arch }}
runner-label: ${{ matrix.test-group.runner-label }}
tt-train-cpp-unit-tests:
needs: build-artifact
secrets: inherit
strategy:
fail-fast: false
matrix:
test-group: [
{ arch: wormhole_b0, runner-label: N150 },
{ arch: wormhole_b0, runner-label: N300 },
]
uses: ./.github/workflows/tt-train-post-commit.yaml
with:
arch: ${{ matrix.test-group.arch }}
runner-label: ${{ matrix.test-group.runner-label }}
profiler-regression:
needs: build-artifact-profiler
uses: ./.github/workflows/run-profiler-regression.yaml
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/build-artifact.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ jobs:
-v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
-v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
-e ARCH_NAME=${{ matrix.arch }}
-e CARGO_HOME=${{ github.workspace }}/.cargo
-w ${{ github.workspace }}
run: |
set -eu # basic shell hygiene
Expand All @@ -136,7 +137,7 @@ jobs:
# NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache
ccache -z
build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --build-tests --build-programming-examples --enable-ccache"
build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --build-all --enable-ccache"
echo "${{ inputs.tracy }}"
if [ "${{ inputs.tracy }}" = "true" ]; then
build_command="$build_command --enable-profiler"
Expand All @@ -150,7 +151,7 @@ jobs:
cat build/ccache.stats >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
- name: 'Tar files'
run: tar -cvf ttm_${{ matrix.arch }}.tar build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools runtime
run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train runtime
- name: 'Upload Artifact'
uses: actions/upload-artifact@v4
with:
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,8 @@ jobs:
-e ARCH_NAME=${{ matrix.arch }}
docker_os_arch: ${{ matrix.build.os }}-amd64
run_args: |
nice -n 19 cmake -B build -DCMAKE_BUILD_TYPE=${{ matrix.build.type }} -DCMAKE_CXX_COMPILER=${{ matrix.build.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.build.c_compiler }} -G Ninja -DTT_METAL_BUILD_TESTS=ON -DTTNN_BUILD_TESTS=ON -DTT_UMD_BUILD_TESTS=ON
nice -n 19 cmake --build build
build_command="./build_metal.sh --build-type ${{ matrix.build.type }} --cxx-compiler-path ${{ matrix.build.cxx_compiler }} --c-compiler-path ${{ matrix.build.c_compiler }} --build-tests --build-programming-examples --disable-unity-builds"
nice -n 19 $build_command
- name: Check disk space
run: |
df -h
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/t3000-frequent-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
{ name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
{ name: "t3k llama3 tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
{ name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich
{ name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich
# { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich # FIXME issue #14934
{ name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
{ name: "t3k resnet tests", arch: wormhole_b0, cmd: run_t3000_resnet_tests, timeout: 30, owner_id: U013121KDH9}, #Austin Ho
]
Expand Down
62 changes: 54 additions & 8 deletions .github/workflows/t3000-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
{ name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
{ name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
{ name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests && run_t3000_ccl_reduce_scatter_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
#{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
]
name: ${{ matrix.test-group.name }}
Expand All @@ -45,13 +46,25 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
- name: Download profiler build artifact
id: download-profiler-artifact
if: ${{ matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
continue-on-error: true
- name: Download build artifact
id: download-artifact
if: ${{ !matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
Expand All @@ -63,18 +76,51 @@ jobs:
env python models/perf/merge_perf_results.py
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
run: |
ls -hal
export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
TODAY=$(date +%Y_%m_%d)
PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL_ALL_GATHER="CCL_all_gather_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER="CCL_reduce_scatter_Perf_${TODAY}.csv"
if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
found_reports=false
if [ -f "$PERF_REPORT_FILENAME_CCL_ALL_GATHER" ]; then
echo "Found CCL AllGather Perf report: $PERF_REPORT_FILENAME_CCL_ALL_GATHER"
echo "perf_report_filename_all_gather=$PERF_REPORT_FILENAME_CCL_ALL_GATHER" >> "$GITHUB_OUTPUT"
found_reports=true
fi
if [ -f "$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" ]; then
echo "Found CCL ReduceScatter Perf report: $PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER"
echo "perf_report_filename_reduce_scatter=$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" >> "$GITHUB_OUTPUT"
found_reports=true
fi
if [ "$found_reports" = false ]; then
echo "No CCL perf report found for today."
exit 1
fi
else
if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
else
echo "No Models perf report found for today."
exit 1
fi
fi
- name: Upload Models perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && !matrix.test-group.tracy}}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Upload CCL perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && matrix.test-group.tracy}}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
path: |
${{ steps.check-perf-report.outputs.perf_report_filename_all_gather }}
${{ steps.check-perf-report.outputs.perf_report_filename_reduce_scatter }}
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/t3000-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
t3000-model-perf-tests:
needs: build-artifact
needs: [build-artifact, build-artifact-profiler]
secrets: inherit
uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
27 changes: 27 additions & 0 deletions .github/workflows/tt-train-post-commit-wrapper.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: "[post-commit] tt-train C++ tests"

on:
workflow_call:
workflow_dispatch:

jobs:
static-checks:
uses: ./.github/workflows/all-static-checks.yaml
secrets: inherit
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
secrets: inherit
tt-train-cpp-unit-tests:
needs: build-artifact
secrets: inherit
strategy:
fail-fast: false
matrix:
test-group: [
{ arch: wormhole_b0, runner-label: N150 },
{ arch: wormhole_b0, runner-label: N300 },
]
uses: ./.github/workflows/tt-train-post-commit.yaml
with:
arch: ${{ matrix.test-group.arch}}
runner-label: ${{ matrix.test-group.runner-label}}
81 changes: 81 additions & 0 deletions .github/workflows/tt-train-post-commit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: "[internal] tt-train C++ tests impl"

on:
workflow_call:
inputs:
arch:
required: true
type: string
runner-label:
required: true
type: string
timeout:
required: false
type: number
default: 20
workflow_dispatch:
inputs:
arch:
required: true
type: choice
options:
- wormhole_b0
runner-label:
required: true
type: choice
options:
- N150
- N300
timeout:
required: false
type: number
default: 20

jobs:
models:
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
test-group: [
{name: tt-train, cmd: ctest --no-tests=error --output-on-failure},
]
name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ inputs.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
TEST_DATA_DIR: ${{ github.workspace }}/tt-train/tests/test_data
runs-on:
- ${{ inputs.runner-label }}
- cloud-virtual-machine
- in-service
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
with:
arch: ${{ inputs.arch }}
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ inputs.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
export PYTHONPATH=$TT_METAL_HOME
cd $TT_METAL_HOME
cp ./build/tt-train/3rd_party/wandb-cpp/libwandbcpp.so build/lib/
find ./build -type f -name "*.tcl" -o -name "*.cmake" -exec sed -i "s|/home/ubuntu/[^/]*/_work/tt-metal/tt-metal/build_Release|${TT_METAL_HOME}/build|g" {} +
cd $TT_METAL_HOME/build/tt-train
ldd tests/ttml_tests || true
${{ matrix.test-group.cmd }}
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
owner: U07ASPTGJTS # Denys
- name: Generate system logs on failure
uses: ./.github/actions/generate-system-logs
if: ${{ failure() }}
4 changes: 4 additions & 0 deletions .github/workflows/ttnn-run-sweeps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -312,12 +312,16 @@ on:
- conv2d.full.conv2d_sharding
- conv2d.full.conv2d_sliding_window
- conv2d.short.conv2d_short_sweep
- pooling.global_avg_pool2d
- pooling.max_pool2d
- max_pool2d.short.max_pool2d_short_sweep
- max_pool2d.full.max_pool2d_params
- max_pool2d.full.max_pool2d_large_dims
- transformer.concatenate_heads.concatenate_heads
- transformer.split_query_key_value_and_split_heads.split_query_key_value_and_split_heads
- transformer.split_query_key_value_and_split_heads.split_query_key_value_and_split_heads_kv_input
- transformer.attention_softmax.attention_softmax
- transformer.attention_softmax.attention_softmax_
- data_movement.stack.stack_pytorch2
- data_movement.repeat.repeat_pytorch2
- data_movement.split.split_pytorch2
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ coremodel/model/release/

pipegen.yaml
device_desc.yaml
cluster_descriptor.yaml
.umd/
/clean
*coverage.txt
Expand Down
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,9 @@
[submodule "tt_metal/third_party/tt_llk_blackhole"]
path = tt_metal/third_party/tt_llk_blackhole
url = https://github.com/tenstorrent/tt-llk-bh.git
[submodule "tokenizers-cpp"]
path = tt-train/3rd_party/tokenizers-cpp
url = https://github.com/mlc-ai/tokenizers-cpp.git
[submodule "3rd_party/wandb-cpp"]
path = tt-train/3rd_party/wandb-cpp
url = https://github.com/yhisaki/wandb-cpp
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ target_link_libraries(
numa
)

if(NOT DEFINED ENV{ARCH_NAME})
message(FATAL_ERROR "Please set ARCH_NAME to grayskull, wormhole_b0, or blackhole")
endif(NOT DEFINED ENV{ARCH_NAME})
string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
add_compile_definitions(ARCH_${ARCH_NAME_DEF})
add_compile_options(
Expand Down Expand Up @@ -331,3 +334,7 @@ add_custom_target(
)

include(packaging)

if(BUILD_TT_TRAIN)
add_subdirectory(tt-train)
endif()
21 changes: 12 additions & 9 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,15 @@ MANIFEST.in @tt-rkim
setup.py @tt-rkim
pyproject.toml @tt-rkim @TT-billteng
requirements*.txt @tt-rkim @TT-billteng @ttmchiou
setup_hugepages.py @tt-rkim @TT-billteng
setup_hugepages.py @tt-rkim

scripts/docker @TT-billteng
scripts/build_scripts/ @tt-rkim @vtangTT @TT-billteng
cmake/ @tt-rkim @vtangTT @TT-billteng @afuller-TT
build_metal.sh @tt-rkim @vtangTT @TT-billteng
scripts/build_scripts/ @tt-rkim @vtangTT
cmake/ @tt-rkim @vtangTT @afuller-TT
build_metal.sh @tt-rkim @vtangTT

Makefile @tt-rkim
/CMakeLists.txt @tt-rkim @vtangTT @TT-billteng @blozano-tt @afuller-TT
tests/CMakeLists.txt @tt-rkim @vtangTT @TT-billteng @blozano-tt @afuller-TT
/CMakeLists.txt @tt-rkim @vtangTT @blozano-tt @afuller-TT
tests/CMakeLists.txt @tt-rkim @vtangTT @blozano-tt @afuller-TT

# Testing scripts and infra

Expand Down Expand Up @@ -176,9 +175,13 @@ tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py @esmalTT
tests/ttnn/integration_tests/unet @esmalTT @uaydonat @mywoodstock
tests/nightly/wh_b0_only_eth/experimental/functional_unet @esmalTT @uaydonat @mywoodstock
scripts/profiler/ @mo-tenstorrent
scripts/docker @ttmchiou @TT-billteng @tt-rkim
scripts/docker @ttmchiou @tt-rkim

dockerfile @ttmchiou @TT-billteng @tt-rkim
dockerfile @ttmchiou @tt-rkim

tt_metal/CMakeLists.txt @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @blozano-tt
ttnn/CMakeLists.txt @ayerofieiev-tt @dmakoviichuk-tt @yan-zaretskiy


# tt-train
tt-train/** @dmakoviichuk-tt @rfurko-tt
Loading

0 comments on commit 04f449d

Please sign in to comment.