Skip to content

Commit

Permalink
#0: Refactored TG frequent pipeline. Fixes to demo pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
mtairum committed Dec 11, 2024
1 parent 26a7580 commit 0069a14
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tg-demo-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ jobs:
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
source ${{ github.workspace }}/tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tg_device --dispatch-mode "" --model ${{ matrix.test-group.model }}'
./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_tg_device --dispatch-mode "" --model ${{ matrix.test-group.model }}
19 changes: 11 additions & 8 deletions .github/workflows/tg-frequent-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,23 @@ jobs:
fail-fast: false
matrix:
test-group: [
{
name: "TG frequent tests",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-functional"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_tg_device --dispatch-mode ""'
},
{ name: "TG Llama3 frequent tests", arch: wormhole_b0, model: llama3, timeout: 90, owner_id: U06F3ER8X9A}, # Stuti Raizada
{ name: "TG Llama3-70B (old) frequent tests", arch: wormhole_b0, model: llama3-70b-old, timeout: 90, owner_id: U03FJB5TM5Y}, #Colman Glagovich
{ name: "TG resnet50 frequent tests", arch: wormhole_b0, model: resnet50, timeout: 90},
{ name: "TG unit/distributed frequent tests", arch: wormhole_b0, model: unit, timeout: 90},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ${{ matrix.test-group.runs-on }}
runs-on:
- arch-wormhole_b0
- config-tg
- in-service
- bare-metal
- pipeline-functional
steps:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
- name: Set up dynamic env vars for build
Expand All @@ -40,4 +43,4 @@ jobs:
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_tg_device --dispatch-mode "" --model ${{ matrix.test-group.model }}
79 changes: 54 additions & 25 deletions tests/scripts/tg/run_tg_frequent_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@ run_tg_llama3_tests() {

echo "LOG_METAL: Running run_tg_llama3_tests"

# Llama3.1-70B
Llama3.2-1B
llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/
Llama3.2-3B
llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/
Llama3.1-8B
llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/
Llama3.2-11B
llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/
Llama3.1-70B
llama70b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.1-70B-Instruct/
# Llama3.1-8B
# llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/
# Llama3.2-1B
# llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/
# Llama3.2-3B
# llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/
# Llama3.2-11B
# llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/

# Run all Llama3 tests for 8B, 1B, and 3B weights
# for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b" "$llama70b"; do
for llama_dir in "$llama70b"; do
LLAMA_DIR=$llama_dir FAKE_DEVICE=TG pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full ; fail+=$?
LLAMA_DIR=$llama_dir FAKE_DEVICE=TG pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full --timeout=1800 ; fail+=$?
echo "LOG_METAL: Llama3 tests for $llama_dir completed"
done

Expand All @@ -34,26 +35,39 @@ run_tg_llama3_tests() {
}

run_tg_tests() {
# Add tests here
echo "LOG_METAL: running run_tg_frequent_tests"

pytest -n auto tests/ttnn/distributed/test_data_parallel_example_TG.py --timeout=900 ; fail+=$?
pytest -n auto tests/ttnn/distributed/test_multidevice_TG.py --timeout=900 ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace_TG.py --timeout=900 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_mlp_galaxy.py --timeout=300 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py --timeout=480 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_decoder_galaxy.py --timeout=600 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_model_galaxy_ci.py --timeout=800 ; fail+=$?
pytest -n auto models/demos/tg/resnet50/tests/test_resnet50_performant.py ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py --timeout=300 ; fail+=$?

if [[ "$1" == "llama3-70b-old" ]]; then
echo "LOG_METAL: running llama3_70b (old) run_tg_frequent_tests"
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_mlp_galaxy.py --timeout=300 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py --timeout=480 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_decoder_galaxy.py --timeout=600 ; fail+=$?
pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_model_galaxy_ci.py --timeout=800 ; fail+=$?

elif [[ "$1" == "llama3" ]]; then
echo "LOG_METAL: running Llama3 run_tg_frequent_tests"
run_tg_llama3_tests

elif [[ "$1" == "resnet50" ]]; then
echo "LOG_METAL: running resnet50 run_tg_frequent_tests"
pytest -n auto models/demos/tg/resnet50/tests/test_resnet50_performant.py ; fail+=$?

elif [[ "$1" == "unit" ]]; then
echo "LOG_METAL: running unit/distributed run_tg_frequent_tests"
pytest -n auto tests/ttnn/distributed/test_data_parallel_example_TG.py --timeout=900 ; fail+=$?
pytest -n auto tests/ttnn/distributed/test_multidevice_TG.py --timeout=900 ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace_TG.py --timeout=900 ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py --timeout=300 ; fail+=$?

else
echo "LOG_METAL: Unknown model type: $1"
return 1
fi

if [[ $fail -ne 0 ]]; then
echo "LOG_METAL: run_tg_frequent_tests failed"
exit 1
fi

# Run llama3 tests
run_tg_llama3_tests
}

main() {
Expand All @@ -67,11 +81,26 @@ main() {
exit 1
fi

# Parse the arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
model=$2
shift
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
shift
done

# Run all tests
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME

run_tg_tests
run_tg_tests "$model"
}

main "$@"

0 comments on commit 0069a14

Please sign in to comment.