#0: Refactored TG frequent pipeline. Fixes to demo pipeline

tenstorrent · Dec 11, 2024 · 0069a14 · 0069a14
1 parent 26a7580
commit 0069a14
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 34 deletions.
diff --git a/.github/workflows/tg-demo-tests-impl.yaml b/.github/workflows/tg-demo-tests-impl.yaml
@@ -41,4 +41,4 @@ jobs:
           source ${{ github.workspace }}/python_env/bin/activate
           cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
-          source ${{ github.workspace }}/tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tg_device --dispatch-mode "" --model ${{ matrix.test-group.model }}'
+          ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_tg_device --dispatch-mode "" --model ${{ matrix.test-group.model }}
diff --git a/.github/workflows/tg-frequent-tests-impl.yaml b/.github/workflows/tg-frequent-tests-impl.yaml
@@ -9,20 +9,23 @@ jobs:
       fail-fast: false
       matrix:
         test-group: [
-          {
-            name: "TG frequent tests",
-            arch: wormhole_b0,
-            runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-functional"],
-            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_tg_device --dispatch-mode ""'
-          },
+          { name: "TG Llama3 frequent tests", arch: wormhole_b0, model: llama3, timeout: 90, owner_id: U06F3ER8X9A}, # Stuti Raizada
+          { name: "TG Llama3-70B (old) frequent tests", arch: wormhole_b0, model: llama3-70b-old, timeout: 90, owner_id: U03FJB5TM5Y}, #Colman Glagovich
+          { name: "TG resnet50 frequent tests", arch: wormhole_b0, model: resnet50, timeout: 90},
+          { name: "TG unit/distributed frequent tests", arch: wormhole_b0, model: unit, timeout: 90},
         ]
     name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
       ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ${{ matrix.test-group.runs-on }}
+    runs-on:
+      - arch-wormhole_b0
+      - config-tg
+      - in-service
+      - bare-metal
+      - pipeline-functional
     steps:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
       - name: Set up dynamic env vars for build
@@ -40,4 +43,4 @@ jobs:
           source ${{ github.workspace }}/python_env/bin/activate
           cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
-          ${{ matrix.test-group.cmd }}
+          ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_tg_device --dispatch-mode "" --model ${{ matrix.test-group.model }}
diff --git a/tests/scripts/tg/run_tg_frequent_tests.sh b/tests/scripts/tg/run_tg_frequent_tests.sh
@@ -7,20 +7,21 @@ run_tg_llama3_tests() {
 
   echo "LOG_METAL: Running run_tg_llama3_tests"
 
-  # Llama3.1-70B
+  Llama3.2-1B
+  llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/
+  Llama3.2-3B
+  llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/
+  Llama3.1-8B
+  llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/
+  Llama3.2-11B
+  llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/
+  Llama3.1-70B
   llama70b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.1-70B-Instruct/
-  # Llama3.1-8B
-  # llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/
-  # Llama3.2-1B
-  # llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/
-  # Llama3.2-3B
-  # llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/
-  # Llama3.2-11B
-  # llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/
 
   # Run all Llama3 tests for 8B, 1B, and 3B weights
+  # for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b" "$llama70b"; do
   for llama_dir in "$llama70b"; do
-    LLAMA_DIR=$llama_dir FAKE_DEVICE=TG pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full ; fail+=$?
+    LLAMA_DIR=$llama_dir FAKE_DEVICE=TG pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full --timeout=1800 ; fail+=$?
     echo "LOG_METAL: Llama3 tests for $llama_dir completed"
   done
 
@@ -34,26 +35,39 @@ run_tg_llama3_tests() {
 }
 
 run_tg_tests() {
-  # Add tests here
-  echo "LOG_METAL: running run_tg_frequent_tests"
-
-  pytest -n auto tests/ttnn/distributed/test_data_parallel_example_TG.py --timeout=900 ; fail+=$?
-  pytest -n auto tests/ttnn/distributed/test_multidevice_TG.py --timeout=900 ; fail+=$?
-  pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace_TG.py --timeout=900 ; fail+=$?
-  pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_mlp_galaxy.py --timeout=300 ; fail+=$?
-  pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py --timeout=480 ; fail+=$?
-  pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_decoder_galaxy.py --timeout=600 ; fail+=$?
-  pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_model_galaxy_ci.py --timeout=800 ; fail+=$?
-  pytest -n auto models/demos/tg/resnet50/tests/test_resnet50_performant.py ; fail+=$?
-  pytest -n auto tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py --timeout=300 ; fail+=$?
+
+  if [[ "$1" == "llama3-70b-old" ]]; then
+    echo "LOG_METAL: running llama3_70b (old) run_tg_frequent_tests"
+    pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_mlp_galaxy.py --timeout=300 ; fail+=$?
+    pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py --timeout=480 ; fail+=$?
+    pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_decoder_galaxy.py --timeout=600 ; fail+=$?
+    pytest -n auto models/demos/tg/llama3_70b/tests/test_llama_model_galaxy_ci.py --timeout=800 ; fail+=$?
+
+  elif [[ "$1" == "llama3" ]]; then
+    echo "LOG_METAL: running Llama3 run_tg_frequent_tests"
+    run_tg_llama3_tests
+
+  elif [[ "$1" == "resnet50" ]]; then
+    echo "LOG_METAL: running resnet50 run_tg_frequent_tests"
+    pytest -n auto models/demos/tg/resnet50/tests/test_resnet50_performant.py ; fail+=$?
+
+  elif [[ "$1" == "unit" ]]; then
+    echo "LOG_METAL: running unit/distributed run_tg_frequent_tests"
+    pytest -n auto tests/ttnn/distributed/test_data_parallel_example_TG.py --timeout=900 ; fail+=$?
+    pytest -n auto tests/ttnn/distributed/test_multidevice_TG.py --timeout=900 ; fail+=$?
+    pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace_TG.py --timeout=900 ; fail+=$?
+    pytest -n auto tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py --timeout=300 ; fail+=$?
+
+  else
+    echo "LOG_METAL: Unknown model type: $1"
+    return 1
+  fi
 
   if [[ $fail -ne 0 ]]; then
     echo "LOG_METAL: run_tg_frequent_tests failed"
     exit 1
   fi
 
-  # Run llama3 tests
-  run_tg_llama3_tests
 }
 
 main() {
@@ -67,11 +81,26 @@ main() {
     exit 1
   fi
 
+  # Parse the arguments
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --model)
+        model=$2
+        shift
+        ;;
+      *)
+        echo "Unknown option: $1"
+        exit 1
+        ;;
+    esac
+    shift
+  done
+
   # Run all tests
   cd $TT_METAL_HOME
   export PYTHONPATH=$TT_METAL_HOME
 
-  run_tg_tests
+  run_tg_tests "$model"
 }
 
 main "$@"