[Llama3] Add test-accuracy to CI (#15778)

tenstorrent · Dec 10, 2024 · 4bcc79b · 4bcc79b
1 parent a275320
commit 4bcc79b
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 5 deletions.
diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml
@@ -21,6 +21,7 @@ jobs:
           { name: "t3k llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k llama3 tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
+          { name: "t3k llama3 accuracy tests", arch: wormhole_b0, cmd: run_t3000_llama3_accuracy_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
           { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           # { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich  # FIXME issue #14934
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, #Miguel Tairum Cruz

diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md
@@ -12,16 +12,16 @@ This configuration uses bfp4 MLP FF1+FF3 for all models.
 |-------|--------|-----------|-----------|---------------|
 | 1b | N150 | 79 | 98 | 90.5 |
 | 1b | N300 | 81 | 98 | 101.7 |
-| 1b | T3K | 81 | 98 | 97.5 |
+| 1b | T3K | 81 | 98 | 96.8 |
 | 3b | N150 | 85 | 96 | 49.0 |
 | 3b | N300 | 88 | 97 | 56.9 |
 | 3b | T3K | 88 | 97 | 54.5 |
 | 8b | N150 | 86 | 98 | 28.4 |
 | 8b | N300 | 84 | 98 | 38.6 |
-| 8b | T3K | 84 | 98 | 52.6 |
+| 8b | T3K | 84 | 97 | 52.6 |
 | 11b | N300 | 86 | 97 | 38.6 |
 | 11b | T3K | 84 | 98 | 52.6 |
-| 70b | T3K | 95 | 100 | 14.3 |
+| 70b | T3K | 94 | 100 | 14.3 |
 
 ## LlamaOptimizations.accuracy
 
@@ -40,4 +40,4 @@ This configuration uses bfp4 MLP FF1+FF3 only for the 3.1-70B model.
 | 8b | T3K | 88 | 97 | 49.9 |
 | 11b | N300 | 90 | 97 | 33.8 |
 | 11b | T3K | 88 | 97 | 52.6 |
-| 70b | T3K | 95 | 100 | 14.5 |
+| 70b | T3K | 94 | 100 | 14.5 |
diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh
@@ -15,6 +15,20 @@ run_common_func_tests() {
   # Qwen7B
   QWEN_DIR=/mnt/MLPerf/tt_dnn-models/qwen/Qwen2-7B-Instruct WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml FAKE_DEVICE=N150 pytest -n auto models/demos/qwen/demo/demo.py -k instruct --timeout 420; fail+=$?
 
+  # Llama3 Accuracy tests
+  # Llama3.2-1B
+  llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/
+  # Llama3.2-3B
+  llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/
+  # Llama3.1-8B (11B weights are the same)
+  llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/
+
+  # Run Llama3 accuracy tests for 1B, 3B, 8B weights
+  for llama_dir in "$llama1b" "$llama3b" "$llama8b"; do
+    LLAMA_DIR=$llama_dir WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/tests/test_llama_accuracy.py -k perf --timeout 420; fail+=$?
+    echo "LOG_METAL: Llama3 accuracy tests for $llama_dir completed"
+  done
+
   #VGG11/VGG16
   WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/vgg/demo/demo.py --timeout 600; fail+=$?
 

diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh
@@ -63,7 +63,7 @@ run_t3000_llama3_tests() {
   # Run test model for llama3 - 1B, 3B, 8B and 11B weights
   for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b"; do
     LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full ; fail+=$?
-    # LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model_prefill.py ; fail+=$?  # FIXME Issue #14843
+    LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model_prefill.py ; fail+=$?
     echo "LOG_METAL: Llama3 tests for $llama_dir completed"
   done
 
@@ -96,6 +96,40 @@ run_t3000_llama3_70b_tests() {
   fi
 }
 
+run_t3000_llama3_accuracy_tests() {
+  # Record the start time
+  fail=0
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_llama3_accuracy_tests"
+
+  wh_arch_yaml=wormhole_b0_80_arch_eth_dispatch.yaml
+  # Llama3.2-1B
+  llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/
+  # Llama3.2-3B
+  llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/
+  # Llama3.1-8B
+  llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/
+  # Llama3.2-11B
+  llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/
+  # Llama3.1-70B
+  llama70b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.1-70B-Instruct/
+
+  # Run test accuracy llama3 - 1B, 3B, 8B, 11B and 70B weights
+  for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b" "$llama70b"; do
+    LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_accuracy.py -k perf ; fail+=$?
+    echo "LOG_METAL: Llama3 accuracy tests for $llama_dir completed"
+  done
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_llama3_accuracy_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_t3000_llama3.2-11b-vision_freq_tests() {
   # Record the start time
   fail=0
@@ -277,6 +311,9 @@ run_t3000_tests() {
   # Run llama3-70b tests
   run_t3000_llama3_70b_tests
 
+  # Run llama3 accuracy tests
+  run_t3000_llama3_accuracy_tests
+
   # Run Llama3.2-11B Vision tests
   run_t3000_llama3.2-11b-vision_freq_tests