Skip to content

Commit

Permalink
#8729: xdist + reset mechanism on fd nightly, model perf, all t3k (ex…
Browse files Browse the repository at this point in the history
…cept profiler)

- enable timeout mechanism by default if using xdist, use 'metal-timeout' flag to enable if not using xdist
- increase GH actions timeout for xdist (review)
- get timings of each test and set global timeout to 5 mins (review)
- add custom timeouts to nightly + t3k pipelines + post-commit (review)
  • Loading branch information
vtangTT authored and TT-billteng committed Jun 28, 2024
1 parent d954e76 commit d525b17
Show file tree
Hide file tree
Showing 20 changed files with 238 additions and 120 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
{ name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 },
{ name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
{ name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
{ name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 },
{ name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 },
]
name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/perf-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
- uses: ./.github/actions/install-python-deps
- name: Run performance regressions
id: performance_tests
timeout-minutes: 30
timeout-minutes: 40
run: |
source ${{ github.workspace }}/python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/t3000-demo-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
fail-fast: false
matrix:
test-group: [
{ name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 40, owner_id: U044T8U8DEF}, #Johanna Rock
{ name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U044T8U8DEF}, #Johanna Rock
{ name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich
{ name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini
{ name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum
Expand Down Expand Up @@ -46,6 +46,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/t3000-frequent-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent regression tests
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/t3000-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ jobs:
fail-fast: false
matrix:
test-group: [
{ name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 60, owner_id: S07AJBTLX2L}, #Model Falcon
{ name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 60, owner_id: S07AJBTLX2L}, # Model Falcon
{ name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 75, owner_id: S07AJBTLX2L}, #Model Falcon
{ name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 75, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 75, owner_id: U03FJB5TM5Y}, #Colman Glagovich
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: S07AJBTLX2L}, # Model Falcon
#{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
]
name: ${{ matrix.test-group.name }}
Expand Down Expand Up @@ -52,6 +52,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/t3000-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run unit regression tests
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand Down
59 changes: 29 additions & 30 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ def device(request, device_params):
import tt_lib as ttl

device_id = request.config.getoption("device_id")

request.node.device_ids = [device_id]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)]

num_devices = ttl.device.GetNumPCIeDevices()
Expand All @@ -108,9 +106,7 @@ def pcie_devices(request, device_params):

num_devices = ttl.device.GetNumPCIeDevices()
device_ids = [i for i in range(num_devices)]

request.node.device_ids = device_ids
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
request.node.pci_ids = device_ids

# Get only physical devices
devices = ttl.device.CreateDevices(device_ids, **device_params)
Expand All @@ -129,8 +125,6 @@ def all_devices(request, device_params):

num_devices = ttl.device.GetNumAvailableDevices()
device_ids = [i for i in range(num_devices)]

request.node.device_ids = device_ids
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]

# Get only physical devices
Expand All @@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
except (ValueError, AttributeError):
num_devices_requested = len(device_ids)

request.node.device_ids = device_ids[:num_devices_requested]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]

device_mesh = ttnn.open_device_mesh(
Expand Down Expand Up @@ -183,8 +176,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
except (ValueError, AttributeError):
num_pcie_devices_requested = len(device_ids)

request.node.device_ids = device_ids[:num_pcie_devices_requested]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]]
request.node.pci_ids = device_ids[:num_pcie_devices_requested]

device_mesh = ttnn.open_device_mesh(
ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params
Expand Down Expand Up @@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
except (ValueError, AttributeError):
num_devices_requested = len(device_ids)

request.node.device_ids = device_ids[:num_devices_requested]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]

device_mesh = ttnn.open_device_mesh(
Expand Down Expand Up @@ -334,13 +325,18 @@ def pytest_addoption(parser):
)
parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
parser.addoption(
"--metal-cleanup",
"--metal-timeout",
action="store",
default=None,
help="Enable process timeout",
)


@pytest.fixture
def input_path(request):
return request.config.getoption("--input-path")


def pytest_generate_tests(metafunc):
"""
This is not a standard docstring.
Expand Down Expand Up @@ -473,25 +469,28 @@ def pytest_runtest_makereport(item, call):
@pytest.hookimpl(hookwrapper=True)
def pytest_runtest_teardown(item, nextitem):
yield
metal_cleanup_enabled = item.config.getoption("--metal-cleanup")
if metal_cleanup_enabled is not None:
metal_timeout_enabled = item.config.getoption("--metal-timeout")
using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))

if metal_timeout_enabled is not None or using_xdist:
report = item.stash[phase_report_key]
test_failed = report.get("call", None) and report["call"].failed
if test_failed:
logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}")
# reset_tensix(set(item.pci_ids))
reset_tensix()
logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}")
reset_tensix(set(item.pci_ids))


# This is overriding the timer setup hook from pytest-timeout
# If --metal-timeout is passed, we define a new timeout method that spawns a timer process
# At timeout, the process kills it's parent (the test process) and then itself
@pytest.hookimpl(tryfirst=True)
def pytest_timeout_set_timer(item, settings):
metal_timeout_enabled = item.config.getoption("--metal-cleanup")
if metal_timeout_enabled is not None:
metal_timeout_enabled = item.config.getoption("--metal-timeout")
using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))

if metal_timeout_enabled is not None or using_xdist:
parent_pid = os.getpid()
logger.info(f"Metal timeout {settings.timeout} seconds")
logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}")

def get_parent_status():
try:
Expand All @@ -501,12 +500,15 @@ def get_parent_status():
return parent.status()

def run_timer(settings):
logger.info(f"Timer started for {item.nodeid}")
dead_status = ["zombie", "dead", "already dead"]
timeout = settings.timeout
while get_parent_status() not in dead_status and timeout > 0:
time.sleep(1)
timeout -= 1
if get_parent_status() != "already dead":
parent_status = "running"
while parent_status not in dead_status and timeout > 0:
time.sleep(5)
timeout -= 5
parent_status = get_parent_status()
if parent_status != "already dead":
logger.info(f"Timing out test case")
os.kill(parent_pid, signal.SIGKILL)
logger.info(f"Killing timer")
Expand All @@ -519,13 +521,12 @@ def cancel():
metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True)
item.cancel_timeout = cancel
metal_timer.start()
# logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}")
return True


# This is a hook used in pytest-xdist to handle when a worker crashes out
# In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and
# then it should get cleaned up by the controller through this fixture :fingers_crossed:
# then it should get cleaned up by the controller through this fixture
@pytest.hookimpl(tryfirst=True)
def pytest_handlecrashitem(crashitem, report, sched):
reset_tensix()
Expand All @@ -542,10 +543,9 @@ def reset_tensix(tt_open_devices=None):
smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
else:
tt_open_devices_str = ",".join([str(i) for i in tt_open_devices])
check_smi = run_process_and_get_result("tt-smi-metal -h")
logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}")
check_smi_metal = run_process_and_get_result("tt-smi-metal -h")
logger.info(f"Running reset for pci devices: {tt_open_devices_str}")
if check_smi.returncode > 0:
if check_smi_metal.returncode > 0:
logger.info(f"Test failed - resetting {arch} with tt-smi")
smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}")
else:
Expand All @@ -555,5 +555,4 @@ def reset_tensix(tt_open_devices=None):

@pytest.hookimpl(tryfirst=True)
def pytest_xdist_auto_num_workers(config):
logger.info("getting num of xdist workers")
return 1
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[pytest]
timeout = 300
minversion = 7.2
addopts = --import-mode=importlib -vs -rA
addopts = --import-mode=importlib -vvs -rA --durations=0
empty_parameter_set_mark = skip
markers =
post_commit: mark tests to run on post-commit
Expand Down
25 changes: 13 additions & 12 deletions tests/scripts/run_performance.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
Expand All @@ -11,19 +11,19 @@ run_perf_models_other() {
local tt_arch=$1
local test_marker=$2

env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker
env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker

env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker

env pytest models/demos/ttnn_falcon7b/tests -m $test_marker
env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker

# Separate calls since we can't mix switching between number of cqs
env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker
env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker

env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker
env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker

env pytest models/demos/metal_BERT_large_11/tests -m $test_marker
env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker

## Merge all the generated reports
env python models/perf/merge_perf_results.py
Expand All @@ -33,13 +33,13 @@ run_perf_models_llm_javelin() {
local tt_arch=$1
local test_marker=$2

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker

if [ "$tt_arch" == "wormhole_b0" ]; then
env pytest models/demos/mamba/tests -m $test_marker --timeout=360
env pytest -n auto models/demos/mamba/tests -m $test_marker --timeout=360
fi

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360

## Merge all the generated reports
env python models/perf/merge_perf_results.py
Expand All @@ -50,14 +50,15 @@ run_perf_models_cnn_javelin() {
local test_marker=$2

# Run tests
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
#env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker

## Merge all the generated reports
env python models/perf/merge_perf_results.py
}

run_device_perf_models() {
set -eo pipefail
local test_marker=$1

env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() {
./tests/scripts/run_python_api_unit_tests.sh
else
if [[ $tt_arch == "wormhole_b0" ]]; then
pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
else
echo "API tests are not available for fast dispatch because they're already covered in post-commit"
fi
Expand Down
9 changes: 7 additions & 2 deletions tests/scripts/single_card/nightly/run_common_models.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running common models for archs"

env pytest tests/nightly/common_models/
env pytest -n auto tests/nightly/common_models/ ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
11 changes: 8 additions & 3 deletions tests/scripts/single_card/nightly/run_gs_only.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running model nightly tests for GS only"

env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py
env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$?

env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py
env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
9 changes: 7 additions & 2 deletions tests/scripts/single_card/nightly/run_ttnn.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running ttnn nightly tests for GS only"

env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal"
env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
Loading

0 comments on commit d525b17

Please sign in to comment.