From d954e7640847df125ce14d51b334c18270b70843 Mon Sep 17 00:00:00 2001 From: Vincent Tang Date: Wed, 22 May 2024 18:53:34 +0000 Subject: [PATCH] #8729: pytest xdist multiprocess reset mechanism - use custom pyhook from pytest-xdist to run cleanup after timeout - add a new process based timeout method to pytest-timeout - add reset after test fail, use '--metal-timeout' to enable reset mechanism - use pytest-xdist fixture to determine # of workers through '-n auto' - expose get_associated_mmio_device to python and only reset opened tt devices on fail --- conftest.py | 493 ++++++++++++++--------- tt_eager/tt_lib/csrc/tt_lib_bindings.cpp | 4 + tt_metal/host_api.hpp | 2 + tt_metal/tt_metal.cpp | 4 + 4 files changed, 307 insertions(+), 196 deletions(-) diff --git a/conftest.py b/conftest.py index f872bce2998..c6339ee3ae1 100644 --- a/conftest.py +++ b/conftest.py @@ -12,6 +12,11 @@ from operator import contains, eq, getitem from pathlib import Path import json +import copy +import multiprocess +import signal +import time +import psutil from loguru import logger @@ -70,191 +75,6 @@ def get_tt_cache_path_(model_version, model_subdir="", default_dir=""): return get_tt_cache_path_ -ALL_ARCHS = set( - [ - "grayskull", - "wormhole_b0", - ] -) - - -def pytest_addoption(parser): - parser.addoption( - "--tt-arch", - choices=[*ALL_ARCHS], - default=os.environ.get("ARCH_NAME", "grayskull"), - help="Target arch, ex. grayskull, wormhole_b0", - ) - parser.addoption( - "--device-id", - type=int, - default=0, - help="Target device id", - ) - parser.addoption( - "--input-method", - action="store", - choices=["json", "cli"], - default=None, - help="Choose input method: 1) json or 2) cli", - ) - parser.addoption( - "--input-path", - action="store", - default="", - help="Path to json file with inputs", - ) - parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") - - -def pytest_generate_tests(metafunc): - """ - This is not a standard docstring. - - We will explain the non-standard fixtures that pytest_generate_tests is - creating here. - - silicon_arch_name and silicon_arch_ - ---------------------------------------------- - - This is how tests should be requesting accelerator architecture names. - Tests which aim to run on silicon should request a silicon_arch_name - fixture. Just that single fixture will parametrize the test to run on the - provided architecture name from the command line through the --tt-arch - option. The value of the fixture will be the string value of the - architecture name. For example, - - @pytest.mark.post_commit - def test_model_silicon(silicon_arch_name): - # silicon_arch_name will be one of grayskull, wormhole_b0 etc. - run_model_on_silicon(silicon_arch_name) - ... - - If you want to restrict a test to only a specific architecture, you can - provide an additional fixture in the form of silicon_arch_. This - will limit the range of possible values for silicon_arch_name to only be - ARCH_NAME. - - @pytest.mark.post_commit - def test_model_silicon_grayskull_only( - silicon_arch_name, - silicon_arch_grayskull, - ): - # silicon_arch_name can only be grayskull or empty - run_model_on_silicon(silicon_arch_name) - ... - - If --tt-arch specifies an architecture that's not ARCH_NAME, the test will - be skipped. We ensure skipping by providing an empty list parametrization - for silicon_arch_name, and with the empty_parameter_set_mark config option - for pytest, will skip any tests with an empty list parametrization. - - Note that you must provide silicon_arch_name as a fixture if you want to - use the silicon_arch_ fixture. - - Note that if tests want to use the ARCH value from the API, tests should - create their own separate fixture which will convert the string value - provided from silicon_arch_name into ARCH. We keep it as strings here - because these fixtures will be used in tests which do not have access to - any Python APIs. - """ - - tt_arch = metafunc.config.getoption("--tt-arch") - - silicon_arch_specific_fixture_name_to_avail_archs = { - "silicon_arch_grayskull": set( - [ - "grayskull", - ] - ), - "silicon_arch_wormhole_b0": set( - [ - "wormhole_b0", - ] - ), - } - - check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs) - test_requested_silicon_arch_fixtures = tuple( - filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames) - ) - is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0 - get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs) - test_requested_silicon_archs = ALL_ARCHS.intersection( - *map( - get_archs_for_silicon_arch_specific_fixture, - test_requested_silicon_arch_fixtures, - ) - ) - - available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS - matches_user_requested_silicon_arch = partial(eq, tt_arch) - available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs)) - - uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames - - # sanity - if is_test_requesting_specific_silicon_archs and not uses_silicon_arch: - raise Exception( - f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture" - ) - - if uses_silicon_arch: - metafunc.parametrize("silicon_arch_name", available_archs, scope="session") - for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures: - # The values of these arch-specific fixtures should not be used in - # the test function, so use any parameters, like [True] - metafunc.parametrize(test_requested_silicon_arch_fixture, [True], scope="session") - - input_method = metafunc.config.getoption("--input-method") - if input_method == "json": - json_path = metafunc.config.getoption("--input-path") - if not json_path: - raise ValueError("Please provide a valid JSON path using --input-path option.") - with open(json_path, "r") as f: - data = json.load(f) - metafunc.parametrize("user_input", [data]) - elif input_method == "cli": - cli_input = metafunc.config.getoption("--cli-input") - if not cli_input: - raise ValueError("Please provide input using --cli-input option.") - metafunc.parametrize("user_input", [[cli_input]]) - - -# Report stashing to get outcomes etc -phase_report_key = pytest.StashKey() - - -@pytest.hookimpl(tryfirst=True, hookwrapper=True) -def pytest_runtest_makereport(item, call): - # execute all other hooks to obtain the report object - outcome = yield - rep = outcome.get_result() - - # store test results for each phase of a call, which can - # be "setup", "call", "teardown" - item.stash.setdefault(phase_report_key, {})[rep.when] = rep - - -@pytest.fixture(scope="function") -def reset_tensix(request, silicon_arch_name): - yield - - report = request.node.stash[phase_report_key] - - test_failed = ("call" not in report) or report["call"].failed - - if test_failed: - logger.debug("Test failed - resetting with smi") - if silicon_arch_name == "grayskull": - result = run_process_and_get_result("tt-smi -tr all") - elif silicon_arch_name == "wormhole_b0": - result = run_process_and_get_result("tt-smi -wr all") - else: - raise Exception(f"Unrecognized arch for tensix-reset: {silicon_arch_name}") - assert result.returncode == 0, "Tensix reset script raised error" - - @pytest.fixture(scope="function") def device_params(request): return getattr(request, "param", {}) @@ -266,6 +86,9 @@ def device(request, device_params): device_id = request.config.getoption("device_id") + request.node.device_ids = [device_id] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)] + num_devices = ttl.device.GetNumPCIeDevices() assert device_id < num_devices, "CreateDevice not supported for non-mmio device" device = ttl.device.CreateDevice(device_id=device_id, **device_params) @@ -284,9 +107,13 @@ def pcie_devices(request, device_params): import tt_lib as ttl num_devices = ttl.device.GetNumPCIeDevices() + device_ids = [i for i in range(num_devices)] + + request.node.device_ids = device_ids + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices - devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params) + devices = ttl.device.CreateDevices(device_ids, **device_params) yield [devices[i] for i in range(num_devices)] @@ -301,9 +128,13 @@ def all_devices(request, device_params): import tt_lib as ttl num_devices = ttl.device.GetNumAvailableDevices() + device_ids = [i for i in range(num_devices)] + + request.node.device_ids = device_ids + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices - devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params) + devices = ttl.device.CreateDevices(device_ids, **device_params) yield [devices[i] for i in range(num_devices)] @@ -316,6 +147,7 @@ def all_devices(request, device_params): @pytest.fixture(scope="function") def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params): import ttnn + import tt_lib as ttl device_ids = ttnn.get_device_ids() try: @@ -323,6 +155,9 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par except (ValueError, AttributeError): num_devices_requested = len(device_ids) + request.node.device_ids = device_ids[:num_devices_requested] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] + device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params ) @@ -330,8 +165,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created") yield device_mesh - import tt_lib as ttl - for device in device_mesh.get_devices(): ttl.device.DumpDeviceProfiler(device) @@ -342,6 +175,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par @pytest.fixture(scope="function") def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params): import ttnn + import tt_lib as ttl device_ids = ttnn.get_pcie_device_ids() try: @@ -349,6 +183,9 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic except (ValueError, AttributeError): num_pcie_devices_requested = len(device_ids) + request.node.device_ids = device_ids[:num_pcie_devices_requested] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]] + device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params ) @@ -356,8 +193,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created") yield device_mesh - import tt_lib as ttl - for device in device_mesh.get_devices(): ttl.device.DumpDeviceProfiler(device) @@ -368,6 +203,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic @pytest.fixture(scope="function") def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params): import ttnn + import tt_lib as ttl if ttnn.get_num_devices() < 8: pytest.skip() @@ -377,6 +213,9 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device except (ValueError, AttributeError): num_devices_requested = len(device_ids) + request.node.device_ids = device_ids[:num_devices_requested] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] + device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params ) @@ -384,8 +223,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created") yield device_mesh - import tt_lib as ttl - for device in device_mesh.get_devices(): ttl.device.DumpDeviceProfiler(device) @@ -453,6 +290,270 @@ def tracy_profile(): profiler.disable() -@pytest.fixture -def input_path(request): - return request.config.getoption("--input-path") +############################### +# Modifying pytest hooks +############################### +ALL_ARCHS = set( + [ + "grayskull", + "wormhole_b0", + ] +) + + +def pytest_addoption(parser): + parser.addoption( + "--tt-arch", + choices=[*ALL_ARCHS], + default=os.environ.get("ARCH_NAME", "grayskull"), + help="Target arch, ex. grayskull, wormhole_b0", + ) + parser.addoption( + "--pipeline-type", + default="", + help="Only `models_device_performance_bare_metal` should run `pytest_runtest_teardown`", + ) + parser.addoption( + "--device-id", + type=int, + default=0, + help="Target device id", + ) + parser.addoption( + "--input-method", + action="store", + choices=["json", "cli"], + default=None, + help="Choose input method: 1) json or 2) cli", + ) + parser.addoption( + "--input-path", + action="store", + default="", + help="Path to json file with inputs", + ) + parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") + parser.addoption( + "--metal-cleanup", + action="store", + default=None, + help="Enable process timeout", + ) + + +def pytest_generate_tests(metafunc): + """ + This is not a standard docstring. + + We will explain the non-standard fixtures that pytest_generate_tests is + creating here. + + silicon_arch_name and silicon_arch_ + ---------------------------------------------- + + This is how tests should be requesting accelerator architecture names. + Tests which aim to run on silicon should request a silicon_arch_name + fixture. Just that single fixture will parametrize the test to run on the + provided architecture name from the command line through the --tt-arch + option. The value of the fixture will be the string value of the + architecture name. For example, + + @pytest.mark.post_commit + def test_model_silicon(silicon_arch_name): + # silicon_arch_name will be one of grayskull, wormhole_b0 etc. + run_model_on_silicon(silicon_arch_name) + ... + + If you want to restrict a test to only a specific architecture, you can + provide an additional fixture in the form of silicon_arch_. This + will limit the range of possible values for silicon_arch_name to only be + ARCH_NAME. + + @pytest.mark.post_commit + def test_model_silicon_grayskull_only( + silicon_arch_name, + silicon_arch_grayskull, + ): + # silicon_arch_name can only be grayskull or empty + run_model_on_silicon(silicon_arch_name) + ... + + If --tt-arch specifies an architecture that's not ARCH_NAME, the test will + be skipped. We ensure skipping by providing an empty list parametrization + for silicon_arch_name, and with the empty_parameter_set_mark config option + for pytest, will skip any tests with an empty list parametrization. + + Note that you must provide silicon_arch_name as a fixture if you want to + use the silicon_arch_ fixture. + + Note that if tests want to use the ARCH value from the API, tests should + create their own separate fixture which will convert the string value + provided from silicon_arch_name into ARCH. We keep it as strings here + because these fixtures will be used in tests which do not have access to + any Python APIs. + """ + + tt_arch = metafunc.config.getoption("--tt-arch") + + silicon_arch_specific_fixture_name_to_avail_archs = { + "silicon_arch_grayskull": set( + [ + "grayskull", + ] + ), + "silicon_arch_wormhole_b0": set( + [ + "wormhole_b0", + ] + ), + } + + check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs) + test_requested_silicon_arch_fixtures = tuple( + filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames) + ) + is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0 + get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs) + test_requested_silicon_archs = ALL_ARCHS.intersection( + *map( + get_archs_for_silicon_arch_specific_fixture, + test_requested_silicon_arch_fixtures, + ) + ) + + available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS + matches_user_requested_silicon_arch = partial(eq, tt_arch) + available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs)) + + uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames + + # sanity + if is_test_requesting_specific_silicon_archs and not uses_silicon_arch: + raise Exception( + f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture" + ) + + if uses_silicon_arch: + metafunc.parametrize("silicon_arch_name", available_archs) + for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures: + # The values of these arch-specific fixtures should not be used in + # the test function, so use any parameters, like [True] + metafunc.parametrize(test_requested_silicon_arch_fixture, [True]) + + input_method = metafunc.config.getoption("--input-method") + if input_method == "json": + json_path = metafunc.config.getoption("--input-path") + if not json_path: + raise ValueError("Please provide a valid JSON path using --input-path option.") + with open(json_path, "r") as f: + data = json.load(f) + metafunc.parametrize("user_input", [data]) + elif input_method == "cli": + cli_input = metafunc.config.getoption("--cli-input") + if not cli_input: + raise ValueError("Please provide input using --cli-input option.") + metafunc.parametrize("user_input", [[cli_input]]) + + +# Report stashing to get outcomes etc +phase_report_key = pytest.StashKey() + + +@pytest.hookimpl(tryfirst=True, hookwrapper=True) +def pytest_runtest_makereport(item, call): + # execute all other hooks to obtain the report object + outcome = yield + rep = outcome.get_result() + + # store test results for each phase of a call, which can + # be "setup", "call", "teardown" + item.stash.setdefault(phase_report_key, {})[rep.when] = rep + + +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_teardown(item, nextitem): + yield + metal_cleanup_enabled = item.config.getoption("--metal-cleanup") + if metal_cleanup_enabled is not None: + report = item.stash[phase_report_key] + test_failed = report.get("call", None) and report["call"].failed + if test_failed: + logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}") + # reset_tensix(set(item.pci_ids)) + reset_tensix() + + +# This is overriding the timer setup hook from pytest-timeout +# If --metal-timeout is passed, we define a new timeout method that spawns a timer process +# At timeout, the process kills it's parent (the test process) and then itself +@pytest.hookimpl(tryfirst=True) +def pytest_timeout_set_timer(item, settings): + metal_timeout_enabled = item.config.getoption("--metal-cleanup") + if metal_timeout_enabled is not None: + parent_pid = os.getpid() + logger.info(f"Metal timeout {settings.timeout} seconds") + + def get_parent_status(): + try: + parent = psutil.Process(parent_pid) + except: + return "already dead" + return parent.status() + + def run_timer(settings): + dead_status = ["zombie", "dead", "already dead"] + timeout = settings.timeout + while get_parent_status() not in dead_status and timeout > 0: + time.sleep(1) + timeout -= 1 + if get_parent_status() != "already dead": + logger.info(f"Timing out test case") + os.kill(parent_pid, signal.SIGKILL) + logger.info(f"Killing timer") + os._exit(1) + + def cancel(): + logger.info(f"Cancelling timer") + metal_timer.terminate() + + metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True) + item.cancel_timeout = cancel + metal_timer.start() + # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}") + return True + + +# This is a hook used in pytest-xdist to handle when a worker crashes out +# In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and +# then it should get cleaned up by the controller through this fixture :fingers_crossed: +@pytest.hookimpl(tryfirst=True) +def pytest_handlecrashitem(crashitem, report, sched): + reset_tensix() + + +def reset_tensix(tt_open_devices=None): + metal_env = copy.deepcopy(os.environ) + arch = metal_env.get("ARCH_NAME") + if arch != "grayskull" and arch != "wormhole_b0": + raise Exception(f"Unrecognized arch for tensix-reset: {arch}") + + if tt_open_devices is None: + logger.info(f"Running reset with reset script: /opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") + smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") + else: + tt_open_devices_str = ",".join([str(i) for i in tt_open_devices]) + check_smi = run_process_and_get_result("tt-smi-metal -h") + logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}") + logger.info(f"Running reset for pci devices: {tt_open_devices_str}") + if check_smi.returncode > 0: + logger.info(f"Test failed - resetting {arch} with tt-smi") + smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}") + else: + smi_reset_result = run_process_and_get_result(f"tt-smi-metal -r {tt_open_devices_str}") + logger.info(f"tt-smi reset status: {smi_reset_result.returncode}") + + +@pytest.hookimpl(tryfirst=True) +def pytest_xdist_auto_num_workers(config): + logger.info("getting num of xdist workers") + return 1 diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp index 915276dfa39..d03205be995 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp @@ -136,6 +136,10 @@ void DeviceModule(py::module &m_device) { Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted. )doc"); + m_device.def("GetPCIeDeviceID", &GetPCIeDeviceID, R"doc( + Returns associated mmio device of give device id. + )doc"); + m_device.def("SetDefaultDevice", &AutoFormat::SetDefaultDevice, R"doc( Sets the default device to use for ops when inputs aren't on device. diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp index a2a490c345c..5572bad808e 100644 --- a/tt_metal/host_api.hpp +++ b/tt_metal/host_api.hpp @@ -55,6 +55,8 @@ size_t GetNumAvailableDevices(); */ size_t GetNumPCIeDevices(); +chip_id_t GetPCIeDeviceID(chip_id_t device_id); + /** * Instantiates a device object. * diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index a1ec68fb1bb..dc529c1256e 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -730,6 +730,10 @@ size_t GetNumPCIeDevices() { #endif } +chip_id_t GetPCIeDeviceID(chip_id_t device_id){ + return tt::Cluster::instance().get_associated_mmio_device(device_id); +} + Device *CreateDevice( chip_id_t device_id, const uint8_t num_hw_cqs,