From d954e7640847df125ce14d51b334c18270b70843 Mon Sep 17 00:00:00 2001
From: Vincent Tang <vtang@tenstorrent.com>
Date: Wed, 22 May 2024 18:53:34 +0000
Subject: [PATCH] #8729: pytest xdist multiprocess reset mechanism

- use custom pyhook from pytest-xdist to run cleanup after timeout
- add a new process based timeout method to pytest-timeout
- add reset after test fail, use '--metal-timeout' to enable reset mechanism
- use pytest-xdist fixture to determine # of workers through '-n auto'
- expose get_associated_mmio_device to python and only reset opened tt devices on fail
---
 conftest.py                              | 493 ++++++++++++++---------
 tt_eager/tt_lib/csrc/tt_lib_bindings.cpp |   4 +
 tt_metal/host_api.hpp                    |   2 +
 tt_metal/tt_metal.cpp                    |   4 +
 4 files changed, 307 insertions(+), 196 deletions(-)

diff --git a/conftest.py b/conftest.py
index f872bce2998..c6339ee3ae1 100644
--- a/conftest.py
+++ b/conftest.py
@@ -12,6 +12,11 @@
 from operator import contains, eq, getitem
 from pathlib import Path
 import json
+import copy
+import multiprocess
+import signal
+import time
+import psutil
 
 from loguru import logger
 
@@ -70,191 +75,6 @@ def get_tt_cache_path_(model_version, model_subdir="", default_dir=""):
     return get_tt_cache_path_
 
 
-ALL_ARCHS = set(
-    [
-        "grayskull",
-        "wormhole_b0",
-    ]
-)
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--tt-arch",
-        choices=[*ALL_ARCHS],
-        default=os.environ.get("ARCH_NAME", "grayskull"),
-        help="Target arch, ex. grayskull, wormhole_b0",
-    )
-    parser.addoption(
-        "--device-id",
-        type=int,
-        default=0,
-        help="Target device id",
-    )
-    parser.addoption(
-        "--input-method",
-        action="store",
-        choices=["json", "cli"],
-        default=None,
-        help="Choose input method: 1) json or 2) cli",
-    )
-    parser.addoption(
-        "--input-path",
-        action="store",
-        default="",
-        help="Path to json file with inputs",
-    )
-    parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
-
-
-def pytest_generate_tests(metafunc):
-    """
-    This is not a standard docstring.
-
-    We will explain the non-standard fixtures that pytest_generate_tests is
-    creating here.
-
-    silicon_arch_name and silicon_arch_<ARCH_NAME>
-    ----------------------------------------------
-
-    This is how tests should be requesting accelerator architecture names.
-    Tests which aim to run on silicon should request a silicon_arch_name
-    fixture. Just that single fixture will parametrize the test to run on the
-    provided architecture name from the command line through the --tt-arch
-    option. The value of the fixture will be the string value of the
-    architecture name. For example,
-
-    @pytest.mark.post_commit
-    def test_model_silicon(silicon_arch_name):
-        # silicon_arch_name will be one of grayskull, wormhole_b0 etc.
-        run_model_on_silicon(silicon_arch_name)
-        ...
-
-    If you want to restrict a test to only a specific architecture, you can
-    provide an additional fixture in the form of silicon_arch_<ARCH_NAME>. This
-    will limit the range of possible values for silicon_arch_name to only be
-    ARCH_NAME.
-
-    @pytest.mark.post_commit
-    def test_model_silicon_grayskull_only(
-        silicon_arch_name,
-        silicon_arch_grayskull,
-    ):
-        # silicon_arch_name can only be grayskull or empty
-        run_model_on_silicon(silicon_arch_name)
-        ...
-
-    If --tt-arch specifies an architecture that's not ARCH_NAME, the test will
-    be skipped. We ensure skipping by providing an empty list parametrization
-    for silicon_arch_name, and with the empty_parameter_set_mark config option
-    for pytest, will skip any tests with an empty list parametrization.
-
-    Note that you must provide silicon_arch_name as a fixture if you want to
-    use the silicon_arch_<ARCH_NAME> fixture.
-
-    Note that if tests want to use the ARCH value from the API, tests should
-    create their own separate fixture which will convert the string value
-    provided from silicon_arch_name into ARCH. We keep it as strings here
-    because these fixtures will be used in tests which do not have access to
-    any Python APIs.
-    """
-
-    tt_arch = metafunc.config.getoption("--tt-arch")
-
-    silicon_arch_specific_fixture_name_to_avail_archs = {
-        "silicon_arch_grayskull": set(
-            [
-                "grayskull",
-            ]
-        ),
-        "silicon_arch_wormhole_b0": set(
-            [
-                "wormhole_b0",
-            ]
-        ),
-    }
-
-    check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs)
-    test_requested_silicon_arch_fixtures = tuple(
-        filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames)
-    )
-    is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0
-    get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs)
-    test_requested_silicon_archs = ALL_ARCHS.intersection(
-        *map(
-            get_archs_for_silicon_arch_specific_fixture,
-            test_requested_silicon_arch_fixtures,
-        )
-    )
-
-    available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS
-    matches_user_requested_silicon_arch = partial(eq, tt_arch)
-    available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs))
-
-    uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames
-
-    # sanity
-    if is_test_requesting_specific_silicon_archs and not uses_silicon_arch:
-        raise Exception(
-            f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture"
-        )
-
-    if uses_silicon_arch:
-        metafunc.parametrize("silicon_arch_name", available_archs, scope="session")
-        for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures:
-            # The values of these arch-specific fixtures should not be used in
-            # the test function, so use any parameters, like [True]
-            metafunc.parametrize(test_requested_silicon_arch_fixture, [True], scope="session")
-
-    input_method = metafunc.config.getoption("--input-method")
-    if input_method == "json":
-        json_path = metafunc.config.getoption("--input-path")
-        if not json_path:
-            raise ValueError("Please provide a valid JSON path using --input-path option.")
-        with open(json_path, "r") as f:
-            data = json.load(f)
-        metafunc.parametrize("user_input", [data])
-    elif input_method == "cli":
-        cli_input = metafunc.config.getoption("--cli-input")
-        if not cli_input:
-            raise ValueError("Please provide input using --cli-input option.")
-        metafunc.parametrize("user_input", [[cli_input]])
-
-
-# Report stashing to get outcomes etc
-phase_report_key = pytest.StashKey()
-
-
-@pytest.hookimpl(tryfirst=True, hookwrapper=True)
-def pytest_runtest_makereport(item, call):
-    # execute all other hooks to obtain the report object
-    outcome = yield
-    rep = outcome.get_result()
-
-    # store test results for each phase of a call, which can
-    # be "setup", "call", "teardown"
-    item.stash.setdefault(phase_report_key, {})[rep.when] = rep
-
-
-@pytest.fixture(scope="function")
-def reset_tensix(request, silicon_arch_name):
-    yield
-
-    report = request.node.stash[phase_report_key]
-
-    test_failed = ("call" not in report) or report["call"].failed
-
-    if test_failed:
-        logger.debug("Test failed - resetting with smi")
-        if silicon_arch_name == "grayskull":
-            result = run_process_and_get_result("tt-smi -tr all")
-        elif silicon_arch_name == "wormhole_b0":
-            result = run_process_and_get_result("tt-smi -wr all")
-        else:
-            raise Exception(f"Unrecognized arch for tensix-reset: {silicon_arch_name}")
-        assert result.returncode == 0, "Tensix reset script raised error"
-
-
 @pytest.fixture(scope="function")
 def device_params(request):
     return getattr(request, "param", {})
@@ -266,6 +86,9 @@ def device(request, device_params):
 
     device_id = request.config.getoption("device_id")
 
+    request.node.device_ids = [device_id]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)]
+
     num_devices = ttl.device.GetNumPCIeDevices()
     assert device_id < num_devices, "CreateDevice not supported for non-mmio device"
     device = ttl.device.CreateDevice(device_id=device_id, **device_params)
@@ -284,9 +107,13 @@ def pcie_devices(request, device_params):
     import tt_lib as ttl
 
     num_devices = ttl.device.GetNumPCIeDevices()
+    device_ids = [i for i in range(num_devices)]
+
+    request.node.device_ids = device_ids
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
 
     # Get only physical devices
-    devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params)
+    devices = ttl.device.CreateDevices(device_ids, **device_params)
 
     yield [devices[i] for i in range(num_devices)]
 
@@ -301,9 +128,13 @@ def all_devices(request, device_params):
     import tt_lib as ttl
 
     num_devices = ttl.device.GetNumAvailableDevices()
+    device_ids = [i for i in range(num_devices)]
+
+    request.node.device_ids = device_ids
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
 
     # Get only physical devices
-    devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params)
+    devices = ttl.device.CreateDevices(device_ids, **device_params)
 
     yield [devices[i] for i in range(num_devices)]
 
@@ -316,6 +147,7 @@ def all_devices(request, device_params):
 @pytest.fixture(scope="function")
 def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params):
     import ttnn
+    import tt_lib as ttl
 
     device_ids = ttnn.get_device_ids()
     try:
@@ -323,6 +155,9 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
+    request.node.device_ids = device_ids[:num_devices_requested]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
+
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params
     )
@@ -330,8 +165,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
     yield device_mesh
 
-    import tt_lib as ttl
-
     for device in device_mesh.get_devices():
         ttl.device.DumpDeviceProfiler(device)
 
@@ -342,6 +175,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
 @pytest.fixture(scope="function")
 def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params):
     import ttnn
+    import tt_lib as ttl
 
     device_ids = ttnn.get_pcie_device_ids()
     try:
@@ -349,6 +183,9 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
+    request.node.device_ids = device_ids[:num_pcie_devices_requested]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]]
+
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params
     )
@@ -356,8 +193,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
     yield device_mesh
 
-    import tt_lib as ttl
-
     for device in device_mesh.get_devices():
         ttl.device.DumpDeviceProfiler(device)
 
@@ -368,6 +203,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
 @pytest.fixture(scope="function")
 def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params):
     import ttnn
+    import tt_lib as ttl
 
     if ttnn.get_num_devices() < 8:
         pytest.skip()
@@ -377,6 +213,9 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
+    request.node.device_ids = device_ids[:num_devices_requested]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
+
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params
     )
@@ -384,8 +223,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
     yield device_mesh
 
-    import tt_lib as ttl
-
     for device in device_mesh.get_devices():
         ttl.device.DumpDeviceProfiler(device)
 
@@ -453,6 +290,270 @@ def tracy_profile():
     profiler.disable()
 
 
-@pytest.fixture
-def input_path(request):
-    return request.config.getoption("--input-path")
+###############################
+# Modifying pytest hooks
+###############################
+ALL_ARCHS = set(
+    [
+        "grayskull",
+        "wormhole_b0",
+    ]
+)
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--tt-arch",
+        choices=[*ALL_ARCHS],
+        default=os.environ.get("ARCH_NAME", "grayskull"),
+        help="Target arch, ex. grayskull, wormhole_b0",
+    )
+    parser.addoption(
+        "--pipeline-type",
+        default="",
+        help="Only `models_device_performance_bare_metal` should run `pytest_runtest_teardown`",
+    )
+    parser.addoption(
+        "--device-id",
+        type=int,
+        default=0,
+        help="Target device id",
+    )
+    parser.addoption(
+        "--input-method",
+        action="store",
+        choices=["json", "cli"],
+        default=None,
+        help="Choose input method: 1) json or 2) cli",
+    )
+    parser.addoption(
+        "--input-path",
+        action="store",
+        default="",
+        help="Path to json file with inputs",
+    )
+    parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
+    parser.addoption(
+        "--metal-cleanup",
+        action="store",
+        default=None,
+        help="Enable process timeout",
+    )
+
+
+def pytest_generate_tests(metafunc):
+    """
+    This is not a standard docstring.
+
+    We will explain the non-standard fixtures that pytest_generate_tests is
+    creating here.
+
+    silicon_arch_name and silicon_arch_<ARCH_NAME>
+    ----------------------------------------------
+
+    This is how tests should be requesting accelerator architecture names.
+    Tests which aim to run on silicon should request a silicon_arch_name
+    fixture. Just that single fixture will parametrize the test to run on the
+    provided architecture name from the command line through the --tt-arch
+    option. The value of the fixture will be the string value of the
+    architecture name. For example,
+
+    @pytest.mark.post_commit
+    def test_model_silicon(silicon_arch_name):
+        # silicon_arch_name will be one of grayskull, wormhole_b0 etc.
+        run_model_on_silicon(silicon_arch_name)
+        ...
+
+    If you want to restrict a test to only a specific architecture, you can
+    provide an additional fixture in the form of silicon_arch_<ARCH_NAME>. This
+    will limit the range of possible values for silicon_arch_name to only be
+    ARCH_NAME.
+
+    @pytest.mark.post_commit
+    def test_model_silicon_grayskull_only(
+        silicon_arch_name,
+        silicon_arch_grayskull,
+    ):
+        # silicon_arch_name can only be grayskull or empty
+        run_model_on_silicon(silicon_arch_name)
+        ...
+
+    If --tt-arch specifies an architecture that's not ARCH_NAME, the test will
+    be skipped. We ensure skipping by providing an empty list parametrization
+    for silicon_arch_name, and with the empty_parameter_set_mark config option
+    for pytest, will skip any tests with an empty list parametrization.
+
+    Note that you must provide silicon_arch_name as a fixture if you want to
+    use the silicon_arch_<ARCH_NAME> fixture.
+
+    Note that if tests want to use the ARCH value from the API, tests should
+    create their own separate fixture which will convert the string value
+    provided from silicon_arch_name into ARCH. We keep it as strings here
+    because these fixtures will be used in tests which do not have access to
+    any Python APIs.
+    """
+
+    tt_arch = metafunc.config.getoption("--tt-arch")
+
+    silicon_arch_specific_fixture_name_to_avail_archs = {
+        "silicon_arch_grayskull": set(
+            [
+                "grayskull",
+            ]
+        ),
+        "silicon_arch_wormhole_b0": set(
+            [
+                "wormhole_b0",
+            ]
+        ),
+    }
+
+    check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs)
+    test_requested_silicon_arch_fixtures = tuple(
+        filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames)
+    )
+    is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0
+    get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs)
+    test_requested_silicon_archs = ALL_ARCHS.intersection(
+        *map(
+            get_archs_for_silicon_arch_specific_fixture,
+            test_requested_silicon_arch_fixtures,
+        )
+    )
+
+    available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS
+    matches_user_requested_silicon_arch = partial(eq, tt_arch)
+    available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs))
+
+    uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames
+
+    # sanity
+    if is_test_requesting_specific_silicon_archs and not uses_silicon_arch:
+        raise Exception(
+            f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture"
+        )
+
+    if uses_silicon_arch:
+        metafunc.parametrize("silicon_arch_name", available_archs)
+        for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures:
+            # The values of these arch-specific fixtures should not be used in
+            # the test function, so use any parameters, like [True]
+            metafunc.parametrize(test_requested_silicon_arch_fixture, [True])
+
+    input_method = metafunc.config.getoption("--input-method")
+    if input_method == "json":
+        json_path = metafunc.config.getoption("--input-path")
+        if not json_path:
+            raise ValueError("Please provide a valid JSON path using --input-path option.")
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        metafunc.parametrize("user_input", [data])
+    elif input_method == "cli":
+        cli_input = metafunc.config.getoption("--cli-input")
+        if not cli_input:
+            raise ValueError("Please provide input using --cli-input option.")
+        metafunc.parametrize("user_input", [[cli_input]])
+
+
+# Report stashing to get outcomes etc
+phase_report_key = pytest.StashKey()
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    # execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+
+    # store test results for each phase of a call, which can
+    # be "setup", "call", "teardown"
+    item.stash.setdefault(phase_report_key, {})[rep.when] = rep
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_teardown(item, nextitem):
+    yield
+    metal_cleanup_enabled = item.config.getoption("--metal-cleanup")
+    if metal_cleanup_enabled is not None:
+        report = item.stash[phase_report_key]
+        test_failed = report.get("call", None) and report["call"].failed
+        if test_failed:
+            logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}")
+            # reset_tensix(set(item.pci_ids))
+            reset_tensix()
+
+
+# This is overriding the timer setup hook from pytest-timeout
+# If --metal-timeout is passed, we define a new timeout method that spawns a timer process
+# At timeout, the process kills it's parent (the test process) and then itself
+@pytest.hookimpl(tryfirst=True)
+def pytest_timeout_set_timer(item, settings):
+    metal_timeout_enabled = item.config.getoption("--metal-cleanup")
+    if metal_timeout_enabled is not None:
+        parent_pid = os.getpid()
+        logger.info(f"Metal timeout {settings.timeout} seconds")
+
+        def get_parent_status():
+            try:
+                parent = psutil.Process(parent_pid)
+            except:
+                return "already dead"
+            return parent.status()
+
+        def run_timer(settings):
+            dead_status = ["zombie", "dead", "already dead"]
+            timeout = settings.timeout
+            while get_parent_status() not in dead_status and timeout > 0:
+                time.sleep(1)
+                timeout -= 1
+            if get_parent_status() != "already dead":
+                logger.info(f"Timing out test case")
+                os.kill(parent_pid, signal.SIGKILL)
+            logger.info(f"Killing timer")
+            os._exit(1)
+
+        def cancel():
+            logger.info(f"Cancelling timer")
+            metal_timer.terminate()
+
+        metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True)
+        item.cancel_timeout = cancel
+        metal_timer.start()
+        # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}")
+    return True
+
+
+# This is a hook used in pytest-xdist to handle when a worker crashes out
+# In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and
+# then it should get cleaned up by the controller through this fixture :fingers_crossed:
+@pytest.hookimpl(tryfirst=True)
+def pytest_handlecrashitem(crashitem, report, sched):
+    reset_tensix()
+
+
+def reset_tensix(tt_open_devices=None):
+    metal_env = copy.deepcopy(os.environ)
+    arch = metal_env.get("ARCH_NAME")
+    if arch != "grayskull" and arch != "wormhole_b0":
+        raise Exception(f"Unrecognized arch for tensix-reset: {arch}")
+
+    if tt_open_devices is None:
+        logger.info(f"Running reset with reset script: /opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
+        smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
+    else:
+        tt_open_devices_str = ",".join([str(i) for i in tt_open_devices])
+        check_smi = run_process_and_get_result("tt-smi-metal -h")
+        logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}")
+        logger.info(f"Running reset for pci devices: {tt_open_devices_str}")
+        if check_smi.returncode > 0:
+            logger.info(f"Test failed - resetting {arch} with tt-smi")
+            smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}")
+        else:
+            smi_reset_result = run_process_and_get_result(f"tt-smi-metal -r {tt_open_devices_str}")
+    logger.info(f"tt-smi reset status: {smi_reset_result.returncode}")
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_xdist_auto_num_workers(config):
+    logger.info("getting num of xdist workers")
+    return 1
diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
index 915276dfa39..d03205be995 100644
--- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
+++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
@@ -136,6 +136,10 @@ void DeviceModule(py::module &m_device) {
         Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted.
     )doc");
 
+    m_device.def("GetPCIeDeviceID", &GetPCIeDeviceID, R"doc(
+        Returns associated mmio device of give device id.
+    )doc");
+
     m_device.def("SetDefaultDevice", &AutoFormat::SetDefaultDevice, R"doc(
         Sets the default device to use for ops when inputs aren't on device.
 
diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp
index a2a490c345c..5572bad808e 100644
--- a/tt_metal/host_api.hpp
+++ b/tt_metal/host_api.hpp
@@ -55,6 +55,8 @@ size_t GetNumAvailableDevices();
  */
 size_t GetNumPCIeDevices();
 
+chip_id_t GetPCIeDeviceID(chip_id_t device_id);
+
 /**
  * Instantiates a device object.
  *
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index a1ec68fb1bb..dc529c1256e 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -730,6 +730,10 @@ size_t GetNumPCIeDevices() {
 #endif
 }
 
+chip_id_t GetPCIeDeviceID(chip_id_t device_id){
+    return tt::Cluster::instance().get_associated_mmio_device(device_id);
+}
+
 Device *CreateDevice(
     chip_id_t device_id,
     const uint8_t num_hw_cqs,