From dfd04fef2cd9b95e35f2a95dbcfec5683b09d5b0 Mon Sep 17 00:00:00 2001 From: Eivind Jahren Date: Thu, 26 Sep 2024 15:42:23 +0200 Subject: [PATCH] Fix flaky kill_before_submit tests --- .../unit_tests/scheduler/test_lsf_driver.py | 60 +++++++++---------- .../unit_tests/scheduler/test_slurm_driver.py | 60 +++++++++---------- 2 files changed, 59 insertions(+), 61 deletions(-) diff --git a/tests/ert/unit_tests/scheduler/test_lsf_driver.py b/tests/ert/unit_tests/scheduler/test_lsf_driver.py index 558e45c94dd..76249cd7876 100644 --- a/tests/ert/unit_tests/scheduler/test_lsf_driver.py +++ b/tests/ert/unit_tests/scheduler/test_lsf_driver.py @@ -6,7 +6,6 @@ import re import stat import string -import sys import time from contextlib import ExitStack as does_not_raise from pathlib import Path @@ -38,7 +37,7 @@ parse_bhist, parse_bjobs, ) -from tests.ert.utils import poll +from tests.ert.utils import poll, wait_until from .conftest import mock_bin @@ -1212,11 +1211,7 @@ def mock_poll_once_by_bhist(*args, **kwargs): assert driver._bhist_cache and job_id in driver._bhist_cache -@pytest.mark.integration_test -@pytest.mark.flaky(rerun=10) -async def test_that_kill_before_submit_is_finished_works( - tmp_path, monkeypatch, caplog, pytestconfig -): +async def test_that_kill_before_submit_is_finished_works(tmp_path, monkeypatch, caplog): """This test asserts that it is possible to issue a kill command to a realization right after it has been submitted (as in driver.submit()). @@ -1228,18 +1223,6 @@ async def test_that_kill_before_submit_is_finished_works( """ os.chdir(tmp_path) - if pytestconfig.getoption("lsf"): - # Allow more time when tested on a real compute cluster to avoid false positives. - job_kill_window = 10 - test_grace_time = 20 - elif sys.platform.startswith("darwin"): - # Mitigate flakiness on low-power test nodes - job_kill_window = 5 - test_grace_time = 10 - else: - job_kill_window = 2 - test_grace_time = 4 - bin_path = tmp_path / "bin" bin_path.mkdir() monkeypatch.setenv("PATH", f"{bin_path}:{os.environ['PATH']}") @@ -1253,6 +1236,31 @@ async def test_that_kill_before_submit_is_finished_works( caplog.set_level(logging.DEBUG) driver = LsfDriver(bsub_cmd="slow_bsub") + job_path = bin_path / "job.sh" + job_path.write_text( + dedent( + f"""\ + #!/bin/bash + + do_stop=0 + + function handle() + {{ + echo "killed" > {tmp_path}/was_killed + do_stop=1 + exit + }} + trap handle SIGTERM + while [[ $do_stop == 0 ]] + do + sleep 0.1 + done + """ + ), + encoding="utf-8", + ) + job_path.chmod(job_path.stat().st_mode | stat.S_IEXEC) + # Allow submit and kill to be interleaved by asyncio by issuing # submit() in its own asyncio Task: asyncio.create_task( @@ -1260,9 +1268,7 @@ async def test_that_kill_before_submit_is_finished_works( # The sleep is the time window in which we can kill the job before # the unwanted finish message appears on disk. 0, - "sh", - "-c", - f"sleep {job_kill_window}; touch {tmp_path}/survived", + str(job_path), ) ) await asyncio.sleep(0.01) # Allow submit task to start executing @@ -1280,12 +1286,4 @@ async def finished(iens: int, returncode: int): await poll(driver, {0}, finished=finished) assert "ERROR" not in str(caplog.text) - # In case the return value of the killed job is correct but the submitted - # shell script is still running for whatever reason, a file called - # "survived" will appear on disk. Wait for it, and then ensure it is not - # there. - assert test_grace_time > job_kill_window, "Wrong test setup" - await asyncio.sleep(test_grace_time) - assert not Path( - "survived" - ).exists(), "The process children of the job should also have been killed" + wait_until((tmp_path / "was_killed").exists, timeout=10) diff --git a/tests/ert/unit_tests/scheduler/test_slurm_driver.py b/tests/ert/unit_tests/scheduler/test_slurm_driver.py index 49a41632899..a47f538b74f 100644 --- a/tests/ert/unit_tests/scheduler/test_slurm_driver.py +++ b/tests/ert/unit_tests/scheduler/test_slurm_driver.py @@ -4,16 +4,16 @@ import random import stat import string -import sys from contextlib import ExitStack as does_not_raise from pathlib import Path +from textwrap import dedent import pytest from hypothesis import given from hypothesis import strategies as st from ert.scheduler import SlurmDriver -from tests.ert.utils import poll +from tests.ert.utils import poll, wait_until from .conftest import mock_bin @@ -350,24 +350,9 @@ async def test_submit_with_num_cpu(pytestconfig, job_name): assert Path("test").read_text(encoding="utf-8") == "test\n" -@pytest.mark.flaky(reruns=3) -async def test_kill_before_submit_is_finished( - tmp_path, monkeypatch, caplog, pytestconfig -): +async def test_kill_before_submit_is_finished(tmp_path, monkeypatch, caplog): os.chdir(tmp_path) - if pytestconfig.getoption("slurm"): - # Allow more time when tested on a real compute cluster to avoid false positives. - job_kill_window = 5 - test_grace_time = 10 - elif sys.platform.startswith("darwin"): - # Mitigate flakiness on low-power test nodes - job_kill_window = 5 - test_grace_time = 10 - else: - job_kill_window = 1 - test_grace_time = 2 - bin_path = tmp_path / "bin" bin_path.mkdir() monkeypatch.setenv("PATH", f"{bin_path}:{os.environ['PATH']}") @@ -381,6 +366,31 @@ async def test_kill_before_submit_is_finished( caplog.set_level(logging.DEBUG) driver = SlurmDriver(sbatch_cmd="slow_sbatch") + job_path = bin_path / "job.sh" + job_path.write_text( + dedent( + f"""\ + #!/bin/bash + + do_stop=0 + + function handle() + {{ + echo "killed" > {tmp_path}/was_killed + do_stop=1 + exit + }} + trap handle SIGTERM + while [[ $do_stop == 0 ]] + do + sleep 0.1 + done + """ + ), + encoding="utf-8", + ) + job_path.chmod(job_path.stat().st_mode | stat.S_IEXEC) + # Allow submit and kill to be interleaved by asyncio by issuing # submit() in its own asyncio Task: asyncio.create_task( @@ -388,9 +398,7 @@ async def test_kill_before_submit_is_finished( # The sleep is the time window in which we can kill the job before # the unwanted finish message appears on disk. 0, - "sh", - "-c", - f"sleep {job_kill_window}; touch {tmp_path}/survived", + str(job_path), ) ) await asyncio.sleep(0.01) # Allow submit task to start executing @@ -403,12 +411,4 @@ async def finished(iens: int, returncode: int): await poll(driver, {0}, finished=finished) - # In case the return value of the killed job is correct but the submitted - # shell script is still running for whatever reason, a file called - # "survived" will appear on disk. Wait for it, and then ensure it is not - # there. - assert test_grace_time > job_kill_window, "Wrong test setup" - await asyncio.sleep(test_grace_time) - assert not Path( - "survived" - ).exists(), "The process children of the job should also have been killed" + wait_until((tmp_path / "was_killed").exists, timeout=10)