From a12045f311ac75e5df1f3c33b75738eb57297843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Berland?= Date: Wed, 20 Sep 2023 12:54:01 +0200 Subject: [PATCH] Count memory usage for grandchildren in job_dispatch job_dispatch/job_runner only picked up the memory usage for its direct child. When the direct child is only a wrapper for some potentially memory hungry processes, the memory usage reported from job_dispatch is essentially garbage. This PR lets the job runner scan through all its grandchildren when estimating memory usage for a started forward model. NB: This counts RSS (resident set size). When summing over multiple processes, the RSS sum will count shared memory pages multiple times, thus the sum is likely to be an overestimate for the actual memory load imposed. This is probably less of a problem, as the target memory consumption numbers that are of interest, stem from big datasets which are not shared among processes. --- src/_ert_job_runner/job.py | 7 ++++- tests/unit_tests/job_runner/test_job.py | 41 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/_ert_job_runner/job.py b/src/_ert_job_runner/job.py index 4a1fd64f94c..9c913b13eeb 100644 --- a/src/_ert_job_runner/job.py +++ b/src/_ert_job_runner/job.py @@ -117,7 +117,12 @@ def ensure_file_handles_closed(): max_memory_usage = 0 while exit_code is None: try: - memory = process.memory_info().rss + memory = process.memory_info().rss + sum( + [ + child.memory_info().rss + for child in process.children(recursive=True) + ] + ) except (NoSuchProcess, AccessDenied, ZombieProcess): # In case of a process that has died and is in some transitional # state, we ignore any failures. Only seen on OSX thus far. diff --git a/tests/unit_tests/job_runner/test_job.py b/tests/unit_tests/job_runner/test_job.py index 4440cf7293a..3ccdec7297c 100644 --- a/tests/unit_tests/job_runner/test_job.py +++ b/tests/unit_tests/job_runner/test_job.py @@ -33,6 +33,47 @@ def test_run_with_process_failing( next(run) +@pytest.mark.usefixtures("use_tmpdir") +def test_memory_usage_counts_grandchildren(): + scriptname = "recursive_memory_hog.py" + with open(scriptname, "w", encoding="utf-8") as script: + script.write( + """#!/usr/bin/env python +import os +import sys +import time + +counter = int(sys.argv[1]) +numbers = list(range(int(1e6))) +if counter > 0: + parent = os.fork() + if not parent: + os.execv(sys.argv[0], [sys.argv[0], str(counter - 1)]) +time.sleep(0.3)""" # Too low sleep will make the test faster but flaky + ) + executable = os.path.realpath(scriptname) + os.chmod(scriptname, stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG) + + def max_memory_per_subprocess_layer(layers: int) -> int: + job = Job( + { + "executable": executable, + "argList": [str(layers)], + }, + 0, + ) + job.MEMORY_POLL_PERIOD = 0.01 + max_seen = 0 + for status in job.run(): + if isinstance(status, Running): + max_seen = max(max_seen, status.max_memory_usage) + return max_seen + + max_seens = [max_memory_per_subprocess_layer(layers) for layers in range(3)] + assert max_seens[0] < max_seens[1] + assert max_seens[1] < max_seens[2] + + @pytest.mark.usefixtures("use_tmpdir") def test_run_fails_using_exit_bash_builtin(): job = Job(