From 129fa027c1af010a299b92a01e0c66f1c6793531 Mon Sep 17 00:00:00 2001 From: David Evans Date: Thu, 8 Feb 2024 10:04:22 +0000 Subject: [PATCH] Reset cumulative metrics after job restart --- jobrunner/lib/docker_stats.py | 1 + jobrunner/record_stats.py | 11 ++++++++ tests/test_local_executor.py | 1 + tests/test_record_stats.py | 48 ++++++++++++++++++++++++++++++++--- 4 files changed, 58 insertions(+), 3 deletions(-) diff --git a/jobrunner/lib/docker_stats.py b/jobrunner/lib/docker_stats.py index daf712e3..f05150ad 100644 --- a/jobrunner/lib/docker_stats.py +++ b/jobrunner/lib/docker_stats.py @@ -30,6 +30,7 @@ def get_container_stats(timeout=DEFAULT_TIMEOUT): removeprefix(row["Name"], "os-job-"): { "cpu_percentage": float(row["CPUPerc"].rstrip("%")), "memory_used": _parse_size(row["MemUsage"].split()[0]), + "container_id": row["Container"], } for row in data if row["Name"].startswith("os-job-") diff --git a/jobrunner/record_stats.py b/jobrunner/record_stats.py index 3614f7e5..cd1ca2d3 100644 --- a/jobrunner/record_stats.py +++ b/jobrunner/record_stats.py @@ -209,6 +209,16 @@ def update_job_metrics(job, raw_metrics, duration_s, runtime_s): job_metrics = read_job_metrics(job.id) + # If the job has been restarted so it's now running in a new container then we need + # to zero out all the previous stats. + if ( + # This check is only needed for smooth deployment as previous metrics dicts + # won't have the container_id populated yet + "container_id" in job_metrics + and job_metrics["container_id"] != raw_metrics["container_id"] + ): + job_metrics = defaultdict(float) + cpu = raw_metrics["cpu_percentage"] mem_mb = raw_metrics["memory_used"] / (1024.0 * 1024.0) @@ -220,6 +230,7 @@ def update_job_metrics(job, raw_metrics, duration_s, runtime_s): job_metrics["mem_mb_cumsum"] += duration_s * mem_mb job_metrics["mem_mb_mean"] = job_metrics["mem_mb_cumsum"] / runtime_s job_metrics["mem_mb_peak"] = max(job_metrics["mem_mb_peak"], mem_mb) + job_metrics["container_id"] = raw_metrics["container_id"] write_job_metrics(job.id, job_metrics) diff --git a/tests/test_local_executor.py b/tests/test_local_executor.py index 994b9b9f..5f7e5922 100644 --- a/tests/test_local_executor.py +++ b/tests/test_local_executor.py @@ -272,6 +272,7 @@ def test_execute_metrics(docker_cleanup, job_definition, tmp_work_dir, db): "mem_mb_cumsum", "mem_mb_mean", "mem_mb_peak", + "container_id", ] diff --git a/tests/test_record_stats.py b/tests/test_record_stats.py index 8cc7dad1..c4b9ca66 100644 --- a/tests/test_record_stats.py +++ b/tests/test_record_stats.py @@ -67,6 +67,7 @@ def test_record_tick_trace(db, freezer, monkeypatch): running_job.id: { "cpu_percentage": 50.0, "memory_used": 1000 * mb, + "container_id": "a0b1c2d3", } } @@ -191,7 +192,11 @@ def test_update_job_metrics(db): # 50%/100m for 1s record_stats.update_job_metrics( job, - {"cpu_percentage": 50, "memory_used": 100 * mb}, + { + "cpu_percentage": 50, + "memory_used": 100 * mb, + "container_id": "a0b1c2d3", + }, duration_s=1.0, runtime_s=1.0, ) @@ -206,12 +211,17 @@ def test_update_job_metrics(db): "mem_mb_mean": 100.0, "mem_mb_peak": 100, "mem_mb_sample": 100, + "container_id": "a0b1c2d3", } # 100%/1000m for 1s record_stats.update_job_metrics( job, - {"cpu_percentage": 100, "memory_used": 1000 * mb}, + { + "cpu_percentage": 100, + "memory_used": 1000 * mb, + "container_id": "a0b1c2d3", + }, duration_s=1.0, runtime_s=2.0, ) @@ -226,12 +236,17 @@ def test_update_job_metrics(db): "mem_mb_mean": 550.0, "mem_mb_peak": 1000, "mem_mb_sample": 1000, + "container_id": "a0b1c2d3", } # 100%/1000m for 8s record_stats.update_job_metrics( job, - {"cpu_percentage": 100, "memory_used": 1000 * mb}, + { + "cpu_percentage": 100, + "memory_used": 1000 * mb, + "container_id": "a0b1c2d3", + }, duration_s=8.0, runtime_s=10.0, ) @@ -246,4 +261,31 @@ def test_update_job_metrics(db): "mem_mb_mean": 910.0, "mem_mb_peak": 1000, "mem_mb_sample": 1000, + "container_id": "a0b1c2d3", + } + + # Job has been restarted (note reset `runtime_s` and new container_id) + record_stats.update_job_metrics( + job, + { + "cpu_percentage": 50, + "memory_used": 100 * mb, + "container_id": "e4f5a6b7", + }, + duration_s=1.0, + runtime_s=1.0, + ) + + # Metrics should be reset as a result of the container_id changing + metrics = record_stats.read_job_metrics(job.id) + assert metrics == { + "cpu_cumsum": 50.0, + "cpu_mean": 50.0, + "cpu_peak": 50, + "cpu_sample": 50, + "mem_mb_cumsum": 100.0, + "mem_mb_mean": 100.0, + "mem_mb_peak": 100, + "mem_mb_sample": 100, + "container_id": "e4f5a6b7", }