From 129fa027c1af010a299b92a01e0c66f1c6793531 Mon Sep 17 00:00:00 2001
From: David Evans <dave.evans@thedatalab.org>
Date: Thu, 8 Feb 2024 10:04:22 +0000
Subject: [PATCH] Reset cumulative metrics after job restart

---
 jobrunner/lib/docker_stats.py |  1 +
 jobrunner/record_stats.py     | 11 ++++++++
 tests/test_local_executor.py  |  1 +
 tests/test_record_stats.py    | 48 ++++++++++++++++++++++++++++++++---
 4 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/jobrunner/lib/docker_stats.py b/jobrunner/lib/docker_stats.py
index daf712e3..f05150ad 100644
--- a/jobrunner/lib/docker_stats.py
+++ b/jobrunner/lib/docker_stats.py
@@ -30,6 +30,7 @@ def get_container_stats(timeout=DEFAULT_TIMEOUT):
         removeprefix(row["Name"], "os-job-"): {
             "cpu_percentage": float(row["CPUPerc"].rstrip("%")),
             "memory_used": _parse_size(row["MemUsage"].split()[0]),
+            "container_id": row["Container"],
         }
         for row in data
         if row["Name"].startswith("os-job-")
diff --git a/jobrunner/record_stats.py b/jobrunner/record_stats.py
index 3614f7e5..cd1ca2d3 100644
--- a/jobrunner/record_stats.py
+++ b/jobrunner/record_stats.py
@@ -209,6 +209,16 @@ def update_job_metrics(job, raw_metrics, duration_s, runtime_s):
 
     job_metrics = read_job_metrics(job.id)
 
+    # If the job has been restarted so it's now running in a new container then we need
+    # to zero out all the previous stats.
+    if (
+        # This check is only needed for smooth deployment as previous metrics dicts
+        # won't have the container_id populated yet
+        "container_id" in job_metrics
+        and job_metrics["container_id"] != raw_metrics["container_id"]
+    ):
+        job_metrics = defaultdict(float)
+
     cpu = raw_metrics["cpu_percentage"]
     mem_mb = raw_metrics["memory_used"] / (1024.0 * 1024.0)
 
@@ -220,6 +230,7 @@ def update_job_metrics(job, raw_metrics, duration_s, runtime_s):
     job_metrics["mem_mb_cumsum"] += duration_s * mem_mb
     job_metrics["mem_mb_mean"] = job_metrics["mem_mb_cumsum"] / runtime_s
     job_metrics["mem_mb_peak"] = max(job_metrics["mem_mb_peak"], mem_mb)
+    job_metrics["container_id"] = raw_metrics["container_id"]
 
     write_job_metrics(job.id, job_metrics)
 
diff --git a/tests/test_local_executor.py b/tests/test_local_executor.py
index 994b9b9f..5f7e5922 100644
--- a/tests/test_local_executor.py
+++ b/tests/test_local_executor.py
@@ -272,6 +272,7 @@ def test_execute_metrics(docker_cleanup, job_definition, tmp_work_dir, db):
         "mem_mb_cumsum",
         "mem_mb_mean",
         "mem_mb_peak",
+        "container_id",
     ]
 
 
diff --git a/tests/test_record_stats.py b/tests/test_record_stats.py
index 8cc7dad1..c4b9ca66 100644
--- a/tests/test_record_stats.py
+++ b/tests/test_record_stats.py
@@ -67,6 +67,7 @@ def test_record_tick_trace(db, freezer, monkeypatch):
         running_job.id: {
             "cpu_percentage": 50.0,
             "memory_used": 1000 * mb,
+            "container_id": "a0b1c2d3",
         }
     }
 
@@ -191,7 +192,11 @@ def test_update_job_metrics(db):
     # 50%/100m for 1s
     record_stats.update_job_metrics(
         job,
-        {"cpu_percentage": 50, "memory_used": 100 * mb},
+        {
+            "cpu_percentage": 50,
+            "memory_used": 100 * mb,
+            "container_id": "a0b1c2d3",
+        },
         duration_s=1.0,
         runtime_s=1.0,
     )
@@ -206,12 +211,17 @@ def test_update_job_metrics(db):
         "mem_mb_mean": 100.0,
         "mem_mb_peak": 100,
         "mem_mb_sample": 100,
+        "container_id": "a0b1c2d3",
     }
 
     # 100%/1000m for 1s
     record_stats.update_job_metrics(
         job,
-        {"cpu_percentage": 100, "memory_used": 1000 * mb},
+        {
+            "cpu_percentage": 100,
+            "memory_used": 1000 * mb,
+            "container_id": "a0b1c2d3",
+        },
         duration_s=1.0,
         runtime_s=2.0,
     )
@@ -226,12 +236,17 @@ def test_update_job_metrics(db):
         "mem_mb_mean": 550.0,
         "mem_mb_peak": 1000,
         "mem_mb_sample": 1000,
+        "container_id": "a0b1c2d3",
     }
 
     # 100%/1000m for 8s
     record_stats.update_job_metrics(
         job,
-        {"cpu_percentage": 100, "memory_used": 1000 * mb},
+        {
+            "cpu_percentage": 100,
+            "memory_used": 1000 * mb,
+            "container_id": "a0b1c2d3",
+        },
         duration_s=8.0,
         runtime_s=10.0,
     )
@@ -246,4 +261,31 @@ def test_update_job_metrics(db):
         "mem_mb_mean": 910.0,
         "mem_mb_peak": 1000,
         "mem_mb_sample": 1000,
+        "container_id": "a0b1c2d3",
+    }
+
+    # Job has been restarted (note reset `runtime_s` and new container_id)
+    record_stats.update_job_metrics(
+        job,
+        {
+            "cpu_percentage": 50,
+            "memory_used": 100 * mb,
+            "container_id": "e4f5a6b7",
+        },
+        duration_s=1.0,
+        runtime_s=1.0,
+    )
+
+    # Metrics should be reset as a result of the container_id changing
+    metrics = record_stats.read_job_metrics(job.id)
+    assert metrics == {
+        "cpu_cumsum": 50.0,
+        "cpu_mean": 50.0,
+        "cpu_peak": 50,
+        "cpu_sample": 50,
+        "mem_mb_cumsum": 100.0,
+        "mem_mb_mean": 100.0,
+        "mem_mb_peak": 100,
+        "mem_mb_sample": 100,
+        "container_id": "e4f5a6b7",
     }