Skip to content

Commit

Permalink
Reset cumulative metrics after job restart
Browse files Browse the repository at this point in the history
  • Loading branch information
evansd committed Feb 8, 2024
1 parent 89dce75 commit 129fa02
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 3 deletions.
1 change: 1 addition & 0 deletions jobrunner/lib/docker_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def get_container_stats(timeout=DEFAULT_TIMEOUT):
removeprefix(row["Name"], "os-job-"): {
"cpu_percentage": float(row["CPUPerc"].rstrip("%")),
"memory_used": _parse_size(row["MemUsage"].split()[0]),
"container_id": row["Container"],
}
for row in data
if row["Name"].startswith("os-job-")
Expand Down
11 changes: 11 additions & 0 deletions jobrunner/record_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,16 @@ def update_job_metrics(job, raw_metrics, duration_s, runtime_s):

job_metrics = read_job_metrics(job.id)

# If the job has been restarted so it's now running in a new container then we need
# to zero out all the previous stats.
if (
# This check is only needed for smooth deployment as previous metrics dicts
# won't have the container_id populated yet
"container_id" in job_metrics
and job_metrics["container_id"] != raw_metrics["container_id"]
):
job_metrics = defaultdict(float)

cpu = raw_metrics["cpu_percentage"]
mem_mb = raw_metrics["memory_used"] / (1024.0 * 1024.0)

Expand All @@ -220,6 +230,7 @@ def update_job_metrics(job, raw_metrics, duration_s, runtime_s):
job_metrics["mem_mb_cumsum"] += duration_s * mem_mb
job_metrics["mem_mb_mean"] = job_metrics["mem_mb_cumsum"] / runtime_s
job_metrics["mem_mb_peak"] = max(job_metrics["mem_mb_peak"], mem_mb)
job_metrics["container_id"] = raw_metrics["container_id"]

write_job_metrics(job.id, job_metrics)

Expand Down
1 change: 1 addition & 0 deletions tests/test_local_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def test_execute_metrics(docker_cleanup, job_definition, tmp_work_dir, db):
"mem_mb_cumsum",
"mem_mb_mean",
"mem_mb_peak",
"container_id",
]


Expand Down
48 changes: 45 additions & 3 deletions tests/test_record_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_record_tick_trace(db, freezer, monkeypatch):
running_job.id: {
"cpu_percentage": 50.0,
"memory_used": 1000 * mb,
"container_id": "a0b1c2d3",
}
}

Expand Down Expand Up @@ -191,7 +192,11 @@ def test_update_job_metrics(db):
# 50%/100m for 1s
record_stats.update_job_metrics(
job,
{"cpu_percentage": 50, "memory_used": 100 * mb},
{
"cpu_percentage": 50,
"memory_used": 100 * mb,
"container_id": "a0b1c2d3",
},
duration_s=1.0,
runtime_s=1.0,
)
Expand All @@ -206,12 +211,17 @@ def test_update_job_metrics(db):
"mem_mb_mean": 100.0,
"mem_mb_peak": 100,
"mem_mb_sample": 100,
"container_id": "a0b1c2d3",
}

# 100%/1000m for 1s
record_stats.update_job_metrics(
job,
{"cpu_percentage": 100, "memory_used": 1000 * mb},
{
"cpu_percentage": 100,
"memory_used": 1000 * mb,
"container_id": "a0b1c2d3",
},
duration_s=1.0,
runtime_s=2.0,
)
Expand All @@ -226,12 +236,17 @@ def test_update_job_metrics(db):
"mem_mb_mean": 550.0,
"mem_mb_peak": 1000,
"mem_mb_sample": 1000,
"container_id": "a0b1c2d3",
}

# 100%/1000m for 8s
record_stats.update_job_metrics(
job,
{"cpu_percentage": 100, "memory_used": 1000 * mb},
{
"cpu_percentage": 100,
"memory_used": 1000 * mb,
"container_id": "a0b1c2d3",
},
duration_s=8.0,
runtime_s=10.0,
)
Expand All @@ -246,4 +261,31 @@ def test_update_job_metrics(db):
"mem_mb_mean": 910.0,
"mem_mb_peak": 1000,
"mem_mb_sample": 1000,
"container_id": "a0b1c2d3",
}

# Job has been restarted (note reset `runtime_s` and new container_id)
record_stats.update_job_metrics(
job,
{
"cpu_percentage": 50,
"memory_used": 100 * mb,
"container_id": "e4f5a6b7",
},
duration_s=1.0,
runtime_s=1.0,
)

# Metrics should be reset as a result of the container_id changing
metrics = record_stats.read_job_metrics(job.id)
assert metrics == {
"cpu_cumsum": 50.0,
"cpu_mean": 50.0,
"cpu_peak": 50,
"cpu_sample": 50,
"mem_mb_cumsum": 100.0,
"mem_mb_mean": 100.0,
"mem_mb_peak": 100,
"mem_mb_sample": 100,
"container_id": "e4f5a6b7",
}

0 comments on commit 129fa02

Please sign in to comment.