From a23e343a5da7e4adf7f97906a3fc82fd769e5d27 Mon Sep 17 00:00:00 2001 From: bloodearnest Date: Wed, 3 Jan 2024 12:37:31 +0000 Subject: [PATCH] Add StatusCode.STALE_CODELISTS Previously, the handling of a stale codelists meant an INTERNAL_ERROR was raised, which needlessly alerted tech-support. This change adds a new StatusCode to track this specific error. This allows us to to avoid re-using INTERNAL_ERROR, and also make it easier to measure stale codelist frequency. In updating the state diagram, I realised we were missing a CREATED->SUCCEEDED transition, so added that too. --- DEVELOPERS.md | 4 +++- jobrunner/create_or_update_jobs.py | 26 +++++++++++++++------- jobrunner/models.py | 2 ++ tests/test_create_or_update_jobs.py | 34 +++++++++++++++++++++++------ 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 368b7863..955d886e 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -55,6 +55,8 @@ graph TD CREATED --> WAITING_ON_REBOOT CREATED --> WAITING_DB_MAINTENANCE CREATED --> WAITING_PAUSED + CREATED --> STALE_CODELISTS + CREATED --> SUCCEEDED WAITING_ON_DEPENDENCIES --> WAITING_ON_WORKERS WAITING_ON_DEPENDENCIES --> WAITING_ON_REBOOT WAITING_ON_DEPENDENCIES --> WAITING_DB_MAINTENANCE @@ -86,7 +88,7 @@ graph TD classDef blocking fill:#ffdf75,color:#7d661c,stroke-width:2px,stroke:#997d23; class LEGEND_BLOCKED,WAITING_ON_WORKERS,WAITING_ON_REBOOT,WAITING_PAUSED,WAITING_DB_MAINTENANCE blocking - class LEGEND_ERROR,INTERNAL_ERROR,UNMATCHED_PATTERNS,DEPENDENCY_FAILED,NONZERO_EXIT error + class LEGEND_ERROR,INTERNAL_ERROR,UNMATCHED_PATTERNS,DEPENDENCY_FAILED,NONZERO_EXIT,STALE_CODELISTS error ``` diff --git a/jobrunner/create_or_update_jobs.py b/jobrunner/create_or_update_jobs.py index f3e35e14..9534ec62 100644 --- a/jobrunner/create_or_update_jobs.py +++ b/jobrunner/create_or_update_jobs.py @@ -36,6 +36,10 @@ class JobRequestError(Exception): pass +class StaleCodelistError(JobRequestError): + pass + + class NothingToDoError(JobRequestError): pass @@ -62,10 +66,10 @@ def create_or_update_jobs(job_request): JobRequestError, ) as e: log.info(f"JobRequest failed:\n{e}") - create_failed_job(job_request, e) + create_job_from_exception(job_request, e) except Exception: log.exception("Uncaught error while creating jobs") - create_failed_job(job_request, JobRequestError("Internal error")) + create_job_from_exception(job_request, JobRequestError("Internal error")) else: if job_request.cancelled_actions: log.debug("Cancelling actions: %s", job_request.cancelled_actions) @@ -312,12 +316,12 @@ def assert_codelists_ok(job_request, new_jobs): # Codelists are out of date; fail the entire job request if any job # requires database access if job.requires_db: - raise JobRequestError( + raise StaleCodelistError( f"Codelists are out of date (required by action {job.action})" ) -def create_failed_job(job_request, exception): +def create_job_from_exception(job_request, exception): """ Sometimes we want to say to the job-server (and the user): your JobRequest was broken so we weren't able to create any jobs for it. But the only way @@ -328,19 +332,25 @@ def create_failed_job(job_request, exception): This is a bit of a hack, but it keeps the sync protocol simple. """ + action = "__error__" + error = exception + state = State.FAILED + status_message = str(exception) + # Special case for the NothingToDoError which we treat as a success if isinstance(exception, NothingToDoError): state = State.SUCCEEDED code = StatusCode.SUCCEEDED - status_message = "All actions have already run" action = job_request.requested_actions[0] error = None + # StaleCodelistError is a failure but not an INTERNAL_ERROR + elif isinstance(exception, StaleCodelistError): + code = StatusCode.STALE_CODELISTS else: - state = State.FAILED code = StatusCode.INTERNAL_ERROR + # include exception name in message to aid debugging status_message = f"{type(exception).__name__}: {exception}" - action = "__error__" - error = exception + now = time.time() job = Job( job_request_id=job_request.id, diff --git a/jobrunner/models.py b/jobrunner/models.py index 0b53450f..84ae5c8f 100644 --- a/jobrunner/models.py +++ b/jobrunner/models.py @@ -74,6 +74,7 @@ class StatusCode(Enum): UNMATCHED_PATTERNS = "unmatched_patterns" INTERNAL_ERROR = "internal_error" KILLED_BY_ADMIN = "killed_by_admin" + STALE_CODELISTS = "stale_codelists" @property def is_final_code(self): @@ -93,6 +94,7 @@ def __lt__(self, other): StatusCode.UNMATCHED_PATTERNS, StatusCode.INTERNAL_ERROR, StatusCode.KILLED_BY_ADMIN, + StatusCode.STALE_CODELISTS, ] diff --git a/tests/test_create_or_update_jobs.py b/tests/test_create_or_update_jobs.py index 8b67943f..c4bbddf1 100644 --- a/tests/test_create_or_update_jobs.py +++ b/tests/test_create_or_update_jobs.py @@ -9,7 +9,8 @@ from jobrunner.create_or_update_jobs import ( JobRequestError, NothingToDoError, - create_failed_job, + StaleCodelistError, + create_job_from_exception, create_jobs, create_or_update_jobs, validate_job_request, @@ -305,10 +306,10 @@ def create_jobs_with_project_file(job_request, project_file): return create_jobs(job_request) -def test_create_failed_job(db): +def test_create_job_from_exception(db): job_request = job_request_factory_raw() - create_failed_job(job_request, Exception("test")) + create_job_from_exception(job_request, Exception("test")) job = find_one(Job, job_request_id=job_request.id) @@ -329,15 +330,15 @@ def test_create_failed_job(db): assert spans[1].events[0].attributes["exception.message"] == "test" -def test_create_failed_job_nothing_to_do(db): +def test_create_job_from_exception_nothing_to_do(db): job_request = job_request_factory_raw() - create_failed_job(job_request, NothingToDoError()) + create_job_from_exception(job_request, NothingToDoError("nothing to do")) job = find_one(Job, job_request_id=job_request.id) assert job.state == State.SUCCEEDED assert job.status_code == StatusCode.SUCCEEDED - assert job.status_message == "All actions have already run" + assert job.status_message == "nothing to do" assert job.action == job_request.requested_actions[0] spans = get_trace("jobs") @@ -348,6 +349,25 @@ def test_create_failed_job_nothing_to_do(db): assert spans[1].status.status_code == trace.StatusCode.UNSET +def test_create_job_from_exception_stale_codelist(db): + job_request = job_request_factory_raw() + + create_job_from_exception(job_request, StaleCodelistError("stale")) + job = find_one(Job, job_request_id=job_request.id) + + assert job.state == State.FAILED + assert job.status_code == StatusCode.STALE_CODELISTS + assert job.status_message == "stale" + assert job.action == "__error__" + + spans = get_trace("jobs") + + assert spans[0].name == "STALE_CODELISTS" + assert spans[0].status.status_code == trace.StatusCode.ERROR + assert spans[1].name == "JOB" + assert spans[1].status.status_code == trace.StatusCode.ERROR + + @pytest.mark.parametrize( "requested_action,expect_error", [("generate_cohort", True), ("analyse_data", True), ("standalone_action", False)], @@ -369,7 +389,7 @@ def test_create_or_update_jobs_with_out_of_date_codelists( # The error reports the action that needed the up-to-date codelists, even if that # wasn't the action explicitly requested with pytest.raises( - JobRequestError, + StaleCodelistError, match=re.escape( "Codelists are out of date (required by action generate_cohort)" ),