Skip to content

Commit

Permalink
better error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
PietroPasotti committed Nov 11, 2024
1 parent fb0e19d commit 8fea869
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 27 deletions.
53 changes: 29 additions & 24 deletions src/cosl/coordinated_workers/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ class WorkerError(Exception):
"""Base class for exceptions raised by this module."""


class NoReadinessCheckEndpointConfiguredError(Exception):
"""Internal error when readiness check endpoint is missing."""


class ServiceEndpointStatus(Enum):
"""Status of the worker service managed by pebble."""

Expand Down Expand Up @@ -255,19 +259,26 @@ def status(self) -> ServiceEndpointStatus:
# we really don't want this code to raise errors, so we blanket catch all.
try:
services = self._container.get_services(*layer.services.keys())
running_status = {name: svc.is_running() for name, svc in services.items()}
if not all(running_status.values()):
if any(running_status.values()):
starting_services = tuple(
if services:
# avoid vacuous quantification
running_status = {name: svc.is_running() for name, svc in services.items()}

some_stopped = not all(running_status.values())
if some_stopped:
stopped = tuple(
name for name, running in running_status.items() if not running
)
logger.info(
f"Some services which should be running are not: {starting_services}."
)
return ServiceEndpointStatus.starting
logger.info(f"Some services which should be running are not: {stopped}.")
return ServiceEndpointStatus.down

logger.info("All services are down.")
return ServiceEndpointStatus.down
logger.info("All pebble services up.")
# so far as pebble knows all services are up, now let's see if
# the readiness endpoint confirm that
return self.check_readiness()

except NoReadinessCheckEndpointConfiguredError:
# assume up
return ServiceEndpointStatus.up

except Exception:
logger.exception(
Expand All @@ -276,20 +287,11 @@ def status(self) -> ServiceEndpointStatus:
)
return ServiceEndpointStatus.down

if not self._readiness_check_endpoint:
logger.warning("no readiness check endpoint configured: assuming workload is DOWN.")
return ServiceEndpointStatus.down

return self.check_readiness()

def check_readiness(self) -> ServiceEndpointStatus:
"""If the user has configured a readiness check endpoint, GET it and check the workload status."""
check_endpoint = self._readiness_check_endpoint
if not check_endpoint:
raise WorkerError(
"cannot check readiness without a readiness_check_endpoint configured. "
"Pass one to Worker on __init__."
)
raise NoReadinessCheckEndpointConfiguredError()

try:
with urllib.request.urlopen(check_endpoint(self)) as response:
Expand Down Expand Up @@ -703,10 +705,13 @@ def restart(self):
# set result to status; will retry unless it's up
attempt.retry_state.set_result(self.status is ServiceEndpointStatus.up)

except WorkerError:
# unable to check worker readiness: no readiness_check_endpoint configured.
# this status is already set on the unit so no need to log it
pass
except NoReadinessCheckEndpointConfiguredError:
# collect_unit_status will surface this to the user
logger.warning(
"could not check worker service readiness: no check endpoint configured. "
"Pass one to the Worker."
)
return True

except Exception:
logger.exception("unexpected error while attempting to determine worker status")
Expand Down
7 changes: 5 additions & 2 deletions tests/test_coordinated_workers/test_worker_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from scenario import Container, Context, ExecOutput, Relation, State

from cosl.coordinated_workers.interface import ClusterProviderAppData
from cosl.coordinated_workers.worker import Worker, WorkerError
from cosl.coordinated_workers.worker import (
NoReadinessCheckEndpointConfiguredError,
Worker,
)


@pytest.fixture(params=[True, False])
Expand Down Expand Up @@ -240,7 +243,7 @@ def test_access_readiness_no_endpoint_raises():
)

# THEN calling .check_readiness raises
with pytest.raises(WorkerError):
with pytest.raises(NoReadinessCheckEndpointConfiguredError):
worker.check_readiness() # noqa


Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# See LICENSE file for licensing details.

[tox]
envlist = lint, static, unit
envlist = fetch-libs, lint, static, unit
isolated_build=true

[vars]
Expand Down

0 comments on commit 8fea869

Please sign in to comment.