diff --git a/src/charm.py b/src/charm.py index df8183ff..b0a51de6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -10,7 +10,7 @@ import socket import subprocess from pathlib import Path -from typing import Dict, Optional, cast +from typing import Dict, Optional, Tuple, TypedDict, cast from urllib.parse import urlparse import yaml @@ -46,6 +46,7 @@ from lightkube.core.client import Client from lightkube.core.exceptions import ApiError as LightkubeApiError from lightkube.resources.core_v1 import PersistentVolumeClaim, Pod +from ops import CollectStatusEvent, StoredState from ops.charm import ActionEvent, CharmBase from ops.main import main from ops.model import ( @@ -54,6 +55,7 @@ MaintenanceStatus, ModelError, OpenedPort, + StatusBase, WaitingStatus, ) from ops.pebble import Error as PebbleError @@ -91,6 +93,27 @@ class ConfigError(Exception): pass +class CompositeStatus(TypedDict): + """Per-component status holder.""" + + # These are going to go into stored state, so we must use marshallable objects. + # They are passed to StatusBase.from_name(). + retention_size: Tuple[str, str] + k8s_patch: Tuple[str, str] + config: Tuple[str, str] + + +def to_tuple(status: StatusBase) -> Tuple[str, str]: + """Convert a StatusBase to tuple, so it is marshallable into StoredState.""" + return status.name, status.message + + +def to_status(tpl: Tuple[str, str]) -> StatusBase: + """Convert a tuple to a StatusBase, so it could be used natively with ops.""" + name, message = tpl + return StatusBase.from_name(name, message) + + @trace_charm( tracing_endpoint="tempo", extra_types=[ @@ -104,9 +127,21 @@ class ConfigError(Exception): class PrometheusCharm(CharmBase): """A Juju Charm for Prometheus.""" + _stored = StoredState() + def __init__(self, *args): super().__init__(*args) + # Prometheus has a mix of pull and push statuses. We need stored state for push statuses. + # https://discourse.charmhub.io/t/its-probably-ok-for-a-unit-to-go-into-error-state/13022 + self._stored.set_default( + status=CompositeStatus( + retention_size=to_tuple(ActiveStatus()), + k8s_patch=to_tuple(ActiveStatus()), + config=to_tuple(ActiveStatus()), + ) + ) + self._name = "prometheus" self._port = 9090 self.container = self.unit.get_container(self._name) @@ -191,6 +226,17 @@ def __init__(self, *args): self.framework.observe(self.alertmanager_consumer.on.cluster_changed, self._configure) self.framework.observe(self.resources_patch.on.patch_failed, self._on_k8s_patch_failed) self.framework.observe(self.on.validate_configuration_action, self._on_validate_config) + self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status) + + def _on_collect_unit_status(self, event: CollectStatusEvent): + # "Pull" statuses + retention_time = self.model.config.get("metrics_retention_time", "") + if not self._is_valid_timespec(retention_time): + event.add_status(BlockedStatus(f"Invalid time spec : {retention_time}")) + + # "Push" statuses + for status in self._stored.status.values(): + event.add_status(to_status(status)) def set_ports(self): """Open necessary (and close no longer needed) workload ports.""" @@ -379,7 +425,7 @@ def _on_ingress_revoked(self, event: IngressPerUnitRevokedForUnitEvent): self._configure(event) def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent): - self.unit.status = BlockedStatus(cast(str, event.message)) + self._stored.status["k8s_patch"] = to_tuple(BlockedStatus(cast(str, event.message))) def _on_server_cert_changed(self, _): self._update_cert() @@ -476,13 +522,21 @@ def _configure(self, _): ), } - if not self.resources_patch.is_ready(): - if isinstance(self.unit.status, ActiveStatus) or self.unit.status.message == "": - self.unit.status = WaitingStatus("Waiting for resource limit patch to apply") + # "is_ready" is a racy check, so we do it once here (instead of in collect-status) + if self.resources_patch.is_ready(): + self._stored.status["k8s_patch"] = to_tuple(ActiveStatus()) + else: + if isinstance(to_status(self._stored.status["k8s_patch"]), ActiveStatus): + self._stored.status["k8s_patch"] = to_tuple( + WaitingStatus("Waiting for resource limit patch to apply") + ) return - if not self.container.can_connect(): - self.unit.status = MaintenanceStatus("Configuring Prometheus") + # "can_connect" is a racy check, so we do it once here (instead of in collect-status) + if self.container.can_connect(): + self._stored.status["config"] = to_tuple(ActiveStatus()) + else: + self._stored.status["config"] = to_tuple(MaintenanceStatus("Configuring Prometheus")) return if self._is_cert_available() and not self._is_tls_ready(): @@ -508,19 +562,23 @@ def _configure(self, _): ) except ConfigError as e: logger.error("Failed to generate configuration: %s", e) - self.unit.status = BlockedStatus(str(e)) + self._stored.status["config"] = to_tuple(BlockedStatus(str(e))) return except PebbleError as e: logger.error("Failed to push updated config/alert files: %s", e) - self.unit.status = early_return_statuses["push_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["push_fail"]) return + else: + self._stored.status["config"] = to_tuple(ActiveStatus()) try: layer_changed = self._update_layer() except (TypeError, PebbleError) as e: logger.error("Failed to update prometheus service: %s", e) - self.unit.status = early_return_statuses["layer_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["layer_fail"]) return + else: + self._stored.status["config"] = to_tuple(ActiveStatus()) try: output, err = self._promtool_check_config() @@ -528,12 +586,14 @@ def _configure(self, _): logger.error( "Invalid prometheus configuration. Stdout: %s Stderr: %s", output, err ) - self.unit.status = early_return_statuses["config_invalid"] + self._stored.status["config"] = to_tuple(early_return_statuses["config_invalid"]) return except PebbleError as e: logger.error("Failed to validate prometheus config: %s", e) - self.unit.status = early_return_statuses["validation_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["validation_fail"]) return + else: + self._stored.status["config"] = to_tuple(ActiveStatus()) try: # If a config is invalid then prometheus would exit immediately. @@ -547,8 +607,10 @@ def _configure(self, _): self._prometheus_layer.to_dict(), e, ) - self.unit.status = early_return_statuses["restart_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["restart_fail"]) return + else: + self._stored.status["config"] = to_tuple(ActiveStatus()) # We only need to reload if pebble didn't replan (if pebble replanned, then new config # would be picked up on startup anyway). @@ -556,21 +618,14 @@ def _configure(self, _): reloaded = self._prometheus_client.reload_configuration() if not reloaded: logger.error("Prometheus failed to reload the configuration") - self.unit.status = early_return_statuses["cfg_load_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_fail"]) return if reloaded == "read_timeout": - self.unit.status = early_return_statuses["cfg_load_timeout"] + self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_timeout"]) return logger.info("Prometheus configuration reloaded") - - if ( - isinstance(self.unit.status, BlockedStatus) - and self.unit.status not in early_return_statuses.values() - ): - return - - self.unit.status = ActiveStatus() + self._stored.status["config"] = to_tuple(ActiveStatus()) def _on_pebble_ready(self, event) -> None: """Pebble ready hook. @@ -681,7 +736,9 @@ def _generate_command(self) -> str: except ValueError as e: logger.warning(e) - self.unit.status = BlockedStatus(f"Invalid retention size: {e}") + self._stored.status["retention_size"] = to_tuple( + BlockedStatus(f"Invalid retention size: {e}") + ) else: # `storage.tsdb.retention.size` uses the legacy binary format, so "GB" and not "GiB" @@ -692,15 +749,20 @@ def _generate_command(self) -> str: self._get_pvc_capacity(), ratio ) except ValueError as e: - self.unit.status = BlockedStatus(f"Error calculating retention size: {e}") + self._stored.status["retention_size"] = to_tuple( + BlockedStatus(f"Error calculating retention size: {e}") + ) except LightkubeApiError as e: - self.unit.status = BlockedStatus( - "Error calculating retention size " - f"(try running `juju trust` on this application): {e}" + self._stored.status["retention_size"] = to_tuple( + BlockedStatus( + "Error calculating retention size " + f"(try running `juju trust` on this application): {e}" + ) ) else: logger.debug("Retention size limit set to %s (%s%%)", capacity, ratio * 100) args.append(f"--storage.tsdb.retention.size={capacity}") + self._stored.status["retention_size"] = to_tuple(ActiveStatus()) command = ["/bin/prometheus"] + args @@ -807,9 +869,7 @@ def _is_valid_timespec(self, timeval: str) -> bool: timespec_re = re.compile( r"^((([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?|0)$" ) - if not (matched := timespec_re.search(timeval)): - self.unit.status = BlockedStatus(f"Invalid time spec : {timeval}") - + matched = timespec_re.search(timeval) return bool(matched) def _percent_string_to_ratio(self, percentage: str) -> float: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9bc8a3b0..9df7ae06 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -216,6 +216,7 @@ def test_configuration_reload(self, trigger_configuration_reload, *unused): def test_configuration_reload_success(self, trigger_configuration_reload, *unused): trigger_configuration_reload.return_value = True self.harness.update_config({"evaluation_interval": "1234m"}) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @k8s_resource_multipatch @@ -224,6 +225,7 @@ def test_configuration_reload_success(self, trigger_configuration_reload, *unuse def test_configuration_reload_error(self, trigger_configuration_reload, *unused): trigger_configuration_reload.return_value = False self.harness.update_config({"evaluation_interval": "1234m"}) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) @k8s_resource_multipatch @@ -232,6 +234,7 @@ def test_configuration_reload_error(self, trigger_configuration_reload, *unused) def test_configuration_reload_read_timeout(self, trigger_configuration_reload, *unused): trigger_configuration_reload.return_value = "read_timeout" self.harness.update_config({"evaluation_interval": "1234m"}) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, MaintenanceStatus) @@ -287,6 +290,14 @@ def test_default_maximum_retention_size_is_80_percent(self, *unused): plan = self.harness.get_container_pebble_plan("prometheus") self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") + # AND WHEN the config option is set and then unset + self.harness.update_config({"maximum_retention_size": "50%"}) + self.harness.update_config(unset={"maximum_retention_size"}) + + # THEN the pebble plan is back to 80% + plan = self.harness.get_container_pebble_plan("prometheus") + self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") + @k8s_resource_multipatch @patch("lightkube.core.client.GenericSyncClient") def test_multiplication_factor_applied_to_pvc_capacity(self, *unused): @@ -294,16 +305,46 @@ def test_multiplication_factor_applied_to_pvc_capacity(self, *unused): # GIVEN a capacity limit in binary notation (k8s notation) self.mock_capacity.return_value = "1Gi" - # AND a multiplication factor as a config option - self.harness.update_config({"maximum_retention_size": "50%"}) - # WHEN the charm starts self.harness.begin_with_initial_hooks() self.harness.container_pebble_ready("prometheus") - # THEN the pebble plan the adjusted capacity + for set_point, read_back in [("0%", "0GB"), ("50%", "0.5GB"), ("100%", "1GB")]: + with self.subTest(limit=set_point): + # WHEN a limit is set + self.harness.update_config({"maximum_retention_size": set_point}) + + # THEN the pebble plan the adjusted capacity + plan = self.harness.get_container_pebble_plan("prometheus") + self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), read_back) + + @k8s_resource_multipatch + @patch("lightkube.core.client.GenericSyncClient") + def test_invalid_retention_size_config_option_string(self, *unused): + # GIVEN a running charm with default values + self.mock_capacity.return_value = "1Gi" + self.harness.begin_with_initial_hooks() + self.harness.container_pebble_ready("prometheus") + self.harness.evaluate_status() + self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) + + # WHEN the config option is set to an invalid string + self.harness.update_config({"maximum_retention_size": "42"}) + + # THEN cli arg is unspecified and the unit is blocked plan = self.harness.get_container_pebble_plan("prometheus") - self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.5GB") + self.assertIsNone(cli_arg(plan, "--storage.tsdb.retention.size")) + self.harness.evaluate_status() + self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) + + # AND WHEN the config option is corrected + self.harness.update_config({"maximum_retention_size": "42%"}) + + # THEN cli arg is updated and the unit is goes back to active + plan = self.harness.get_container_pebble_plan("prometheus") + self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.42GB") + self.harness.evaluate_status() + self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @prom_multipatch @@ -654,6 +695,7 @@ def test_ca_file(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) container = self.harness.charm.unit.get_container("prometheus") self.assertEqual(container.pull("/etc/prometheus/job1-ca.crt").read(), "CA 1") @@ -690,6 +732,7 @@ def test_no_tls_config(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @k8s_resource_multipatch @@ -725,6 +768,7 @@ def test_tls_config_missing_cert(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) @k8s_resource_multipatch @@ -760,4 +804,5 @@ def test_tls_config_missing_key(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) diff --git a/tests/unit/test_charm_status.py b/tests/unit/test_charm_status.py index b7aae061..e0517e67 100644 --- a/tests/unit/test_charm_status.py +++ b/tests/unit/test_charm_status.py @@ -58,6 +58,7 @@ def test_unit_is_active_if_deployed_without_relations_or_config(self, *unused): # WHEN no config is provided or relations created # THEN the unit goes into active state + self.harness.evaluate_status() self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) # AND pebble plan is not empty @@ -90,6 +91,7 @@ def test_unit_is_blocked_if_reload_configuration_fails(self, *unused): # WHEN no config is provided or relations created # THEN the unit goes into blocked state + self.harness.evaluate_status() self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus) # AND pebble plan is not empty diff --git a/tests/unit/test_remote_write.py b/tests/unit/test_remote_write.py index b97e1b55..3a1fc31d 100644 --- a/tests/unit/test_remote_write.py +++ b/tests/unit/test_remote_write.py @@ -241,6 +241,7 @@ def test_port_is_set(self, *unused): self.harness.get_relation_data(rel_id, self.harness.charm.unit.name), {"remote_write": json.dumps({"url": "http://fqdn:9090/api/v1/write"})}, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) @k8s_resource_multipatch