From 4ca83b670f72f964e60c0ce69bd0f66b47016aaa Mon Sep 17 00:00:00 2001
From: Leon <82407168+sed-i@users.noreply.github.com>
Date: Mon, 29 Jan 2024 08:37:36 -0500
Subject: [PATCH] Set ActiveStatus when retention size config option is valid
 (#561)

* Fix status for retention config option
* fetch-lib
* Refactor status setting

Co-authored-by: Ben Hoyt <benhoyt@gmail.com>
---
 src/charm.py                    | 122 ++++++++++++++++++++++++--------
 tests/unit/test_charm.py        |  55 ++++++++++++--
 tests/unit/test_charm_status.py |   2 +
 tests/unit/test_remote_write.py |   1 +
 4 files changed, 144 insertions(+), 36 deletions(-)

diff --git a/src/charm.py b/src/charm.py
index df8183ff..b0a51de6 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -10,7 +10,7 @@
 import socket
 import subprocess
 from pathlib import Path
-from typing import Dict, Optional, cast
+from typing import Dict, Optional, Tuple, TypedDict, cast
 from urllib.parse import urlparse
 
 import yaml
@@ -46,6 +46,7 @@
 from lightkube.core.client import Client
 from lightkube.core.exceptions import ApiError as LightkubeApiError
 from lightkube.resources.core_v1 import PersistentVolumeClaim, Pod
+from ops import CollectStatusEvent, StoredState
 from ops.charm import ActionEvent, CharmBase
 from ops.main import main
 from ops.model import (
@@ -54,6 +55,7 @@
     MaintenanceStatus,
     ModelError,
     OpenedPort,
+    StatusBase,
     WaitingStatus,
 )
 from ops.pebble import Error as PebbleError
@@ -91,6 +93,27 @@ class ConfigError(Exception):
     pass
 
 
+class CompositeStatus(TypedDict):
+    """Per-component status holder."""
+
+    # These are going to go into stored state, so we must use marshallable objects.
+    # They are passed to StatusBase.from_name().
+    retention_size: Tuple[str, str]
+    k8s_patch: Tuple[str, str]
+    config: Tuple[str, str]
+
+
+def to_tuple(status: StatusBase) -> Tuple[str, str]:
+    """Convert a StatusBase to tuple, so it is marshallable into StoredState."""
+    return status.name, status.message
+
+
+def to_status(tpl: Tuple[str, str]) -> StatusBase:
+    """Convert a tuple to a StatusBase, so it could be used natively with ops."""
+    name, message = tpl
+    return StatusBase.from_name(name, message)
+
+
 @trace_charm(
     tracing_endpoint="tempo",
     extra_types=[
@@ -104,9 +127,21 @@ class ConfigError(Exception):
 class PrometheusCharm(CharmBase):
     """A Juju Charm for Prometheus."""
 
+    _stored = StoredState()
+
     def __init__(self, *args):
         super().__init__(*args)
 
+        # Prometheus has a mix of pull and push statuses. We need stored state for push statuses.
+        # https://discourse.charmhub.io/t/its-probably-ok-for-a-unit-to-go-into-error-state/13022
+        self._stored.set_default(
+            status=CompositeStatus(
+                retention_size=to_tuple(ActiveStatus()),
+                k8s_patch=to_tuple(ActiveStatus()),
+                config=to_tuple(ActiveStatus()),
+            )
+        )
+
         self._name = "prometheus"
         self._port = 9090
         self.container = self.unit.get_container(self._name)
@@ -191,6 +226,17 @@ def __init__(self, *args):
         self.framework.observe(self.alertmanager_consumer.on.cluster_changed, self._configure)
         self.framework.observe(self.resources_patch.on.patch_failed, self._on_k8s_patch_failed)
         self.framework.observe(self.on.validate_configuration_action, self._on_validate_config)
+        self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status)
+
+    def _on_collect_unit_status(self, event: CollectStatusEvent):
+        # "Pull" statuses
+        retention_time = self.model.config.get("metrics_retention_time", "")
+        if not self._is_valid_timespec(retention_time):
+            event.add_status(BlockedStatus(f"Invalid time spec : {retention_time}"))
+
+        # "Push" statuses
+        for status in self._stored.status.values():
+            event.add_status(to_status(status))
 
     def set_ports(self):
         """Open necessary (and close no longer needed) workload ports."""
@@ -379,7 +425,7 @@ def _on_ingress_revoked(self, event: IngressPerUnitRevokedForUnitEvent):
         self._configure(event)
 
     def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent):
-        self.unit.status = BlockedStatus(cast(str, event.message))
+        self._stored.status["k8s_patch"] = to_tuple(BlockedStatus(cast(str, event.message)))
 
     def _on_server_cert_changed(self, _):
         self._update_cert()
@@ -476,13 +522,21 @@ def _configure(self, _):
             ),
         }
 
-        if not self.resources_patch.is_ready():
-            if isinstance(self.unit.status, ActiveStatus) or self.unit.status.message == "":
-                self.unit.status = WaitingStatus("Waiting for resource limit patch to apply")
+        # "is_ready" is a racy check, so we do it once here (instead of in collect-status)
+        if self.resources_patch.is_ready():
+            self._stored.status["k8s_patch"] = to_tuple(ActiveStatus())
+        else:
+            if isinstance(to_status(self._stored.status["k8s_patch"]), ActiveStatus):
+                self._stored.status["k8s_patch"] = to_tuple(
+                    WaitingStatus("Waiting for resource limit patch to apply")
+                )
             return
 
-        if not self.container.can_connect():
-            self.unit.status = MaintenanceStatus("Configuring Prometheus")
+        # "can_connect" is a racy check, so we do it once here (instead of in collect-status)
+        if self.container.can_connect():
+            self._stored.status["config"] = to_tuple(ActiveStatus())
+        else:
+            self._stored.status["config"] = to_tuple(MaintenanceStatus("Configuring Prometheus"))
             return
 
         if self._is_cert_available() and not self._is_tls_ready():
@@ -508,19 +562,23 @@ def _configure(self, _):
             )
         except ConfigError as e:
             logger.error("Failed to generate configuration: %s", e)
-            self.unit.status = BlockedStatus(str(e))
+            self._stored.status["config"] = to_tuple(BlockedStatus(str(e)))
             return
         except PebbleError as e:
             logger.error("Failed to push updated config/alert files: %s", e)
-            self.unit.status = early_return_statuses["push_fail"]
+            self._stored.status["config"] = to_tuple(early_return_statuses["push_fail"])
             return
+        else:
+            self._stored.status["config"] = to_tuple(ActiveStatus())
 
         try:
             layer_changed = self._update_layer()
         except (TypeError, PebbleError) as e:
             logger.error("Failed to update prometheus service: %s", e)
-            self.unit.status = early_return_statuses["layer_fail"]
+            self._stored.status["config"] = to_tuple(early_return_statuses["layer_fail"])
             return
+        else:
+            self._stored.status["config"] = to_tuple(ActiveStatus())
 
         try:
             output, err = self._promtool_check_config()
@@ -528,12 +586,14 @@ def _configure(self, _):
                 logger.error(
                     "Invalid prometheus configuration. Stdout: %s Stderr: %s", output, err
                 )
-                self.unit.status = early_return_statuses["config_invalid"]
+                self._stored.status["config"] = to_tuple(early_return_statuses["config_invalid"])
                 return
         except PebbleError as e:
             logger.error("Failed to validate prometheus config: %s", e)
-            self.unit.status = early_return_statuses["validation_fail"]
+            self._stored.status["config"] = to_tuple(early_return_statuses["validation_fail"])
             return
+        else:
+            self._stored.status["config"] = to_tuple(ActiveStatus())
 
         try:
             # If a config is invalid then prometheus would exit immediately.
@@ -547,8 +607,10 @@ def _configure(self, _):
                 self._prometheus_layer.to_dict(),
                 e,
             )
-            self.unit.status = early_return_statuses["restart_fail"]
+            self._stored.status["config"] = to_tuple(early_return_statuses["restart_fail"])
             return
+        else:
+            self._stored.status["config"] = to_tuple(ActiveStatus())
 
         # We only need to reload if pebble didn't replan (if pebble replanned, then new config
         # would be picked up on startup anyway).
@@ -556,21 +618,14 @@ def _configure(self, _):
             reloaded = self._prometheus_client.reload_configuration()
             if not reloaded:
                 logger.error("Prometheus failed to reload the configuration")
-                self.unit.status = early_return_statuses["cfg_load_fail"]
+                self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_fail"])
                 return
             if reloaded == "read_timeout":
-                self.unit.status = early_return_statuses["cfg_load_timeout"]
+                self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_timeout"])
                 return
 
             logger.info("Prometheus configuration reloaded")
-
-        if (
-            isinstance(self.unit.status, BlockedStatus)
-            and self.unit.status not in early_return_statuses.values()
-        ):
-            return
-
-        self.unit.status = ActiveStatus()
+            self._stored.status["config"] = to_tuple(ActiveStatus())
 
     def _on_pebble_ready(self, event) -> None:
         """Pebble ready hook.
@@ -681,7 +736,9 @@ def _generate_command(self) -> str:
 
         except ValueError as e:
             logger.warning(e)
-            self.unit.status = BlockedStatus(f"Invalid retention size: {e}")
+            self._stored.status["retention_size"] = to_tuple(
+                BlockedStatus(f"Invalid retention size: {e}")
+            )
 
         else:
             # `storage.tsdb.retention.size` uses the legacy binary format, so "GB" and not "GiB"
@@ -692,15 +749,20 @@ def _generate_command(self) -> str:
                     self._get_pvc_capacity(), ratio
                 )
             except ValueError as e:
-                self.unit.status = BlockedStatus(f"Error calculating retention size: {e}")
+                self._stored.status["retention_size"] = to_tuple(
+                    BlockedStatus(f"Error calculating retention size: {e}")
+                )
             except LightkubeApiError as e:
-                self.unit.status = BlockedStatus(
-                    "Error calculating retention size "
-                    f"(try running `juju trust` on this application): {e}"
+                self._stored.status["retention_size"] = to_tuple(
+                    BlockedStatus(
+                        "Error calculating retention size "
+                        f"(try running `juju trust` on this application): {e}"
+                    )
                 )
             else:
                 logger.debug("Retention size limit set to %s (%s%%)", capacity, ratio * 100)
                 args.append(f"--storage.tsdb.retention.size={capacity}")
+                self._stored.status["retention_size"] = to_tuple(ActiveStatus())
 
         command = ["/bin/prometheus"] + args
 
@@ -807,9 +869,7 @@ def _is_valid_timespec(self, timeval: str) -> bool:
         timespec_re = re.compile(
             r"^((([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?|0)$"
         )
-        if not (matched := timespec_re.search(timeval)):
-            self.unit.status = BlockedStatus(f"Invalid time spec : {timeval}")
-
+        matched = timespec_re.search(timeval)
         return bool(matched)
 
     def _percent_string_to_ratio(self, percentage: str) -> float:
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index 9bc8a3b0..9df7ae06 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -216,6 +216,7 @@ def test_configuration_reload(self, trigger_configuration_reload, *unused):
     def test_configuration_reload_success(self, trigger_configuration_reload, *unused):
         trigger_configuration_reload.return_value = True
         self.harness.update_config({"evaluation_interval": "1234m"})
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)
 
     @k8s_resource_multipatch
@@ -224,6 +225,7 @@ def test_configuration_reload_success(self, trigger_configuration_reload, *unuse
     def test_configuration_reload_error(self, trigger_configuration_reload, *unused):
         trigger_configuration_reload.return_value = False
         self.harness.update_config({"evaluation_interval": "1234m"})
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)
 
     @k8s_resource_multipatch
@@ -232,6 +234,7 @@ def test_configuration_reload_error(self, trigger_configuration_reload, *unused)
     def test_configuration_reload_read_timeout(self, trigger_configuration_reload, *unused):
         trigger_configuration_reload.return_value = "read_timeout"
         self.harness.update_config({"evaluation_interval": "1234m"})
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, MaintenanceStatus)
 
 
@@ -287,6 +290,14 @@ def test_default_maximum_retention_size_is_80_percent(self, *unused):
         plan = self.harness.get_container_pebble_plan("prometheus")
         self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB")
 
+        # AND WHEN the config option is set and then unset
+        self.harness.update_config({"maximum_retention_size": "50%"})
+        self.harness.update_config(unset={"maximum_retention_size"})
+
+        # THEN the pebble plan is back to 80%
+        plan = self.harness.get_container_pebble_plan("prometheus")
+        self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB")
+
     @k8s_resource_multipatch
     @patch("lightkube.core.client.GenericSyncClient")
     def test_multiplication_factor_applied_to_pvc_capacity(self, *unused):
@@ -294,16 +305,46 @@ def test_multiplication_factor_applied_to_pvc_capacity(self, *unused):
         # GIVEN a capacity limit in binary notation (k8s notation)
         self.mock_capacity.return_value = "1Gi"
 
-        # AND a multiplication factor as a config option
-        self.harness.update_config({"maximum_retention_size": "50%"})
-
         # WHEN the charm starts
         self.harness.begin_with_initial_hooks()
         self.harness.container_pebble_ready("prometheus")
 
-        # THEN the pebble plan the adjusted capacity
+        for set_point, read_back in [("0%", "0GB"), ("50%", "0.5GB"), ("100%", "1GB")]:
+            with self.subTest(limit=set_point):
+                # WHEN a limit is set
+                self.harness.update_config({"maximum_retention_size": set_point})
+
+                # THEN the pebble plan the adjusted capacity
+                plan = self.harness.get_container_pebble_plan("prometheus")
+                self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), read_back)
+
+    @k8s_resource_multipatch
+    @patch("lightkube.core.client.GenericSyncClient")
+    def test_invalid_retention_size_config_option_string(self, *unused):
+        # GIVEN a running charm with default values
+        self.mock_capacity.return_value = "1Gi"
+        self.harness.begin_with_initial_hooks()
+        self.harness.container_pebble_ready("prometheus")
+        self.harness.evaluate_status()
+        self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)
+
+        # WHEN the config option is set to an invalid string
+        self.harness.update_config({"maximum_retention_size": "42"})
+
+        # THEN cli arg is unspecified and the unit is blocked
         plan = self.harness.get_container_pebble_plan("prometheus")
-        self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.5GB")
+        self.assertIsNone(cli_arg(plan, "--storage.tsdb.retention.size"))
+        self.harness.evaluate_status()
+        self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)
+
+        # AND WHEN the config option is corrected
+        self.harness.update_config({"maximum_retention_size": "42%"})
+
+        # THEN cli arg is updated and the unit is goes back to active
+        plan = self.harness.get_container_pebble_plan("prometheus")
+        self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.42GB")
+        self.harness.evaluate_status()
+        self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)
 
 
 @prom_multipatch
@@ -654,6 +695,7 @@ def test_ca_file(self, *_):
             },
         )
 
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)
         container = self.harness.charm.unit.get_container("prometheus")
         self.assertEqual(container.pull("/etc/prometheus/job1-ca.crt").read(), "CA 1")
@@ -690,6 +732,7 @@ def test_no_tls_config(self, *_):
             },
         )
 
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)
 
     @k8s_resource_multipatch
@@ -725,6 +768,7 @@ def test_tls_config_missing_cert(self, *_):
             },
         )
 
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)
 
     @k8s_resource_multipatch
@@ -760,4 +804,5 @@ def test_tls_config_missing_key(self, *_):
             },
         )
 
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)
diff --git a/tests/unit/test_charm_status.py b/tests/unit/test_charm_status.py
index b7aae061..e0517e67 100644
--- a/tests/unit/test_charm_status.py
+++ b/tests/unit/test_charm_status.py
@@ -58,6 +58,7 @@ def test_unit_is_active_if_deployed_without_relations_or_config(self, *unused):
             # WHEN no config is provided or relations created
 
             # THEN the unit goes into active state
+            self.harness.evaluate_status()
             self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus)
 
             # AND pebble plan is not empty
@@ -90,6 +91,7 @@ def test_unit_is_blocked_if_reload_configuration_fails(self, *unused):
             # WHEN no config is provided or relations created
 
             # THEN the unit goes into blocked state
+            self.harness.evaluate_status()
             self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus)
 
             # AND pebble plan is not empty
diff --git a/tests/unit/test_remote_write.py b/tests/unit/test_remote_write.py
index b97e1b55..3a1fc31d 100644
--- a/tests/unit/test_remote_write.py
+++ b/tests/unit/test_remote_write.py
@@ -241,6 +241,7 @@ def test_port_is_set(self, *unused):
             self.harness.get_relation_data(rel_id, self.harness.charm.unit.name),
             {"remote_write": json.dumps({"url": "http://fqdn:9090/api/v1/write"})},
         )
+        self.harness.evaluate_status()
         self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus)
 
     @k8s_resource_multipatch