diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index e3d35c6f..2f475dc6 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -362,13 +362,12 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 47 +LIBPATCH = 48 -PYDEPS = ["cosl"] +PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"] logger = logging.getLogger(__name__) - ALLOWED_KEYS = { "job_name", "metrics_path", @@ -399,6 +398,40 @@ def _on_scrape_targets_changed(self, event): DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +GENERIC_ALERT_RULES_GROUP = { + "groups": [ + { + "name": "HostHealth", + "rules": [ + { + "alert": "HostDown", + "expr": "up < 1", + "for": "5m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Host '{{ $labels.instance }}' is down.", + "description": """Host '{{ $labels.instance }}' is down, failed to scrape. + VALUE = {{ $value }} + LABELS = {{ $labels }}""", + }, + }, + { + "alert": "HostMetricsMissing", + "expr": "absent(up)", + "for": "5m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Metrics not received from host '{{ $labels.instance }}', failed to remote write.", + "description": """Metrics not received from host '{{ $labels.instance }}', failed to remote write. + VALUE = {{ $value }} + LABELS = {{ $labels }}""", + }, + }, + ], + } + ] +} + class PrometheusConfig: """A namespace for utility functions for manipulating the prometheus config dict.""" @@ -1531,6 +1564,7 @@ def set_scrape_job_spec(self, _=None): alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path, recursive=True) + alert_rules.add(GENERIC_ALERT_RULES_GROUP, group_name_prefix=self.topology.identifier) alert_rules_as_dict = alert_rules.as_dict() for relation in self._charm.model.relations[self._relation_name]: diff --git a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py index cf24b9f7..d7b0b9fa 100644 --- a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py @@ -46,9 +46,9 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 5 -PYDEPS = ["cosl"] +PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"] logger = logging.getLogger(__name__) @@ -60,6 +60,28 @@ DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +GENERIC_ALERT_RULES_GROUP = { + "groups": [ + { + "name": "AggregatorHostHealth", + "rules": [ + { + "alert": "HostMetricsMissing", + "expr": "absent(up)", + "for": "5m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Metrics not received from host '{{ $labels.instance }}', failed to remote write.", + "description": """Metrics not received from host '{{ $labels.instance }}', failed to remote write. + VALUE = {{ $value }} + LABELS = {{ $labels }}""", + }, + } + ], + } + ] +} + class RelationNotFoundError(Exception): """Raised if there is no relation with the given name.""" @@ -485,6 +507,7 @@ def _push_alerts_to_relation_databag(self, relation: Relation) -> None: alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path) + alert_rules.add(GENERIC_ALERT_RULES_GROUP, group_name_prefix=self.topology.identifier) alert_rules_as_dict = alert_rules.as_dict() diff --git a/requirements.txt b/requirements.txt index 08a78f21..7fc56b1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -cosl>=0.0.46 +git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl cryptography jsonschema ops diff --git a/tests/unit/test_endpoint_provider.py b/tests/unit/test_endpoint_provider.py index 12651b69..ee2ad12e 100644 --- a/tests/unit/test_endpoint_provider.py +++ b/tests/unit/test_endpoint_provider.py @@ -324,7 +324,7 @@ def test_each_alert_rule_is_topology_labeled(self): self.assertIn("alert_rules", data) alerts = json.loads(data["alert_rules"]) self.assertIn("groups", alerts) - self.assertEqual(len(alerts["groups"]), 6) + self.assertEqual(len(alerts["groups"]), 7) for group in alerts["groups"]: for rule in group["rules"]: if "and_unit" not in group["name"]: @@ -360,7 +360,7 @@ def test_each_alert_expression_is_topology_labeled(self): self.assertIn("alert_rules", data) alerts = json.loads(data["alert_rules"]) self.assertIn("groups", alerts) - self.assertEqual(len(alerts["groups"]), 6) + self.assertEqual(len(alerts["groups"]), 7) group = alerts["groups"][0] for rule in group["rules"]: self.assertIn("expr", rule) @@ -755,8 +755,11 @@ def test_unit_label_is_retained_if_hard_coded(self): alert_rules = json.loads(relation.data[self.harness.charm.app].get("alert_rules")) for group in alert_rules["groups"]: for rule in group["rules"]: - self.assertIn("juju_unit", rule["labels"]) - self.assertIn("juju_unit=", rule["expr"]) + if ( + "_HostHealth_alerts" not in group["name"] + ): # _HostHealth_alerts are injected alerts without juju_unit labels + self.assertIn("juju_unit", rule["labels"]) + self.assertIn("juju_unit=", rule["expr"]) class TestNoLeader(unittest.TestCase): diff --git a/tox.ini b/tox.ini index b1c393ee..57b76083 100644 --- a/tox.ini +++ b/tox.ini @@ -45,7 +45,7 @@ commands = [testenv:static-{charm,lib,unit,integration}] description = Run static analysis checks deps = - cosl + git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl pyright charm: -r{toxinidir}/requirements.txt lib: ops