Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update scrape and remote_write libs for generic HostHealth rules #660

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions lib/charms/prometheus_k8s/v0/prometheus_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,12 @@ def _on_scrape_targets_changed(self, event):

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 47
LIBPATCH = 48

PYDEPS = ["cosl"]
PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reminder to revert before merging here and elsewhere.


logger = logging.getLogger(__name__)


ALLOWED_KEYS = {
"job_name",
"metrics_path",
Expand Down Expand Up @@ -399,6 +398,40 @@ def _on_scrape_targets_changed(self, event):

DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules"

GENERIC_ALERT_RULES_GROUP = {
"groups": [
{
"name": "HostHealth",
"rules": [
{
"alert": "HostDown",
"expr": "up < 1",
"for": "5m",
"labels": {"severity": "critical"},
"annotations": {
"summary": "Host '{{ $labels.instance }}' is down.",
"description": """Host '{{ $labels.instance }}' is down, failed to scrape.
VALUE = {{ $value }}
LABELS = {{ $labels }}""",
},
},
{
"alert": "HostMetricsMissing",
"expr": "absent(up)",
"for": "5m",
"labels": {"severity": "critical"},
"annotations": {
"summary": "Metrics not received from host '{{ $labels.instance }}', failed to remote write.",
"description": """Metrics not received from host '{{ $labels.instance }}', failed to remote write.
VALUE = {{ $value }}
LABELS = {{ $labels }}""",
},
},
],
}
]
}


class PrometheusConfig:
"""A namespace for utility functions for manipulating the prometheus config dict."""
Expand Down Expand Up @@ -1531,6 +1564,7 @@ def set_scrape_job_spec(self, _=None):

alert_rules = AlertRules(query_type="promql", topology=self.topology)
alert_rules.add_path(self._alert_rules_path, recursive=True)
alert_rules.add(GENERIC_ALERT_RULES_GROUP, group_name_prefix=self.topology.identifier)
alert_rules_as_dict = alert_rules.as_dict()

for relation in self._charm.model.relations[self._relation_name]:
Expand Down
27 changes: 25 additions & 2 deletions lib/charms/prometheus_k8s/v1/prometheus_remote_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5

PYDEPS = ["cosl"]
PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"]


logger = logging.getLogger(__name__)
Expand All @@ -60,6 +60,28 @@

DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules"

GENERIC_ALERT_RULES_GROUP = {
"groups": [
{
"name": "AggregatorHostHealth",
"rules": [
{
"alert": "HostMetricsMissing",
"expr": "absent(up)",
"for": "5m",
"labels": {"severity": "critical"},
"annotations": {
"summary": "Metrics not received from host '{{ $labels.instance }}', failed to remote write.",
"description": """Metrics not received from host '{{ $labels.instance }}', failed to remote write.
VALUE = {{ $value }}
LABELS = {{ $labels }}""",
},
}
],
}
]
}


class RelationNotFoundError(Exception):
"""Raised if there is no relation with the given name."""
Expand Down Expand Up @@ -485,6 +507,7 @@ def _push_alerts_to_relation_databag(self, relation: Relation) -> None:

alert_rules = AlertRules(query_type="promql", topology=self.topology)
alert_rules.add_path(self._alert_rules_path)
alert_rules.add(GENERIC_ALERT_RULES_GROUP, group_name_prefix=self.topology.identifier)

alert_rules_as_dict = alert_rules.as_dict()

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cosl>=0.0.46
git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl
cryptography
jsonschema
ops
Expand Down
11 changes: 7 additions & 4 deletions tests/unit/test_endpoint_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def test_each_alert_rule_is_topology_labeled(self):
self.assertIn("alert_rules", data)
alerts = json.loads(data["alert_rules"])
self.assertIn("groups", alerts)
self.assertEqual(len(alerts["groups"]), 6)
self.assertEqual(len(alerts["groups"]), 7)
for group in alerts["groups"]:
for rule in group["rules"]:
if "and_unit" not in group["name"]:
Expand Down Expand Up @@ -360,7 +360,7 @@ def test_each_alert_expression_is_topology_labeled(self):
self.assertIn("alert_rules", data)
alerts = json.loads(data["alert_rules"])
self.assertIn("groups", alerts)
self.assertEqual(len(alerts["groups"]), 6)
self.assertEqual(len(alerts["groups"]), 7)
group = alerts["groups"][0]
for rule in group["rules"]:
self.assertIn("expr", rule)
Expand Down Expand Up @@ -755,8 +755,11 @@ def test_unit_label_is_retained_if_hard_coded(self):
alert_rules = json.loads(relation.data[self.harness.charm.app].get("alert_rules"))
for group in alert_rules["groups"]:
for rule in group["rules"]:
self.assertIn("juju_unit", rule["labels"])
self.assertIn("juju_unit=", rule["expr"])
if (
"_HostHealth_alerts" not in group["name"]
): # _HostHealth_alerts are injected alerts without juju_unit labels
self.assertIn("juju_unit", rule["labels"])
self.assertIn("juju_unit=", rule["expr"])


class TestNoLeader(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ commands =
[testenv:static-{charm,lib,unit,integration}]
description = Run static analysis checks
deps =
cosl
git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl
pyright
charm: -r{toxinidir}/requirements.txt
lib: ops
Expand Down
Loading