Skip to content

Commit

Permalink
remote_write_2 (#254)
Browse files Browse the repository at this point in the history
* Overhaul of prometheus_remote_write bringing it up to working order
  • Loading branch information
dstathis authored Mar 30, 2022
1 parent e964e63 commit a369c99
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 52 deletions.
96 changes: 56 additions & 40 deletions lib/charms/prometheus_k8s/v0/prometheus_remote_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import logging
import os
import platform
import re
import subprocess
from collections import OrderedDict
from pathlib import Path
Expand All @@ -32,7 +33,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 2
LIBPATCH = 3


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -131,6 +132,10 @@ def __init__(
application: an application name as a string
unit: a unit name as a string
charm_name: name of charm as a string
Note:
`JujuTopology` should not be constructed directly by charm code. Please
use `ProviderTopology` or `AggregatorTopology`.
"""
self.model = model
self.model_uuid = model_uuid
Expand Down Expand Up @@ -167,6 +172,7 @@ def from_relation_data(cls, data: dict):
- "application"
- "unit"
- "charm_name"
`unit` and `charm_name` may be empty, but will result in more limited
labels. However, this allows us to support payload-only charms.
Expand All @@ -185,16 +191,15 @@ def from_relation_data(cls, data: dict):
def identifier(self) -> str:
"""Format the topology information into a terse string."""
# This is odd, but may have `None` as a model key
return "_".join([str(val) for val in self.as_dict().values()]).replace("/", "_")
return "_".join([str(val) for val in self.as_promql_label_dict().values()]).replace(
"/", "_"
)

@property
def promql_labels(self) -> str:
"""Format the topology information into a verbose string."""
return ", ".join(
[
'juju_{}="{}"'.format(key, value)
for key, value in self.as_dict(rename_keys={"charm_name": "charm"}).items()
]
['{}="{}"'.format(key, value) for key, value in self.as_promql_label_dict().items()]
)

def as_dict(self, rename_keys: Optional[Dict[str, str]] = None) -> OrderedDict:
Expand Down Expand Up @@ -234,6 +239,10 @@ def as_promql_label_dict(self):
"juju_{}".format(key): val
for key, val in self.as_dict(rename_keys={"charm_name": "charm"}).items()
}
# The leader is the only unit that sets alert rules, if "juju_unit" is present,
# then the rules will only be evaluated for that unit
if "juju_unit" in vals:
vals.pop("juju_unit")

return vals

Expand Down Expand Up @@ -392,20 +401,24 @@ def _from_file(self, root_path: Path, file_path: Path) -> List[dict]:

# update rules with additional metadata
for alert_group in alert_groups:
# update group name with topology and sub-path
alert_group["name"] = self._group_name(
str(root_path),
str(file_path),
alert_group["name"],
)
if not self._is_already_modified(alert_group["name"]):
# update group name with topology and sub-path
alert_group["name"] = self._group_name(
str(root_path),
str(file_path),
alert_group["name"],
)

# add "juju_" topology labels
for alert_rule in alert_group["rules"]:
if "labels" not in alert_rule:
alert_rule["labels"] = {}

if self.topology:
alert_rule["labels"].update(self.topology.as_promql_label_dict())
# only insert labels that do not already exist
for label, val in self.topology.as_promql_label_dict().items():
if label not in alert_rule["labels"]:
alert_rule["labels"][label] = val
# insert juju topology filters into a prometheus alert rule
alert_rule["expr"] = self.topology.render(alert_rule["expr"])

Expand Down Expand Up @@ -436,6 +449,13 @@ def _group_name(self, root_path: str, file_path: str, group_name: str) -> str:
# filter to remove empty strings
return "_".join(filter(None, group_name_parts))

def _is_already_modified(self, name: str) -> bool:
"""Detect whether a group name has already been modified with juju topology."""
modified_matcher = re.compile(r"^.*?_[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}_.*?alerts$")
if modified_matcher.match(name) is not None:
return True
return False

@classmethod
def _multi_suffix_glob(
cls, dir_path: Path, suffixes: List[str], recursive: bool = True
Expand Down Expand Up @@ -793,7 +813,7 @@ def _handle_endpoints_changed(self, event: RelationEvent) -> None:
def _push_alerts_on_relation_joined(self, event: RelationEvent) -> None:
self._push_alerts_to_relation_databag(event.relation)

def _push_alerts_to_all_relation_databags(self, _: HookEvent) -> None:
def _push_alerts_to_all_relation_databags(self, _: Optional[HookEvent]) -> None:
for relation in self.model.relations[self._relation_name]:
self._push_alerts_to_relation_databag(relation)

Expand All @@ -809,6 +829,10 @@ def _push_alerts_to_relation_databag(self, relation: Relation) -> None:
if alert_rules_as_dict:
relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict)

def reload_alerts(self) -> None:
"""Reload alert rules from disk and push to relation data."""
self._push_alerts_to_all_relation_databags(None)

@property
def endpoints(self) -> List[Dict[str, str]]:
"""A config object ready to be dropped into a prometheus config file.
Expand Down Expand Up @@ -1027,32 +1051,24 @@ def alerts(self) -> dict:
if not alert_rules:
continue

try:
scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
identifier = ProviderTopology.from_relation_data(scrape_metadata).identifier
alerts[identifier] = self._transformer.apply_label_matchers(alert_rules)
except KeyError as e:
logger.warning(
"Relation %s has no 'scrape_metadata': %s",
relation.id,
e,
)

if "groups" not in alert_rules:
logger.warning("No alert groups were found in relation data")
continue
# Construct an ID based on what's in the alert rules
for group in alert_rules["groups"]:
try:
labels = group["rules"][0]["labels"]
identifier = "{}_{}_{}".format(
labels["juju_model"],
labels["juju_model_uuid"],
labels["juju_application"],
)
alerts[identifier] = alert_rules
except KeyError:
logger.error("Alert rules were found but no usable labels were present")
if "groups" not in alert_rules:
logger.warning("No alert groups were found in relation data")
continue
# Construct an ID based on what's in the alert rules
for group in alert_rules["groups"]:
try:
labels = group["rules"][0]["labels"]
identifier = "{}_{}_{}".format(
labels["juju_model"],
labels["juju_model_uuid"],
labels["juju_application"],
)
if identifier not in alerts:
alerts[identifier] = {"groups": [group]}
else:
alerts[identifier]["groups"].append(group)
except KeyError:
logger.error("Alert rules were found but no usable labels were present")

return alerts

Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ profile = "black"
[tool.flake8]
max-line-length = 99
max-doc-length = 99
max-complexity = 10
exclude = [".git", "__pycache__", ".tox", "build", "dist", "*.egg_info", "venv"]
select = ["E", "W", "F", "C", "N", "R", "D", "H"]
# Ignore W503, E501 because using black creates errors with this
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
alert: PrometheusTargetMissing
expr: up{%%juju_topology%%, juju_unit="app/0"} == 0
for: 0m
labels:
severity: critical
juju_unit: app/0
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
33 changes: 22 additions & 11 deletions tests/unit/test_endpoint_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,18 +192,29 @@ def test_each_alert_rule_is_topology_labeled(self, _):
self.assertIn("alert_rules", data)
alerts = json.loads(data["alert_rules"])
self.assertIn("groups", alerts)
self.assertEqual(len(alerts["groups"]), 5)
self.assertEqual(len(alerts["groups"]), 6)
for group in alerts["groups"]:
for rule in group["rules"]:
self.assertIn("labels", rule)
labels = rule["labels"]
self.assertIn("juju_model", labels)
self.assertIn("juju_application", labels)
self.assertIn("juju_model_uuid", labels)
self.assertIn("juju_charm", labels)
# alerts should not have unit information if not already present
self.assertNotIn("juju_unit", rule["labels"])
self.assertNotIn("juju_unit=", rule["expr"])
if "and_unit" not in group["name"]:
self.assertIn("labels", rule)
labels = rule["labels"]
self.assertIn("juju_model", labels)
self.assertIn("juju_application", labels)
self.assertIn("juju_model_uuid", labels)
self.assertIn("juju_charm", labels)
# alerts should not have unit information if not already present
self.assertNotIn("juju_unit", rule["labels"])
self.assertNotIn("juju_unit=", rule["expr"])
else:
self.assertIn("labels", rule)
labels = rule["labels"]
self.assertIn("juju_model", labels)
self.assertIn("juju_application", labels)
self.assertIn("juju_model_uuid", labels)
self.assertIn("juju_charm", labels)
# unit information is already present
self.assertIn("juju_unit", rule["labels"])
self.assertIn("juju_unit=", rule["expr"])

@patch("ops.testing._TestingModelBackend.network_get")
def test_each_alert_expression_is_topology_labeled(self, _):
Expand All @@ -213,7 +224,7 @@ def test_each_alert_expression_is_topology_labeled(self, _):
self.assertIn("alert_rules", data)
alerts = json.loads(data["alert_rules"])
self.assertIn("groups", alerts)
self.assertEqual(len(alerts["groups"]), 5)
self.assertEqual(len(alerts["groups"]), 6)
group = alerts["groups"][0]
for rule in group["rules"]:
self.assertIn("expr", rule)
Expand Down
57 changes: 57 additions & 0 deletions tests/unit/test_remote_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,63 @@ def test_no_remote_write_endpoint_provided(self):
self.harness.update_relation_data(rel_id, "provider/0", {})
assert list(self.harness.charm.remote_write_consumer.endpoints) == []

def test_alert_rule_has_correct_labels(self):
rel_id = self.harness.add_relation(RELATION_NAME, "provider")
self.harness.add_relation_unit(rel_id, "provider/0")
rules = json.loads(
self.harness.get_relation_data(rel_id, self.harness.charm.app)["alert_rules"]
)
for group in rules["groups"]:
if group["name"].endswith("with_template_string_alerts"):
expr = group["rules"][0]["expr"]
self.assertIn("juju_model", expr)
self.assertIn("juju_model_uuid", expr)
self.assertIn("juju_application", expr)
self.assertIn("juju_charm", expr)
self.assertNotIn("juju_unit", expr)
self.assertEqual(
set(group["rules"][0]["labels"]),
{
"juju_application",
"juju_charm",
"juju_model",
"juju_model_uuid",
"severity",
},
)
break
else:
assert False # Could not find the correct alert rule to check

def test_alert_rule_has_correct_labels_with_unit(self):
rel_id = self.harness.add_relation(RELATION_NAME, "provider")
self.harness.add_relation_unit(rel_id, "provider/0")
rules = json.loads(
self.harness.get_relation_data(rel_id, self.harness.charm.app)["alert_rules"]
)
for group in rules["groups"]:
if group["name"].endswith("with_template_string_and_unit_alerts"):
expr = group["rules"][0]["expr"]
self.assertIn("juju_model", expr)
self.assertIn("juju_model_uuid", expr)
self.assertIn("juju_application", expr)
self.assertIn("juju_charm", expr)
self.assertIn("juju_unit", expr)
self.assertEqual(
set(group["rules"][0]["labels"]),
{
"juju_application",
"juju_charm",
"juju_model",
"juju_model_uuid",
"severity",
"juju_unit",
},
)
break
else:
assert False # Could not find the correct alert rule to check


class TestRemoteWriteProvider(unittest.TestCase):
@patch_network_get(private_address="1.1.1.1")
Expand Down

0 comments on commit a369c99

Please sign in to comment.