diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py new file mode 100644 index 00000000..bbf15ed7 --- /dev/null +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -0,0 +1,1544 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +"""## Overview. + +This document explains how to integrate with the Grafana charm +for the purpose of providing a dashboard which can be used by +end users. It also explains the structure of the data +expected by the `grafana-dashboard` interface, and may provide a +mechanism or reference point for providing a compatible interface +or library by providing a definitive reference guide to the +structure of relation data which is shared between the Grafana +charm and any charm providing datasource information. + +## Provider Library Usage + +The Grafana charm interacts with its dashboards using its charm +library. The goal of this library is to be as simple to use as +possible, and instantiation of the class with or without changing +the default arguments provides a complete use case. For the simplest +use case of a charm which bundles dashboards and provides a +`provides: grafana-dashboard` interface, + + requires: + grafana-dashboard: + interface: grafana_dashboard + +creation of a `GrafanaDashboardProvider` object with the default arguments is +sufficient. + +:class:`GrafanaDashboardProvider` expects that bundled dashboards should +be included in your charm with a default path of: + + path/to/charm.py + path/to/src/grafana_dashboards/*.{json|json.tmpl|.tmpl} + +Where the files are Grafana dashboard JSON data either from the +Grafana marketplace, or directly exported from a Grafana instance. +Refer to the [official docs](https://grafana.com/tutorials/provision-dashboards-and-data-sources/) +for more information. + +When constructing a dashboard that is intended to be consumed by COS, make sure to use variables +for your datasources, and name them "prometheusds" and "lokids". You can also use the following +juju topology variables in your dashboards: $juju_model, $juju_model_uuid, $juju_application +and $juju_unit. Note, however, that if metrics are coming via peripheral charms (scrape-config +or cos-config) then topology labels would not exist. + +The default constructor arguments are: + + `charm`: `self` from the charm instantiating this library + `relation_name`: grafana-dashboard + `dashboards_path`: "/src/grafana_dashboards" + +If your configuration requires any changes from these defaults, they +may be set from the class constructor. It may be instantiated as +follows: + + from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider + + class FooCharm: + def __init__(self, *args): + super().__init__(*args, **kwargs) + ... + self.grafana_dashboard_provider = GrafanaDashboardProvider(self) + ... + +The first argument (`self`) should be a reference to the parent (providing +dashboards), as this charm's lifecycle events will be used to re-submit +dashboard information if a charm is upgraded, the pod is restarted, or other. + +An instantiated `GrafanaDashboardProvider` validates that the path specified +in the constructor (or the default) exists, reads the file contents, then +compresses them with LZMA and adds them to the application relation data +when a relation is established with Grafana. + +Provided dashboards will be checked by Grafana, and a series of dropdown menus +providing the ability to select query targets by Juju Model, application instance, +and unit will be added if they do not exist. + +To avoid requiring `jinja` in `GrafanaDashboardProvider` users, template validation +and rendering occurs on the other side of the relation, and relation data in +the form of: + + { + "event": { + "valid": `true|false`, + "errors": [], + } + } + +Will be returned if rendering or validation fails. In this case, the +`GrafanaDashboardProvider` object will emit a `dashboard_status_changed` event +of the type :class:`GrafanaDashboardEvent`, which will contain information +about the validation error. + +This information is added to the relation data for the charms as serialized JSON +from a dict, with a structure of: +``` +{ + "application": { + "dashboards": { + "uuid": a uuid generated to ensure a relation event triggers, + "templates": { + "file:{hash}": { + "content": `{compressed_template_data}`, + "charm": `charm.meta.name`, + "juju_topology": { + "model": `charm.model.name`, + "model_uuid": `charm.model.uuid`, + "application": `charm.app.name`, + "unit": `charm.unit.name`, + } + }, + "file:{other_file_hash}": { + ... + }, + }, + }, + }, +} +``` + +This is ingested by :class:`GrafanaDashboardConsumer`, and is sufficient for configuration. + +The [COS Configuration Charm](https://charmhub.io/cos-configuration-k8s) can be used to +add dashboards which are not bundled with charms. + +## Consumer Library Usage + +The `GrafanaDashboardConsumer` object may be used by Grafana +charms to manage relations with available dashboards. For this +purpose, a charm consuming Grafana dashboard information should do +the following things: + +1. Instantiate the `GrafanaDashboardConsumer` object by providing it a +reference to the parent (Grafana) charm and, optionally, the name of +the relation that the Grafana charm uses to interact with dashboards. +This relation must confirm to the `grafana-dashboard` interface. + +For example a Grafana charm may instantiate the +`GrafanaDashboardConsumer` in its constructor as follows + + from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.grafana_dashboard_consumer = GrafanaDashboardConsumer(self) + ... + +2. A Grafana charm also needs to listen to the +`GrafanaDashboardConsumer` events emitted by the `GrafanaDashboardConsumer` +by adding itself as an observer for these events: + + self.framework.observe( + self.grafana_source_consumer.on.sources_changed, + self._on_dashboards_changed, + ) + +Dashboards can be retrieved the :meth:`dashboards`: + +It will be returned in the format of: + +``` +[ + { + "id": unique_id, + "relation_id": relation_id, + "charm": the name of the charm which provided the dashboard, + "content": compressed_template_data + }, +] +``` + +The consuming charm should decompress the dashboard. +""" + +import base64 +import json +import logging +import lzma +import os +import re +import uuid +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from ops.charm import ( + CharmBase, + HookEvent, + RelationBrokenEvent, + RelationChangedEvent, + RelationCreatedEvent, + RelationEvent, + RelationRole, +) +from ops.framework import ( + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) +from ops.model import Relation + +# The unique Charmhub library identifier, never change it +LIBID = "c49eb9c7dfef40c7b6235ebd67010a3f" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 12 + +logger = logging.getLogger(__name__) + + +DEFAULT_RELATION_NAME = "grafana-dashboard" +DEFAULT_PEER_NAME = "grafana" +RELATION_INTERFACE_NAME = "grafana_dashboard" + +TEMPLATE_DROPDOWNS = [ + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": "label_values(up,juju_model)", + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju model", + "multi": False, + "name": "juju_model", + "query": { + "query": "label_values(up,juju_model)", + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju model uuid", + "multi": False, + "name": "juju_model_uuid", + "query": { + "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju application", + "multi": False, + "name": "juju_application", + "query": { + "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju unit", + "multi": False, + "name": "juju_unit", + "query": { + "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": None, + "multi": False, + "name": "prometheusds", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "type": "datasource", + }, +] + +REACTIVE_CONVERTER = { # type: ignore + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "hosts", + "multi": True, + "name": "host", + "options": [], + "query": { + "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, +} + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as " + "interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different direction.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +class InvalidDirectoryPathError(Exception): + """Raised if the grafana dashboards folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + grafana_dashboards_absolute_path: str, + message: str, + ): + self.grafana_dashboards_absolute_path = grafana_dashboards_absolute_path + self.message = message + + super().__init__(self.message) + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the charmed operator file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and return its absolute path. + + Raises: + InvalidDirectoryPathError if the resolved path does not exist or it is not a directory + + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not dir_path.exists(): + raise InvalidDirectoryPathError(str(dir_path), "directory does not exist") + if not dir_path.is_dir(): + raise InvalidDirectoryPathError(str(dir_path), "is not a directory") + + return str(dir_path) + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +) -> None: + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + named like the value of the `relation_name` argument. + RelationInterfaceMismatchError: If the relation interface of the + relation named as the provided `relation_name` argument does not + match the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation named as the provided `relation_name` + argument has a different role than what is specified by the + `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +def _encode_dashboard_content(content: Union[str, bytes]) -> str: + if isinstance(content, str): + content = bytes(content, "utf-8") + + return base64.b64encode(lzma.compress(content)).decode("utf-8") + + +def _decode_dashboard_content(encoded_content: str) -> str: + return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() + + +def _convert_dashboard_fields(content: str) -> str: + """Make sure values are present for Juju topology. + + Inserts Juju topology variables and selectors into the template, as well as + a variable for Prometheus. + """ + dict_content = json.loads(content) + datasources = {} + existing_templates = False + + # If no existing template variables exist, just insert our own + if "templating" not in dict_content: + dict_content["templating"] = {"list": [d for d in TEMPLATE_DROPDOWNS]} + else: + # Otherwise, set a flag so we can go back later + existing_templates = True + for maybe in dict_content["templating"]["list"]: + # Build a list of `datasource_name`: `datasource_type` mappings + # The "query" field is actually "prometheus", "loki", "influxdb", etc + if "type" in maybe and maybe["type"] == "datasource": + datasources[maybe["name"]] = maybe["query"] + + # Put our own variables in the template + for d in TEMPLATE_DROPDOWNS: + if d not in dict_content["templating"]["list"]: + dict_content["templating"]["list"].insert(0, d) + + dict_content = _replace_template_fields(dict_content, datasources, existing_templates) + + return json.dumps(dict_content) + + +def _replace_template_fields( # noqa: C901 + dict_content: dict, datasources: dict, existing_templates: bool +) -> dict: + """Make templated fields get cleaned up afterwards. + + If existing datasource variables are present, try to substitute them, otherwise + assume they are all for Prometheus and put the prometheus variable there. + """ + replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"} + used_replacements = [] + + # If any existing datasources match types we know, or we didn't find + # any templating variables at all, template them. + if datasources or not existing_templates: + panels = dict_content["panels"] + + # Go through all of the panels. If they have a datasource set, AND it's one + # that we can convert to ${lokids} or ${prometheusds}, by stripping off the + # ${} templating and comparing the name to the list we built, replace it, + # otherwise, leave it alone. + # + # COS only knows about Prometheus and Loki. + for panel in panels: + if "datasource" not in panel or not panel.get("datasource", ""): + continue + if not existing_templates: + panel["datasource"] = "${prometheusds}" + else: + if panel["datasource"] in replacements.values(): + # Already a known template variable + continue + if not panel["datasource"]: + # Don't worry about null values + continue + # Strip out variable characters and maybe braces + ds = re.sub(r"(\$|\{|\})", "", panel["datasource"]) + replacement = replacements.get(datasources[ds], "") + if replacement: + used_replacements.append(ds) + panel["datasource"] = replacement or panel["datasource"] + + # Put our substitutions back + dict_content["panels"] = panels + + # Finally, go back and pop off the templates we stubbed out + deletions = [] + for tmpl in dict_content["templating"]["list"]: + if tmpl["name"] and tmpl["name"] in used_replacements: + deletions.append(tmpl) + + for d in deletions: + dict_content["templating"]["list"].remove(d) + + return dict_content + + +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + elif isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + else: + return obj + + +class GrafanaDashboardsChanged(EventBase): + """Event emitted when Grafana dashboards change.""" + + def __init__(self, handle, data=None): + super().__init__(handle) + self.data = data + + def snapshot(self) -> Dict: + """Save grafana source information.""" + return {"data": self.data} + + def restore(self, snapshot): + """Restore grafana source information.""" + self.data = snapshot["data"] + + +class GrafanaDashboardEvents(ObjectEvents): + """Events raised by :class:`GrafanaSourceEvents`.""" + + dashboards_changed = EventSource(GrafanaDashboardsChanged) + + +class GrafanaDashboardEvent(EventBase): + """Event emitted when Grafana dashboards cannot be resolved. + + Enables us to set a clear status on the provider. + """ + + def __init__(self, handle, errors: List[Dict[str, str]] = [], valid: bool = False): + super().__init__(handle) + self.errors = errors + self.error_message = "; ".join([error["error"] for error in errors if "error" in error]) + self.valid = valid + + def snapshot(self) -> Dict: + """Save grafana source information.""" + return { + "error_message": self.error_message, + "valid": self.valid, + "errors": json.dumps(self.errors), + } + + def restore(self, snapshot): + """Restore grafana source information.""" + self.error_message = snapshot["error_message"] + self.valid = snapshot["valid"] + self.errors = json.loads(snapshot["errors"]) + + +class GrafanaProviderEvents(ObjectEvents): + """Events raised by :class:`GrafanaSourceEvents`.""" + + dashboard_status_changed = EventSource(GrafanaDashboardEvent) + + +class GrafanaDashboardProvider(Object): + """An API to provide Grafana dashboards to a Grafana charm.""" + + _stored = StoredState() + on = GrafanaProviderEvents() + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dashboards_path: str = "src/grafana_dashboards", + ) -> None: + """API to provide Grafana dashboard to a Grafana charmed operator. + + The :class:`GrafanaDashboardProvider` object provides an API + to upload dashboards to a Grafana charm. In its most streamlined + usage, the :class:`GrafanaDashboardProvider` is integrated in a + charmed operator as follows: + + self.grafana = GrafanaDashboardProvider(self) + + The :class:`GrafanaDashboardProvider` will look for dashboard + templates in the `/grafana_dashboards` folder. + Additionally, dashboard templates can be uploaded programmatically + via the :method:`GrafanaDashboardProvider.add_dashboard` method. + + To use the :class:`GrafanaDashboardProvider` API, you need a relation + defined in your charm operator's metadata.yaml as follows: + + provides: + grafana-dashboard: + interface: grafana_dashboard + + If you would like to use relation name other than `grafana-dashboard`, + you will need to specify the relation name via the `relation_name` + argument when instantiating the :class:`GrafanaDashboardProvider` object. + However, it is strongly advised to keep the the default relation name, + so that people deploying your charm will have a consistent experience + with all other charms that provide Grafana dashboards. + + It is possible to provide a different file path for the Grafana dashboards + to be automatically managed by the :class:`GrafanaDashboardProvider` object + via the `dashboards_path` argument. This may be necessary when the directory + structure of your charmed operator repository is not the "usual" one as + generated by `charmcraft init`, for example when adding the charmed operator + in a Java repository managed by Maven or Gradle. However, unless there are + such constraints with other tooling, it is strongly advised to store the + Grafana dashboards in the default `/grafana_dashboards` + folder, in order to provide a consistent experience for other charmed operator + authors. + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + relation_name: a :string: name of the relation managed by this + :class:`GrafanaDashboardProvider`; it defaults to "grafana-dashboard". + dashboards_path: a filesystem path relative to the charm root + where dashboard templates can be located. By default, the library + expects dashboard files to be in the `/grafana_dashboards` + directory. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + dashboards_path = _resolve_dir_against_charm_path(charm, dashboards_path) + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + + self._charm = charm + self._relation_name = relation_name + self._dashboards_path = dashboards_path + + # No peer relation bucket we can rely on providers, keep StoredState here, too + self._stored.set_default(dashboard_templates={}) + + self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) + self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) + + self.framework.observe( + self._charm.on[self._relation_name].relation_created, + self._on_grafana_dashboard_relation_created, + ) + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_grafana_dashboard_relation_changed, + ) + + def add_dashboard(self, content: str) -> None: + """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`. + + Args: + content: a string representing a Jinja template. Currently, no + global variables are added to the Jinja template evaluation + context. + """ + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + stored_dashboard_templates = self._stored.dashboard_templates + + encoded_dashboard = _encode_dashboard_content(content) + + # Use as id the first chars of the encoded dashboard, so that + # it is predictable across units. + id = "prog:{}".format(encoded_dashboard[-24:-16]) + stored_dashboard_templates[id] = self._content_to_dashboard_object(encoded_dashboard) + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def remove_non_builtin_dashboards(self) -> None: + """Remove all dashboards to the relation added via :method:`add_dashboard`.""" + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + stored_dashboard_templates = self._stored.dashboard_templates + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("prog:"): + del stored_dashboard_templates[dashboard_id] + self._stored.dashboard_templates = stored_dashboard_templates + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def update_dashboards(self) -> None: + """Trigger the re-evaluation of the data on all relations.""" + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _update_all_dashboards_from_dir(self, _: Optional[HookEvent] = None) -> None: + """Scans the built-in dashboards and updates relations with changes.""" + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + + # Ensure we do not leave outdated dashboards by removing from stored all + # the encoded dashboards that start with "file/". + if self._dashboards_path: + stored_dashboard_templates = self._stored.dashboard_templates + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("file:"): + del stored_dashboard_templates[dashboard_id] + + # Path.glob uses fnmatch on the backend, which is pretty limited, so use a + # custom function for the filter + def _is_dashbaord(p: Path) -> bool: + return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + + for path in filter(_is_dashbaord, Path(self._dashboards_path).glob("*")): + # path = Path(path) + id = "file:{}".format(path.stem) + stored_dashboard_templates[id] = self._content_to_dashboard_object( + _encode_dashboard_content(path.read_bytes()) + ) + + self._stored.dashboard_templates = stored_dashboard_templates + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _reinitialize_dashboard_data(self) -> None: + """Triggers a reload of dashboard outside of an eventing workflow. + + This will destroy any existing relation data. + """ + try: + _resolve_dir_against_charm_path(self._charm, self._dashboards_path) + self._update_all_dashboards_from_dir() + + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + stored_dashboard_templates = self._stored.dashboard_templates + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("file:"): + del stored_dashboard_templates[dashboard_id] + self._stored.dashboard_templates = stored_dashboard_templates + + # With all of the file-based dashboards cleared out, force a refresh + # of relation data + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> None: + """Watch for a relation being created and automatically send dashboards. + + Args: + event: The :class:`RelationJoinedEvent` sent when a + `grafana_dashboaard` relationship is joined + """ + if self._charm.unit.is_leader(): + self._upset_dashboards_on_relation(event.relation) + + def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: + """Watch for changes so we know if there's an error to signal back to the parent charm. + + Args: + event: The `RelationChangedEvent` that triggered this handler. + """ + if self._charm.unit.is_leader(): + data = json.loads(event.relation.data[event.app].get("event", "{}")) + + if not data: + return + + valid = bool(data.get("valid", True)) + errors = data.get("errors", []) + if valid and not errors: + self.on.dashboard_status_changed.emit(valid=valid) + else: + self.on.dashboard_status_changed.emit(valid=valid, errors=errors) + + def _upset_dashboards_on_relation(self, relation: Relation) -> None: + """Update the dashboards in the relation data bucket.""" + # It's completely ridiculous to add a UUID, but if we don't have some + # pseudo-random value, this never makes it across 'juju set-state' + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), + "uuid": str(uuid.uuid4()), + } + + relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + def _content_to_dashboard_object(self, content: str) -> Dict: + return { + "charm": self._charm.meta.name, + "content": content, + "juju_topology": self._juju_topology, + } + + # This is not actually used in the dashboards, but is present to provide a secondary + # salt to ensure uniqueness in the dict keys in case individual charm units provide + # dashboards + @property + def _juju_topology(self) -> Dict: + return { + "model": self._charm.model.name, + "model_uuid": self._charm.model.uuid, + "application": self._charm.app.name, + "unit": self._charm.unit.name, + } + + @property + def dashboard_templates(self) -> List: + """Return a list of the known dashboard templates.""" + return [v for v in self._stored.dashboard_templates.values()] + + +class GrafanaDashboardConsumer(Object): + """A consumer object for working with Grafana Dashboards.""" + + on = GrafanaDashboardEvents() + _stored = StoredState() + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + ) -> None: + """API to receive Grafana dashboards from charmed operators. + + The :class:`GrafanaDashboardConsumer` object provides an API + to consume dashboards provided by a charmed operator using the + :class:`GrafanaDashboardProvider` library. The + :class:`GrafanaDashboardConsumer` is integrated in a + charmed operator as follows: + + self.grafana = GrafanaDashboardConsumer(self) + + To use this library, you need a relation defined as follows in + your charm operator's metadata.yaml: + + requires: + grafana-dashboard: + interface: grafana_dashboard + + If you would like to use a different relation name than + `grafana-dashboard`, you need to specify the relation name via the + `relation_name` argument. However, it is strongly advised not to + change the default, so that people deploying your charm will have + a consistent experience with all other charms that consume Grafana + dashboards. + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + relation_name: a :string: name of the relation managed by this + :class:`GrafanaDashboardConsumer`; it defaults to "grafana-dashboard". + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + + self._stored.set_default(dashboards=dict()) + + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_grafana_dashboard_relation_changed, + ) + self.framework.observe( + self._charm.on[self._relation_name].relation_broken, + self._on_grafana_dashboard_relation_broken, + ) + self.framework.observe( + self._charm.on[DEFAULT_PEER_NAME].relation_changed, + self._on_grafana_peer_changed, + ) + + def get_dashboards_from_relation(self, relation_id: int) -> List: + """Get a list of known dashboards for one instance of the monitored relation. + + Args: + relation_id: the identifier of the relation instance, as returned by + :method:`ops.model.Relation.id`. + + Returns: a list of known dashboards coming from the provided relation instance. + """ + return [ + self._to_external_object(relation_id, dashboard) + for dashboard in self._get_stored_dashboards(relation_id) + ] + + def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: + """Handle relation changes in related providers. + + If there are changes in relations between Grafana dashboard consumers + and providers, this event handler (if the unit is the leader) will + get data for an incoming grafana-dashboard relation through a + :class:`GrafanaDashboardsChanged` event, and make the relation data + available in the app's datastore object. The Grafana charm can + then respond to the event to update its configuration. + """ + changes = False + if self._charm.unit.is_leader(): + changes = self._render_dashboards_and_signal_changed(event.relation) + + if changes: + self.on.dashboards_changed.emit() + + def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: + """Emit dashboard events on peer events so secondary charm data updates.""" + if self._charm.unit.is_leader(): + return + self.on.dashboards_changed.emit() + + def update_dashboards(self, relation: Optional[Relation] = None) -> None: + """Re-establish dashboards on one or more relations. + + If something changes between this library and a datasource, try to re-establish + invalid dashboards and invalidate active ones. + + Args: + relation: a specific relation for which the dashboards have to be + updated. If not specified, all relations managed by this + :class:`GrafanaDashboardConsumer` will be updated. + """ + changes = False + if self._charm.unit.is_leader(): + relations = ( + [relation] if relation else self._charm.model.relations[self._relation_name] + ) + + for relation in relations: + self._render_dashboards_and_signal_changed(relation) # type: ignore + + if changes: + self.on.dashboards_changed.emit() + + def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None: + """Update job config when providers depart. + + When a Grafana dashboard provider departs, the configuration + for that provider is removed from the list of dashboards + """ + if not self._charm.unit.is_leader(): + return + + self._remove_all_dashboards_for_relation(event.relation) + + def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # type: ignore + """Validate a given dashboard. + + Verify that the passed dashboard data is able to be found in our list + of datasources and will render. If they do, let the charm know by + emitting an event. + + Args: + relation: Relation; The relation the dashboard is associated with. + + Returns: + a boolean indicating whether an event should be emitted + """ + other_app = relation.app + + raw_data = relation.data[other_app].get("dashboards", {}) + + if not raw_data: + logger.warning( + "No dashboard data found in the %s:%s relation", + self._relation_name, + str(relation.id), + ) + return False + + data = json.loads(raw_data) + + # The only piece of data needed on this side of the relations is "templates" + templates = data.pop("templates") + + # Import only if a charmed operator uses the consumer, we don't impose these + # dependencies on the client + from jinja2 import Template # type: ignore + from jinja2.exceptions import TemplateSyntaxError # type: ignore + + # The dashboards are WAY too big since this ultimately calls out to Juju to + # set the relation data, and it overflows the maximum argument length for + # subprocess, so we have to use b64, annoyingly. + # Worse, Python3 expects absolutely everything to be a byte, and a plain + # `base64.b64encode()` is still too large, so we have to go through hoops + # of encoding to byte, compressing with lzma, converting to base64 so it + # can be converted to JSON, then all the way back. + + rendered_dashboards = [] + relation_has_invalid_dashboards = False + + for _, (fname, template) in enumerate(templates.items()): + decoded_content = None + content = None + error = None + try: + decoded_content = _decode_dashboard_content(template["content"]) + content = Template(decoded_content).render() + content = _encode_dashboard_content(_convert_dashboard_fields(content)) + except lzma.LZMAError as e: + error = str(e) + relation_has_invalid_dashboards = True + except json.JSONDecodeError as e: + error = str(e.msg) + logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) + continue + except TemplateSyntaxError as e: + error = str(e) + relation_has_invalid_dashboards = True + + # Prepend the relation name and ID to the dashboard ID to avoid clashes with + # multiple relations with apps from the same charm, or having dashboards with + # the same ids inside their charm operators + rendered_dashboards.append( + { + "id": "{}:{}/{}".format(relation.name, relation.id, fname), + "original_id": fname, + "content": content if content else None, + "template": template, + "valid": (error is None), + "error": error, + } + ) + + if relation_has_invalid_dashboards: + self._remove_all_dashboards_for_relation(relation) + + invalid_templates = [ + data["original_id"] for data in rendered_dashboards if not data["valid"] + ] + + logger.warning( + "Cannot add one or more Grafana dashboards from relation '{}:{}': the following " + "templates are invalid: {}".format( + relation.name, + relation.id, + invalid_templates, + ) + ) + + relation.data[self._charm.app]["event"] = json.dumps( + { + "errors": [ + { + "dashboard_id": rendered_dashboard["original_id"], + "error": rendered_dashboard["error"], + } + for rendered_dashboard in rendered_dashboards + if rendered_dashboard["error"] + ] + } + ) + + # Dropping dashboards for a relation needs to be signalled + return True + else: + stored_data = rendered_dashboards + currently_stored_data = self._get_stored_dashboards(relation.id) + + coerced_data = ( + _type_convert_stored(currently_stored_data) if currently_stored_data else {} + ) + + if not coerced_data == stored_data: + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards[relation.id] = stored_data + self.set_peer_data("dashboards", stored_dashboards) + return True + + def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: + """If an errored dashboard is in stored data, remove it and trigger a deletion.""" + if self._get_stored_dashboards(relation.id): + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards.pop(str(relation.id)) + self.set_peer_data("dashboards", stored_dashboards) + self.on.dashboards_changed.emit() + + def _to_external_object(self, relation_id, dashboard): + return { + "id": dashboard["original_id"], + "relation_id": relation_id, + "charm": dashboard["template"]["charm"], + "content": _decode_dashboard_content(dashboard["content"]), + } + + @property + def dashboards(self) -> List[Dict]: + """Get a list of known dashboards across all instances of the monitored relation. + + Returns: a list of known dashboards. The JSON of each of the dashboards is available + in the `content` field of the corresponding `dict`. + """ + dashboards = [] + + for _, (relation_id, dashboards_for_relation) in enumerate( + self.get_peer_data("dashboards").items() + ): + for dashboard in dashboards_for_relation: + dashboards.append(self._to_external_object(relation_id, dashboard)) + + return dashboards + + def _get_stored_dashboards(self, relation_id: int) -> list: + """Pull stored dashboards out of the peer data bucket.""" + return self.get_peer_data("dashboards").get(str(relation_id), {}) + + def _set_default_data(self) -> None: + """Set defaults if they are not in peer relation data.""" + data = {"dashboards": {}} # type: ignore + for k, v in data.items(): + if not self.get_peer_data(k): + self.set_peer_data(k, v) + + def set_peer_data(self, key: str, data: Any) -> None: + """Put information into the peer data bucket instead of `StoredState`.""" + self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore + + def get_peer_data(self, key: str) -> Any: + """Retrieve information from the peer data bucket instead of `StoredState`.""" + data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore + return json.loads(data) if data else {} + + +class GrafanaDashboardAggregator(Object): + """API to retrieve Grafana dashboards from machine dashboards. + + The :class:`GrafanaDashboardAggregator` object provides a way to + collate and aggregate Grafana dashboards from reactive/machine charms + and transport them into Charmed Operators, using Juju topology. + + For detailed usage instructions, see the documentation for + :module:`lma-proxy-operator`, as this class is intended for use as a + single point of intersection rather than use in individual charms. + + Since :class:`GrafanaDashboardAggregator` serves as a bridge between + Canonical Observability Stack Charmed Operators and Reactive Charms, + deployed in a Reactive Juju model, both a target relation which is + used to collect events from Reactive charms and a `grafana_relation` + which is used to send the collected data back to the Canonical + Observability Stack are required. + + In its most streamlined usage, :class:`GrafanaDashboardAggregator` is + integrated in a charmed operator as follows: + + self.grafana = GrafanaDashboardAggregator(self) + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + target_relation: a :string: name of a relation managed by this + :class:`GrafanaDashboardAggregator`, which is used to communicate + with reactive/machine charms it defaults to "dashboards". + grafana_relation: a :string: name of a relation used by this + :class:`GrafanaDashboardAggregator`, which is used to communicate + with charmed grafana. It defaults to "downstream-grafana-dashboard" + """ + + _stored = StoredState() + on = GrafanaProviderEvents() + + def __init__( + self, + charm: CharmBase, + target_relation: str = "dashboards", + grafana_relation: str = "downstream-grafana-dashboard", + ): + super().__init__(charm, grafana_relation) + + # Reactive charms may be RPC-ish and not leave reliable data around. Keep + # StoredState here + self._stored.set_default( + dashboard_templates={}, + id_mappings={}, + ) + + self._charm = charm + self._target_relation = target_relation + self._grafana_relation = grafana_relation + + self.framework.observe( + self._charm.on[self._grafana_relation].relation_joined, + self._update_remote_grafana, + ) + self.framework.observe( + self._charm.on[self._grafana_relation].relation_changed, + self._update_remote_grafana, + ) + self.framework.observe( + self._charm.on[self._target_relation].relation_changed, + self.update_dashboards, + ) + self.framework.observe( + self._charm.on[self._target_relation].relation_broken, + self.remove_dashboards, + ) + + def update_dashboards(self, event: RelationEvent) -> None: + """If we get a dashboard from a reactive charm, parse it out and update.""" + if self._charm.unit.is_leader(): + self._upset_dashboards_on_event(event) + + def _upset_dashboards_on_event(self, event: RelationEvent) -> None: + """Update the dashboards in the relation data bucket.""" + dashboards = self._handle_reactive_dashboards(event) + + if not dashboards: + logger.warning( + "Could not find dashboard data after a relation change for {}".format(event.app) + ) + return + + for id in dashboards: + self._stored.dashboard_templates[id] = self._content_to_dashboard_object( + dashboards[id], event + ) + + self._stored.id_mappings[event.app.name] = dashboards + self._update_remote_grafana(event) + + def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: + """Push dashboards to the downstream Grafana relation.""" + # It's still ridiculous to add a UUID here, but needed + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), + "uuid": str(uuid.uuid4()), + } + + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + def remove_dashboards(self, event: RelationBrokenEvent) -> None: + """Remove a dashboard if the relation is broken.""" + app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) + + del self._stored.id_mappings[event.app.name] + for id in app_ids: + del self._stored.dashboard_templates[id] + + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), + "uuid": str(uuid.uuid4()), + } + + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + # Yes, this has a fair amount of branching. It's not that complex, though + def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 + """Remove existing reactive charm datasource templating out. + + This method iterates through *known* places where reactive charms may set + data in contributed dashboards and removes them. + + `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from + the Grafana UI. It is not present in earlier Grafana versions, and can be disabled + in 5.3.4 and above (optionally). If set, any values present will be substituted on + import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors + for datasources, and leaving this present results in "default" datasource values + which are broken. + + Similarly, `dashboard["templating"]["list"][N]["name"] == "host"` can be used to + set a `host` variable for use in dashboards which is not meaningful in the context + of Juju topology and will yield broken dashboards. + + Further properties may be discovered. + """ + dash = template["dashboard"] + try: + if "list" in dash["templating"]: + for i in range(len(dash["templating"]["list"])): + if ( + "datasource" in dash["templating"]["list"][i] + and "Juju" in dash["templating"]["list"][i]["datasource"] + ): + dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" + if ( + "name" in dash["templating"]["list"][i] + and dash["templating"]["list"][i]["name"] == "host" + ): + dash["templating"]["list"][i] = REACTIVE_CONVERTER + except KeyError: + logger.debug("No existing templating data in dashboard") + + if "__inputs" in dash: + inputs = dash + for i in range(len(dash["__inputs"])): + if dash["__inputs"][i]["pluginName"] == "Prometheus": + del inputs["__inputs"][i] + if inputs: + dash["__inputs"] = inputs["__inputs"] + else: + del dash["__inputs"] + + template["dashboard"] = dash + return template + + def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: + """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" + templates = [] + id = "" + + # Reactive data can reliably be pulled out of events. In theory, if we got an event, + # it's on the bucket, but using event explicitly keeps the mental model in + # place for reactive + for k in event.relation.data[event.unit].keys(): + if k.startswith("request_"): + templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) + + for k in event.relation.data[event.app].keys(): + if k.startswith("request_"): + templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) + + builtins = self._maybe_get_builtin_dashboards(event) + + if not templates and not builtins: + return {} + + dashboards = {} + for t in templates: + # Replace values with LMA-style templating + t = self._strip_existing_datasources(t) + + # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON + # in the bucket back out to the actual "dashboard" we _need_, this is the way + # This is not a mistake -- there's a double nesting in reactive charms, and + # Grafana won't load it. We have to unbox: + # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], + # and the final unboxing is below. + dash = json.dumps(t["dashboard"]) + + # Replace the old-style datasource templates + dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) + dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + + from jinja2 import Template + + content = _encode_dashboard_content( + Template(dash).render(host=event.unit.name, datasource="prometheus") + ) + id = "prog:{}".format(content[-24:-16]) + + dashboards[id] = content + return {**builtins, **dashboards} + + def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: + """Tries to match the event with an included dashboard. + + Scans dashboards packed with the charm instantiating this class, and tries to match + one with the event. There is no guarantee that any given event will match a builtin, + since each charm instantiating this class may include a different set of dashboards, + or none. + """ + builtins = {} + dashboards_path = None + + try: + dashboards_path = _resolve_dir_against_charm_path( + self._charm, "src/grafana_dashboards" + ) + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + + if dashboards_path: + + def _is_dashbaord(p: Path) -> bool: + return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + + for path in filter(_is_dashbaord, Path(dashboards_path).glob("*")): + # path = Path(path) + if event.app.name in path.name: + id = "file:{}".format(path.stem) + builtins[id] = self._content_to_dashboard_object( + _encode_dashboard_content(path.read_bytes()), event + ) + + return builtins + + def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: + return { + "charm": event.app.name, + "content": content, + "juju_topology": self._juju_topology(event), + } + + # This is not actually used in the dashboards, but is present to provide a secondary + # salt to ensure uniqueness in the dict keys in case individual charm units provide + # dashboards + def _juju_topology(self, event: RelationEvent) -> Dict: + return { + "model": self._charm.model.name, + "model_uuid": self._charm.model.uuid, + "application": event.app.name, + "unit": event.unit.name, + } diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py index a065dd53..c985b1e7 100644 --- a/lib/charms/observability_libs/v0/juju_topology.py +++ b/lib/charms/observability_libs/v0/juju_topology.py @@ -76,7 +76,7 @@ LIBID = "bced1658f20f49d28b88f61f83c2d232" LIBAPI = 0 -LIBPATCH = 1 +LIBPATCH = 2 class InvalidUUIDError(Exception): @@ -126,9 +126,27 @@ def __init__( self._unit = unit def is_valid_uuid(self, uuid): - """Validates the supplied UUID against the Juju Model UUID pattern.""" + """Validate the supplied UUID against the Juju Model UUID pattern.""" + # TODO: + # Harness is harcoding an UUID that is v1 not v4: f2c1b2a6-e006-11eb-ba80-0242ac130004 + # See: https://github.com/canonical/operator/issues/779 + # + # >>> uuid.UUID("f2c1b2a6-e006-11eb-ba80-0242ac130004").version + # 1 + # + # we changed the validation of the 3ed UUID block: 4[a-f0-9]{3} -> [a-f0-9]{4} + # See: https://github.com/canonical/operator/blob/main/ops/testing.py#L1094 + # + # Juju in fact generates a UUID v4: https://github.com/juju/utils/blob/master/uuid.go#L62 + # but does not validate it is actually v4: + # See: + # - https://github.com/juju/utils/blob/master/uuid.go#L22 + # - https://github.com/juju/schema/blob/master/strings.go#L79 + # + # Once Harness fixes this, we should remove this comment and refactor the regex or + # the entire method using the uuid module to validate UUIDs regex = re.compile( - "^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}$" + "^[a-f0-9]{8}-?[a-f0-9]{4}-?[a-f0-9]{4}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}$" ) return bool(regex.match(uuid)) diff --git a/lib/charms/traefik_k8s/v0/ingress_per_unit.py b/lib/charms/traefik_k8s/v0/ingress_per_unit.py index 21c58a98..59dfebfa 100644 --- a/lib/charms/traefik_k8s/v0/ingress_per_unit.py +++ b/lib/charms/traefik_k8s/v0/ingress_per_unit.py @@ -10,15 +10,13 @@ ## Getting Started To get started using the library, you just need to fetch the library using `charmcraft`. -**Note that you also need to add the `serialized_data_interface` dependency to your -charm's `requirements.txt`.** ```shell -cd some-charm charmcraft fetch-lib charms.traefik_k8s.v0.ingress_per_unit -echo -e "serialized_data_interface\n" >> requirements.txt ``` +Add the `jsonschema` dependency to the `requirements.txt` of your charm. + ```yaml requires: ingress: @@ -48,285 +46,534 @@ def _handle_ingress_per_unit(self, event): logger.info("This unit's ingress URL: %s", self.ingress_per_unit.url) ``` """ - import logging -from typing import Optional - -from ops.charm import CharmBase, RelationBrokenEvent, RelationEvent, RelationRole -from ops.framework import EventSource -from ops.model import Relation, Unit - -try: - from serialized_data_interface import EndpointWrapper - from serialized_data_interface.errors import RelationDataError - from serialized_data_interface.events import EndpointWrapperEvents -except ImportError: - import os - - library_name = os.path.basename(__file__) - raise ModuleNotFoundError( - "To use the '{}' library, you must include " - "the '{}' package in your dependencies".format(library_name, "serialized_data_interface") - ) from None # Suppress original ImportError - -try: - # introduced in 3.9 - from functools import cache # type: ignore -except ImportError: - from functools import lru_cache - - cache = lru_cache(maxsize=None) +import socket +import typing +from typing import Dict, Optional, Union + +import ops.model +import yaml +from ops.charm import CharmBase, RelationBrokenEvent, RelationEvent +from ops.framework import EventSource, Object, ObjectEvents +from ops.model import ( + ActiveStatus, + Application, + BlockedStatus, + Relation, + StatusBase, + Unit, + WaitingStatus, +) # The unique Charmhub library identifier, never change it -LIBID = "7ef06111da2945ed84f4f5d4eb5b353a" # can't register a library until the charm is in the store 9_9 +LIBID = "7ef06111da2945ed84f4f5d4eb5b353a" # Increment this major API version when introducing breaking changes LIBAPI = 0 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 6 +LIBPATCH = 10 log = logging.getLogger(__name__) -INGRESS_SCHEMA = { - "v1": { - "requires": { - "unit": { - "type": "object", - "properties": { - "model": {"type": "string"}, - "name": {"type": "string"}, - "host": {"type": "string"}, - "port": {"type": "integer"}, - }, - "required": ["model", "name", "host", "port"], - } - }, - "provides": { - "app": { - "type": "object", - "properties": { - "ingress": { - "type": "object", - "patternProperties": { - "": { - "type": "object", - "properties": {"url": {"type": "string"}}, - "required": ["url"], - } - }, - } - }, - "required": ["ingress"], - } +try: + import jsonschema + + DO_VALIDATION = True +except ModuleNotFoundError: + log.warning( + "The `ingress_per_unit` library needs the `jsonschema` package to be able " + "to do runtime data validation; without it, it will still work but validation " + "will be disabled. \n" + "It is recommended to add `jsonschema` to the 'requirements.txt' of your charm, " + "which will enable this feature." + ) + DO_VALIDATION = False + +# LIBRARY GLOBS +RELATION_INTERFACE = "ingress_per_unit" +DEFAULT_RELATION_NAME = RELATION_INTERFACE.replace("_", "-") + +INGRESS_REQUIRES_UNIT_SCHEMA = { + "type": "object", + "properties": { + "model": {"type": "string"}, + "name": {"type": "string"}, + "host": {"type": "string"}, + "port": {"type": "integer"}, + }, + "required": ["model", "name", "host", "port"], +} +INGRESS_PROVIDES_APP_SCHEMA = { + "type": "object", + "properties": { + "ingress": { + "type": "object", + "patternProperties": { + "": { + "type": "object", + "properties": { + "url": {"type": "string"}, + }, + "required": ["url"], + } + }, }, - } + # Optional key for backwards compatibility + # with legacy requirers based on SDI + "_supported_versions": {"type": "string"}, + }, + "required": ["ingress"], } -class IngressPerUnitRequestEvent(RelationEvent): - """Event representing an incoming request. +# TYPES +try: + from typing import TypedDict +except ImportError: + from typing_extensions import TypedDict # py35 compat + + +class RequirerData(TypedDict): # pyright: reportGeneralTypeIssues=false + """Model of the data a unit implementing the requirer will need to provide.""" + + model: str + name: str + host: str + port: int + + +RequirerUnitData = Dict[Unit, "RequirerData"] +KeyValueMapping = Dict[str, str] +ProviderApplicationData = Dict[str, KeyValueMapping] + + +def _validate_data(data, schema): + """Checks whether `data` matches `schema`. - This is equivalent to the "ready" event, but is more semantically meaningful. + Will raise DataValidationError if the data is not valid, else return None. """ + if not DO_VALIDATION: + return + try: + jsonschema.validate(instance=data, schema=schema) + except jsonschema.ValidationError as e: + raise DataValidationError(data, schema) from e -class IngressPerUnitProviderEvents(EndpointWrapperEvents): - """Container for IUP events.""" +# EXCEPTIONS +class DataValidationError(RuntimeError): + """Raised when data validation fails on IPU relation data.""" - request = EventSource(IngressPerUnitRequestEvent) +class RelationException(RuntimeError): + """Base class for relation exceptions from this library. -class IngressPerUnitProvider(EndpointWrapper): - """Implementation of the provider of ingress_per_unit.""" + Attributes: + relation: The Relation which caused the exception. + entity: The Application or Unit which caused the exception. + """ - ROLE = RelationRole.provides.name - INTERFACE = "ingress_per_unit" - SCHEMA = INGRESS_SCHEMA + def __init__(self, relation: Relation, entity: Union[Application, Unit]): + super().__init__(relation) + self.args = ( + "There is an error with the relation {}:{} with {}".format( + relation.name, relation.id, entity.name + ), + ) + self.relation = relation + self.entity = entity - on = IngressPerUnitProviderEvents() - def __init__(self, charm: CharmBase, endpoint: str = None): - """Constructor for IngressPerUnitProvider. +class RelationDataMismatchError(RelationException): + """Data from different units do not match where they should.""" + + +class RelationPermissionError(RelationException): + """Ingress is requested to do something for which it lacks permissions.""" + + def __init__(self, relation: Relation, entity: Union[Application, Unit], message: str): + super(RelationPermissionError, self).__init__(relation, entity) + self.args = ( + "Unable to write data to relation '{}:{}' with {}: {}".format( + relation.name, relation.id, entity.name, message + ), + ) + + +# EVENTS +class RelationAvailableEvent(RelationEvent): + """Event triggered when a relation is ready to provide ingress.""" + + +class RelationFailedEvent(RelationEvent): + """Event triggered when something went wrong with a relation.""" + + +class RelationReadyEvent(RelationEvent): + """Event triggered when a remote relation has the expected data.""" + + +class IngressPerUnitEvents(ObjectEvents): + """Container for events for IngressPerUnit.""" + + available = EventSource(RelationAvailableEvent) + ready = EventSource(RelationReadyEvent) + failed = EventSource(RelationFailedEvent) + broken = EventSource(RelationBrokenEvent) + + +class _IngressPerUnitBase(Object): + """Base class for IngressPerUnit interface classes.""" + + if typing.TYPE_CHECKING: + + @property + def on(self) -> IngressPerUnitEvents: + ... # noqa + + def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): + """Constructor for _IngressPerUnitBase. Args: charm: The charm that is instantiating the instance. - endpoint: The name of the relation endpoint to bind to + relation_name: The name of the relation name to bind to (defaults to "ingress-per-unit"). """ - super().__init__(charm, endpoint) - self.framework.observe(self.on.ready, self._emit_request_event) - - def _emit_request_event(self, event): - self.on.request.emit(event.relation) - - def get_request(self, relation: Relation): - """Get the IngressRequest for the given Relation.""" - return IngressRequest(self, relation) + super().__init__(charm, relation_name) + self.charm: CharmBase = charm + + self.relation_name = relation_name + self.app = self.charm.app + self.unit = self.charm.unit + + observe = self.framework.observe + rel_events = charm.on[relation_name] + observe(rel_events.relation_created, self._handle_relation) + observe(rel_events.relation_joined, self._handle_relation) + observe(rel_events.relation_changed, self._handle_relation) + observe(rel_events.relation_broken, self._handle_relation_broken) + observe(charm.on.leader_elected, self._handle_upgrade_or_leader) + observe(charm.on.upgrade_charm, self._handle_upgrade_or_leader) - @cache - def is_failed(self, relation: Relation = None): - """Checks whether the given relation, or any relation if not specified, has an error.""" + @property + def relations(self): + """The list of Relation instances associated with this relation_name.""" + return list(self.charm.model.relations[self.relation_name]) + + def _handle_relation(self, event): + relation = event.relation + if self.is_ready(relation): + self.on.ready.emit(relation) + elif self.is_available(relation): + self.on.available.emit(relation) + elif self.is_failed(relation): + self.on.failed.emit(relation) + else: + log.debug( + "Relation {} is neither ready, nor available, nor failed. " + "Something fishy's going on...".format(relation) + ) + + def get_status(self, relation: Relation) -> StatusBase: + """Get the suggested status for the given Relation.""" + if self.is_failed(relation): + return BlockedStatus( + "Error handling relation {}:{}".format(relation.name, relation.id) + ) + elif not self.is_available(relation): + return WaitingStatus("Waiting on relation {}:{}".format(relation.name, relation.id)) + elif not self.is_ready(relation): + return WaitingStatus("Waiting on relation {}:{}".format(relation.name, relation.id)) + else: + return ActiveStatus() + + def _handle_relation_broken(self, event): + self.on.broken.emit(event.relation) + + def _handle_upgrade_or_leader(self, _): + pass + + def is_available(self, relation: Optional[Relation] = None) -> bool: + """Check whether the given relation is available. + + Or any relation if not specified. + """ if relation is None: - return any(self.is_failed(relation) for relation in self.relations) - if not relation.units: + return any(map(self.is_available, self.relations)) + if relation.app is None: + return False + if not relation.app.name: + # Juju doesn't provide JUJU_REMOTE_APP during relation-broken + # hooks. See https://github.com/canonical/operator/issues/693. + # Relation in the process of breaking cannot be available. return False - if super().is_failed(relation): - return True - data = self.unwrap(relation) - prev_fields = None - for unit in relation.units: - if not data[unit]: - continue - new_fields = {field: data[unit][field] for field in ("model", "port")} - if prev_fields is None: - prev_fields = new_fields - if new_fields != prev_fields: - raise RelationDataMismatchError(relation, unit) - return False - @property - def proxied_endpoints(self): - """Returns the ingress settings provided to units by this IngressPerUnitProvider. + return True - For example, when this IngressPerUnitProvider has provided the - `http://foo.bar/my-model.my-app-1` and `http://foo.bar/my-model.my-app-2` URLs to - the two units of the my-app application, the returned dictionary will be: + def is_ready(self, relation: Optional[Relation] = None) -> bool: + """Checks whether the given relation is ready. - ``` - { - "my-app/1": { - "url": "http://foo.bar/my-model.my-app-1" - }, - "my-app/2": { - "url": "http://foo.bar/my-model.my-app-2" - } - } - ``` + Or any relation if not specified. + A given relation is ready if the remote side has sent valid data. + The base implementation does nothing but check that the relation is + available. It's up to subclasses to decide what it means for the + relation to be actually 'ready'. """ - results = {} + if relation is None: + return any(map(self.is_ready, self.relations)) + return self.is_available(relation) - for ingress_relation in self.charm.model.relations[self.endpoint]: - results.update(self.unwrap(ingress_relation)[self.charm.app].get("ingress", {})) + def is_failed(self, _: Optional[Relation] = None) -> bool: + """Checks whether the given relation is failed. - return results + Or any relation if not specified. + """ + raise NotImplementedError("implement in subclass") -class IngressRequest: - """A request for per-unit ingress.""" +class IngressPerUnitProvider(_IngressPerUnitBase): + """Implementation of the provider of ingress_per_unit.""" - def __init__(self, provider: IngressPerUnitProvider, relation: Relation): - """Construct an IngressRequest.""" - self._provider = provider - self._relation = relation - self._data = provider.unwrap(relation) + on = IngressPerUnitEvents() - @property - def model(self): - """The name of the model the request was made from.""" - return self._get_data_from_first_unit("model") + def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): + """Constructor for IngressPerUnitProvider. - @property - def app(self): - """The remote application.""" - return self._relation.app + Args: + charm: The charm that is instantiating the instance. + relation_name: The name of the relation relation_name to bind to + (defaults to "ingress-per-unit"). + """ + super().__init__(charm, relation_name) + observe = self.framework.observe + observe(self.charm.on[relation_name].relation_joined, self._share_version_info) - @property - def app_name(self): - """The name of the remote app. + def _share_version_info(self, event): + """Backwards-compatibility shim for version negotiation. - Note: This is not the same as `self.app.name` when using CMR relations, - since `self.app.name` is replaced by a `remote-{UUID}` pattern. + Allows older versions of IPU (requirer side) to interact with this + provider without breaking. + Will be removed in a future version of this library. + Do not use. """ - first_unit_name = self._get_data_from_first_unit("name") + relation = event.relation + if self.charm.unit.is_leader(): + log.info("shared supported_versions shim information") + relation.data[self.charm.app]["_supported_versions"] = "- v1" - if first_unit_name: - return first_unit_name.split("/")[0] + def is_ready(self, relation: Optional[Relation] = None) -> bool: + """Checks whether the given relation is ready. - return None + Or any relation if not specified. + A given relation is ready if SOME remote side has sent valid data. + """ + if relation is None: + return any(map(self.is_ready, self.relations)) - @property - def units(self): - """The remote units.""" - return sorted(self._relation.units, key=lambda unit: unit.name) + if not super().is_ready(relation): + return False - @property - def port(self): - """The backend port.""" - return self._get_data_from_first_unit("port") + try: + requirer_unit_data = self._requirer_unit_data(relation) + except Exception: + log.exception("Cannot fetch ingress data for the '{}' relation".format(relation)) + return False - def get_host(self, unit: Unit): - """The hostname (DNS address, ip) of the given unit.""" - return self._get_unit_data(unit, "host") + return any(requirer_unit_data.values()) - def get_unit_name(self, unit: Unit): - """The name of the remote unit. + def is_failed(self, relation: Optional[Relation] = None) -> bool: + """Checks whether the given relation is failed. - Note: This is not the same as `self.unit.name` when using CMR relations, - since `self.unit.name` is replaced by a `remote-{UUID}` pattern. + Or any relation if not specified. """ - return self._get_unit_data(unit, "name") + if relation is None: + return any(map(self.is_failed, self.relations)) - def _get_data_from_first_unit(self, key: str): - if self.units: - first_unit_data = self._data[self.units[0]] + if not relation.app.name: # type: ignore + # Juju doesn't provide JUJU_REMOTE_APP during relation-broken + # hooks. See https://github.com/canonical/operator/issues/693 + return False - if key in first_unit_data: - return first_unit_data[key] + if not relation.units: + # Relations without requiring units cannot be in failed state + return False - return None + try: + # grab the data and validate it; might raise + requirer_unit_data = self._requirer_unit_data(relation) + except DataValidationError as e: + log.warning("Failed to validate relation data for {} relation: {}".format(relation, e)) + return True - def _get_unit_data(self, unit: Unit, key: str): - if self.units: - if unit in self.units: - unit_data = self._data[unit] + # verify that all remote units (requirer's side) publish the same model. + # We do not validate the port because, in case of changes to the configuration + # of the charm or a new version of the charmed workload, e.g. over an upgrade, + # the remote port may be different among units. + expected_model = None # It may be none for units that have not yet written data - if key in unit_data: - return unit_data[key] + for remote_unit, remote_unit_data in requirer_unit_data.items(): + if "model" in remote_unit_data: + remote_model = remote_unit_data["model"] + if not expected_model: + expected_model = remote_model + elif expected_model != remote_model: + raise RelationDataMismatchError(relation, remote_unit) - return None + return False + + def is_unit_ready(self, relation: Relation, unit: Unit) -> bool: + """Report whether the given unit has shared data in its unit data bag.""" + # sanity check: this should not occur in production, but it may happen + # during testing: cfr https://github.com/canonical/traefik-k8s-operator/issues/39 + assert ( + unit in relation.units + ), "attempting to get ready state for unit that does not belong to relation" + if relation.data.get(unit, {}).get("data"): + # TODO consider doing schema-based validation here + return True + return False + + def get_data(self, relation: Relation, unit: Unit) -> "RequirerData": + """Fetch the data shared by the specified unit on the relation (Requirer side).""" + data = yaml.safe_load(relation.data[unit]["data"]) + _validate_data(data, INGRESS_REQUIRES_UNIT_SCHEMA) + return data - def respond(self, unit: Unit, url: str): - """Send URL back for the given unit. + def publish_url(self, relation: Relation, unit_name: str, url: str): + """Place the ingress url in the application data bag for the units on the requires side. - Note: only the leader can send URLs. + Assumes that this unit is leader. """ - # Can't use `unit.name` because with CMR it's a UUID. - remote_unit_name = self.get_unit_name(unit) - ingress = self._data[self._provider.charm.app].setdefault("ingress", {}) - ingress.setdefault(remote_unit_name, {})["url"] = url - self._provider.wrap(self._relation, self._data) + raw_data = relation.data[self.app].get("data", None) + data = yaml.safe_load(raw_data) if raw_data else {"ingress": {}} + + # we ensure that the application databag has the shape we think it + # should have; to catch any inconsistencies early on. + try: + _validate_data(data, INGRESS_PROVIDES_APP_SCHEMA) + except DataValidationError as e: + log.error( + "unable to publish url to {}: corrupted application databag ({})".format( + unit_name, e + ) + ) + return + + # we update the data with a new url + data["ingress"][unit_name] = {"url": url} + + # we validate the data **again**, to ensure that we respected the schema + # and did not accidentally corrupt our own databag. + _validate_data(data, INGRESS_PROVIDES_APP_SCHEMA) + + try: + relation.data[self.app]["data"] = yaml.safe_dump(data) + except ops.model.RelationDataError: + unit = self.unit + raise RelationPermissionError( + relation, + unit, + "failed to write application data: leader={}".format(unit.is_leader()), + ) + + def wipe_ingress_data(self, relation): + """Remove all published ingress data. + + Assumes that this unit is leader. + """ + relation.data[self.app]["data"] = "" + + def _requirer_unit_data(self, relation: Relation) -> RequirerUnitData: + """Fetch and validate the requirer's unit databag.""" + if not relation.app or not relation.app.name: + # Handle edge case where remote app name can be missing, e.g., + # relation_broken events. + # FIXME https://github.com/canonical/traefik-k8s-operator/issues/34 + return {} + remote_units = [unit for unit in relation.units if unit.app is not self.app] + + requirer_unit_data = {} + for remote_unit in remote_units: + remote_data = relation.data[remote_unit].get("data") + remote_deserialized = {} + if remote_data: + remote_deserialized = yaml.safe_load(remote_data) + _validate_data(remote_deserialized, INGRESS_REQUIRES_UNIT_SCHEMA) + requirer_unit_data[remote_unit] = remote_deserialized + return requirer_unit_data + + def _provider_app_data(self, relation: Relation) -> ProviderApplicationData: + """Fetch and validate the provider's app databag.""" + if not relation.app or not relation.app.name: + # Handle edge case where remote app name can be missing, e.g., + # relation_broken events. + # FIXME https://github.com/canonical/traefik-k8s-operator/issues/34 + return {} -class RelationDataMismatchError(RelationDataError): - """Data from different units do not match where they should.""" + provider_app_data = {} + # we start by looking at the provider's app databag + if self.unit.is_leader(): + # only leaders can read their app's data + data = relation.data[self.app].get("data") + deserialized = {} + if data: + deserialized = yaml.safe_load(data) + _validate_data(deserialized, INGRESS_PROVIDES_APP_SCHEMA) + provider_app_data = deserialized.get("ingress", {}) + + return provider_app_data + + @property + def proxied_endpoints(self) -> dict: + """The ingress settings provided to units by this provider. + + For example, when this IngressPerUnitProvider has provided the + `http://foo.bar/my-model.my-app-1` and + `http://foo.bar/my-model.my-app-2` URLs to the two units of the + my-app application, the returned dictionary will be: + + ``` + { + "my-app/1": { + "url": "http://foo.bar/my-model.my-app-1" + }, + "my-app/2": { + "url": "http://foo.bar/my-model.my-app-2" + } + } + ``` + """ + results = {} + + for ingress_relation in self.relations: + provider_app_data = self._provider_app_data(ingress_relation) + results.update(provider_app_data) + + return results class IngressPerUnitConfigurationChangeEvent(RelationEvent): """Event representing a change in the data sent by the ingress.""" -class IngressPerUnitRequirerEvents(EndpointWrapperEvents): +class IngressPerUnitRequirerEvents(IngressPerUnitEvents): """Container for IUP events.""" ingress_changed = EventSource(IngressPerUnitConfigurationChangeEvent) -class IngressPerUnitRequirer(EndpointWrapper): +class IngressPerUnitRequirer(_IngressPerUnitBase): """Implementation of the requirer of ingress_per_unit.""" on = IngressPerUnitRequirerEvents() - ROLE = RelationRole.requires.name - INTERFACE = "ingress_per_unit" - SCHEMA = INGRESS_SCHEMA - LIMIT = 1 - def __init__( self, charm: CharmBase, - endpoint: str = None, + relation_name: str = DEFAULT_RELATION_NAME, *, host: str = None, port: int = None, @@ -340,29 +587,103 @@ def __init__( Args: charm: the charm that is instantiating the library. - endpoint: the name of the relation endpoint to bind to - (defaults to "ingress-per-unit"; relation must be of interface type - "ingress_per_unit" and have "limit: 1") - host: Hostname to be used by the ingress provider to address the requirer - unit; if unspecified, the pod ip of the unit will be used instead + relation_name: the name of the relation name to bind to + (defaults to "ingress-per-unit"; relation must be of interface + type "ingress_per_unit" and have "limit: 1") + host: Hostname to be used by the ingress provider to address the + requirer unit; if unspecified, the pod ip of the unit will be used + instead Request Args: port: the port of the service """ - super().__init__(charm, endpoint) + super().__init__(charm, relation_name) + + # if instantiated with a port, and we are related, then + # we immediately publish our ingress data to speed up the process. if port: - self.auto_data = self._complete_request(host or "", port) + self._auto_data = host, port + else: + self._auto_data = None # Workaround for SDI not marking the EndpointWrapper as not # ready upon a relation broken event self.is_relation_broken = False self.framework.observe( - self.charm.on[self.endpoint].relation_changed, self._emit_ingress_change_event + self.charm.on[self.relation_name].relation_changed, self._emit_ingress_change_event ) self.framework.observe( - self.charm.on[self.endpoint].relation_broken, self._emit_ingress_change_event + self.charm.on[self.relation_name].relation_broken, self._emit_ingress_change_event ) + def _handle_relation(self, event): + super()._handle_relation(event) + self._publish_auto_data(event.relation) + + def _handle_upgrade_or_leader(self, event): + for relation in self.relations: + self._publish_auto_data(relation) + + def _publish_auto_data(self, relation: Relation): + if self._auto_data and self.is_available(relation): + host, port = self._auto_data + self.provide_ingress_requirements(host=host, port=port) + + @property + def relation(self) -> Optional[Relation]: + """The established Relation instance, or None if still unrelated.""" + if len(self.relations) > 1: + raise ValueError("Multiple ingress-per-unit relations found.") + return self.relations[0] if self.relations else None + + def is_ready(self, relation: Optional[Relation] = None) -> bool: + """Checks whether the given relation is ready. + + Or any relation if not specified. + A given relation is ready if the remote side has sent valid data. + """ + if super().is_ready(relation) is False: + return False + + return bool(self.url) + + def is_failed(self, relation: Optional[Relation] = None) -> bool: + """Checks whether the given relation is failed. + + Or any relation if not specified. + """ + if not self.relations: # can't fail if you can't try + return False + + if relation is None: + return any(map(self.is_failed, self.relations)) + + if not relation.app.name: # type: ignore + # Juju doesn't provide JUJU_REMOTE_APP during relation-broken + # hooks. See https://github.com/canonical/operator/issues/693 + return False + + if not relation.units: + return False + + try: + # grab the data and validate it; might raise + raw = relation.data[self.unit].get("data") + except Exception: + log.exception("Error accessing relation databag") + return True + + if raw: + # validate data + data = yaml.safe_load(raw) + try: + _validate_data(data, INGRESS_REQUIRES_UNIT_SCHEMA) + except DataValidationError: + log.exception("Error validating relation data") + return True + + return False + def _emit_ingress_change_event(self, event): if isinstance(event, RelationBrokenEvent): self.is_relation_broken = True @@ -370,49 +691,57 @@ def _emit_ingress_change_event(self, event): # TODO Avoid spurious events, emit only when URL changes self.on.ingress_changed.emit(self.relation) - def _complete_request(self, host: Optional[str], port: int): - if not host: - binding = self.charm.model.get_binding(self.endpoint) - host = str(binding.network.bind_address) - - return { - self.charm.unit: { - "model": self.model.name, - "name": self.charm.unit.name, - "host": host, - "port": port, - }, - } - - def request(self, *, host: str = None, port: int): - """Request ingress to this unit. + def provide_ingress_requirements(self, *, host: str = None, port: int): + """Publishes the data that Traefik needs to provide ingress. Args: - host: Hostname to be used by the ingress provider to address the requirer - unit; if unspecified, the pod ip of the unit will be used instead + host: Hostname to be used by the ingress provider to address the + requirer unit; if unspecified, the pod ip of the unit will be used + instead port: the port of the service (required) """ - self.wrap(self.relation, self._complete_request(host, port)) + if not host: + host = socket.getfqdn() - @property - def relation(self): - """The established Relation instance, or None.""" - return self.relations[0] if self.relations else None + data = { + "model": self.model.name, + "name": self.unit.name, + "host": host, + "port": port, + } + _validate_data(data, INGRESS_REQUIRES_UNIT_SCHEMA) + + if not self.relation: + raise RuntimeError("Can't publish ingress data: no relation found.") + self.relation.data[self.unit]["data"] = yaml.safe_dump(data) @property - def urls(self): + def urls(self) -> dict: """The full ingress URLs to reach every unit. May return an empty dict if the URLs aren't available yet. """ - if self.is_relation_broken or not self.is_ready(): + relation = self.relation + if not relation or self.is_relation_broken: return {} - data = self.unwrap(self.relation) - ingress = data[self.relation.app].get("ingress", {}) + + raw = None + if relation.app.name: # type: ignore + # FIXME Workaround for https://github.com/canonical/operator/issues/693 + # We must be in a relation_broken hook + raw = relation.data.get(relation.app, {}).get("data") + + if not raw: + return {} + + data = yaml.safe_load(raw) + _validate_data(data, INGRESS_PROVIDES_APP_SCHEMA) + + ingress = data.get("ingress", {}) return {unit_name: unit_data["url"] for unit_name, unit_data in ingress.items()} @property - def url(self): + def url(self) -> Optional[str]: """The full ingress URL to reach the current unit. May return None if the URL isn't available yet. diff --git a/metadata.yaml b/metadata.yaml index a0547038..97c85608 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -22,11 +22,17 @@ containers: mounts: - storage: database location: /var/lib/prometheus + provides: + self-metrics-endpoint: + interface: prometheus_scrape grafana-source: interface: grafana_datasource + grafana-dashboard: + interface: grafana_dashboard receive-remote-write: interface: prometheus_remote_write + requires: metrics-endpoint: interface: prometheus_scrape @@ -35,12 +41,15 @@ requires: ingress: interface: ingress_per_unit limit: 1 + peers: prometheus-peers: interface: prometheus_peers + storage: database: type: filesystem + resources: prometheus-image: type: oci-image diff --git a/pyproject.toml b/pyproject.toml index d1c8e29d..65f207fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ module = ["ops.*", "pytest.*", "pytest_operator.*", "prometheus_api_client.*", " ignore_missing_imports = true [[tool.mypy.overrides]] -module = ["charms.grafana_k8s.*", "charms.alertmanager_k8s.*"] +module = ["charms.grafana_k8s.*", "charms.alertmanager_k8s.*", "charms.traefik_k8s.*"] follow_imports = "silent" [tool.pytest.ini_options] diff --git a/src/charm.py b/src/charm.py index 4d7d3435..6374e382 100755 --- a/src/charm.py +++ b/src/charm.py @@ -14,7 +14,9 @@ import bitmath import yaml from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerConsumer +from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider +from charms.observability_libs.v0.juju_topology import JujuTopology from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch from charms.prometheus_k8s.v0.prometheus_remote_write import ( DEFAULT_RELATION_NAME as DEFAULT_REMOTE_WRITE_RELATION_NAME, @@ -22,7 +24,10 @@ from charms.prometheus_k8s.v0.prometheus_remote_write import ( PrometheusRemoteWriteProvider, ) -from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer +from charms.prometheus_k8s.v0.prometheus_scrape import ( + MetricsEndpointConsumer, + MetricsEndpointProvider, +) from charms.traefik_k8s.v0.ingress_per_unit import IngressPerUnitRequirer from lightkube import Client from lightkube.core.exceptions import ApiError as LightkubeApiError @@ -52,14 +57,23 @@ def __init__(self, *args): self._port = 9090 self.service_patch = KubernetesServicePatch(self, [(f"{self.app.name}", self._port)]) + self._topology = JujuTopology.from_charm(self) # Relation handler objects + # Self-monitoring + self._scraping = MetricsEndpointProvider( + self, + relation_name="self-metrics-endpoint", + jobs=[{"static_configs": [{"targets": [f"*:{self._port}"]}]}], + ) + self.grafana_dashboard_provider = GrafanaDashboardProvider(charm=self) + # Gathers scrape job information from metrics endpoints self.metrics_consumer = MetricsEndpointConsumer(self) # Manages ingress for this charm - self.ingress = IngressPerUnitRequirer(self, endpoint="ingress", port=self._port) + self.ingress = IngressPerUnitRequirer(self, relation_name="ingress", port=self._port) external_url = urlparse(self._external_url) @@ -74,7 +88,7 @@ def __init__(self, *args): ) # Allows Grafana to aggregate metrics - self.grafana_source_consumer = GrafanaSourceProvider( + self.grafana_source_provider = GrafanaSourceProvider( charm=self, source_type="prometheus", source_url=self._external_url, @@ -163,7 +177,7 @@ def _configure(self, _): # Make sure that if the remote_write endpoint changes, it is reflected in relation data. self.remote_write_provider.update_endpoint() - self.grafana_source_consumer.update_source(self._external_url) + self.grafana_source_provider.update_source(self._external_url) self.unit.status = ActiveStatus() @@ -452,7 +466,32 @@ def _prometheus_config(self) -> str: "metrics_path": "/metrics", "honor_timestamps": True, "scheme": "http", - "static_configs": [{"targets": [f"localhost:{self._port}"]}], + "static_configs": [ + { + "targets": [f"localhost:{self._port}"], + "labels": { + "juju_model": self._topology.model, + "juju_model_uuid": self._topology.model_uuid, + "juju_application": self._topology.application, + "juju_unit": self._topology.charm_name, + "host": "localhost", + }, + } + ], + # Replace the value of the "instance" label with a juju topology identifier + "relabel_configs": [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ], } prometheus_config["scrape_configs"].append(default_config) # type: ignore scrape_jobs = self.metrics_consumer.jobs() diff --git a/src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl b/src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl new file mode 100644 index 00000000..a9db8a51 --- /dev/null +++ b/src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl @@ -0,0 +1,2972 @@ +{ + "annotations": { + "list": [ + ] + }, + "description": "Dashboard for the Prometheus Operator, powered by Juju", + "editable": true, + "gnetId": 3662, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": 250, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${prometheusds}", + "decimals": 3, + "description": "Percentage of uptime during the most recent $interval period. Change the period with the 'interval' dropdown above.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "%", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(avg_over_time(up{instance=~\"$instance\",job=~\"$job\"}[$interval]) * 100)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 40 + } + ], + "thresholds": "90, 99", + "title": "Uptime [$interval]", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "columns": [], + "datasource": "${prometheusds}", + "description": "Servers which are DOWN RIGHT NOW! \nFIX THEM!!", + "fontSize": "100%", + "hideTimeOverride": true, + "id": 25, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "span": 3, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/__name__|job|Value/", + "thresholds": [], + "type": "hidden", + "unit": "short" + }, + { + "alias": " ", + "colorMode": "cell", + "colors": [ + "rgba(255, 0, 0, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(255, 0, 0, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "pattern": "instance", + "thresholds": [ + "", + "", + "" + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "up{instance=~\"$instance\",job=~\"$job\"} < 1", + "format": "table", + "intervalFactor": 2, + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1s", + "title": "Currently Down", + "transform": "table", + "type": "table" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${prometheusds}", + "description": "Total number of time series in prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 12, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "B", + "step": 40 + } + ], + "thresholds": "1000000,2000000", + "title": "Total Series", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${prometheusds}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "B", + "step": 40 + } + ], + "thresholds": "", + "title": "Memory Chunks", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "at a glance", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 236, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${prometheusds}", + "description": "The total number of rule group evaluations missed due to slow rule group evaluation.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 16, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(sum_over_time(prometheus_evaluator_iterations_missed_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,10", + "title": "Missed Iterations [$interval]", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${prometheusds}", + "description": "The total number of rule group evaluations skipped due to throttled metric storage.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(sum_over_time(prometheus_evaluator_iterations_skipped_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,10", + "title": "Skipped Iterations [$interval]", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${prometheusds}", + "description": "Total number of scrapes that hit the sample limit and were rejected.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 19, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,10", + "title": "Tardy Scrapes [$interval]", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${prometheusds}", + "description": "Number of times the database failed to reload block data from disk.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 13, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(sum_over_time(prometheus_tsdb_reloads_failures_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,10", + "title": "Reload Failures [$interval]", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${prometheusds}", + "description": "Sum of all skipped scrapes", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 20, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) ", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,10", + "title": "Skipped Scrapes [$interval]", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "quick numbers", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "All non-zero failures and errors", + "fill": 1, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(net_conntrack_dialer_conn_failed_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Failed Connections", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_evaluator_iterations_missed_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Missed Iterations", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_evaluator_iterations_skipped_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Skipped Iterations", + "refId": "C", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_rule_evaluation_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Evaluation", + "refId": "D", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_azure_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Azure Refresh", + "refId": "E", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_consul_rpc_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Consul RPC", + "refId": "F", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_dns_lookup_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "DNS Lookup", + "refId": "G", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_ec2_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "EC2 Refresh", + "refId": "H", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_gce_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "GCE Refresh", + "refId": "I", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_marathon_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Marathon Refresh", + "refId": "J", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_openstack_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Openstack Refresh", + "refId": "K", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_triton_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Triton Refresh", + "refId": "L", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Sample Limit", + "refId": "M", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Duplicate Timestamp", + "refId": "N", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_bounds_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Timestamp Out of Bounds", + "refId": "O", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_order_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Sample Out of Order", + "refId": "P", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_treecache_zookeeper_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Zookeeper", + "refId": "Q", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_tsdb_compactions_failed_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "TSDB Compactions", + "refId": "R", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_tsdb_head_series_not_found{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Series Not Found", + "refId": "S", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_tsdb_reloads_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reload", + "refId": "T", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Failures and Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Errors", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "errors", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "up{instance=~\"$instance\",job=~\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upness (stacked)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "none", + "label": "Up", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Storage Memory Chunks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Chunks", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "up", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Series Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Series", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "removed", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum( increase(prometheus_tsdb_head_series_created_total{instance=~\"$instance\"}[5m]) )", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "created", + "refId": "A", + "step": 4 + }, + { + "expr": "sum( increase(prometheus_tsdb_head_series_removed_total{instance=~\"$instance\"}[5m]) )", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "removed", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Series Created / Removed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Series Count", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "series", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": { + "10.58.3.10:80": "#BA43A9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Rate of total number of appended samples", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Appended Samples per Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Samples / Second", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "appended samples", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Total number of syncs that were executed on a scrape pool.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_target_scrape_pool_sync_total{job=~\"$job\",instance=~\"$instance\"}) by (scrape_job)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{scrape_job}}", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Sync Total", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Syncs", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Actual interval to sync the scrape pool.", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[2m])) by (scrape_job) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{scrape_job}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Target Sync", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Milliseconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "sync", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "scrape_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Total number of rejected scrapes", + "fill": 1, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "exceeded sample limit", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "duplicate timestamp", + "refId": "B", + "step": 4 + }, + { + "expr": "sum(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "out of bounds", + "refId": "C", + "step": 4 + }, + { + "expr": "sum(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}) ", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "out of order", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Rejected Scrapes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "Scrapes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "scrapes", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "The duration of rule group evaluations", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1000 * rate(prometheus_evaluator_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[5m]) / rate(prometheus_evaluator_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "E", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Average Rule Evaluation Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Milliseconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_request_duration_microseconds_count{job=~\"$job\",instance=~\"$instance\"}[1m])) by (handler) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{handler}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Microseconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_engine_query_duration_seconds_sum{job=~\"$job\",instance=~\"$instance\"}) by (slice)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{slice}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Prometheus Engine Query Duration Seconds", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Rule-group evaluations \n - total\n - missed due to slow rule group evaluation\n - skipped due to throttled metric storage", + "fill": 1, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_evaluator_iterations_total{job=~\"$job\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "B", + "step": 4 + }, + { + "expr": "sum(rate(prometheus_evaluator_iterations_missed_total{job=~\"$job\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Missed", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(prometheus_evaluator_iterations_skipped_total{job=~\"$job\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Skipped", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Rule Evaluator Iterations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "iterations", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "durations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(prometheus_notifications_sent_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Notifications Sent", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Notifications", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "notifications", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(time() - prometheus_config_last_reload_success_timestamp_seconds{job=~\"$job\",instance=~\"$instance\"}) / 60", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Minutes Since Successful Config Reload", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Minutes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_config_last_reload_successful{job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Successful Config Reload", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "Success", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "config", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "GC invocation durations", + "fill": 1, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(go_gc_duration_seconds_sum{instance=~\"$instance\",job=~\"$job\"}[2m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GC Rate / 2m", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "garbage collection", + "titleSize": "h6" + }, + { + "collapse": true, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "This is probably wrong! Please help.", + "fill": 1, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "allocated", + "stack": false + } + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(go_memstats_alloc_bytes_total{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "alloc_bytes_total", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(go_memstats_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "allocated", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(go_memstats_buck_hash_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "profiling bucket hash table", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(go_memstats_gc_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "GC metadata", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap in-use", + "refId": "E", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_idle_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap idle", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap in use", + "refId": "G", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_released_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap released", + "refId": "H", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap system", + "refId": "I", + "step": 10 + }, + { + "expr": "sum(go_memstats_mcache_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mcache in use", + "refId": "J", + "step": 10 + }, + { + "expr": "sum(go_memstats_mcache_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mcache sys", + "refId": "K", + "step": 10 + }, + { + "expr": "sum(go_memstats_mspan_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mspan in use", + "refId": "L", + "step": 10 + }, + { + "expr": "sum(go_memstats_mspan_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mspan sys", + "refId": "M", + "step": 10 + }, + { + "expr": "sum(go_memstats_next_gc_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap next gc", + "refId": "N", + "step": 10 + }, + { + "expr": "sum(go_memstats_other_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "other sys", + "refId": "O", + "step": 10 + }, + { + "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "stack in use", + "refId": "P", + "step": 10 + }, + { + "expr": "sum(go_memstats_stack_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "stack sys", + "refId": "Q", + "step": 10 + }, + { + "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "sys", + "refId": "R", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Go Memory Usage (FIXME)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_target_interval_length_seconds{instance=~\"$instance\", job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{interval}}", + "refId": "A", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m])) by (interval)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{interval}}", + "refId": "A", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Target Scrapes / 5m", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Scrapes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Broken, ignore", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [], + "query": "query_result(prometheus_tsdb_head_samples_appended_total)", + "refresh": 2, + "regex": "/.*job=\"([^\"]+)/", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [], + "query": "query_result(up{job=~\"$job\"})", + "refresh": 2, + "regex": "/.*instance=\"([^\"]+).*/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "1h", + "value": "1h" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "3h", + "value": "3h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "2d", + "value": "2d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + }, + { + "selected": false, + "text": "90d", + "value": "90d" + }, + { + "selected": false, + "text": "180d", + "value": "180d" + } + ], + "query": "1h, 3h, 6h, 12h, 1d, 2d, 7d, 30d, 90d, 180d", + "type": "custom" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Prometheus Operator Overview", + "version": 21 +} + diff --git a/src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule b/src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule new file mode 100644 index 00000000..64a3ad75 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusConfigurationReloadFailure +expr: prometheus_config_last_reload_successful{} != 1 +for: 0m +labels: + severity: warning +annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: | + Prometheus configuration reload error + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_exporters_slowly.rule b/src/prometheus_alert_rules/prometheus_exporters_slowly.rule new file mode 100644 index 00000000..e3c182c0 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_exporters_slowly.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTargetScrapingSlow +expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 +for: 5m +labels: + severity: warning +annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: | + Prometheus is scraping exporters slowly + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_large_scrape.rule b/src/prometheus_alert_rules/prometheus_large_scrape.rule new file mode 100644 index 00000000..c4a7e005 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_large_scrape.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusLargeScrape +expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{}[10m]) > 10 +for: 5m +labels: + severity: warning +annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: | + Prometheus has many scrapes that exceed the sample limit + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_missing.rule b/src/prometheus_alert_rules/prometheus_missing.rule new file mode 100644 index 00000000..10c9789f --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_missing.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusJobMissing +expr: absent(up{}) +for: 0m +labels: + severity: warning +annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: | + A Prometheus job has disappeared + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_notifications_backlog.rule b/src/prometheus_alert_rules/prometheus_notifications_backlog.rule new file mode 100644 index 00000000..76b0adb7 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_notifications_backlog.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusNotificationsBacklog +expr: min_over_time(prometheus_notifications_queue_length{}[10m]) > 0 +for: 0m +labels: + severity: warning +annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: | + The Prometheus notification queue has not been empty for 10 minutes + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule b/src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule new file mode 100644 index 00000000..88137fc1 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule @@ -0,0 +1,13 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusRuleEvaluationSlow +expr: prometheus_rule_group_last_duration_seconds{} > prometheus_rule_group_interval_seconds{} +for: 5m +labels: + severity: warning +annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: | + Prometheus rule evaluation took more time than the scheduled interval. + It indicates a slower storage backend access or too complex query. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule b/src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule new file mode 100644 index 00000000..898623ec --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTargetScrapeDuplicate +expr: | + increase( + prometheus_target_scrapes_sample_duplicate_timestamp_total{}[5m] + ) > 0 +for: 0m +labels: + severity: warning +annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: | + Prometheus has many samples rejected due to duplicate timestamps but differing values + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule new file mode 100644 index 00000000..2f8ded7a --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbCheckpointCreationFailures +expr: | + increase( + prometheus_tsdb_checkpoint_creations_failed_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} checkpoint creation failures + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule new file mode 100644 index 00000000..18cc337a --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbCheckpointDeletionFailures +expr: | + increase( + prometheus_tsdb_checkpoint_deletions_failed_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} checkpoint deletion failures. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule b/src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule new file mode 100644 index 00000000..693e2288 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbCompactionsFailed +expr: | + increase( + prometheus_tsdb_compactions_failed_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} TSDB compactions failures. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule b/src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule new file mode 100644 index 00000000..33c35072 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbHeadTruncationsFailed +expr: | + increase( + prometheus_tsdb_head_truncations_failed_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} TSDB head truncation failures. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule b/src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule new file mode 100644 index 00000000..66bcc745 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbReloadFailures +expr: | + increase( + prometheus_tsdb_reloads_failures_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} TSDB reload failures. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule b/src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule new file mode 100644 index 00000000..f86ac2c2 --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbWalCorruptions +expr: | + increase( + prometheus_tsdb_wal_corruptions_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} TSDB WAL corruptions. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule b/src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule new file mode 100644 index 00000000..91a2e59b --- /dev/null +++ b/src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule @@ -0,0 +1,15 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: PrometheusTsdbWalTruncationsFailed +expr: | + increase( + prometheus_tsdb_wal_truncations_failed_total{}[1m] + ) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: | + Prometheus encountered {{ $value }} TSDB WAL truncation failures. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/tests/integration/test_check_config.py b/tests/integration/test_check_config.py index f0b8d7aa..9017b652 100644 --- a/tests/integration/test_check_config.py +++ b/tests/integration/test_check_config.py @@ -87,7 +87,9 @@ async def test_bad_config_sets_action_results(ops_test, prometheus_charm, promet await asyncio.gather( ops_test.model.add_relation(bad_scrape_tester, scrape_shim), - ops_test.model.add_relation(prometheus_app_name, scrape_shim), + ops_test.model.add_relation( + f"{prometheus_app_name}:metrics-endpoint", f"{scrape_shim}:metrics-endpoint" + ), ) await ops_test.model.wait_for_idle(apps=[prometheus_app_name, scrape_shim, bad_scrape_tester]) diff --git a/tests/integration/test_prometheus_scrape_multiunit.py b/tests/integration/test_prometheus_scrape_multiunit.py index 87fd92f8..b0f6c6e7 100644 --- a/tests/integration/test_prometheus_scrape_multiunit.py +++ b/tests/integration/test_prometheus_scrape_multiunit.py @@ -101,12 +101,17 @@ async def test_prometheus_scrape_relation_with_prometheus_tester( assert len(targets) == 1 self_scrape = next(iter(targets)) assert self_scrape["labels"]["job"] == "prometheus" - assert self_scrape["labels"]["instance"] == "localhost:9090" + assert self_scrape["labels"]["host"] == "localhost" # WHEN prometheus is related to the testers await asyncio.gather( - ops_test.model.add_relation(prometheus_app_name, scrape_tester), - ops_test.model.add_relation(prometheus_app_name, remote_write_tester), + ops_test.model.add_relation( + f"{prometheus_app_name}:metrics-endpoint", f"{scrape_tester}:metrics-endpoint" + ), + ops_test.model.add_relation( + f"{prometheus_app_name}:receive-remote-write", + f"{remote_write_tester}:send-remote-write", + ), ) await ops_test.model.wait_for_idle(apps=app_names, status="active") @@ -175,7 +180,7 @@ async def test_prometheus_scrape_relation_with_prometheus_tester( async def test_upgrade_prometheus(ops_test: OpsTest, prometheus_charm): """Upgrade prometheus and confirm all is still green (see also test_upgrade_charm.py).""" # GIVEN an existing "up" timeseries - query = 'count_over_time(up{instance="localhost:9090",job="prometheus"}[1y])' + query = 'count_over_time(up{host="localhost",job="prometheus"}[1y])' up_before = await asyncio.gather( *[run_promql(ops_test, query, prometheus_app_name, u) for u in range(num_units)] ) diff --git a/tests/integration/test_remote_write_grafana_agent.py b/tests/integration/test_remote_write_grafana_agent.py index d2ab3a83..1dda16e8 100644 --- a/tests/integration/test_remote_write_grafana_agent.py +++ b/tests/integration/test_remote_write_grafana_agent.py @@ -36,7 +36,9 @@ async def test_remote_write_with_grafana_agent(ops_test, prometheus_charm): await ops_test.model.wait_for_idle(apps=apps, status="active", wait_for_units=1) assert await check_prometheus_is_ready(ops_test, prometheus_name, 0) - await ops_test.model.add_relation(prometheus_name, agent_name) + await ops_test.model.add_relation( + f"{prometheus_name}:receive-remote-write", f"{agent_name}:send-remote-write" + ) # A considerable idle_period is needed to guarantee metrics show up in prometheus # (60 sec was not enough). diff --git a/tests/integration/test_remote_write_with_zinc.py b/tests/integration/test_remote_write_with_zinc.py index e772da48..7b53fce5 100644 --- a/tests/integration/test_remote_write_with_zinc.py +++ b/tests/integration/test_remote_write_with_zinc.py @@ -46,7 +46,9 @@ async def test_remote_write_with_zinc(ops_test, prometheus_charm): assert await check_prometheus_is_ready(ops_test, prometheus_name, 0) await asyncio.gather( - ops_test.model.add_relation(prometheus_name, agent_name), + ops_test.model.add_relation( + f"{prometheus_name}:receive-remote-write", f"{agent_name}:send-remote-write" + ), ops_test.model.add_relation( f"{agent_name}:metrics-endpoint", f"{zinc_name}:metrics-endpoint" ), diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 748637bc..e078e56b 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -4,10 +4,10 @@ import json import socket import unittest +import uuid from unittest.mock import patch import yaml -from helpers import patch_network_get from ops.testing import Harness from charm import PROMETHEUS_CONFIG, PrometheusCharm @@ -16,16 +16,14 @@ DEFAULT_JOBS = [{"metrics_path": "/metrics"}] SCRAPE_METADATA = { "model": "provider-model", - "model_uuid": "abcdef", + "model_uuid": str(uuid.uuid4()), "application": "provider", "charm_name": "provider-charm", } -@patch("charms.observability_libs.v0.juju_topology.JujuTopology.is_valid_uuid", lambda *args: True) class TestCharm(unittest.TestCase): @patch("charm.KubernetesServicePatch", lambda x, y: None) - @patch_network_get() def setUp(self, *unused): self.harness = Harness(PrometheusCharm) self.addCleanup(self.harness.cleanup) @@ -33,7 +31,7 @@ def setUp(self, *unused): patcher = patch.object(PrometheusCharm, "_get_pvc_capacity") self.mock_capacity = patcher.start() self.addCleanup(patcher.stop) - + self.harness.set_model_name("prometheus_model") self.mock_capacity.return_value = "1Gi" self.harness.begin_with_initial_hooks() @@ -253,7 +251,6 @@ def setUp(self): self.addCleanup(patcher.stop) @patch("charm.KubernetesServicePatch", lambda x, y: None) - @patch_network_get() def test_default_maximum_retention_size_is_80_percent(self): """This test is here to guarantee backwards compatibility. @@ -273,7 +270,6 @@ def test_default_maximum_retention_size_is_80_percent(self): self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") @patch("charm.KubernetesServicePatch", lambda x, y: None) - @patch_network_get() def test_multiplication_factor_applied_to_pvc_capacity(self): """The `--storage.tsdb.retention.size` arg must be multiplied by maximum_retention_size.""" # GIVEN a capacity limit in binary notation (k8s notation)