diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py index ef4ec58..e68e93f 100644 --- a/lib/charms/observability_libs/v0/juju_topology.py +++ b/lib/charms/observability_libs/v0/juju_topology.py @@ -75,7 +75,7 @@ LIBID = "bced1658f20f49d28b88f61f83c2d232" LIBAPI = 0 -LIBPATCH = 3 +LIBPATCH = 4 class InvalidUUIDError(Exception): @@ -94,8 +94,8 @@ def __init__( model: str, model_uuid: str, application: str, - unit: str = None, - charm_name: str = None, + unit: Optional[str] = None, + charm_name: Optional[str] = None, ): """Build a JujuTopology object. @@ -181,7 +181,10 @@ def from_dict(cls, data: dict): ) def as_dict( - self, *, remapped_keys: Dict[str, str] = None, excluded_keys: List[str] = None + self, + *, + remapped_keys: Optional[Dict[str, str]] = None, + excluded_keys: Optional[List[str]] = None, ) -> OrderedDict: """Format the topology information into an ordered dict. diff --git a/lib/charms/observability_libs/v0/kubernetes_service_patch.py b/lib/charms/observability_libs/v1/kubernetes_service_patch.py similarity index 65% rename from lib/charms/observability_libs/v0/kubernetes_service_patch.py rename to lib/charms/observability_libs/v1/kubernetes_service_patch.py index a3fb910..b458795 100644 --- a/lib/charms/observability_libs/v0/kubernetes_service_patch.py +++ b/lib/charms/observability_libs/v1/kubernetes_service_patch.py @@ -9,21 +9,20 @@ default contains a "placeholder" port, which is 65536/TCP. When modifying the default set of resources managed by Juju, one must consider the lifecycle of the -charm. In this case, any modifications to the default service (created during deployment), will -be overwritten during a charm upgrade. +charm. In this case, any modifications to the default service (created during deployment), will be +overwritten during a charm upgrade. When initialised, this library binds a handler to the parent charm's `install` and `upgrade_charm` events which applies the patch to the cluster. This should ensure that the service ports are correct throughout the charm's life. -The constructor simply takes a reference to the parent charm, and a list of tuples that each define -a port for the service, where each tuple contains: +The constructor simply takes a reference to the parent charm, and a list of +[`lightkube`](https://github.com/gtsystem/lightkube) ServicePorts that each define a port for the +service. For information regarding the `lightkube` `ServicePort` model, please visit the +`lightkube` [docs](https://gtsystem.github.io/lightkube-models/1.23/models/core_v1/#serviceport). -- a name for the port -- port for the service to listen on -- optionally: a targetPort for the service (the port in the container!) -- optionally: a nodePort for the service (for NodePort or LoadBalancer services only!) -- optionally: a name of the service (in case service name needs to be patched as well) +Optionally, a name of the service (in case service name needs to be patched as well), labels, +selectors, and annotations can be provided as keyword arguments. ## Getting Started @@ -32,8 +31,8 @@ ```shell cd some-charm -charmcraft fetch-lib charms.observability_libs.v0.kubernetes_service_patch -echo <<-EOF >> requirements.txt +charmcraft fetch-lib charms.observability_libs.v1.kubernetes_service_patch +cat << EOF >> requirements.txt lightkube lightkube-models EOF @@ -41,28 +40,71 @@ Then, to initialise the library: -For ClusterIP services: +For `ClusterIP` services: + ```python # ... -from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch +from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch +from lightkube.models.core_v1 import ServicePort class SomeCharm(CharmBase): def __init__(self, *args): # ... - self.service_patcher = KubernetesServicePatch(self, [(f"{self.app.name}", 8080)]) + port = ServicePort(443, name=f"{self.app.name}") + self.service_patcher = KubernetesServicePatch(self, [port]) # ... ``` -For LoadBalancer/NodePort services: +For `LoadBalancer`/`NodePort` services: + ```python # ... -from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch +from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch +from lightkube.models.core_v1 import ServicePort class SomeCharm(CharmBase): def __init__(self, *args): # ... + port = ServicePort(443, name=f"{self.app.name}", targetPort=443, nodePort=30666) self.service_patcher = KubernetesServicePatch( - self, [(f"{self.app.name}", 443, 443, 30666)], "LoadBalancer" + self, [port], "LoadBalancer" + ) + # ... +``` + +Port protocols can also be specified. Valid protocols are `"TCP"`, `"UDP"`, and `"SCTP"` + +```python +# ... +from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch +from lightkube.models.core_v1 import ServicePort + +class SomeCharm(CharmBase): + def __init__(self, *args): + # ... + tcp = ServicePort(443, name=f"{self.app.name}-tcp", protocol="TCP") + udp = ServicePort(443, name=f"{self.app.name}-udp", protocol="UDP") + sctp = ServicePort(443, name=f"{self.app.name}-sctp", protocol="SCTP") + self.service_patcher = KubernetesServicePatch(self, [tcp, udp, sctp]) + # ... +``` + +Bound with custom events by providing `refresh_event` argument: +For example, you would like to have a configurable port in your charm and want to apply +service patch every time charm config is changed. + +```python +from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch +from lightkube.models.core_v1 import ServicePort + +class SomeCharm(CharmBase): + def __init__(self, *args): + # ... + port = ServicePort(int(self.config["charm-config-port"]), name=f"{self.app.name}") + self.service_patcher = KubernetesServicePatch( + self, + [port], + refresh_event=self.on.config_changed ) # ... ``` @@ -83,15 +125,16 @@ def setUp(self, *unused): import logging from types import MethodType -from typing import Literal, Sequence, Tuple, Union +from typing import List, Literal, Optional, Union from lightkube import ApiError, Client +from lightkube.core import exceptions from lightkube.models.core_v1 import ServicePort, ServiceSpec from lightkube.models.meta_v1 import ObjectMeta from lightkube.resources.core_v1 import Service from lightkube.types import PatchType from ops.charm import CharmBase -from ops.framework import Object +from ops.framework import BoundEvent, Object logger = logging.getLogger(__name__) @@ -99,13 +142,12 @@ def setUp(self, *unused): LIBID = "0042f86d0a874435adef581806cddbbb" # Increment this major API version when introducing breaking changes -LIBAPI = 0 +LIBAPI = 1 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 6 +LIBPATCH = 5 -PortDefinition = Union[Tuple[str, int], Tuple[str, int, int], Tuple[str, int, int, int]] ServiceType = Literal["ClusterIP", "LoadBalancer"] @@ -115,18 +157,20 @@ class KubernetesServicePatch(Object): def __init__( self, charm: CharmBase, - ports: Sequence[PortDefinition], - service_name: str = None, + ports: List[ServicePort], + service_name: Optional[str] = None, service_type: ServiceType = "ClusterIP", - additional_labels: dict = None, - additional_selectors: dict = None, - additional_annotations: dict = None, + additional_labels: Optional[dict] = None, + additional_selectors: Optional[dict] = None, + additional_annotations: Optional[dict] = None, + *, + refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, ): """Constructor for KubernetesServicePatch. Args: charm: the charm that is instantiating the library. - ports: a list of tuples (name, port, targetPort, nodePort) for every service port. + ports: a list of ServicePorts service_name: allows setting custom name to the patched service. If none given, application name will be used. service_type: desired type of K8s service. Default value is in line with ServiceSpec's @@ -136,6 +180,9 @@ def __init__( additional_selectors: Selectors to be added to the kubernetes service (by default only "app.kubernetes.io/name" is set to the service name) additional_annotations: Annotations to be added to the kubernetes service. + refresh_event: an optional bound event or list of bound events which + will be observed to re-apply the patch (e.g. on port change). + The `install` and `upgrade-charm` events would be observed regardless. """ super().__init__(charm, "kubernetes-service-patch") self.charm = charm @@ -155,22 +202,27 @@ def __init__( self.framework.observe(charm.on.install, self._patch) self.framework.observe(charm.on.upgrade_charm, self._patch) + # apply user defined events + if refresh_event: + if not isinstance(refresh_event, list): + refresh_event = [refresh_event] + + for evt in refresh_event: + self.framework.observe(evt, self._patch) + def _service_object( self, - ports: Sequence[PortDefinition], - service_name: str = None, + ports: List[ServicePort], + service_name: Optional[str] = None, service_type: ServiceType = "ClusterIP", - additional_labels: dict = None, - additional_selectors: dict = None, - additional_annotations: dict = None, + additional_labels: Optional[dict] = None, + additional_selectors: Optional[dict] = None, + additional_annotations: Optional[dict] = None, ) -> Service: """Creates a valid Service representation. Args: - ports: a list of tuples of the form (name, port) or (name, port, targetPort) - or (name, port, targetPort, nodePort) for every service port. If the 'targetPort' - is omitted, it is assumed to be equal to 'port', with the exception of NodePort - and LoadBalancer services, where all port numbers have to be specified. + ports: a list of ServicePorts service_name: allows setting custom name to the patched service. If none given, application name will be used. service_type: desired type of K8s service. Default value is in line with ServiceSpec's @@ -203,15 +255,7 @@ def _service_object( ), spec=ServiceSpec( selector=selector, - ports=[ - ServicePort( - name=p[0], - port=p[1], - targetPort=p[2] if len(p) > 2 else p[1], # type: ignore[misc] - nodePort=p[3] if len(p) > 3 else None, # type: ignore[arg-type, misc] - ) - for p in ports - ], + ports=ports, type=service_type, ), ) @@ -222,11 +266,15 @@ def _patch(self, _) -> None: Raises: PatchFailed: if patching fails due to lack of permissions, or otherwise. """ - if not self.charm.unit.is_leader(): + try: + client = Client() + except exceptions.ConfigError as e: + logger.warning("Error creating k8s client: %s", e) return - client = Client() try: + if self._is_patched(client): + return if self.service_name != self._app: self._delete_and_create_service(client) client.patch(Service, self.service_name, self.service, patch_type=PatchType.MERGE) @@ -252,12 +300,25 @@ def is_patched(self) -> bool: bool: A boolean indicating if the service patch has been applied. """ client = Client() + return self._is_patched(client) + + def _is_patched(self, client: Client) -> bool: # Get the relevant service from the cluster - service = client.get(Service, name=self.service_name, namespace=self._namespace) + try: + service = client.get(Service, name=self.service_name, namespace=self._namespace) + except ApiError as e: + if e.status.code == 404 and self.service_name != self._app: + return False + else: + logger.error("Kubernetes service get failed: %s", str(e)) + raise + # Construct a list of expected ports, should the patch be applied expected_ports = [(p.port, p.targetPort) for p in self.service.spec.ports] # Construct a list in the same manner, using the fetched service - fetched_ports = [(p.port, p.targetPort) for p in service.spec.ports] # type: ignore[attr-defined] # noqa: E501 + fetched_ports = [ + (p.port, p.targetPort) for p in service.spec.ports # type: ignore[attr-defined] + ] # noqa: E501 return expected_ports == fetched_ports @property diff --git a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py index 63f6857..07a379f 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py @@ -4,6 +4,9 @@ This library facilitates the integration of the prometheus_remote_write interface. +Source code can be found on GitHub at: + https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s + Charms that need to push data to a charm exposing the Prometheus remote_write API, should use the `PrometheusRemoteWriteConsumer`. Charms that operate software that exposes the Prometheus remote_write API, that is, they can receive metrics data over remote_write, @@ -23,7 +26,14 @@ import yaml from charms.observability_libs.v0.juju_topology import JujuTopology -from ops.charm import CharmBase, HookEvent, RelationEvent, RelationMeta, RelationRole +from ops.charm import ( + CharmBase, + HookEvent, + RelationBrokenEvent, + RelationEvent, + RelationMeta, + RelationRole, +) from ops.framework import EventBase, EventSource, Object, ObjectEvents from ops.model import Relation @@ -35,7 +45,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 10 logger = logging.getLogger(__name__) @@ -321,7 +331,9 @@ def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: alert_groups = [] # type: List[dict] # Gather all alerts into a list of groups - for file_path in self._multi_suffix_glob(dir_path, [".rule", ".rules"], recursive): + for file_path in self._multi_suffix_glob( + dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive + ): alert_groups_from_file = self._from_file(dir_path, file_path) if alert_groups_from_file: logger.debug("Reading alert rule from %s", file_path) @@ -629,7 +641,7 @@ def __init__( self.framework.observe(on_relation.relation_joined, self._handle_endpoints_changed) self.framework.observe(on_relation.relation_changed, self._handle_endpoints_changed) self.framework.observe(on_relation.relation_departed, self._handle_endpoints_changed) - self.framework.observe(on_relation.relation_broken, self._handle_endpoints_changed) + self.framework.observe(on_relation.relation_broken, self._on_relation_broken) self.framework.observe(on_relation.relation_joined, self._push_alerts_on_relation_joined) self.framework.observe( self._charm.on.leader_elected, self._push_alerts_to_all_relation_databags @@ -638,6 +650,9 @@ def __init__( self._charm.on.upgrade_charm, self._push_alerts_to_all_relation_databags ) + def _on_relation_broken(self, event: RelationBrokenEvent) -> None: + self.on.endpoints_changed.emit(relation_id=event.relation.id) + def _handle_endpoints_changed(self, event: RelationEvent) -> None: if self._charm.unit.is_leader(): ev = json.loads(event.relation.data[event.app].get("event", "{}")) @@ -805,7 +820,7 @@ def __init__( def _on_relation_change(self, event: RelationEvent) -> None: self.update_endpoint(event.relation) - def update_endpoint(self, relation: Relation = None) -> None: + def update_endpoint(self, relation: Optional[Relation] = None) -> None: """Triggers programmatically the update of the relation data. This method should be used when the charm relying on this library needs diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py new file mode 100644 index 0000000..f080fb8 --- /dev/null +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -0,0 +1,2357 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +"""Prometheus Scrape Library. + +## Overview + +This document explains how to integrate with the Prometheus charm +for the purpose of providing a metrics endpoint to Prometheus. It +also explains how alternative implementations of the Prometheus charms +may maintain the same interface and be backward compatible with all +currently integrated charms. Finally this document is the +authoritative reference on the structure of relation data that is +shared between Prometheus charms and any other charm that intends to +provide a scrape target for Prometheus. + +## Source code + +Source code can be found on GitHub at: + https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s + +## Dependencies + +Using this library requires you to fetch the juju_topology library from +[observability-libs](https://charmhub.io/observability-libs/libraries/juju_topology). + +`charmcraft fetch-lib charms.observability_libs.v0.juju_topology` + +## Provider Library Usage + +This Prometheus charm interacts with its scrape targets using its +charm library. Charms seeking to expose metric endpoints for the +Prometheus charm, must do so using the `MetricsEndpointProvider` +object from this charm library. For the simplest use cases, using the +`MetricsEndpointProvider` object only requires instantiating it, +typically in the constructor of your charm (the one which exposes a +metrics endpoint). The `MetricsEndpointProvider` constructor requires +the name of the relation over which a scrape target (metrics endpoint) +is exposed to the Prometheus charm. This relation must use the +`prometheus_scrape` interface. By default address of the metrics +endpoint is set to the unit IP address, by each unit of the +`MetricsEndpointProvider` charm. These units set their address in +response to the `PebbleReady` event of each container in the unit, +since container restarts of Kubernetes charms can result in change of +IP addresses. The default name for the metrics endpoint relation is +`metrics-endpoint`. It is strongly recommended to use the same +relation name for consistency across charms and doing so obviates the +need for an additional constructor argument. The +`MetricsEndpointProvider` object may be instantiated as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_endpoint = MetricsEndpointProvider(self) + ... + +Note that the first argument (`self`) to `MetricsEndpointProvider` is +always a reference to the parent (scrape target) charm. + +An instantiated `MetricsEndpointProvider` object will ensure that each +unit of its parent charm, is a scrape target for the +`MetricsEndpointConsumer` (Prometheus) charm. By default +`MetricsEndpointProvider` assumes each unit of the consumer charm +exports its metrics at a path given by `/metrics` on port 80. These +defaults may be changed by providing the `MetricsEndpointProvider` +constructor an optional argument (`jobs`) that represents a +Prometheus scrape job specification using Python standard data +structures. This job specification is a subset of Prometheus' own +[scrape +configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) +format but represented using Python data structures. More than one job +may be provided using the `jobs` argument. Hence `jobs` accepts a list +of dictionaries where each dictionary represents one `` +object as described in the Prometheus documentation. The currently +supported configuration subset is: `job_name`, `metrics_path`, +`static_configs` + +Suppose it is required to change the port on which scraped metrics are +exposed to 8000. This may be done by providing the following data +structure as the value of `jobs`. + +``` +[ + { + "static_configs": [ + { + "targets": ["*:8000"] + } + ] + } +] +``` + +The wildcard ("*") host specification implies that the scrape targets +will automatically be set to the host addresses advertised by each +unit of the consumer charm. + +It is also possible to change the metrics path and scrape multiple +ports, for example + +``` +[ + { + "metrics_path": "/my-metrics-path", + "static_configs": [ + { + "targets": ["*:8000", "*:8081"], + } + ] + } +] +``` + +More complex scrape configurations are possible. For example + +``` +[ + { + "static_configs": [ + { + "targets": ["10.1.32.215:7000", "*:8000"], + "labels": { + "some-key": "some-value" + } + } + ] + } +] +``` + +This example scrapes the target "10.1.32.215" at port 7000 in addition +to scraping each unit at port 8000. There is however one difference +between wildcard targets (specified using "*") and fully qualified +targets (such as "10.1.32.215"). The Prometheus charm automatically +associates labels with metrics generated by each target. These labels +localise the source of metrics within the Juju topology by specifying +its "model name", "model UUID", "application name" and "unit +name". However unit name is associated only with wildcard targets but +not with fully qualified targets. + +Multiple jobs with different metrics paths and labels are allowed, but +each job must be given a unique name: + +``` +[ + { + "job_name": "my-first-job", + "metrics_path": "one-path", + "static_configs": [ + { + "targets": ["*:7000"], + "labels": { + "some-key": "some-value" + } + } + ] + }, + { + "job_name": "my-second-job", + "metrics_path": "another-path", + "static_configs": [ + { + "targets": ["*:8000"], + "labels": { + "some-other-key": "some-other-value" + } + } + ] + } +] +``` + +**Important:** `job_name` should be a fixed string (e.g. hardcoded literal). +For instance, if you include variable elements, like your `unit.name`, it may break +the continuity of the metrics time series gathered by Prometheus when the leader unit +changes (e.g. on upgrade or rescale). + +Additionally, it is also technically possible, but **strongly discouraged**, to +configure the following scrape-related settings, which behave as described by the +[Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config): + +- `static_configs` +- `scrape_interval` +- `scrape_timeout` +- `proxy_url` +- `relabel_configs` +- `metrics_relabel_configs` +- `sample_limit` +- `label_limit` +- `label_name_length_limit` +- `label_value_length_limit` + +The settings above are supported by the `prometheus_scrape` library only for the sake of +specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s) +charm. Virtually no charms should use these settings, and charmers definitely **should not** +expose them to the Juju administrator via configuration options. + +## Consumer Library Usage + +The `MetricsEndpointConsumer` object may be used by Prometheus +charms to manage relations with their scrape targets. For this +purposes a Prometheus charm needs to do two things + +1. Instantiate the `MetricsEndpointConsumer` object by providing it a +reference to the parent (Prometheus) charm and optionally the name of +the relation that the Prometheus charm uses to interact with scrape +targets. This relation must confirm to the `prometheus_scrape` +interface and it is strongly recommended that this relation be named +`metrics-endpoint` which is its default value. + +For example a Prometheus charm may instantiate the +`MetricsEndpointConsumer` in its constructor as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_consumer = MetricsEndpointConsumer(self) + ... + +2. A Prometheus charm also needs to respond to the +`TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as +an observer for these events, as in + + self.framework.observe( + self.metrics_consumer.on.targets_changed, + self._on_scrape_targets_changed, + ) + +In responding to the `TargetsChangedEvent` event the Prometheus +charm must update the Prometheus configuration so that any new scrape +targets are added and/or old ones removed from the list of scraped +endpoints. For this purpose the `MetricsEndpointConsumer` object +exposes a `jobs()` method that returns a list of scrape jobs. Each +element of this list is the Prometheus scrape configuration for that +job. In order to update the Prometheus configuration, the Prometheus +charm needs to replace the current list of jobs with the list provided +by `jobs()` as follows + + def _on_scrape_targets_changed(self, event): + ... + scrape_jobs = self.metrics_consumer.jobs() + for job in scrape_jobs: + prometheus_scrape_config.append(job) + ... + +## Alerting Rules + +This charm library also supports gathering alerting rules from all +related `MetricsEndpointProvider` charms and enabling corresponding alerts within the +Prometheus charm. Alert rules are automatically gathered by `MetricsEndpointProvider` +charms when using this library, from a directory conventionally named +`prometheus_alert_rules`. This directory must reside at the top level +in the `src` folder of the consumer charm. Each file in this directory +is assumed to be in one of two formats: +- the official prometheus alert rule format, conforming to the +[Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +- a single rule format, which is a simplified subset of the official format, +comprising a single alert rule per file, using the same YAML fields. + +The file name must have one of the following extensions: +- `.rule` +- `.rules` +- `.yml` +- `.yaml` + +An example of the contents of such a file in the custom single rule +format is shown below. + +``` +alert: HighRequestLatency +expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5 +for: 10m +labels: + severity: Medium + type: HighLatency +annotations: + summary: High request latency for {{ $labels.instance }}. +``` + +The `MetricsEndpointProvider` will read all available alert rules and +also inject "filtering labels" into the alert expressions. The +filtering labels ensure that alert rules are localised to the metrics +provider charm's Juju topology (application, model and its UUID). Such +a topology filter is essential to ensure that alert rules submitted by +one provider charm generates alerts only for that same charm. When +alert rules are embedded in a charm, and the charm is deployed as a +Juju application, the alert rules from that application have their +expressions automatically updated to filter for metrics coming from +the units of that application alone. This remove risk of spurious +evaluation, e.g., when you have multiple deployments of the same charm +monitored by the same Prometheus. + +Not all alerts one may want to specify can be embedded in a +charm. Some alert rules will be specific to a user's use case. This is +the case, for example, of alert rules that are based on business +constraints, like expecting a certain amount of requests to a specific +API every five minutes. Such alert rules can be specified via the +[COS Config Charm](https://charmhub.io/cos-configuration-k8s), +which allows importing alert rules and other settings like dashboards +from a Git repository. + +Gathering alert rules and generating rule files within the Prometheus +charm is easily done using the `alerts()` method of +`MetricsEndpointConsumer`. Alerts generated by Prometheus will +automatically include Juju topology labels in the alerts. These labels +indicate the source of the alert. The following labels are +automatically included with each alert + +- `juju_model` +- `juju_model_uuid` +- `juju_application` + +## Relation Data + +The Prometheus charm uses both application and unit relation data to +obtain information regarding its scrape jobs, alert rules and scrape +targets. This relation data is in JSON format and it closely resembles +the YAML structure of Prometheus [scrape configuration] +(https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). + +Units of Metrics provider charms advertise their names and addresses +over unit relation data using the `prometheus_scrape_unit_name` and +`prometheus_scrape_unit_address` keys. While the `scrape_metadata`, +`scrape_jobs` and `alert_rules` keys in application relation data +of Metrics provider charms hold eponymous information. + +""" # noqa: W505 + +import copy +import hashlib +import ipaddress +import json +import logging +import os +import platform +import re +import socket +import subprocess +import tempfile +from collections import defaultdict +from pathlib import Path +from typing import Callable, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import yaml +from charms.observability_libs.v0.juju_topology import JujuTopology +from ops.charm import CharmBase, RelationRole +from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents +from ops.model import Relation + +# The unique Charmhub library identifier, never change it +LIBID = "bc84295fef5f4049878f07b131968ee2" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 27 + +logger = logging.getLogger(__name__) + + +ALLOWED_KEYS = { + "job_name", + "metrics_path", + "static_configs", + "scrape_interval", + "scrape_timeout", + "proxy_url", + "relabel_configs", + "metrics_relabel_configs", + "sample_limit", + "label_limit", + "label_name_length_limit", + "label_value_length_limit", + "scheme", + "basic_auth", + "tls_config", +} +DEFAULT_JOB = { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], +} + + +DEFAULT_RELATION_NAME = "metrics-endpoint" +RELATION_INTERFACE_NAME = "prometheus_scrape" + +DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" + + +class PrometheusConfig: + """A namespace for utility functions for manipulating the prometheus config dict.""" + + # relabel instance labels so that instance identifiers are globally unique + # stable over unit recreation + topology_relabel_config = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + topology_relabel_config_wildcard = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + @staticmethod + def sanitize_scrape_config(job: dict) -> dict: + """Restrict permissible scrape configuration options. + + If job is empty then a default job is returned. The + default job is + + ``` + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], + } + ``` + + Args: + job: a dict containing a single Prometheus job + specification. + + Returns: + a dictionary containing a sanitized job specification. + """ + sanitized_job = DEFAULT_JOB.copy() + sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) + return sanitized_job + + @staticmethod + def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]: + """A vectorized version of `sanitize_scrape_config`.""" + return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs] + + @staticmethod + def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]: + """Adds the given prefix to all the job names in the given scrape_configs list.""" + modified_scrape_configs = [] + for scrape_config in scrape_configs: + job_name = scrape_config.get("job_name") + modified = scrape_config.copy() + modified["job_name"] = prefix + "_" + job_name if job_name else prefix + modified_scrape_configs.append(modified) + + return modified_scrape_configs + + @staticmethod + def expand_wildcard_targets_into_individual_jobs( + scrape_jobs: List[dict], + hosts: Dict[str, Tuple[str, str]], + topology: Optional[JujuTopology] = None, + ) -> List[dict]: + """Extract wildcard hosts from the given scrape_configs list into separate jobs. + + Args: + scrape_jobs: list of scrape jobs. + hosts: a dictionary mapping host names to host address for + all units of the relation for which this job configuration + must be constructed. + topology: optional arg for adding topology labels to scrape targets. + """ + # hosts = self._relation_hosts(relation) + + modified_scrape_jobs = [] + for job in scrape_jobs: + static_configs = job.get("static_configs") + if not static_configs: + continue + + # When a single unit specified more than one wildcard target, then they are expanded + # into a static_config per target + non_wildcard_static_configs = [] + + for static_config in static_configs: + targets = static_config.get("targets") + if not targets: + continue + + # All non-wildcard targets remain in the same static_config + non_wildcard_targets = [] + + # All wildcard targets are extracted to a job per unit. If multiple wildcard + # targets are specified, they remain in the same static_config (per unit). + wildcard_targets = [] + + for target in targets: + match = re.compile(r"\*(?:(:\d+))?").match(target) + if match: + # This is a wildcard target. + # Need to expand into separate jobs and remove it from this job here + wildcard_targets.append(target) + else: + # This is not a wildcard target. Copy it over into its own static_config. + non_wildcard_targets.append(target) + + # All non-wildcard targets remain in the same static_config + if non_wildcard_targets: + non_wildcard_static_config = static_config.copy() + non_wildcard_static_config["targets"] = non_wildcard_targets + + if topology: + # When non-wildcard targets (aka fully qualified hostnames) are specified, + # there is no reliable way to determine the name (Juju topology unit name) + # for such a target. Therefore labeling with Juju topology, excluding the + # unit name. + non_wildcard_static_config["labels"] = { + **non_wildcard_static_config.get("labels", {}), + **topology.label_matcher_dict, + } + + non_wildcard_static_configs.append(non_wildcard_static_config) + + # Extract wildcard targets into individual jobs + if wildcard_targets: + for unit_name, (unit_hostname, unit_path) in hosts.items(): + modified_job = job.copy() + modified_job["static_configs"] = [static_config.copy()] + modified_static_config = modified_job["static_configs"][0] + modified_static_config["targets"] = [ + target.replace("*", unit_hostname) for target in wildcard_targets + ] + + unit_num = unit_name.split("/")[-1] + job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num + modified_job["job_name"] = job_name + modified_job["metrics_path"] = unit_path + ( + job.get("metrics_path") or "/metrics" + ) + + if topology: + # Add topology labels + modified_static_config["labels"] = { + **modified_static_config.get("labels", {}), + **topology.label_matcher_dict, + **{"juju_unit": unit_name}, + } + + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get( + "relabel_configs", [] + ) + [PrometheusConfig.topology_relabel_config_wildcard] + + modified_scrape_jobs.append(modified_job) + + if non_wildcard_static_configs: + modified_job = job.copy() + modified_job["static_configs"] = non_wildcard_static_configs + modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics" + + if topology: + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [ + PrometheusConfig.topology_relabel_config + ] + + modified_scrape_jobs.append(modified_job) + + return modified_scrape_jobs + + @staticmethod + def render_alertmanager_static_configs(alertmanagers: List[str]): + """Render the alertmanager static_configs section from a list of URLs. + + Each target must be in the hostname:port format, and prefixes are specified in a separate + key. Therefore, with ingress in place, would need to extract the path into the + `path_prefix` key, which is higher up in the config hierarchy. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config + + Args: + alertmanagers: List of alertmanager URLs. + + Returns: + A dict representation for the static_configs section. + """ + # Make sure it's a valid url so urlparse could parse it. + scheme = re.compile(r"^https?://") + sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers] + + # Create a mapping from paths to netlocs + # Group alertmanager targets into a dictionary of lists: + # {path: [netloc1, netloc2]} + paths = defaultdict(list) # type: Dict[str, List[str]] + for parsed in map(urlparse, sanitized): + path = parsed.path or "/" + paths[path].append(parsed.netloc) + + return { + "alertmanagers": [ + {"path_prefix": path_prefix, "static_configs": [{"targets": netlocs}]} + for path_prefix, netlocs in paths.items() + ] + } + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name is found.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different role.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +class InvalidAlertRuleEvent(EventBase): + """Event emitted when alert rule files are not parsable. + + Enables us to set a clear status on the provider. + """ + + def __init__(self, handle, errors: str = "", valid: bool = False): + super().__init__(handle) + self.errors = errors + self.valid = valid + + def snapshot(self) -> Dict: + """Save alert rule information.""" + return { + "valid": self.valid, + "errors": self.errors, + } + + def restore(self, snapshot): + """Restore alert rule information.""" + self.valid = snapshot["valid"] + self.errors = snapshot["errors"] + + +class MetricsEndpointProviderEvents(ObjectEvents): + """Events raised by :class:`InvalidAlertRuleEvent`s.""" + + alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +): + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the same relation interface + as specified via the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the same role as specified + via the `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +class InvalidAlertRulePathError(Exception): + """Raised if the alert rules folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + alert_rules_absolute_path: Path, + message: str, + ): + self.alert_rules_absolute_path = alert_rules_absolute_path + self.message = message + + super().__init__(self.message) + + +def _is_official_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in the upstream format as supported by Prometheus. + + Alert rules in dictionary format are in "official" form if they + contain a "groups" key, since this implies they contain a list of + alert rule groups. + + Args: + rules_dict: a set of alert rules in Python dictionary format + + Returns: + True if alert rules are in official Prometheus file format. + """ + return "groups" in rules_dict + + +def _is_single_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in single rule format. + + The Prometheus charm library supports reading of alert rules in a + custom format that consists of a single alert rule per file. This + does not conform to the official Prometheus alert rule file format + which requires that each alert rules file consists of a list of + alert rule groups and each group consists of a list of alert + rules. + + Alert rules in dictionary form are considered to be in single rule + format if in the least it contains two keys corresponding to the + alert rule name and alert expression. + + Returns: + True if alert rule is in single rule file format. + """ + # one alert rule per file + return set(rules_dict) >= {"alert", "expr"} + + +class AlertRules: + """Utility class for amalgamating prometheus alert rule files and injecting juju topology. + + An `AlertRules` object supports aggregating alert rules from files and directories in both + official and single rule file formats using the `add_path()` method. All the alert rules + read are annotated with Juju topology labels and amalgamated into a single data structure + in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be + easily dumped into JSON format and exchanged over relation data. The dictionary can also + be dumped into YAML format and written directly into an alert rules file that is read by + Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, + since Prometheus allows only a single list of alert rule groups per alert rules file. + + The official Prometheus format is a YAML file conforming to the Prometheus documentation + (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). + The custom single rule format is a subsection of the official YAML, having a single alert + rule, effectively "one alert per file". + """ + + # This class uses the following terminology for the various parts of a rule file: + # - alert rules file: the entire groups[] yaml, including the "groups:" key. + # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list + # of dictionaries that have the "name" and "rules" keys. + # - alert group (singular): a single dictionary that has the "name" and "rules" keys. + # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with + # the "alert" and "expr" keys. + # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. + + def __init__(self, topology: Optional[JujuTopology] = None): + """Build and alert rule object. + + Args: + topology: an optional `JujuTopology` instance that is used to annotate all alert rules. + """ + self.topology = topology + self.tool = CosTool(None) + self.alert_groups = [] # type: List[dict] + + def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: + """Read a rules file from path, injecting juju topology. + + Args: + root_path: full path to the root rules folder (used only for generating group name) + file_path: full path to a *.rule file. + + Returns: + A list of dictionaries representing the rules file, if file is valid (the structure is + formed by `yaml.safe_load` of the file); an empty list otherwise. + """ + with file_path.open() as rf: + # Load a list of rules from file then add labels and filters + try: + rule_file = yaml.safe_load(rf) + + except Exception as e: + logger.error("Failed to read alert rules from %s: %s", file_path.name, e) + return [] + + if not rule_file: + logger.warning("Empty rules file: %s", file_path.name) + return [] + if not isinstance(rule_file, dict): + logger.error("Invalid rules file (must be a dict): %s", file_path.name) + return [] + if _is_official_alert_rule_format(rule_file): + alert_groups = rule_file["groups"] + elif _is_single_alert_rule_format(rule_file): + # convert to list of alert groups + # group name is made up from the file name + alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] + else: + # invalid/unsupported + logger.error("Invalid rules file: %s", file_path.name) + return [] + + # update rules with additional metadata + for alert_group in alert_groups: + # update group name with topology and sub-path + alert_group["name"] = self._group_name( + str(root_path), + str(file_path), + alert_group["name"], + ) + + # add "juju_" topology labels + for alert_rule in alert_group["rules"]: + if "labels" not in alert_rule: + alert_rule["labels"] = {} + + if self.topology: + alert_rule["labels"].update(self.topology.label_matcher_dict) + # insert juju topology filters into a prometheus alert rule + alert_rule["expr"] = self.tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), + self.topology.label_matcher_dict, + ) + + return alert_groups + + def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: + """Generate group name from path and topology. + + The group name is made up of the relative path between the root dir_path, the file path, + and topology identifier. + + Args: + root_path: path to the root rules dir. + file_path: path to rule file. + group_name: original group name to keep as part of the new augmented group name + + Returns: + New group name, augmented by juju topology and relative path. + """ + rel_path = os.path.relpath(os.path.dirname(file_path), root_path) + rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_") + + # Generate group name: + # - name, from juju topology + # - suffix, from the relative path of the rule file; + group_name_parts = [self.topology.identifier] if self.topology else [] + group_name_parts.extend([rel_path, group_name, "alerts"]) + # filter to remove empty strings + return "_".join(filter(None, group_name_parts)) + + @classmethod + def _multi_suffix_glob( + cls, dir_path: Path, suffixes: List[str], recursive: bool = True + ) -> list: + """Helper function for getting all files in a directory that have a matching suffix. + + Args: + dir_path: path to the directory to glob from. + suffixes: list of suffixes to include in the glob (items should begin with a period). + recursive: a flag indicating whether a glob is recursive (nested) or not. + + Returns: + List of files in `dir_path` that have one of the suffixes specified in `suffixes`. + """ + all_files_in_dir = dir_path.glob("**/*" if recursive else "*") + return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir)) + + def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: + """Read all rule files in a directory. + + All rules from files for the same directory are loaded into a single + group. The generated name of this group includes juju topology. + By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. + + Args: + dir_path: directory containing *.rule files (alert rules without groups). + recursive: flag indicating whether to scan for rule files recursively. + + Returns: + a list of dictionaries representing prometheus alert rule groups, each dictionary + representing an alert group (structure determined by `yaml.safe_load`). + """ + alert_groups = [] # type: List[dict] + + # Gather all alerts into a list of groups + for file_path in self._multi_suffix_glob( + dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive + ): + alert_groups_from_file = self._from_file(dir_path, file_path) + if alert_groups_from_file: + logger.debug("Reading alert rule from %s", file_path) + alert_groups.extend(alert_groups_from_file) + + return alert_groups + + def add_path(self, path: str, *, recursive: bool = False) -> None: + """Add rules from a dir path. + + All rules from files are aggregated into a data structure representing a single rule file. + All group names are augmented with juju topology. + + Args: + path: either a rules file or a dir of rules files. + recursive: whether to read files recursively or not (no impact if `path` is a file). + + Returns: + True if path was added else False. + """ + path = Path(path) # type: Path + if path.is_dir(): + self.alert_groups.extend(self._from_dir(path, recursive)) + elif path.is_file(): + self.alert_groups.extend(self._from_file(path.parent, path)) + else: + logger.debug("Alert rules path does not exist: %s", path) + + def as_dict(self) -> dict: + """Return standard alert rules file in dict representation. + + Returns: + a dictionary containing a single list of alert rule groups. + The list of alert rule groups is provided as value of the + "groups" dictionary key. + """ + return {"groups": self.alert_groups} if self.alert_groups else {} + + +class TargetsChangedEvent(EventBase): + """Event emitted when Prometheus scrape targets change.""" + + def __init__(self, handle, relation_id): + super().__init__(handle) + self.relation_id = relation_id + + def snapshot(self): + """Save scrape target relation information.""" + return {"relation_id": self.relation_id} + + def restore(self, snapshot): + """Restore scrape target relation information.""" + self.relation_id = snapshot["relation_id"] + + +class MonitoringEvents(ObjectEvents): + """Event descriptor for events raised by `MetricsEndpointConsumer`.""" + + targets_changed = EventSource(TargetsChangedEvent) + + +class MetricsEndpointConsumer(Object): + """A Prometheus based Monitoring service.""" + + on = MonitoringEvents() + + def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): + """A Prometheus based Monitoring service. + + Args: + charm: a `CharmBase` instance that manages this + instance of the Prometheus service. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that consume metrics endpoints. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.requires` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._tool = CosTool(self._charm) + events = self._charm.on[relation_name] + self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed) + self.framework.observe( + events.relation_departed, self._on_metrics_provider_relation_departed + ) + + def _on_metrics_provider_relation_changed(self, event): + """Handle changes with related metrics providers. + + Anytime there are changes in relations between Prometheus + and metrics provider charms the Prometheus charm is informed, + through a `TargetsChangedEvent` event. The Prometheus charm can + then choose to update its scrape configuration. + + Args: + event: a `CharmEvent` in response to which the Prometheus + charm must update its scrape configuration. + """ + rel_id = event.relation.id + + self.on.targets_changed.emit(relation_id=rel_id) + + def _on_metrics_provider_relation_departed(self, event): + """Update job config when a metrics provider departs. + + When a metrics provider departs the Prometheus charm is informed + through a `TargetsChangedEvent` event so that it can update its + scrape configuration to ensure that the departed metrics provider + is removed from the list of scrape jobs and + + Args: + event: a `CharmEvent` that indicates a metrics provider + unit has departed. + """ + rel_id = event.relation.id + self.on.targets_changed.emit(relation_id=rel_id) + + def jobs(self) -> list: + """Fetch the list of scrape jobs. + + Returns: + A list consisting of all the static scrape configurations + for each related `MetricsEndpointProvider` that has specified + its scrape targets. + """ + scrape_jobs = [] + + for relation in self._charm.model.relations[self._relation_name]: + static_scrape_jobs = self._static_scrape_config(relation) + if static_scrape_jobs: + scrape_jobs.extend(static_scrape_jobs) + + scrape_jobs = _dedupe_job_names(scrape_jobs) + + return scrape_jobs + + def alerts(self) -> dict: + """Fetch alerts for all relations. + + A Prometheus alert rules file consists of a list of "groups". Each + group consists of a list of alerts (`rules`) that are sequentially + executed. This method returns all the alert rules provided by each + related metrics provider charm. These rules may be used to generate a + separate alert rules file for each relation since the returned list + of alert groups are indexed by that relations Juju topology identifier. + The Juju topology identifier string includes substrings that identify + alert rule related metadata such as the Juju model, model UUID and the + application name from where the alert rule originates. Since this + topology identifier is globally unique, it may be used for instance as + the name for the file into which the list of alert rule groups are + written. For each relation, the structure of data returned is a dictionary + representation of a standard prometheus rules file: + + {"groups": [{"name": ...}, ...]} + + per official prometheus documentation + https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + + The value of the `groups` key is such that it may be used to generate + a Prometheus alert rules file directly using `yaml.dump` but the + `groups` key itself must be included as this is required by Prometheus. + + For example the list of alert rule groups returned by this method may + be written into files consumed by Prometheus as follows + + ``` + for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): + filename = "juju_" + topology_identifier + ".rules" + path = os.path.join(PROMETHEUS_RULES_DIR, filename) + rules = yaml.safe_dump(alert_rule_groups) + container.push(path, rules, make_dirs=True) + ``` + + Returns: + A dictionary mapping the Juju topology identifier of the source charm to + its list of alert rule groups. + """ + alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units or not relation.app: + continue + + alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) + if not alert_rules: + continue + + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = JujuTopology.from_dict(scrape_metadata).identifier + alerts[identifier] = self._tool.apply_label_matchers(alert_rules) + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) + identifier = self._get_identifier_by_alert_rules(alert_rules) + + if not identifier: + logger.error( + "Alert rules were found but no usable group or identifier was present" + ) + continue + + alerts[identifier] = alert_rules + + _, errmsg = self._tool.validate_alert_rules(alert_rules) + if errmsg: + if alerts[identifier]: + del alerts[identifier] + relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg}) + continue + + return alerts + + def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: + """Determine an appropriate dict key for alert rules. + + The key is used as the filename when writing alerts to disk, so the structure + and uniqueness is important. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + logger.debug("No alert groups were found in relation data") + return None + + # Construct an ID based on what's in the alert rules if they have labels + for group in rules["groups"]: + try: + labels = group["rules"][0]["labels"] + identifier = "{}_{}_{}".format( + labels["juju_model"], + labels["juju_model_uuid"], + labels["juju_application"], + ) + return identifier + except KeyError: + logger.debug("Alert rules were found but no usable labels were present") + continue + + logger.warning( + "No labeled alert rules were found, and no 'scrape_metadata' " + "was available. Using the alert group name as filename." + ) + try: + for group in rules["groups"]: + return group["name"] + except KeyError: + logger.debug("No group name was found to use as identifier") + + return None + + def _static_scrape_config(self, relation) -> list: + """Generate the static scrape configuration for a single relation. + + If the relation data includes `scrape_metadata` then the value + of this key is used to annotate the scrape jobs with Juju + Topology labels before returning them. + + Args: + relation: an `ops.model.Relation` object whose static + scrape configuration is required. + + Returns: + A list (possibly empty) of scrape jobs. Each job is a + valid Prometheus scrape configuration for that job, + represented as a Python dictionary. + """ + if not relation.units: + return [] + + scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + + if not scrape_jobs: + return [] + + scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) + + if not scrape_metadata: + return scrape_jobs + + topology = JujuTopology.from_dict(scrape_metadata) + + job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) + scrape_jobs = PrometheusConfig.prefix_job_names(scrape_jobs, job_name_prefix) + scrape_jobs = PrometheusConfig.sanitize_scrape_configs(scrape_jobs) + + hosts = self._relation_hosts(relation) + + scrape_jobs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_jobs, hosts, topology + ) + + return scrape_jobs + + def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: + """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" + hosts = {} + for unit in relation.units: + # TODO deprecate and remove unit.name + unit_name = relation.data[unit].get("prometheus_scrape_unit_name") or unit.name + # TODO deprecate and remove "prometheus_scrape_host" + unit_address = relation.data[unit].get( + "prometheus_scrape_unit_address" + ) or relation.data[unit].get("prometheus_scrape_host") + unit_path = relation.data[unit].get("prometheus_scrape_unit_path", "") + if unit_name and unit_address: + hosts.update({unit_name: (unit_address, unit_path)}) + + return hosts + + def _target_parts(self, target) -> list: + """Extract host and port from a wildcard target. + + Args: + target: a string specifying a scrape target. A + scrape target is expected to have the format + "host:port". The host part may be a wildcard + "*" and the port part can be missing (along + with ":") in which case port is set to 80. + + Returns: + a list with target host and port as in [host, port] + """ + if ":" in target: + parts = target.split(":") + else: + parts = [target, "80"] + + return parts + + +def _dedupe_job_names(jobs: List[dict]): + """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. + + Additionally, fully de-duplicate any identical jobs. + + Args: + jobs: A list of prometheus scrape jobs + """ + jobs_copy = copy.deepcopy(jobs) + + # Convert to a dict with job names as keys + # I think this line is O(n^2) but it should be okay given the list sizes + jobs_dict = { + job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy)) + for job in jobs_copy + } + + # If multiple jobs have the same name, convert the name to "name_" + for key in jobs_dict: + if len(jobs_dict[key]) > 1: + for job in jobs_dict[key]: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + job["job_name"] = "{}_{}".format(job["job_name"], hashed) + new_jobs = [] + for key in jobs_dict: + new_jobs.extend([i for i in jobs_dict[key]]) + + # Deduplicate jobs which are equal + # Again this in O(n^2) but it should be okay + deduped_jobs = [] + seen = [] + for job in new_jobs: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + if hashed in seen: + continue + seen.append(hashed) + deduped_jobs.append(job) + + return deduped_jobs + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the `main.py` file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and, if the result path exists and is a directory, + return its absolute path; otherwise, raise en exception. + + Raises: + InvalidAlertRulePathError, if the path does not exist or is not a directory. + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not alerts_dir_path.exists(): + raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") + if not alerts_dir_path.is_dir(): + raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") + + return str(alerts_dir_path) + + +class MetricsEndpointProvider(Object): + """A metrics endpoint for Prometheus.""" + + on = MetricsEndpointProviderEvents() + + def __init__( + self, + charm, + relation_name: str = DEFAULT_RELATION_NAME, + jobs=None, + alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, + external_url: str = "", + lookaside_jobs_callable: Optional[Callable] = None, + ): + """Construct a metrics provider for a Prometheus charm. + + If your charm exposes a Prometheus metrics endpoint, the + `MetricsEndpointProvider` object enables your charm to easily + communicate how to reach that metrics endpoint. + + By default, a charm instantiating this object has the metrics + endpoints of each of its units scraped by the related Prometheus + charms. The scraped metrics are automatically tagged by the + Prometheus charms with Juju topology data via the + `juju_model_name`, `juju_model_uuid`, `juju_application_name` + and `juju_unit` labels. To support such tagging `MetricsEndpointProvider` + automatically forwards scrape metadata to a `MetricsEndpointConsumer` + (Prometheus charm). + + Scrape targets provided by `MetricsEndpointProvider` can be + customized when instantiating this object. For example in the + case of a charm exposing the metrics endpoint for each of its + units on port 8080 and the `/metrics` path, the + `MetricsEndpointProvider` can be instantiated as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "static_configs": [{"targets": ["*:8080"]}], + }]) + + The notation `*:` means "scrape each unit of this charm on port + ``. + + In case the metrics endpoints are not on the standard `/metrics` path, + a custom path can be specified as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "metrics_path": "/my/strange/metrics/path", + "static_configs": [{"targets": ["*:8080"]}], + }]) + + Note how the `jobs` argument is a list: this allows you to expose multiple + combinations of paths "metrics_path" and "static_configs" in case your charm + exposes multiple endpoints, which could happen, for example, when you have + multiple workload containers, with applications in each needing to be scraped. + The structure of the objects in the `jobs` list is one-to-one with the + `scrape_config` configuration item of Prometheus' own configuration (see + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + ), but with only a subset of the fields allowed. The permitted fields are + listed in `ALLOWED_KEYS` object in this charm library module. + + It is also possible to specify alert rules. By default, this library will look + into the `/prometheus_alert_rules`, which in a standard charm + layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a + separate `*.rule` file. If the syntax of a rule is invalid, + the `MetricsEndpointProvider` logs an error and does not load the particular + rule. + + To avoid false positives and negatives in the evaluation of alert rules, + all ingested alert rule expressions are automatically qualified using Juju + Topology filters. This ensures that alert rules provided by your charm, trigger + alerts based only on data scrapped from your charm. For example an alert rule + such as the following + + alert: UnitUnavailable + expr: up < 1 + for: 0m + + will be automatically transformed into something along the lines of the following + + alert: UnitUnavailable + expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 + for: 0m + + An attempt will be made to validate alert rules prior to loading them into Prometheus. + If they are invalid, an event will be emitted from this object which charms can respond + to in order to set a meaningful status for administrators. + + This can be observed via `consumer.on.alert_rule_status_changed` which contains: + - The error(s) encountered when validating as `errors` + - A `valid` attribute, which can be used to reset the state of charms if alert rules + are updated via another mechanism (e.g. `cos-config`) and refreshed. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointProvider` object. Typically, this is + `self` in the instantiating class. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that provide metrics endpoints. + jobs: an optional list of dictionaries where each + dictionary represents the Prometheus scrape + configuration for a single job. When not provided, a + default scrape configuration is provided for the + `/metrics` endpoint polling all units of the charm on port `80` + using the `MetricsEndpointProvider` object. + alert_rules_path: an optional path for the location of alert rules + files. Defaults to "./prometheus_alert_rules", + resolved relative to the directory hosting the charm entry file. + The alert rules are automatically updated on charm upgrade. + refresh_event: an optional bound event or list of bound events which + will be observed to re-set scrape job data (IP address and others) + external_url: an optional argument that represents an external url that + can be generated by an Ingress or a Proxy. + lookaside_jobs_callable: an optional `Callable` which should be invoked + when the job configuration is built as a secondary mapping. The callable + should return a `List[Dict]` which is syntactically identical to the + `jobs` parameter, but can be updated out of step initialization of + this library without disrupting the 'global' job spec. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.provides` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + self.topology = JujuTopology.from_charm(charm) + + self._charm = charm + self._alert_rules_path = alert_rules_path + self._relation_name = relation_name + # sanitize job configurations to the supported subset of parameters + jobs = [] if jobs is None else jobs + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + + if external_url: + external_url = ( + external_url if urlparse(external_url).scheme else ("http://" + external_url) + ) + self.external_url = external_url + self._lookaside_jobs = lookaside_jobs_callable + + events = self._charm.on[self._relation_name] + self.framework.observe(events.relation_changed, self._on_relation_changed) + + if not refresh_event: + # FIXME remove once podspec charms are verified. + # `self.set_scrape_job_spec()` is called every re-init so this should not be needed. + if len(self._charm.meta.containers) == 1: + if "kubernetes" in self._charm.meta.series: + # This is a podspec charm + refresh_event = [self._charm.on.update_status] + else: + # This is a sidecar/pebble charm + container = list(self._charm.meta.containers.values())[0] + refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready] + else: + logger.warning( + "%d containers are present in metadata.yaml and " + "refresh_event was not specified. Defaulting to update_status. " + "Metrics IP may not be set in a timely fashion.", + len(self._charm.meta.containers), + ) + refresh_event = [self._charm.on.update_status] + + else: + if not isinstance(refresh_event, list): + refresh_event = [refresh_event] + + for ev in refresh_event: + self.framework.observe(ev, self.set_scrape_job_spec) + + # Update relation data every reinit. If instead we used event hooks then observing only + # relation-joined would not be sufficient: + # - Would need to observe leader-elected, in case there was no leader during + # relation-joined. + # - If later related to an ingress provider, then would need to register and wait for + # update-status interval to elapse before changes would apply. + # - The ingerss-ready custom event is currently emitted prematurely and cannot be relied + # upon: https://github.com/canonical/traefik-k8s-operator/issues/78 + # NOTE We may still end up waiting for update-status before changes are applied. + self.set_scrape_job_spec() + + def _on_relation_changed(self, event): + """Check for alert rule messages in the relation data before moving on.""" + if self._charm.unit.is_leader(): + ev = json.loads(event.relation.data[event.app].get("event", "{}")) + + if ev: + valid = bool(ev.get("valid", True)) + errors = ev.get("errors", "") + + if valid and not errors: + self.on.alert_rule_status_changed.emit(valid=valid) + else: + self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) + + def update_scrape_job_spec(self, jobs): + """Update scrape job specification.""" + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + self.set_scrape_job_spec() + + def set_scrape_job_spec(self, _=None): + """Ensure scrape target information is made available to prometheus. + + When a metrics provider charm is related to a prometheus charm, the + metrics provider sets specification and metadata related to its own + scrape configuration. This information is set using Juju application + data. In addition, each of the consumer units also sets its own + host address in Juju unit relation data. + """ + self._set_unit_ip() + + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules(topology=self.topology) + alert_rules.add_path(self._alert_rules_path, recursive=True) + alert_rules_as_dict = alert_rules.as_dict() + + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) + + if alert_rules_as_dict: + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + + def _set_unit_ip(self, _=None): + """Set unit host address. + + Each time a metrics provider charm container is restarted it updates its own + host address in the unit relation data for the prometheus charm. + + The only argument specified is an event, and it ignored. This is for expediency + to be able to use this method as an event handler, although no access to the + event is actually needed. + """ + for relation in self._charm.model.relations[self._relation_name]: + unit_ip = str(self._charm.model.get_binding(relation).network.bind_address) + + # TODO store entire url in relation data, instead of only select url parts. + + if self.external_url: + parsed = urlparse(self.external_url) + unit_address = parsed.hostname + path = parsed.path + elif self._is_valid_unit_address(unit_ip): + unit_address = unit_ip + path = "" + else: + unit_address = socket.getfqdn() + path = "" + + relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address + relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path + relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( + self._charm.model.unit.name + ) + + def _is_valid_unit_address(self, address: str) -> bool: + """Validate a unit address. + + At present only IP address validation is supported, but + this may be extended to DNS addresses also, as needed. + + Args: + address: a string representing a unit address + """ + try: + _ = ipaddress.ip_address(address) + except ValueError: + return False + + return True + + @property + def _scrape_jobs(self) -> list: + """Fetch list of scrape jobs. + + Returns: + A list of dictionaries, where each dictionary specifies a + single scrape job for Prometheus. + """ + jobs = self._jobs if self._jobs else [DEFAULT_JOB] + if callable(self._lookaside_jobs): + return jobs + PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs()) + else: + return jobs + + @property + def _scrape_metadata(self) -> dict: + """Generate scrape metadata. + + Returns: + Scrape configuration metadata for this metrics provider charm. + """ + return self.topology.as_dict() + + +class PrometheusRulesProvider(Object): + """Forward rules to Prometheus. + + This object may be used to forward rules to Prometheus. At present it only supports + forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which + is used for forwarding both scrape targets and associated alert rules. This object + is typically used when there is a desire to forward rules that apply globally (across + all deployed charms and units) rather than to a single charm. All rule files are + forwarded using the same 'prometheus_scrape' interface that is also used by + `MetricsEndpointProvider`. + + Args: + charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. + relation_name: Name of the relation in `metadata.yaml` that + has the `prometheus_scrape` interface. + dir_path: Root directory for the collection of rule files. + recursive: Whether to scan for rule files recursively. + """ + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recursive=True, + ): + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._recursive = recursive + + try: + dir_path = _resolve_dir_against_charm_path(charm, dir_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + self.dir_path = dir_path + + events = self._charm.on[self._relation_name] + event_sources = [ + events.relation_joined, + events.relation_changed, + self._charm.on.leader_elected, + self._charm.on.upgrade_charm, + ] + + for event_source in event_sources: + self.framework.observe(event_source, self._update_relation_data) + + def _reinitialize_alert_rules(self): + """Reloads alert rules and updates all relations.""" + self._update_relation_data(None) + + def _update_relation_data(self, _): + """Update application relation data with alert rules for all relations.""" + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules() + alert_rules.add_path(self.dir_path, recursive=self._recursive) + alert_rules_as_dict = alert_rules.as_dict() + + logger.info("Updating relation data with rule files from disk") + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["alert_rules"] = json.dumps( + alert_rules_as_dict, + sort_keys=True, # sort, to prevent unnecessary relation_changed events + ) + + +class MetricsEndpointAggregator(Object): + """Aggregate metrics from multiple scrape targets. + + `MetricsEndpointAggregator` collects scrape target information from one + or more related charms and forwards this to a `MetricsEndpointConsumer` + charm, which may be in a different Juju model. However, it is + essential that `MetricsEndpointAggregator` itself resides in the same + model as its scrape targets, as this is currently the only way to + ensure in Juju that the `MetricsEndpointAggregator` will be able to + determine the model name and uuid of the scrape targets. + + `MetricsEndpointAggregator` should be used in place of + `MetricsEndpointProvider` in the following two use cases: + + 1. Integrating one or more scrape targets that do not support the + `prometheus_scrape` interface. + + 2. Integrating one or more scrape targets through cross model + relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s) + may also be used for the purpose of supporting cross model + relations. + + Using `MetricsEndpointAggregator` to build a Prometheus charm client + only requires instantiating it. Instantiating + `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except + that it requires specifying the names of three relations: the + relation with scrape targets, the relation for alert rules, and + that with the Prometheus charms. For example + + ```python + self._aggregator = MetricsEndpointAggregator( + self, + { + "prometheus": "monitoring", + "scrape_target": "prometheus-target", + "alert_rules": "prometheus-rules" + } + ) + ``` + + `MetricsEndpointAggregator` assumes that each unit of a scrape target + sets in its unit-level relation data two entries with keys + "hostname" and "port". If it is required to integrate with charms + that do not honor these assumptions, it is always possible to + derive from `MetricsEndpointAggregator` overriding the `_get_targets()` + method, which is responsible for aggregating the unit name, host + address ("hostname") and port of the scrape target. + + `MetricsEndpointAggregator` also assumes that each unit of a + scrape target sets in its unit-level relation data a key named + "groups". The value of this key is expected to be the string + representation of list of Prometheus Alert rules in YAML format. + An example of a single such alert rule is + + ```yaml + - alert: HighRequestLatency + expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 + for: 10m + labels: + severity: page + annotations: + summary: High request latency + ``` + + Once again if it is required to integrate with charms that do not + honour these assumptions about alert rules then an object derived + from `MetricsEndpointAggregator` may be used by overriding the + `_get_alert_rules()` method. + + `MetricsEndpointAggregator` ensures that Prometheus scrape job + specifications and alert rules are annotated with Juju topology + information, just like `MetricsEndpointProvider` and + `MetricsEndpointConsumer` do. + + By default, `MetricsEndpointAggregator` ensures that Prometheus + "instance" labels refer to Juju topology. This ensures that + instance labels are stable over unit recreation. While it is not + advisable to change this option, if required it can be done by + setting the "relabel_instance" keyword argument to `False` when + constructing an aggregator object. + """ + + def __init__(self, charm, relation_names, relabel_instance=True): + """Construct a `MetricsEndpointAggregator`. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointAggregator` object. Typically, this is + `self` in the instantiating class. + relation_names: a dictionary with three keys. The value + of the "scrape_target" and "alert_rules" keys are + the relation names over which scrape job and alert rule + information is gathered by this `MetricsEndpointAggregator`. + And the value of the "prometheus" key is the name of + the relation with a `MetricsEndpointConsumer` such as + the Prometheus charm. + relabel_instance: A boolean flag indicating if Prometheus + scrape job "instance" labels must refer to Juju Topology. + """ + super().__init__(charm, relation_names["prometheus"]) + + self._charm = charm + self._target_relation = relation_names["scrape_target"] + self._prometheus_relation = relation_names["prometheus"] + self._alert_rules_relation = relation_names["alert_rules"] + self._relabel_instance = relabel_instance + + # manage Prometheus charm relation events + prometheus_events = self._charm.on[self._prometheus_relation] + self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data) + + # manage list of Prometheus scrape jobs from related scrape targets + target_events = self._charm.on[self._target_relation] + self.framework.observe(target_events.relation_changed, self._update_prometheus_jobs) + self.framework.observe(target_events.relation_departed, self._remove_prometheus_jobs) + + # manage alert rules for Prometheus from related scrape targets + alert_rule_events = self._charm.on[self._alert_rules_relation] + self.framework.observe(alert_rule_events.relation_changed, self._update_alert_rules) + self.framework.observe(alert_rule_events.relation_departed, self._remove_alert_rules) + + def _set_prometheus_data(self, event): + """Ensure every new Prometheus instances is updated. + + Any time a new Prometheus unit joins the relation with + `MetricsEndpointAggregator`, that Prometheus unit is provided + with the complete set of existing scrape jobs and alert rules. + """ + jobs = [] # list of scrape jobs, one per relation + for relation in self.model.relations[self._target_relation]: + targets = self._get_targets(relation) + if targets and relation.app: + jobs.append(self._static_scrape_job(targets, relation.app.name)) + + groups = [] # list of alert rule groups, one group per relation + for relation in self.model.relations[self._alert_rules_relation]: + unit_rules = self._get_alert_rules(relation) + if unit_rules and relation.app: + appname = relation.app.name + rules = self._label_alert_rules(unit_rules, appname) + group = {"name": self._group_name(appname), "rules": rules} + groups.append(group) + + event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + + def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. Additionally, if this method is called manually, do the + same. + + Args: + targets: a `dict` containing target information + app_name: a `str` identifying the application + """ + # new scrape job for the relation that has changed + updated_job = self._static_scrape_job(targets, app_name, **kwargs) + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + # list of scrape jobs that have not changed + jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] + jobs.append(updated_job) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + def _update_prometheus_jobs(self, event): + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. + """ + targets = self._get_targets(event.relation) + if not targets: + return + + # new scrape job for the relation that has changed + updated_job = self._static_scrape_job(targets, event.relation.app.name) + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + # list of scrape jobs that have not changed + jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] + jobs.append(updated_job) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + def _remove_prometheus_jobs(self, event): + """Remove scrape jobs when a target departs. + + Any time a scrape target departs, any Prometheus scrape job + associated with that specific scrape target is removed. + """ + job_name = self._job_name(event.relation.app.name) + unit_name = event.unit.name + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + if not jobs: + continue + + changed_job = [j for j in jobs if j.get("job_name") == job_name] + if not changed_job: + continue + changed_job = changed_job[0] + + # list of scrape jobs that have not changed + jobs = [job for job in jobs if job.get("job_name") != job_name] + + # list of scrape jobs for units of the same application that still exist + configs_kept = [ + config + for config in changed_job["static_configs"] # type: ignore + if config.get("labels", {}).get("juju_unit") != unit_name + ] + + if configs_kept: + changed_job["static_configs"] = configs_kept # type: ignore + jobs.append(changed_job) + + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + def _update_alert_rules(self, event): + """Update alert rules in response to scrape target changes. + + When there is any change in alert rule relation data for any + scrape target, the list of alert rules for that specific + target is updated. + """ + unit_rules = self._get_alert_rules(event.relation) + if not unit_rules: + return + + appname = event.relation.app.name + rules = self._label_alert_rules(unit_rules, appname) + # the alert rule group that has changed + updated_group = {"name": self._group_name(appname), "rules": rules} + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + # list of alert rule groups that have not changed + groups = [group for group in groups if updated_group["name"] != group["name"]] + groups.append(updated_group) + relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + + def _remove_alert_rules(self, event): + """Remove alert rules for departed targets. + + Any time a scrape target departs any alert rules associated + with that specific scrape target is removed. + """ + group_name = self._group_name(event.relation.app.name) + unit_name = event.unit.name + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + if not alert_rules: + continue + + groups = alert_rules.get("groups", []) + if not groups: + continue + + changed_group = [group for group in groups if group["name"] == group_name] + if not changed_group: + continue + changed_group = changed_group[0] + + # list of alert rule groups that have not changed + groups = [group for group in groups if group["name"] != group_name] + + # list of alert rules not associated with departing unit + rules_kept = [ + rule + for rule in changed_group.get("rules") # type: ignore + if rule.get("labels").get("juju_unit") != unit_name + ] + + if rules_kept: + changed_group["rules"] = rules_kept # type: ignore + groups.append(changed_group) + + relation.data[self._charm.app]["alert_rules"] = ( + json.dumps({"groups": groups}) if groups else "{}" + ) + + def _get_targets(self, relation) -> dict: + """Fetch scrape targets for a relation. + + Scrape target information is returned for each unit in the + relation. This information contains the unit name, network + hostname (or address) for that unit, and port on which a + metrics endpoint is exposed in that unit. + + Args: + relation: an `ops.model.Relation` object for which scrape + targets are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is itself + a dictionary of the form + ``` + {"hostname": hostname, "port": port} + ``` + """ + targets = {} + for unit in relation.units: + port = relation.data[unit].get("port", 80) + hostname = relation.data[unit].get("hostname") + if hostname: + targets.update({unit.name: {"hostname": hostname, "port": port}}) + + return targets + + def _get_alert_rules(self, relation) -> dict: + """Fetch alert rules for a relation. + + Each unit of the related scrape target may have its own + associated alert rules. Alert rules for all units are returned + indexed by unit name. + + Args: + relation: an `ops.model.Relation` object for which alert + rules are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is a list + of alert rules. Each rule is in dictionary format. The + structure "rule dictionary" corresponds to single + Prometheus alert rule. + """ + rules = {} + for unit in relation.units: + unit_rules = yaml.safe_load(relation.data[unit].get("groups", "")) + if unit_rules: + rules.update({unit.name: unit_rules}) + + return rules + + def _job_name(self, appname) -> str: + """Construct a scrape job name. + + Each relation has its own unique scrape job name. All units in + the relation are scraped as part of the same scrape job. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus scrape job name for the application. + """ + return "juju_{}_{}_{}_prometheus_scrape".format( + self.model.name, self.model.uuid[:7], appname + ) + + def _group_name(self, appname) -> str: + """Construct name for an alert rule group. + + Each unit in a relation may define its own alert rules. All + rules, for all units in a relation are grouped together and + given a single alert rule group name. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus alert rules group name for the application. + """ + return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], appname) + + def _label_alert_rules(self, unit_rules, appname) -> list: + """Apply juju topology labels to alert rules. + + Args: + unit_rules: a list of alert rules, where each rule is in + dictionary format. + appname: a string name of the application to which the + alert rules belong. + + Returns: + a list of alert rules with Juju topology labels. + """ + labeled_rules = [] + for unit_name, rules in unit_rules.items(): + for rule in rules: + # the new JujuTopology removed this, so build it up by hand + matchers = { + "juju_{}".format(k): v + for k, v in JujuTopology(self.model.name, self.model.uuid, appname, unit_name) + .as_dict(excluded_keys=["charm_name"]) + .items() + } + rule["labels"].update(matchers.items()) + labeled_rules.append(rule) + + return labeled_rules + + def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: + """Construct a static scrape job for an application. + + Args: + targets: a dictionary providing hostname and port for all + scrape target. The keys of this dictionary are unit + names. Values corresponding to these keys are + themselves a dictionary with keys "hostname" and + "port". + application_name: a string name of the application for + which this static scrape job is being constructed. + + Returns: + A dictionary corresponding to a Prometheus static scrape + job configuration for one application. The returned + dictionary may be transformed into YAML and appended to + the list of any existing list of Prometheus static configs. + """ + juju_model = self.model.name + juju_model_uuid = self.model.uuid + job = { + "job_name": self._job_name(application_name), + "static_configs": [ + { + "targets": ["{}:{}".format(target["hostname"], target["port"])], + "labels": { + "juju_model": juju_model, + "juju_model_uuid": juju_model_uuid, + "juju_application": application_name, + "juju_unit": unit_name, + "host": target["hostname"], + }, + } + for unit_name, target in targets.items() + ], + "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), + } + job.update(kwargs.get("updates", {})) + + return job + + @property + def _relabel_configs(self) -> list: + """Create Juju topology relabeling configuration. + + Using Juju topology for instance labels ensures that these + labels are stable across unit recreation. + + Returns: + a list of Prometheus relabeling configurations. Each item in + this list is one relabel configuration. + """ + return ( + [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ] + if self._relabel_instance + else [] + ) + + +class CosTool: + """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" + + _path = None + _disabled = False + + def __init__(self, charm): + self._charm = charm + + @property + def path(self): + """Lazy lookup of the path of cos-tool.""" + if self._disabled: + return None + if not self._path: + self._path = self._get_tool_path() + if not self._path: + logger.debug("Skipping injection of juju topology as label matchers") + self._disabled = True + return self._path + + def apply_label_matchers(self, rules) -> dict: + """Will apply label matchers to the expression of all alerts in all supplied groups.""" + if not self.path: + return rules + for group in rules["groups"]: + rules_in_group = group.get("rules", []) + for rule in rules_in_group: + topology = {} + # if the user for some reason has provided juju_unit, we'll need to honor it + # in most cases, however, this will be empty + for label in [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_charm", + "juju_unit", + ]: + if label in rule["labels"]: + topology[label] = rule["labels"][label] + + rule["expr"] = self.inject_label_matchers(rule["expr"], topology) + return rules + + def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of alert rules, returning a boolean and any errors.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + return True, "" + + with tempfile.TemporaryDirectory() as tmpdir: + rule_path = Path(tmpdir + "/validate_rule.yaml") + rule_path.write_text(yaml.dump(rules)) + + args = [str(self.path), "validate", str(rule_path)] + # noinspection PyBroadException + try: + self._exec(args) + return True, "" + except subprocess.CalledProcessError as e: + logger.debug("Validating the rules failed: %s", e.output) + return False, ", ".join( + [ + line + for line in e.output.decode("utf8").splitlines() + if "error validating" in line + ] + ) + + def inject_label_matchers(self, expression, topology) -> str: + """Add label matchers to an expression.""" + if not topology: + return expression + if not self.path: + logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) + return expression + args = [str(self.path), "transform"] + args.extend( + ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] + ) + + args.extend(["{}".format(expression)]) + # noinspection PyBroadException + try: + return self._exec(args) + except subprocess.CalledProcessError as e: + logger.debug('Applying the expression failed: "%s", falling back to the original', e) + return expression + + def _get_tool_path(self) -> Optional[Path]: + arch = platform.machine() + arch = "amd64" if arch == "x86_64" else arch + res = "cos-tool-{}".format(arch) + try: + path = Path(res).resolve() + path.chmod(0o777) + return path + except NotImplementedError: + logger.debug("System lacks support for chmod") + except FileNotFoundError: + logger.debug('Could not locate cos-tool at: "{}"'.format(res)) + return None + + def _exec(self, cmd) -> str: + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return result.stdout.decode("utf-8").strip() diff --git a/metadata.yaml b/metadata.yaml index 0c051fb..a2df776 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -36,5 +36,7 @@ resources: upstream-source: docker.io/grafana/mimir:2.4.0 provides: + metrics-endpoint: + interface: prometheus_scrape receive-remote-write: interface: prometheus_remote_write diff --git a/requirements.txt b/requirements.txt index d5cce20..14d0f14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -ops >= 1.5.0 -deepdiff -lightkube -lightkube-models +ops +lightkube >= 0.11 +lightkube-models >= 1.22.0.4 parse diff --git a/src/charm.py b/src/charm.py index 69658d6..2d81974 100755 --- a/src/charm.py +++ b/src/charm.py @@ -13,18 +13,22 @@ from typing import Optional import yaml -from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch +from charms.observability_libs.v0.juju_topology import JujuTopology +from charms.observability_libs.v1.kubernetes_service_patch import ( + KubernetesServicePatch, + ServicePort, +) from charms.prometheus_k8s.v0.prometheus_remote_write import ( DEFAULT_RELATION_NAME as DEFAULT_REMOTE_WRITE_RELATION_NAME, ) from charms.prometheus_k8s.v0.prometheus_remote_write import PrometheusRemoteWriteProvider -from deepdiff import DeepDiff # type: ignore +from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from ops.charm import CharmBase from ops.framework import StoredState from ops.main import main from ops.model import ActiveStatus, BlockedStatus, WaitingStatus from ops.pebble import Error as PebbleError -from ops.pebble import PathError, ProtocolError +from ops.pebble import Layer, PathError, ProtocolError from parse import search # type: ignore MIMIR_CONFIG = "/etc/mimir/mimir-config.yaml" @@ -54,8 +58,30 @@ def __init__(self, *args): self._stored.set_default(alerts_hash=None) self._container = self.unit.get_container(self._name) + self.topology = JujuTopology.from_charm(self) + self.service_patch = KubernetesServicePatch( - self, [(self.app.name, self._http_listen_port)] + self, [ServicePort(self._http_listen_port, name=self.app.name)] + ) + + self.metrics_provider = MetricsEndpointProvider( + self, + jobs=[ + { + "static_configs": [ + { + "targets": [f"*:{self._http_listen_port}"], + "labels": { + "cluster": self.topology.model_uuid, + "namespace": self.topology.model, + "job": f"{self.topology.model}/mimir", + "pod": self.topology.unit, + }, + } + ], + "scrape_interval": "15s", + } + ], ) self.remote_write_provider = PrometheusRemoteWriteProvider( @@ -116,11 +142,12 @@ def _set_pebble_layer(self) -> bool: Returns: True if Pebble layer was added, otherwise False. """ - current_layer = self._container.get_plan().to_dict() + current_layer = self._container.get_plan() new_layer = self._pebble_layer - if "services" not in current_layer or DeepDiff( - current_layer["services"], new_layer["services"], ignore_order=True + if ( + "services" not in current_layer.to_dict() + or current_layer.services != new_layer.services ): self._container.add_layer(self._name, new_layer, combine=True) return True @@ -193,18 +220,20 @@ def _push_alert_rules(self, alerts): @property def _pebble_layer(self): - return { - "summary": "mimir layer", - "description": "pebble config layer for mimir", - "services": { - "mimir": { - "override": "replace", - "summary": "mimir daemon", - "command": f"/bin/mimir --config.file={MIMIR_CONFIG}", - "startup": "enabled", - } - }, - } + return Layer( + { + "summary": "mimir layer", + "description": "pebble config layer for mimir", + "services": { + "mimir": { + "override": "replace", + "summary": "mimir daemon", + "command": f"/bin/mimir --config.file={MIMIR_CONFIG}", + "startup": "enabled", + } + }, + } + ) @property def _mimir_config(self) -> dict: diff --git a/src/prometheus_alert_rules/alert-rules.yaml b/src/prometheus_alert_rules/alert-rules.yaml new file mode 100644 index 0000000..bc5701d --- /dev/null +++ b/src/prometheus_alert_rules/alert-rules.yaml @@ -0,0 +1,943 @@ +groups: +- name: mimir_alerts + rules: + - alert: MimirIngesterUnhealthy + annotations: + message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ + printf "%f" $value }} unhealthy ingester(s). + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy + expr: | + min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 + for: 15m + labels: + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m])) + / + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m])) + > 1 + for: 15m + labels: + severity: critical + - alert: MimirRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency + expr: | + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} + > + 2.5 + for: 15m + labels: + severity: warning + - alert: MimirQueriesIncorrect + annotations: + message: | + The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect + expr: | + 100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) + / + sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 + for: 15m + labels: + severity: warning + - alert: MimirInconsistentRuntimeConfig + annotations: + message: | + An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig + expr: | + count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 + for: 1h + labels: + severity: critical + - alert: MimirBadRuntimeConfig + annotations: + message: | + {{ $labels.job }} failed to reload runtime config. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig + expr: | + # The metric value is reset to 0 on error while reloading the config at runtime. + cortex_runtime_config_last_reload_successful == 0 + for: 5m + labels: + severity: critical + - alert: MimirFrontendQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirSchedulerQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 + for: 7m + labels: + severity: critical + - alert: MimirMemcachedRequestErrors + annotations: + message: | + Memcached {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemcachedrequesterrors + expr: | + ( + sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operation_failures_total[1m])) / + sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operations_total[1m])) + ) * 100 > 5 + for: 5m + labels: + severity: warning + - alert: MimirIngesterRestarts + annotations: + message: '{{ $labels.job }}/{{ $labels.pod }} has restarted {{ printf "%.2f" + $value }} times in the last 30 mins.' + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts + expr: | + changes(process_start_time_seconds{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}[30m]) >= 2 + labels: + severity: warning + - alert: MimirKVStoreFailure + annotations: + message: | + Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure + expr: | + ( + sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + for: 5m + labels: + severity: critical + - alert: MimirMemoryMapAreasTooHigh + annotations: + message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas + close to the limit.' + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh + expr: | + process_memory_map_areas{job=~".*/((ingester.*|cortex|mimir|mimir-write.*)|(store-gateway.*|cortex|mimir|mimir-backend.*))"} / process_memory_map_areas_limit{job=~".*/((ingester.*|cortex|mimir|mimir-write.*)|(store-gateway.*|cortex|mimir|mimir-backend.*))"} > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirDistributorForwardingErrorRate + annotations: + message: | + Mimir in {{ $labels.cluster }}/{{ $labels.namespace }} has a high failure rate when forwarding samples. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorforwardingerrorrate + expr: | + sum by (cluster, namespace) (rate(cortex_distributor_forward_errors_total{}[1m])) + / + sum by (cluster, namespace) (rate(cortex_distributor_forward_requests_total{}[1m])) + > 0.01 + for: 5m + labels: + severity: critical + - alert: MimirIngesterInstanceHasNoTenants + annotations: + message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has no tenants assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants + expr: | + (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) + and on (cluster, namespace) + # Only if there are more time-series than would be expected due to continuous testing load + ( + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) > 100000 + for: 1h + labels: + severity: warning + - alert: MimirRulerInstanceHasNoRuleGroups + annotations: + message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has no rule groups assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups + expr: | + # Alert on ruler instances in microservices mode that have no rule groups assigned, + min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*-mimir-)?ruler.*"}) == 0 + # but only if other ruler instances of the same cell do have rule groups assigned + and on (cluster, namespace) + (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) + # and there are more than two instances overall + and on (cluster, namespace) + (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) + for: 1h + labels: + severity: warning + - alert: MimirRingMembersMismatch + annotations: + message: | + Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch + expr: | + ( + avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~"(.*/)?(ingester.*|cortex|mimir|mimir-write.*)"})) + != sum by(cluster, namespace) (up{job=~"(.*/)?(ingester.*|cortex|mimir|mimir-write.*)"}) + ) + and + ( + count by(cluster, namespace) (cortex_build_info) > 0 + ) + for: 15m + labels: + component: ingester + severity: warning +- name: mimir_instance_limits_alerts + rules: + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.8 + for: 3h + labels: + severity: warning + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.9 + for: 5m + labels: + severity: critical + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.7 + for: 5m + labels: + severity: warning + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirReachingTCPConnectionsLimit + annotations: + message: | + Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit + expr: | + cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and + cortex_tcp_connections_limit > 0 + for: 5m + labels: + severity: critical + - alert: MimirDistributorReachingInflightPushRequestLimit + annotations: + message: | + Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit + expr: | + ( + (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) + and ignoring (limit) + (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical +- name: mimir-rollout-alerts + rules: + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + - alert: RolloutOperatorNotReconciling + annotations: + message: | + Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling + expr: | + max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 + for: 5m + labels: + severity: critical +- name: mimir-provisioning + rules: + - alert: MimirProvisioningTooManyActiveSeries + annotations: + message: | + The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries + expr: | + avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6 + for: 2h + labels: + severity: warning + - alert: MimirProvisioningTooManyWrites + annotations: + message: | + Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanywrites + expr: | + avg by (cluster, namespace) (cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m) > 80e3 + for: 15m + labels: + severity: warning + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. + # See: https://github.com/grafana/mimir/issues/2466 + container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + / + ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ) > 0.65 + for: 15m + labels: + severity: warning + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. + # See: https://github.com/grafana/mimir/issues/2466 + container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + / + ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ) > 0.8 + for: 15m + labels: + severity: critical +- name: ruler_alerts + rules: + - alert: MimirRulerTooManyFailedPushes + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes + expr: | + 100 * ( + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerTooManyFailedQueries + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries + expr: | + 100 * ( + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerMissedEvaluations + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations + expr: | + 100 * ( + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + / + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + ) > 1 + for: 5m + labels: + severity: warning + - alert: MimirRulerFailedRingCheck + annotations: + message: | + Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck + expr: | + sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + 100 * ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + / + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) > 1 + for: 5m + labels: + severity: warning +- name: gossip_alerts + rules: + - alert: MimirGossipMembersMismatch + annotations: + message: Mimir instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} sees incorrect number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch + expr: | + avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(alertmanager|compactor|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + for: 15m + labels: + severity: warning +- name: etcd_alerts + rules: + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.65 + for: 15m + labels: + severity: warning + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.8 + for: 15m + labels: + severity: critical +- name: alertmanager_alerts + rules: + - alert: MimirAlertmanagerSyncConfigsFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing + expr: | + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + for: 30m + labels: + severity: critical + - alert: MimirAlertmanagerRingCheckFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing + expr: | + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPartialStateMergeFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing + expr: | + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerReplicationFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing + expr: | + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPersistStateFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing + expr: | + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + for: 1h + labels: + severity: critical + - alert: MimirAlertmanagerInitialSyncFailed + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed + expr: | + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + labels: + severity: critical + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80 + and + (container_spec_memory_limit_bytes{container="alertmanager"} > 0) + for: 15m + labels: + severity: warning + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90 + and + (container_spec_memory_limit_bytes{container="alertmanager"} > 0) + for: 15m + labels: + severity: critical + - alert: MimirAlertmanagerInstanceHasNoTenants + annotations: + message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} owns no tenants. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants + expr: | + # Alert on alertmanager instances in microservices mode that own no tenants, + min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*-mimir-)?alertmanager.*"}) == 0 + # but only if other instances of the same cell do have tenants assigned. + and on (cluster, namespace) + max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 + for: 1h + labels: + severity: warning +- name: mimir_blocks_alerts + rules: + - alert: MimirIngesterHasNotShippedBlocks + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks + expr: | + (min by(cluster, namespace, pod) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4) + and + (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) > 0) + and + # Only if the ingester has ingested samples over the last 4h. + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + and + # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica + # had ingested samples in the past, then no traffic was received for a long period and then it starts + # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving + # samples, while the a block shipping is expected within the next 4h. + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterHasNotShippedBlocksSinceStart + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart + expr: | + (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) == 0) + and + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + for: 4h + labels: + severity: critical + - alert: MimirIngesterHasUnshippedBlocks + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't + been successfully uploaded to the storage yet. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks + expr: | + (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) + and + (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadCompactionFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to compact TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed + expr: | + rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to truncate TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed + expr: | + rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointCreationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to create TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointDeletionFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to delete TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBWALTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to truncate TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed + expr: | + rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 + labels: + severity: warning + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBWALWritesFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to write to TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed + expr: | + rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 + for: 3m + labels: + severity: critical + - alert: MimirQuerierHasNotScanTheBucket + annotations: + message: Mimir Querier {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not successfully scanned the bucket since {{ $value | humanizeDuration + }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirquerierhasnotscanthebucket + expr: | + (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30) + and + cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0 + for: 5m + labels: + severity: critical + - alert: MimirQuerierHighRefetchRate + annotations: + message: Mimir Queries in {{ $labels.cluster }}/{{ $labels.namespace }} are + refetching series from different store-gateways (because of missing blocks) + for the {{ printf "%.0f" $value }}% of queries. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirquerierhighrefetchrate + expr: | + 100 * ( + ( + sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) + - + sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) + ) + / + sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) + ) + > 1 + for: 10m + labels: + severity: warning + - alert: MimirStoreGatewayHasNotSyncTheBucket + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not successfully synched the bucket since {{ $value | humanizeDuration + }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket + expr: | + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) + and + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 + for: 5m + labels: + severity: critical + - alert: MimirStoreGatewayNoSyncedTenants + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is not syncing any blocks for any tenant. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants + expr: | + min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 + for: 1h + labels: + severity: warning + - alert: MimirBucketIndexNotUpdated + annotations: + message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster + }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration + }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated + expr: | + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + labels: + severity: critical + - alert: MimirTenantHasPartialBlocks + annotations: + message: Mimir tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has {{ $value }} partial blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirtenanthaspartialblocks + expr: | + max by(cluster, namespace, user) (cortex_bucket_blocks_partials_count) > 0 + for: 6h + labels: + severity: warning +- name: mimir_compactor_alerts + rules: + - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not successfully cleaned up blocks in the last 6 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) + for: 1h + labels: + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) + and + (cortex_compactor_last_successful_run_timestamp_seconds > 0) + for: 1h + labels: + reason: in-last-24h + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + cortex_compactor_last_successful_run_timestamp_seconds == 0 + for: 24h + labels: + reason: since-startup + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} failed to run 2 consecutive compactions. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + increase(cortex_compactor_runs_failed_total[2h]) >= 2 + labels: + reason: consecutive-failures + severity: critical + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not uploaded any block in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) + and + (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 15m + labels: + severity: critical + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not uploaded any block in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 24h + labels: + severity: critical + - alert: MimirCompactorSkippedBlocksWithOutOfOrderChunks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has found and ignored blocks with out of order chunks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedblockswithoutoforderchunks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total{reason="block-index-out-of-order-chunk"}[5m]) > 0 + for: 1m + labels: + severity: warning +- name: mimir_autoscaling + rules: + - alert: MimirAutoscalerNotActive + annotations: + message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler + }} in {{ $labels.namespace }} is not active. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive + expr: | + ( + kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + # Add "metric" label. + + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + > 0 + ) + # Do not alert if metric is 0, because in that case we expect the HPA to be inactive. + unless on (cluster, namespace, metric) + (label_replace(keda_metrics_adapter_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") == 0) + for: 1h + labels: + severity: critical +- name: mimir_continuous_test + rules: + - alert: MimirContinuousTestNotRunningOnWrites + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ + $labels.namespace }} is not effectively running because writes are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestNotRunningOnReads + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ + $labels.namespace }} is not effectively running because queries are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestFailed + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ + $labels.namespace }} failed when asserting query results. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 + labels: + severity: warning diff --git a/src/prometheus_alert_rules/recording-rules.yaml b/src/prometheus_alert_rules/recording-rules.yaml new file mode 100644 index 0000000..05e6a48 --- /dev/null +++ b/src/prometheus_alert_rules/recording-rules.yaml @@ -0,0 +1,563 @@ +groups: +- name: mimir_api_1 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_count:sum_rate +- name: mimir_api_2 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate +- name: mimir_api_3 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate +- name: mimir_querier_api + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by + (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) + by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate +- name: mimir_cache + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile + - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) + by (cluster, job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds:avg + - expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, + job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_cache_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_cache_request_duration_seconds:50quantile + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) + / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_cache_request_duration_seconds:avg + - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_cache_request_duration_seconds:avg + - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + job, method) + record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + method) + record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate +- name: mimir_storage + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:50quantile + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds:avg + - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate +- name: mimir_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:50quantile + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) + by (cluster, job) + record: cluster_job:cortex_query_frontend_retries:avg + - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by + (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, + cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate +- name: mimir_ingester_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:50quantile + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) + by (cluster, job) + record: cluster_job:cortex_ingester_queried_series:avg + - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:50quantile + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) + by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples:avg + - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:50quantile + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / + sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars:avg + - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate +- name: mimir_received_samples + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_samples:rate5m +- name: mimir_exemplars_in + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) + record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m +- name: mimir_received_exemplars + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m +- name: mimir_exemplars_ingested + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) + record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m +- name: mimir_exemplars_appended + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) + record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m +- name: mimir_scaling_rules + rules: + - expr: | + # Convenience rule to get the number of replicas for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + kube_deployment_spec_replicas, + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + or + sum by (cluster, namespace, deployment) ( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") + ) + record: cluster_namespace_deployment:actual_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / 240000 + ) + labels: + deployment: distributor + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 240000 + ) + labels: + deployment: distributor + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / 1500000 + ) + labels: + deployment: ingester + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + * 3 * 0.59999999999999998 / 1500000 + ) + labels: + deployment: ingester + reason: active_series_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + labels: + deployment: memcached + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate + - expr: | + # Convenience rule to get the CPU request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_cpu_cores was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + # Jobs should be sized to their CPU usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + ) + labels: + reason: cpu_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + # Convenience rule to get the Memory utilization for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + container_memory_usage_bytes{image!=""}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_memory_usage_bytes:sum + - expr: | + # Convenience rule to get the Memory request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_memory_bytes was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + # Jobs should be sized to their Memory usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + ) + labels: + reason: memory_usage + record: cluster_namespace_deployment_reason:required_replicas:count +- name: mimir_alertmanager_rules + rules: + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_alerts) + record: cluster_job_pod:cortex_alertmanager_alerts:sum + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_silences) + record: cluster_job_pod:cortex_alertmanager_silences:sum + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m +- name: mimir_ingester_rules + rules: + - expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) + record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 0000000..382d043 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,56 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +import functools +import logging +from collections import defaultdict +from datetime import datetime + +import pytest +from pytest_operator.plugin import OpsTest + +logger = logging.getLogger(__name__) + + +class Store(defaultdict): + def __init__(self): + super(Store, self).__init__(Store) + + def __getattr__(self, key): + """Override __getattr__ so dot syntax works on keys.""" + try: + return self[key] + except KeyError: + raise AttributeError(key) + + def __setattr__(self, key, value): + """Override __setattr__ so dot syntax works on keys.""" + self[key] = value + + +store = Store() + + +def timed_memoizer(func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + fname = func.__qualname__ + logger.info("Started: %s" % fname) + start_time = datetime.now() + if fname in store.keys(): + ret = store[fname] + else: + logger.info("Return for {} not cached".format(fname)) + ret = await func(*args, **kwargs) + store[fname] = ret + logger.info("Finished: {} in: {} seconds".format(fname, datetime.now() - start_time)) + return ret + + return wrapper + + +@pytest.fixture(scope="module") +@timed_memoizer +async def mimir_charm(ops_test: OpsTest): + """Mimir charm used for integration testing.""" + charm = await ops_test.build_charm(".") + return charm diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py new file mode 100644 index 0000000..b56a212 --- /dev/null +++ b/tests/integration/helpers.py @@ -0,0 +1,45 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +from pathlib import Path + +import yaml + +logger = logging.getLogger(__name__) + + +async def get_unit_address(ops_test, app_name: str, unit_num: int) -> str: + status = await ops_test.model.get_status() # noqa: F821 + return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"] + + +def oci_image(metadata_file: str, image_name: str) -> str: + """Find upstream source for a container image. + + Args: + metadata_file: string path of metadata YAML file relative + to top level charm directory + image_name: OCI container image string name as defined in + metadata.yaml file + Returns: + upstream image source + Raises: + FileNotFoundError: if metadata_file path is invalid + ValueError: if upstream source for image name can not be found + """ + metadata = yaml.safe_load(Path(metadata_file).read_text()) + + resources = metadata.get("resources", {}) + if not resources: + raise ValueError("No resources found") + + image = resources.get(image_name, {}) + if not image: + raise ValueError("{} image not found".format(image_name)) + + upstream_source = image.get("upstream-source", "") + if not upstream_source: + raise ValueError("Upstream source not found") + + return upstream_source diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py new file mode 100644 index 0000000..e0e8b50 --- /dev/null +++ b/tests/integration/test_self_monitoring.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import asyncio +import logging + +import pytest +import requests +from helpers import get_unit_address, oci_image +from pytest_operator.plugin import OpsTest +from workload import Mimir + +logger = logging.getLogger(__name__) + +MIMIR = "mimir" +PROMETHEUS = "prometheus" + + +@pytest.mark.abort_on_fail +async def test_deploy_and_relate_charms(ops_test: OpsTest, mimir_charm): + """Test that Mimir can be related with Prometheus over prometheus_scrape.""" + # Build charm from local source folder + # mimir_charm = await ops_test.build_charm(".") + await asyncio.gather( + ops_test.model.deploy( + await mimir_charm, + resources={"mimir-image": oci_image("./metadata.yaml", "mimir-image")}, + application_name=MIMIR, + trust=True, + ), + ops_test.model.deploy( + "prometheus-k8s", + application_name=PROMETHEUS, + channel="edge", + trust=True, + ), + ) + + await ops_test.model.add_relation(MIMIR, f"{PROMETHEUS}:metrics-endpoint") + apps = [MIMIR, PROMETHEUS] + await ops_test.model.wait_for_idle(apps=apps, status="active") + + +async def test_metrics_are_available(ops_test): + address = await get_unit_address(ops_test, MIMIR, 0) + mimir = Mimir(host=address) + metrics = await mimir.api_request("/metrics") + assert len(metrics) > 0 + + +async def test_query_metrics_from_prometheus(ops_test): + address = await get_unit_address(ops_test, PROMETHEUS, 0) + url = f"http://{address}:9090/api/v1/query" + params = {"query": f"up{{juju_application='{MIMIR}'}}"} + try: + response = requests.get(url, params=params) + assert response.json()["status"] == "success" + for result in response.json()["data"]["result"]: + assert "1" in result["value"] + except requests.exceptions.RequestException: + assert False diff --git a/tests/integration/workload.py b/tests/integration/workload.py new file mode 100644 index 0000000..f3d67cc --- /dev/null +++ b/tests/integration/workload.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +from urllib.parse import urljoin + +import aiohttp + +logger = logging.getLogger(__name__) + + +class Mimir: + """A class that represents a running instance of Mimir.""" + + def __init__(self, host="localhost", port=9009): + """Utility to manage a Mimir application. + + Args: + host: Optional; host address of Mimir application. + port: Optional; port on which Mimir service is exposed. + """ + self.base_url = f"http://{host}:{port}" + + # Set a timeout of 5 second - should be sufficient for all the checks here. + # The default (5 min) prolongs itests unnecessarily. + self.timeout = aiohttp.ClientTimeout(total=5) + + async def is_ready(self) -> bool: + """Send a GET request to check readiness. + + Returns: + True if Mimir is ready (returned 200 OK); False otherwise. + """ + url = f"{self.base_url}/ready" + + async with aiohttp.ClientSession(timeout=self.timeout) as session: + async with session.get(url) as response: + return response.status == 200 + + async def config(self) -> str: + """Send a GET request to get Mimir configuration. + + Returns: + YAML config in string format or empty string + """ + url = f"{self.base_url}/config" + # Response looks like this: + # { + # "status": "success", + # "data": { + # "yaml": "global:\n + # scrape_interval: 1m\n + # scrape_timeout: 10s\n + # evaluation_interval: 1m\n + # rule_files:\n + # - /etc/prometheus/rules/juju_*.rules\n + # scrape_configs:\n + # - job_name: prometheus\n + # honor_timestamps: true\n + # scrape_interval: 5s\n + # scrape_timeout: 5s\n + # metrics_path: /metrics\n + # scheme: http\n + # static_configs:\n + # - targets:\n + # - localhost:9090\n" + # } + # } + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + result = await response.text() + return result if response.status == 200 else "" + + async def api_request(self, endpoint: str): + url = urljoin(self.base_url, endpoint) + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + result = await response.text() + return result if response.status == 200 else "" diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 2cfdb0f..52ec78f 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1,4 +1,4 @@ -# Copyright 2022 Canonical +# Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. # # Learn more about testing at: https://juju.is/docs/sdk/testing diff --git a/tox.ini b/tox.ini index 86f0947..4cbcb2a 100644 --- a/tox.ini +++ b/tox.ini @@ -35,18 +35,19 @@ commands = description = Check code against coding style standards deps = black + codespell + flake8 < 5 flake8-docstrings + flake8-copyright flake8-builtins pyproject-flake8 pep8-naming isort - codespell commands = # uncomment the following line if this charm owns a lib # codespell {[vars]lib_path} - codespell {toxinidir}/. --skip {toxinidir}/.git --skip {toxinidir}/.tox \ - --skip {toxinidir}/build --skip {toxinidir}/lib --skip {toxinidir}/venv \ - --skip {toxinidir}/.mypy_cache --skip {toxinidir}/icon.svg + codespell . --skip .git --skip .tox --skip build --skip lib --skip venv --skip .mypy_cache \ + --skip icon.svg --skip prometheus_alert_rules # pflake8 wrapper supports config from pyproject.toml pflake8 {[vars]all_path} isort --check-only --diff {[vars]all_path} @@ -100,9 +101,10 @@ commands = [testenv:integration] description = Run integration tests deps = + aiohttp pytest juju pytest-operator -r{toxinidir}/requirements.txt commands = - pytest -vv --tb native --log-cli-level=INFO --color=yes -s {posargs} {toxinidir}/tests/integration \ No newline at end of file + pytest -vv --tb native --log-cli-level=INFO --color=yes -s {posargs} {toxinidir}/tests/integration