From e93c81c4250d94f3e6363cb0b3499b47f7f9f316 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 10:22:36 +0000
Subject: [PATCH 01/13] update kubernetes_service_patch to v1

---
 .../{v0 => v1}/kubernetes_service_patch.py    | 161 ++++++++++++------
 src/charm.py                                  |   7 +-
 2 files changed, 116 insertions(+), 52 deletions(-)
 rename lib/charms/observability_libs/{v0 => v1}/kubernetes_service_patch.py (65%)

diff --git a/lib/charms/observability_libs/v0/kubernetes_service_patch.py b/lib/charms/observability_libs/v1/kubernetes_service_patch.py
similarity index 65%
rename from lib/charms/observability_libs/v0/kubernetes_service_patch.py
rename to lib/charms/observability_libs/v1/kubernetes_service_patch.py
index a3fb910..b458795 100644
--- a/lib/charms/observability_libs/v0/kubernetes_service_patch.py
+++ b/lib/charms/observability_libs/v1/kubernetes_service_patch.py
@@ -9,21 +9,20 @@
 default contains a "placeholder" port, which is 65536/TCP.
 
 When modifying the default set of resources managed by Juju, one must consider the lifecycle of the
-charm. In this case, any modifications to the default service (created during deployment), will
-be overwritten during a charm upgrade.
+charm. In this case, any modifications to the default service (created during deployment), will be
+overwritten during a charm upgrade.
 
 When initialised, this library binds a handler to the parent charm's `install` and `upgrade_charm`
 events which applies the patch to the cluster. This should ensure that the service ports are
 correct throughout the charm's life.
 
-The constructor simply takes a reference to the parent charm, and a list of tuples that each define
-a port for the service, where each tuple contains:
+The constructor simply takes a reference to the parent charm, and a list of
+[`lightkube`](https://github.com/gtsystem/lightkube) ServicePorts that each define a port for the
+service. For information regarding the `lightkube` `ServicePort` model, please visit the
+`lightkube` [docs](https://gtsystem.github.io/lightkube-models/1.23/models/core_v1/#serviceport).
 
-- a name for the port
-- port for the service to listen on
-- optionally: a targetPort for the service (the port in the container!)
-- optionally: a nodePort for the service (for NodePort or LoadBalancer services only!)
-- optionally: a name of the service (in case service name needs to be patched as well)
+Optionally, a name of the service (in case service name needs to be patched as well), labels,
+selectors, and annotations can be provided as keyword arguments.
 
 ## Getting Started
 
@@ -32,8 +31,8 @@
 
 ```shell
 cd some-charm
-charmcraft fetch-lib charms.observability_libs.v0.kubernetes_service_patch
-echo <<-EOF >> requirements.txt
+charmcraft fetch-lib charms.observability_libs.v1.kubernetes_service_patch
+cat << EOF >> requirements.txt
 lightkube
 lightkube-models
 EOF
@@ -41,28 +40,71 @@
 
 Then, to initialise the library:
 
-For ClusterIP services:
+For `ClusterIP` services:
+
 ```python
 # ...
-from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch
+from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
+from lightkube.models.core_v1 import ServicePort
 
 class SomeCharm(CharmBase):
   def __init__(self, *args):
     # ...
-    self.service_patcher = KubernetesServicePatch(self, [(f"{self.app.name}", 8080)])
+    port = ServicePort(443, name=f"{self.app.name}")
+    self.service_patcher = KubernetesServicePatch(self, [port])
     # ...
 ```
 
-For LoadBalancer/NodePort services:
+For `LoadBalancer`/`NodePort` services:
+
 ```python
 # ...
-from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch
+from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
+from lightkube.models.core_v1 import ServicePort
 
 class SomeCharm(CharmBase):
   def __init__(self, *args):
     # ...
+    port = ServicePort(443, name=f"{self.app.name}", targetPort=443, nodePort=30666)
     self.service_patcher = KubernetesServicePatch(
-        self, [(f"{self.app.name}", 443, 443, 30666)], "LoadBalancer"
+        self, [port], "LoadBalancer"
+    )
+    # ...
+```
+
+Port protocols can also be specified. Valid protocols are `"TCP"`, `"UDP"`, and `"SCTP"`
+
+```python
+# ...
+from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
+from lightkube.models.core_v1 import ServicePort
+
+class SomeCharm(CharmBase):
+  def __init__(self, *args):
+    # ...
+    tcp = ServicePort(443, name=f"{self.app.name}-tcp", protocol="TCP")
+    udp = ServicePort(443, name=f"{self.app.name}-udp", protocol="UDP")
+    sctp = ServicePort(443, name=f"{self.app.name}-sctp", protocol="SCTP")
+    self.service_patcher = KubernetesServicePatch(self, [tcp, udp, sctp])
+    # ...
+```
+
+Bound with custom events by providing `refresh_event` argument:
+For example, you would like to have a configurable port in your charm and want to apply
+service patch every time charm config is changed.
+
+```python
+from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
+from lightkube.models.core_v1 import ServicePort
+
+class SomeCharm(CharmBase):
+  def __init__(self, *args):
+    # ...
+    port = ServicePort(int(self.config["charm-config-port"]), name=f"{self.app.name}")
+    self.service_patcher = KubernetesServicePatch(
+        self,
+        [port],
+        refresh_event=self.on.config_changed
     )
     # ...
 ```
@@ -83,15 +125,16 @@ def setUp(self, *unused):
 
 import logging
 from types import MethodType
-from typing import Literal, Sequence, Tuple, Union
+from typing import List, Literal, Optional, Union
 
 from lightkube import ApiError, Client
+from lightkube.core import exceptions
 from lightkube.models.core_v1 import ServicePort, ServiceSpec
 from lightkube.models.meta_v1 import ObjectMeta
 from lightkube.resources.core_v1 import Service
 from lightkube.types import PatchType
 from ops.charm import CharmBase
-from ops.framework import Object
+from ops.framework import BoundEvent, Object
 
 logger = logging.getLogger(__name__)
 
@@ -99,13 +142,12 @@ def setUp(self, *unused):
 LIBID = "0042f86d0a874435adef581806cddbbb"
 
 # Increment this major API version when introducing breaking changes
-LIBAPI = 0
+LIBAPI = 1
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 6
+LIBPATCH = 5
 
-PortDefinition = Union[Tuple[str, int], Tuple[str, int, int], Tuple[str, int, int, int]]
 ServiceType = Literal["ClusterIP", "LoadBalancer"]
 
 
@@ -115,18 +157,20 @@ class KubernetesServicePatch(Object):
     def __init__(
         self,
         charm: CharmBase,
-        ports: Sequence[PortDefinition],
-        service_name: str = None,
+        ports: List[ServicePort],
+        service_name: Optional[str] = None,
         service_type: ServiceType = "ClusterIP",
-        additional_labels: dict = None,
-        additional_selectors: dict = None,
-        additional_annotations: dict = None,
+        additional_labels: Optional[dict] = None,
+        additional_selectors: Optional[dict] = None,
+        additional_annotations: Optional[dict] = None,
+        *,
+        refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
     ):
         """Constructor for KubernetesServicePatch.
 
         Args:
             charm: the charm that is instantiating the library.
-            ports: a list of tuples (name, port, targetPort, nodePort) for every service port.
+            ports: a list of ServicePorts
             service_name: allows setting custom name to the patched service. If none given,
                 application name will be used.
             service_type: desired type of K8s service. Default value is in line with ServiceSpec's
@@ -136,6 +180,9 @@ def __init__(
             additional_selectors: Selectors to be added to the kubernetes service (by default only
                 "app.kubernetes.io/name" is set to the service name)
             additional_annotations: Annotations to be added to the kubernetes service.
+            refresh_event: an optional bound event or list of bound events which
+                will be observed to re-apply the patch (e.g. on port change).
+                The `install` and `upgrade-charm` events would be observed regardless.
         """
         super().__init__(charm, "kubernetes-service-patch")
         self.charm = charm
@@ -155,22 +202,27 @@ def __init__(
         self.framework.observe(charm.on.install, self._patch)
         self.framework.observe(charm.on.upgrade_charm, self._patch)
 
+        # apply user defined events
+        if refresh_event:
+            if not isinstance(refresh_event, list):
+                refresh_event = [refresh_event]
+
+            for evt in refresh_event:
+                self.framework.observe(evt, self._patch)
+
     def _service_object(
         self,
-        ports: Sequence[PortDefinition],
-        service_name: str = None,
+        ports: List[ServicePort],
+        service_name: Optional[str] = None,
         service_type: ServiceType = "ClusterIP",
-        additional_labels: dict = None,
-        additional_selectors: dict = None,
-        additional_annotations: dict = None,
+        additional_labels: Optional[dict] = None,
+        additional_selectors: Optional[dict] = None,
+        additional_annotations: Optional[dict] = None,
     ) -> Service:
         """Creates a valid Service representation.
 
         Args:
-            ports: a list of tuples of the form (name, port) or (name, port, targetPort)
-                or (name, port, targetPort, nodePort) for every service port. If the 'targetPort'
-                is omitted, it is assumed to be equal to 'port', with the exception of NodePort
-                and LoadBalancer services, where all port numbers have to be specified.
+            ports: a list of ServicePorts
             service_name: allows setting custom name to the patched service. If none given,
                 application name will be used.
             service_type: desired type of K8s service. Default value is in line with ServiceSpec's
@@ -203,15 +255,7 @@ def _service_object(
             ),
             spec=ServiceSpec(
                 selector=selector,
-                ports=[
-                    ServicePort(
-                        name=p[0],
-                        port=p[1],
-                        targetPort=p[2] if len(p) > 2 else p[1],  # type: ignore[misc]
-                        nodePort=p[3] if len(p) > 3 else None,  # type: ignore[arg-type, misc]
-                    )
-                    for p in ports
-                ],
+                ports=ports,
                 type=service_type,
             ),
         )
@@ -222,11 +266,15 @@ def _patch(self, _) -> None:
         Raises:
             PatchFailed: if patching fails due to lack of permissions, or otherwise.
         """
-        if not self.charm.unit.is_leader():
+        try:
+            client = Client()
+        except exceptions.ConfigError as e:
+            logger.warning("Error creating k8s client: %s", e)
             return
 
-        client = Client()
         try:
+            if self._is_patched(client):
+                return
             if self.service_name != self._app:
                 self._delete_and_create_service(client)
             client.patch(Service, self.service_name, self.service, patch_type=PatchType.MERGE)
@@ -252,12 +300,25 @@ def is_patched(self) -> bool:
             bool: A boolean indicating if the service patch has been applied.
         """
         client = Client()
+        return self._is_patched(client)
+
+    def _is_patched(self, client: Client) -> bool:
         # Get the relevant service from the cluster
-        service = client.get(Service, name=self.service_name, namespace=self._namespace)
+        try:
+            service = client.get(Service, name=self.service_name, namespace=self._namespace)
+        except ApiError as e:
+            if e.status.code == 404 and self.service_name != self._app:
+                return False
+            else:
+                logger.error("Kubernetes service get failed: %s", str(e))
+                raise
+
         # Construct a list of expected ports, should the patch be applied
         expected_ports = [(p.port, p.targetPort) for p in self.service.spec.ports]
         # Construct a list in the same manner, using the fetched service
-        fetched_ports = [(p.port, p.targetPort) for p in service.spec.ports]  # type: ignore[attr-defined]  # noqa: E501
+        fetched_ports = [
+            (p.port, p.targetPort) for p in service.spec.ports  # type: ignore[attr-defined]
+        ]  # noqa: E501
         return expected_ports == fetched_ports
 
     @property
diff --git a/src/charm.py b/src/charm.py
index 69658d6..4f99695 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -13,7 +13,10 @@
 from typing import Optional
 
 import yaml
-from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch
+from charms.observability_libs.v1.kubernetes_service_patch import (
+    KubernetesServicePatch,
+    ServicePort,
+)
 from charms.prometheus_k8s.v0.prometheus_remote_write import (
     DEFAULT_RELATION_NAME as DEFAULT_REMOTE_WRITE_RELATION_NAME,
 )
@@ -55,7 +58,7 @@ def __init__(self, *args):
         self._container = self.unit.get_container(self._name)
 
         self.service_patch = KubernetesServicePatch(
-            self, [(self.app.name, self._http_listen_port)]
+            self, [ServicePort(self._http_listen_port, name=self.app.name)]
         )
 
         self.remote_write_provider = PrometheusRemoteWriteProvider(

From b894ba708be42e52734c663faff8156bebeaac9b Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 10:23:08 +0000
Subject: [PATCH 02/13] update libraries

---
 .../observability_libs/v0/juju_topology.py    | 11 +++++---
 .../v0/prometheus_remote_write.py             | 25 +++++++++++++++----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py
index ef4ec58..e68e93f 100644
--- a/lib/charms/observability_libs/v0/juju_topology.py
+++ b/lib/charms/observability_libs/v0/juju_topology.py
@@ -75,7 +75,7 @@
 LIBID = "bced1658f20f49d28b88f61f83c2d232"
 
 LIBAPI = 0
-LIBPATCH = 3
+LIBPATCH = 4
 
 
 class InvalidUUIDError(Exception):
@@ -94,8 +94,8 @@ def __init__(
         model: str,
         model_uuid: str,
         application: str,
-        unit: str = None,
-        charm_name: str = None,
+        unit: Optional[str] = None,
+        charm_name: Optional[str] = None,
     ):
         """Build a JujuTopology object.
 
@@ -181,7 +181,10 @@ def from_dict(cls, data: dict):
         )
 
     def as_dict(
-        self, *, remapped_keys: Dict[str, str] = None, excluded_keys: List[str] = None
+        self,
+        *,
+        remapped_keys: Optional[Dict[str, str]] = None,
+        excluded_keys: Optional[List[str]] = None,
     ) -> OrderedDict:
         """Format the topology information into an ordered dict.
 
diff --git a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py
index 63f6857..07a379f 100644
--- a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py
+++ b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py
@@ -4,6 +4,9 @@
 
 This library facilitates the integration of the prometheus_remote_write interface.
 
+Source code can be found on GitHub at:
+ https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s
+
 Charms that need to push data to a charm exposing the Prometheus remote_write API,
 should use the `PrometheusRemoteWriteConsumer`. Charms that operate software that exposes
 the Prometheus remote_write API, that is, they can receive metrics data over remote_write,
@@ -23,7 +26,14 @@
 
 import yaml
 from charms.observability_libs.v0.juju_topology import JujuTopology
-from ops.charm import CharmBase, HookEvent, RelationEvent, RelationMeta, RelationRole
+from ops.charm import (
+    CharmBase,
+    HookEvent,
+    RelationBrokenEvent,
+    RelationEvent,
+    RelationMeta,
+    RelationRole,
+)
 from ops.framework import EventBase, EventSource, Object, ObjectEvents
 from ops.model import Relation
 
@@ -35,7 +45,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 7
+LIBPATCH = 10
 
 
 logger = logging.getLogger(__name__)
@@ -321,7 +331,9 @@ def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]:
         alert_groups = []  # type: List[dict]
 
         # Gather all alerts into a list of groups
-        for file_path in self._multi_suffix_glob(dir_path, [".rule", ".rules"], recursive):
+        for file_path in self._multi_suffix_glob(
+            dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive
+        ):
             alert_groups_from_file = self._from_file(dir_path, file_path)
             if alert_groups_from_file:
                 logger.debug("Reading alert rule from %s", file_path)
@@ -629,7 +641,7 @@ def __init__(
         self.framework.observe(on_relation.relation_joined, self._handle_endpoints_changed)
         self.framework.observe(on_relation.relation_changed, self._handle_endpoints_changed)
         self.framework.observe(on_relation.relation_departed, self._handle_endpoints_changed)
-        self.framework.observe(on_relation.relation_broken, self._handle_endpoints_changed)
+        self.framework.observe(on_relation.relation_broken, self._on_relation_broken)
         self.framework.observe(on_relation.relation_joined, self._push_alerts_on_relation_joined)
         self.framework.observe(
             self._charm.on.leader_elected, self._push_alerts_to_all_relation_databags
@@ -638,6 +650,9 @@ def __init__(
             self._charm.on.upgrade_charm, self._push_alerts_to_all_relation_databags
         )
 
+    def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
+        self.on.endpoints_changed.emit(relation_id=event.relation.id)
+
     def _handle_endpoints_changed(self, event: RelationEvent) -> None:
         if self._charm.unit.is_leader():
             ev = json.loads(event.relation.data[event.app].get("event", "{}"))
@@ -805,7 +820,7 @@ def __init__(
     def _on_relation_change(self, event: RelationEvent) -> None:
         self.update_endpoint(event.relation)
 
-    def update_endpoint(self, relation: Relation = None) -> None:
+    def update_endpoint(self, relation: Optional[Relation] = None) -> None:
         """Triggers programmatically the update of the relation data.
 
         This method should be used when the charm relying on this library needs

From 0a254919f9436050eaef940e859746fbfafef559 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 10:59:05 +0000
Subject: [PATCH 03/13] add first implementation of MetricsEndpointProvider

---
 .../prometheus_k8s/v0/prometheus_scrape.py    | 2357 +++++++++++++++++
 metadata.yaml                                 |    2 +
 src/charm.py                                  |    8 +
 3 files changed, 2367 insertions(+)
 create mode 100644 lib/charms/prometheus_k8s/v0/prometheus_scrape.py

diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py
new file mode 100644
index 0000000..f080fb8
--- /dev/null
+++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py
@@ -0,0 +1,2357 @@
+# Copyright 2021 Canonical Ltd.
+# See LICENSE file for licensing details.
+"""Prometheus Scrape Library.
+
+## Overview
+
+This document explains how to integrate with the Prometheus charm
+for the purpose of providing a metrics endpoint to Prometheus. It
+also explains how alternative implementations of the Prometheus charms
+may maintain the same interface and be backward compatible with all
+currently integrated charms. Finally this document is the
+authoritative reference on the structure of relation data that is
+shared between Prometheus charms and any other charm that intends to
+provide a scrape target for Prometheus.
+
+## Source code
+
+Source code can be found on GitHub at:
+ https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s
+
+## Dependencies
+
+Using this library requires you to fetch the juju_topology library from
+[observability-libs](https://charmhub.io/observability-libs/libraries/juju_topology).
+
+`charmcraft fetch-lib charms.observability_libs.v0.juju_topology`
+
+## Provider Library Usage
+
+This Prometheus charm interacts with its scrape targets using its
+charm library. Charms seeking to expose metric endpoints for the
+Prometheus charm, must do so using the `MetricsEndpointProvider`
+object from this charm library. For the simplest use cases, using the
+`MetricsEndpointProvider` object only requires instantiating it,
+typically in the constructor of your charm (the one which exposes a
+metrics endpoint). The `MetricsEndpointProvider` constructor requires
+the name of the relation over which a scrape target (metrics endpoint)
+is exposed to the Prometheus charm. This relation must use the
+`prometheus_scrape` interface. By default address of the metrics
+endpoint is set to the unit IP address, by each unit of the
+`MetricsEndpointProvider` charm. These units set their address in
+response to the `PebbleReady` event of each container in the unit,
+since container restarts of Kubernetes charms can result in change of
+IP addresses. The default name for the metrics endpoint relation is
+`metrics-endpoint`. It is strongly recommended to use the same
+relation name for consistency across charms and doing so obviates the
+need for an additional constructor argument. The
+`MetricsEndpointProvider` object may be instantiated as follows
+
+    from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        ...
+        self.metrics_endpoint = MetricsEndpointProvider(self)
+        ...
+
+Note that the first argument (`self`) to `MetricsEndpointProvider` is
+always a reference to the parent (scrape target) charm.
+
+An instantiated `MetricsEndpointProvider` object will ensure that each
+unit of its parent charm, is a scrape target for the
+`MetricsEndpointConsumer` (Prometheus) charm. By default
+`MetricsEndpointProvider` assumes each unit of the consumer charm
+exports its metrics at a path given by `/metrics` on port 80. These
+defaults may be changed by providing the `MetricsEndpointProvider`
+constructor an optional argument (`jobs`) that represents a
+Prometheus scrape job specification using Python standard data
+structures. This job specification is a subset of Prometheus' own
+[scrape
+configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
+format but represented using Python data structures. More than one job
+may be provided using the `jobs` argument. Hence `jobs` accepts a list
+of dictionaries where each dictionary represents one `<scrape_config>`
+object as described in the Prometheus documentation. The currently
+supported configuration subset is: `job_name`, `metrics_path`,
+`static_configs`
+
+Suppose it is required to change the port on which scraped metrics are
+exposed to 8000. This may be done by providing the following data
+structure as the value of `jobs`.
+
+```
+[
+    {
+        "static_configs": [
+            {
+                "targets": ["*:8000"]
+            }
+        ]
+    }
+]
+```
+
+The wildcard ("*") host specification implies that the scrape targets
+will automatically be set to the host addresses advertised by each
+unit of the consumer charm.
+
+It is also possible to change the metrics path and scrape multiple
+ports, for example
+
+```
+[
+    {
+        "metrics_path": "/my-metrics-path",
+        "static_configs": [
+            {
+                "targets": ["*:8000", "*:8081"],
+            }
+        ]
+    }
+]
+```
+
+More complex scrape configurations are possible. For example
+
+```
+[
+    {
+        "static_configs": [
+            {
+                "targets": ["10.1.32.215:7000", "*:8000"],
+                "labels": {
+                    "some-key": "some-value"
+                }
+            }
+        ]
+    }
+]
+```
+
+This example scrapes the target "10.1.32.215" at port 7000 in addition
+to scraping each unit at port 8000. There is however one difference
+between wildcard targets (specified using "*") and fully qualified
+targets (such as "10.1.32.215"). The Prometheus charm automatically
+associates labels with metrics generated by each target. These labels
+localise the source of metrics within the Juju topology by specifying
+its "model name", "model UUID", "application name" and "unit
+name". However unit name is associated only with wildcard targets but
+not with fully qualified targets.
+
+Multiple jobs with different metrics paths and labels are allowed, but
+each job must be given a unique name:
+
+```
+[
+    {
+        "job_name": "my-first-job",
+        "metrics_path": "one-path",
+        "static_configs": [
+            {
+                "targets": ["*:7000"],
+                "labels": {
+                    "some-key": "some-value"
+                }
+            }
+        ]
+    },
+    {
+        "job_name": "my-second-job",
+        "metrics_path": "another-path",
+        "static_configs": [
+            {
+                "targets": ["*:8000"],
+                "labels": {
+                    "some-other-key": "some-other-value"
+                }
+            }
+        ]
+    }
+]
+```
+
+**Important:** `job_name` should be a fixed string (e.g. hardcoded literal).
+For instance, if you include variable elements, like your `unit.name`, it may break
+the continuity of the metrics time series gathered by Prometheus when the leader unit
+changes (e.g. on upgrade or rescale).
+
+Additionally, it is also technically possible, but **strongly discouraged**, to
+configure the following scrape-related settings, which behave as described by the
+[Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config):
+
+- `static_configs`
+- `scrape_interval`
+- `scrape_timeout`
+- `proxy_url`
+- `relabel_configs`
+- `metrics_relabel_configs`
+- `sample_limit`
+- `label_limit`
+- `label_name_length_limit`
+- `label_value_length_limit`
+
+The settings above are supported by the `prometheus_scrape` library only for the sake of
+specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s)
+charm. Virtually no charms should use these settings, and charmers definitely **should not**
+expose them to the Juju administrator via configuration options.
+
+## Consumer Library Usage
+
+The `MetricsEndpointConsumer` object may be used by Prometheus
+charms to manage relations with their scrape targets. For this
+purposes a Prometheus charm needs to do two things
+
+1. Instantiate the `MetricsEndpointConsumer` object by providing it a
+reference to the parent (Prometheus) charm and optionally the name of
+the relation that the Prometheus charm uses to interact with scrape
+targets. This relation must confirm to the `prometheus_scrape`
+interface and it is strongly recommended that this relation be named
+`metrics-endpoint` which is its default value.
+
+For example a Prometheus charm may instantiate the
+`MetricsEndpointConsumer` in its constructor as follows
+
+    from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        ...
+        self.metrics_consumer = MetricsEndpointConsumer(self)
+        ...
+
+2. A Prometheus charm also needs to respond to the
+`TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as
+an observer for these events, as in
+
+    self.framework.observe(
+        self.metrics_consumer.on.targets_changed,
+        self._on_scrape_targets_changed,
+    )
+
+In responding to the `TargetsChangedEvent` event the Prometheus
+charm must update the Prometheus configuration so that any new scrape
+targets are added and/or old ones removed from the list of scraped
+endpoints. For this purpose the `MetricsEndpointConsumer` object
+exposes a `jobs()` method that returns a list of scrape jobs. Each
+element of this list is the Prometheus scrape configuration for that
+job. In order to update the Prometheus configuration, the Prometheus
+charm needs to replace the current list of jobs with the list provided
+by `jobs()` as follows
+
+    def _on_scrape_targets_changed(self, event):
+        ...
+        scrape_jobs = self.metrics_consumer.jobs()
+        for job in scrape_jobs:
+            prometheus_scrape_config.append(job)
+        ...
+
+## Alerting Rules
+
+This charm library also supports gathering alerting rules from all
+related `MetricsEndpointProvider` charms and enabling corresponding alerts within the
+Prometheus charm.  Alert rules are automatically gathered by `MetricsEndpointProvider`
+charms when using this library, from a directory conventionally named
+`prometheus_alert_rules`. This directory must reside at the top level
+in the `src` folder of the consumer charm. Each file in this directory
+is assumed to be in one of two formats:
+- the official prometheus alert rule format, conforming to the
+[Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
+- a single rule format, which is a simplified subset of the official format,
+comprising a single alert rule per file, using the same YAML fields.
+
+The file name must have one of the following extensions:
+- `.rule`
+- `.rules`
+- `.yml`
+- `.yaml`
+
+An example of the contents of such a file in the custom single rule
+format is shown below.
+
+```
+alert: HighRequestLatency
+expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5
+for: 10m
+labels:
+  severity: Medium
+  type: HighLatency
+annotations:
+  summary: High request latency for {{ $labels.instance }}.
+```
+
+The `MetricsEndpointProvider` will read all available alert rules and
+also inject "filtering labels" into the alert expressions. The
+filtering labels ensure that alert rules are localised to the metrics
+provider charm's Juju topology (application, model and its UUID). Such
+a topology filter is essential to ensure that alert rules submitted by
+one provider charm generates alerts only for that same charm. When
+alert rules are embedded in a charm, and the charm is deployed as a
+Juju application, the alert rules from that application have their
+expressions automatically updated to filter for metrics coming from
+the units of that application alone. This remove risk of spurious
+evaluation, e.g., when you have multiple deployments of the same charm
+monitored by the same Prometheus.
+
+Not all alerts one may want to specify can be embedded in a
+charm. Some alert rules will be specific to a user's use case. This is
+the case, for example, of alert rules that are based on business
+constraints, like expecting a certain amount of requests to a specific
+API every five minutes. Such alert rules can be specified via the
+[COS Config Charm](https://charmhub.io/cos-configuration-k8s),
+which allows importing alert rules and other settings like dashboards
+from a Git repository.
+
+Gathering alert rules and generating rule files within the Prometheus
+charm is easily done using the `alerts()` method of
+`MetricsEndpointConsumer`. Alerts generated by Prometheus will
+automatically include Juju topology labels in the alerts. These labels
+indicate the source of the alert. The following labels are
+automatically included with each alert
+
+- `juju_model`
+- `juju_model_uuid`
+- `juju_application`
+
+## Relation Data
+
+The Prometheus charm uses both application and unit relation data to
+obtain information regarding its scrape jobs, alert rules and scrape
+targets. This relation data is in JSON format and it closely resembles
+the YAML structure of Prometheus [scrape configuration]
+(https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config).
+
+Units of Metrics provider charms advertise their names and addresses
+over unit relation data using the `prometheus_scrape_unit_name` and
+`prometheus_scrape_unit_address` keys. While the `scrape_metadata`,
+`scrape_jobs` and `alert_rules` keys in application relation data
+of Metrics provider charms hold eponymous information.
+
+"""  # noqa: W505
+
+import copy
+import hashlib
+import ipaddress
+import json
+import logging
+import os
+import platform
+import re
+import socket
+import subprocess
+import tempfile
+from collections import defaultdict
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+
+import yaml
+from charms.observability_libs.v0.juju_topology import JujuTopology
+from ops.charm import CharmBase, RelationRole
+from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents
+from ops.model import Relation
+
+# The unique Charmhub library identifier, never change it
+LIBID = "bc84295fef5f4049878f07b131968ee2"
+
+# Increment this major API version when introducing breaking changes
+LIBAPI = 0
+
+# Increment this PATCH version before using `charmcraft publish-lib` or reset
+# to 0 if you are raising the major API version
+LIBPATCH = 27
+
+logger = logging.getLogger(__name__)
+
+
+ALLOWED_KEYS = {
+    "job_name",
+    "metrics_path",
+    "static_configs",
+    "scrape_interval",
+    "scrape_timeout",
+    "proxy_url",
+    "relabel_configs",
+    "metrics_relabel_configs",
+    "sample_limit",
+    "label_limit",
+    "label_name_length_limit",
+    "label_value_length_limit",
+    "scheme",
+    "basic_auth",
+    "tls_config",
+}
+DEFAULT_JOB = {
+    "metrics_path": "/metrics",
+    "static_configs": [{"targets": ["*:80"]}],
+}
+
+
+DEFAULT_RELATION_NAME = "metrics-endpoint"
+RELATION_INTERFACE_NAME = "prometheus_scrape"
+
+DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules"
+
+
+class PrometheusConfig:
+    """A namespace for utility functions for manipulating the prometheus config dict."""
+
+    # relabel instance labels so that instance identifiers are globally unique
+    # stable over unit recreation
+    topology_relabel_config = {
+        "source_labels": ["juju_model", "juju_model_uuid", "juju_application"],
+        "separator": "_",
+        "target_label": "instance",
+        "regex": "(.*)",
+    }
+
+    topology_relabel_config_wildcard = {
+        "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"],
+        "separator": "_",
+        "target_label": "instance",
+        "regex": "(.*)",
+    }
+
+    @staticmethod
+    def sanitize_scrape_config(job: dict) -> dict:
+        """Restrict permissible scrape configuration options.
+
+        If job is empty then a default job is returned. The
+        default job is
+
+        ```
+        {
+            "metrics_path": "/metrics",
+            "static_configs": [{"targets": ["*:80"]}],
+        }
+        ```
+
+        Args:
+            job: a dict containing a single Prometheus job
+                specification.
+
+        Returns:
+            a dictionary containing a sanitized job specification.
+        """
+        sanitized_job = DEFAULT_JOB.copy()
+        sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS})
+        return sanitized_job
+
+    @staticmethod
+    def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]:
+        """A vectorized version of `sanitize_scrape_config`."""
+        return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs]
+
+    @staticmethod
+    def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]:
+        """Adds the given prefix to all the job names in the given scrape_configs list."""
+        modified_scrape_configs = []
+        for scrape_config in scrape_configs:
+            job_name = scrape_config.get("job_name")
+            modified = scrape_config.copy()
+            modified["job_name"] = prefix + "_" + job_name if job_name else prefix
+            modified_scrape_configs.append(modified)
+
+        return modified_scrape_configs
+
+    @staticmethod
+    def expand_wildcard_targets_into_individual_jobs(
+        scrape_jobs: List[dict],
+        hosts: Dict[str, Tuple[str, str]],
+        topology: Optional[JujuTopology] = None,
+    ) -> List[dict]:
+        """Extract wildcard hosts from the given scrape_configs list into separate jobs.
+
+        Args:
+            scrape_jobs: list of scrape jobs.
+            hosts: a dictionary mapping host names to host address for
+                all units of the relation for which this job configuration
+                must be constructed.
+            topology: optional arg for adding topology labels to scrape targets.
+        """
+        # hosts = self._relation_hosts(relation)
+
+        modified_scrape_jobs = []
+        for job in scrape_jobs:
+            static_configs = job.get("static_configs")
+            if not static_configs:
+                continue
+
+            # When a single unit specified more than one wildcard target, then they are expanded
+            # into a static_config per target
+            non_wildcard_static_configs = []
+
+            for static_config in static_configs:
+                targets = static_config.get("targets")
+                if not targets:
+                    continue
+
+                # All non-wildcard targets remain in the same static_config
+                non_wildcard_targets = []
+
+                # All wildcard targets are extracted to a job per unit. If multiple wildcard
+                # targets are specified, they remain in the same static_config (per unit).
+                wildcard_targets = []
+
+                for target in targets:
+                    match = re.compile(r"\*(?:(:\d+))?").match(target)
+                    if match:
+                        # This is a wildcard target.
+                        # Need to expand into separate jobs and remove it from this job here
+                        wildcard_targets.append(target)
+                    else:
+                        # This is not a wildcard target. Copy it over into its own static_config.
+                        non_wildcard_targets.append(target)
+
+                # All non-wildcard targets remain in the same static_config
+                if non_wildcard_targets:
+                    non_wildcard_static_config = static_config.copy()
+                    non_wildcard_static_config["targets"] = non_wildcard_targets
+
+                    if topology:
+                        # When non-wildcard targets (aka fully qualified hostnames) are specified,
+                        # there is no reliable way to determine the name (Juju topology unit name)
+                        # for such a target. Therefore labeling with Juju topology, excluding the
+                        # unit name.
+                        non_wildcard_static_config["labels"] = {
+                            **non_wildcard_static_config.get("labels", {}),
+                            **topology.label_matcher_dict,
+                        }
+
+                    non_wildcard_static_configs.append(non_wildcard_static_config)
+
+                # Extract wildcard targets into individual jobs
+                if wildcard_targets:
+                    for unit_name, (unit_hostname, unit_path) in hosts.items():
+                        modified_job = job.copy()
+                        modified_job["static_configs"] = [static_config.copy()]
+                        modified_static_config = modified_job["static_configs"][0]
+                        modified_static_config["targets"] = [
+                            target.replace("*", unit_hostname) for target in wildcard_targets
+                        ]
+
+                        unit_num = unit_name.split("/")[-1]
+                        job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num
+                        modified_job["job_name"] = job_name
+                        modified_job["metrics_path"] = unit_path + (
+                            job.get("metrics_path") or "/metrics"
+                        )
+
+                        if topology:
+                            # Add topology labels
+                            modified_static_config["labels"] = {
+                                **modified_static_config.get("labels", {}),
+                                **topology.label_matcher_dict,
+                                **{"juju_unit": unit_name},
+                            }
+
+                            # Instance relabeling for topology should be last in order.
+                            modified_job["relabel_configs"] = modified_job.get(
+                                "relabel_configs", []
+                            ) + [PrometheusConfig.topology_relabel_config_wildcard]
+
+                        modified_scrape_jobs.append(modified_job)
+
+            if non_wildcard_static_configs:
+                modified_job = job.copy()
+                modified_job["static_configs"] = non_wildcard_static_configs
+                modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics"
+
+                if topology:
+                    # Instance relabeling for topology should be last in order.
+                    modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [
+                        PrometheusConfig.topology_relabel_config
+                    ]
+
+                modified_scrape_jobs.append(modified_job)
+
+        return modified_scrape_jobs
+
+    @staticmethod
+    def render_alertmanager_static_configs(alertmanagers: List[str]):
+        """Render the alertmanager static_configs section from a list of URLs.
+
+        Each target must be in the hostname:port format, and prefixes are specified in a separate
+        key. Therefore, with ingress in place, would need to extract the path into the
+        `path_prefix` key, which is higher up in the config hierarchy.
+
+        https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
+
+        Args:
+            alertmanagers: List of alertmanager URLs.
+
+        Returns:
+            A dict representation for the static_configs section.
+        """
+        # Make sure it's a valid url so urlparse could parse it.
+        scheme = re.compile(r"^https?://")
+        sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers]
+
+        # Create a mapping from paths to netlocs
+        # Group alertmanager targets into a dictionary of lists:
+        # {path: [netloc1, netloc2]}
+        paths = defaultdict(list)  # type: Dict[str, List[str]]
+        for parsed in map(urlparse, sanitized):
+            path = parsed.path or "/"
+            paths[path].append(parsed.netloc)
+
+        return {
+            "alertmanagers": [
+                {"path_prefix": path_prefix, "static_configs": [{"targets": netlocs}]}
+                for path_prefix, netlocs in paths.items()
+            ]
+        }
+
+
+class RelationNotFoundError(Exception):
+    """Raised if there is no relation with the given name is found."""
+
+    def __init__(self, relation_name: str):
+        self.relation_name = relation_name
+        self.message = "No relation named '{}' found".format(relation_name)
+
+        super().__init__(self.message)
+
+
+class RelationInterfaceMismatchError(Exception):
+    """Raised if the relation with the given name has a different interface."""
+
+    def __init__(
+        self,
+        relation_name: str,
+        expected_relation_interface: str,
+        actual_relation_interface: str,
+    ):
+        self.relation_name = relation_name
+        self.expected_relation_interface = expected_relation_interface
+        self.actual_relation_interface = actual_relation_interface
+        self.message = (
+            "The '{}' relation has '{}' as interface rather than the expected '{}'".format(
+                relation_name, actual_relation_interface, expected_relation_interface
+            )
+        )
+
+        super().__init__(self.message)
+
+
+class RelationRoleMismatchError(Exception):
+    """Raised if the relation with the given name has a different role."""
+
+    def __init__(
+        self,
+        relation_name: str,
+        expected_relation_role: RelationRole,
+        actual_relation_role: RelationRole,
+    ):
+        self.relation_name = relation_name
+        self.expected_relation_interface = expected_relation_role
+        self.actual_relation_role = actual_relation_role
+        self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format(
+            relation_name, repr(actual_relation_role), repr(expected_relation_role)
+        )
+
+        super().__init__(self.message)
+
+
+class InvalidAlertRuleEvent(EventBase):
+    """Event emitted when alert rule files are not parsable.
+
+    Enables us to set a clear status on the provider.
+    """
+
+    def __init__(self, handle, errors: str = "", valid: bool = False):
+        super().__init__(handle)
+        self.errors = errors
+        self.valid = valid
+
+    def snapshot(self) -> Dict:
+        """Save alert rule information."""
+        return {
+            "valid": self.valid,
+            "errors": self.errors,
+        }
+
+    def restore(self, snapshot):
+        """Restore alert rule information."""
+        self.valid = snapshot["valid"]
+        self.errors = snapshot["errors"]
+
+
+class MetricsEndpointProviderEvents(ObjectEvents):
+    """Events raised by :class:`InvalidAlertRuleEvent`s."""
+
+    alert_rule_status_changed = EventSource(InvalidAlertRuleEvent)
+
+
+def _validate_relation_by_interface_and_direction(
+    charm: CharmBase,
+    relation_name: str,
+    expected_relation_interface: str,
+    expected_relation_role: RelationRole,
+):
+    """Verifies that a relation has the necessary characteristics.
+
+    Verifies that the `relation_name` provided: (1) exists in metadata.yaml,
+    (2) declares as interface the interface name passed as `relation_interface`
+    and (3) has the right "direction", i.e., it is a relation that `charm`
+    provides or requires.
+
+    Args:
+        charm: a `CharmBase` object to scan for the matching relation.
+        relation_name: the name of the relation to be verified.
+        expected_relation_interface: the interface name to be matched by the
+            relation named `relation_name`.
+        expected_relation_role: whether the `relation_name` must be either
+            provided or required by `charm`.
+
+    Raises:
+        RelationNotFoundError: If there is no relation in the charm's metadata.yaml
+            with the same name as provided via `relation_name` argument.
+        RelationInterfaceMismatchError: The relation with the same name as provided
+            via `relation_name` argument does not have the same relation interface
+            as specified via the `expected_relation_interface` argument.
+        RelationRoleMismatchError: If the relation with the same name as provided
+            via `relation_name` argument does not have the same role as specified
+            via the `expected_relation_role` argument.
+    """
+    if relation_name not in charm.meta.relations:
+        raise RelationNotFoundError(relation_name)
+
+    relation = charm.meta.relations[relation_name]
+
+    actual_relation_interface = relation.interface_name
+    if actual_relation_interface != expected_relation_interface:
+        raise RelationInterfaceMismatchError(
+            relation_name, expected_relation_interface, actual_relation_interface
+        )
+
+    if expected_relation_role == RelationRole.provides:
+        if relation_name not in charm.meta.provides:
+            raise RelationRoleMismatchError(
+                relation_name, RelationRole.provides, RelationRole.requires
+            )
+    elif expected_relation_role == RelationRole.requires:
+        if relation_name not in charm.meta.requires:
+            raise RelationRoleMismatchError(
+                relation_name, RelationRole.requires, RelationRole.provides
+            )
+    else:
+        raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role))
+
+
+class InvalidAlertRulePathError(Exception):
+    """Raised if the alert rules folder cannot be found or is otherwise invalid."""
+
+    def __init__(
+        self,
+        alert_rules_absolute_path: Path,
+        message: str,
+    ):
+        self.alert_rules_absolute_path = alert_rules_absolute_path
+        self.message = message
+
+        super().__init__(self.message)
+
+
+def _is_official_alert_rule_format(rules_dict: dict) -> bool:
+    """Are alert rules in the upstream format as supported by Prometheus.
+
+    Alert rules in dictionary format are in "official" form if they
+    contain a "groups" key, since this implies they contain a list of
+    alert rule groups.
+
+    Args:
+        rules_dict: a set of alert rules in Python dictionary format
+
+    Returns:
+        True if alert rules are in official Prometheus file format.
+    """
+    return "groups" in rules_dict
+
+
+def _is_single_alert_rule_format(rules_dict: dict) -> bool:
+    """Are alert rules in single rule format.
+
+    The Prometheus charm library supports reading of alert rules in a
+    custom format that consists of a single alert rule per file. This
+    does not conform to the official Prometheus alert rule file format
+    which requires that each alert rules file consists of a list of
+    alert rule groups and each group consists of a list of alert
+    rules.
+
+    Alert rules in dictionary form are considered to be in single rule
+    format if in the least it contains two keys corresponding to the
+    alert rule name and alert expression.
+
+    Returns:
+        True if alert rule is in single rule file format.
+    """
+    # one alert rule per file
+    return set(rules_dict) >= {"alert", "expr"}
+
+
+class AlertRules:
+    """Utility class for amalgamating prometheus alert rule files and injecting juju topology.
+
+    An `AlertRules` object supports aggregating alert rules from files and directories in both
+    official and single rule file formats using the `add_path()` method. All the alert rules
+    read are annotated with Juju topology labels and amalgamated into a single data structure
+    in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be
+    easily dumped into JSON format and exchanged over relation data. The dictionary can also
+    be dumped into YAML format and written directly into an alert rules file that is read by
+    Prometheus. Note that multiple `AlertRules` objects must not be written into the same file,
+    since Prometheus allows only a single list of alert rule groups per alert rules file.
+
+    The official Prometheus format is a YAML file conforming to the Prometheus documentation
+    (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
+    The custom single rule format is a subsection of the official YAML, having a single alert
+    rule, effectively "one alert per file".
+    """
+
+    # This class uses the following terminology for the various parts of a rule file:
+    # - alert rules file: the entire groups[] yaml, including the "groups:" key.
+    # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list
+    #   of dictionaries that have the "name" and "rules" keys.
+    # - alert group (singular): a single dictionary that has the "name" and "rules" keys.
+    # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with
+    #   the "alert" and "expr" keys.
+    # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys.
+
+    def __init__(self, topology: Optional[JujuTopology] = None):
+        """Build and alert rule object.
+
+        Args:
+            topology: an optional `JujuTopology` instance that is used to annotate all alert rules.
+        """
+        self.topology = topology
+        self.tool = CosTool(None)
+        self.alert_groups = []  # type: List[dict]
+
+    def _from_file(self, root_path: Path, file_path: Path) -> List[dict]:
+        """Read a rules file from path, injecting juju topology.
+
+        Args:
+            root_path: full path to the root rules folder (used only for generating group name)
+            file_path: full path to a *.rule file.
+
+        Returns:
+            A list of dictionaries representing the rules file, if file is valid (the structure is
+            formed by `yaml.safe_load` of the file); an empty list otherwise.
+        """
+        with file_path.open() as rf:
+            # Load a list of rules from file then add labels and filters
+            try:
+                rule_file = yaml.safe_load(rf)
+
+            except Exception as e:
+                logger.error("Failed to read alert rules from %s: %s", file_path.name, e)
+                return []
+
+            if not rule_file:
+                logger.warning("Empty rules file: %s", file_path.name)
+                return []
+            if not isinstance(rule_file, dict):
+                logger.error("Invalid rules file (must be a dict): %s", file_path.name)
+                return []
+            if _is_official_alert_rule_format(rule_file):
+                alert_groups = rule_file["groups"]
+            elif _is_single_alert_rule_format(rule_file):
+                # convert to list of alert groups
+                # group name is made up from the file name
+                alert_groups = [{"name": file_path.stem, "rules": [rule_file]}]
+            else:
+                # invalid/unsupported
+                logger.error("Invalid rules file: %s", file_path.name)
+                return []
+
+            # update rules with additional metadata
+            for alert_group in alert_groups:
+                # update group name with topology and sub-path
+                alert_group["name"] = self._group_name(
+                    str(root_path),
+                    str(file_path),
+                    alert_group["name"],
+                )
+
+                # add "juju_" topology labels
+                for alert_rule in alert_group["rules"]:
+                    if "labels" not in alert_rule:
+                        alert_rule["labels"] = {}
+
+                    if self.topology:
+                        alert_rule["labels"].update(self.topology.label_matcher_dict)
+                        # insert juju topology filters into a prometheus alert rule
+                        alert_rule["expr"] = self.tool.inject_label_matchers(
+                            re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]),
+                            self.topology.label_matcher_dict,
+                        )
+
+            return alert_groups
+
+    def _group_name(self, root_path: str, file_path: str, group_name: str) -> str:
+        """Generate group name from path and topology.
+
+        The group name is made up of the relative path between the root dir_path, the file path,
+        and topology identifier.
+
+        Args:
+            root_path: path to the root rules dir.
+            file_path: path to rule file.
+            group_name: original group name to keep as part of the new augmented group name
+
+        Returns:
+            New group name, augmented by juju topology and relative path.
+        """
+        rel_path = os.path.relpath(os.path.dirname(file_path), root_path)
+        rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_")
+
+        # Generate group name:
+        #  - name, from juju topology
+        #  - suffix, from the relative path of the rule file;
+        group_name_parts = [self.topology.identifier] if self.topology else []
+        group_name_parts.extend([rel_path, group_name, "alerts"])
+        # filter to remove empty strings
+        return "_".join(filter(None, group_name_parts))
+
+    @classmethod
+    def _multi_suffix_glob(
+        cls, dir_path: Path, suffixes: List[str], recursive: bool = True
+    ) -> list:
+        """Helper function for getting all files in a directory that have a matching suffix.
+
+        Args:
+            dir_path: path to the directory to glob from.
+            suffixes: list of suffixes to include in the glob (items should begin with a period).
+            recursive: a flag indicating whether a glob is recursive (nested) or not.
+
+        Returns:
+            List of files in `dir_path` that have one of the suffixes specified in `suffixes`.
+        """
+        all_files_in_dir = dir_path.glob("**/*" if recursive else "*")
+        return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir))
+
+    def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]:
+        """Read all rule files in a directory.
+
+        All rules from files for the same directory are loaded into a single
+        group. The generated name of this group includes juju topology.
+        By default, only the top directory is scanned; for nested scanning, pass `recursive=True`.
+
+        Args:
+            dir_path: directory containing *.rule files (alert rules without groups).
+            recursive: flag indicating whether to scan for rule files recursively.
+
+        Returns:
+            a list of dictionaries representing prometheus alert rule groups, each dictionary
+            representing an alert group (structure determined by `yaml.safe_load`).
+        """
+        alert_groups = []  # type: List[dict]
+
+        # Gather all alerts into a list of groups
+        for file_path in self._multi_suffix_glob(
+            dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive
+        ):
+            alert_groups_from_file = self._from_file(dir_path, file_path)
+            if alert_groups_from_file:
+                logger.debug("Reading alert rule from %s", file_path)
+                alert_groups.extend(alert_groups_from_file)
+
+        return alert_groups
+
+    def add_path(self, path: str, *, recursive: bool = False) -> None:
+        """Add rules from a dir path.
+
+        All rules from files are aggregated into a data structure representing a single rule file.
+        All group names are augmented with juju topology.
+
+        Args:
+            path: either a rules file or a dir of rules files.
+            recursive: whether to read files recursively or not (no impact if `path` is a file).
+
+        Returns:
+            True if path was added else False.
+        """
+        path = Path(path)  # type: Path
+        if path.is_dir():
+            self.alert_groups.extend(self._from_dir(path, recursive))
+        elif path.is_file():
+            self.alert_groups.extend(self._from_file(path.parent, path))
+        else:
+            logger.debug("Alert rules path does not exist: %s", path)
+
+    def as_dict(self) -> dict:
+        """Return standard alert rules file in dict representation.
+
+        Returns:
+            a dictionary containing a single list of alert rule groups.
+            The list of alert rule groups is provided as value of the
+            "groups" dictionary key.
+        """
+        return {"groups": self.alert_groups} if self.alert_groups else {}
+
+
+class TargetsChangedEvent(EventBase):
+    """Event emitted when Prometheus scrape targets change."""
+
+    def __init__(self, handle, relation_id):
+        super().__init__(handle)
+        self.relation_id = relation_id
+
+    def snapshot(self):
+        """Save scrape target relation information."""
+        return {"relation_id": self.relation_id}
+
+    def restore(self, snapshot):
+        """Restore scrape target relation information."""
+        self.relation_id = snapshot["relation_id"]
+
+
+class MonitoringEvents(ObjectEvents):
+    """Event descriptor for events raised by `MetricsEndpointConsumer`."""
+
+    targets_changed = EventSource(TargetsChangedEvent)
+
+
+class MetricsEndpointConsumer(Object):
+    """A Prometheus based Monitoring service."""
+
+    on = MonitoringEvents()
+
+    def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME):
+        """A Prometheus based Monitoring service.
+
+        Args:
+            charm: a `CharmBase` instance that manages this
+                instance of the Prometheus service.
+            relation_name: an optional string name of the relation between `charm`
+                and the Prometheus charmed service. The default is "metrics-endpoint".
+                It is strongly advised not to change the default, so that people
+                deploying your charm will have a consistent experience with all
+                other charms that consume metrics endpoints.
+
+        Raises:
+            RelationNotFoundError: If there is no relation in the charm's metadata.yaml
+                with the same name as provided via `relation_name` argument.
+            RelationInterfaceMismatchError: The relation with the same name as provided
+                via `relation_name` argument does not have the `prometheus_scrape` relation
+                interface.
+            RelationRoleMismatchError: If the relation with the same name as provided
+                via `relation_name` argument does not have the `RelationRole.requires`
+                role.
+        """
+        _validate_relation_by_interface_and_direction(
+            charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires
+        )
+
+        super().__init__(charm, relation_name)
+        self._charm = charm
+        self._relation_name = relation_name
+        self._tool = CosTool(self._charm)
+        events = self._charm.on[relation_name]
+        self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed)
+        self.framework.observe(
+            events.relation_departed, self._on_metrics_provider_relation_departed
+        )
+
+    def _on_metrics_provider_relation_changed(self, event):
+        """Handle changes with related metrics providers.
+
+        Anytime there are changes in relations between Prometheus
+        and metrics provider charms the Prometheus charm is informed,
+        through a `TargetsChangedEvent` event. The Prometheus charm can
+        then choose to update its scrape configuration.
+
+        Args:
+            event: a `CharmEvent` in response to which the Prometheus
+                charm must update its scrape configuration.
+        """
+        rel_id = event.relation.id
+
+        self.on.targets_changed.emit(relation_id=rel_id)
+
+    def _on_metrics_provider_relation_departed(self, event):
+        """Update job config when a metrics provider departs.
+
+        When a metrics provider departs the Prometheus charm is informed
+        through a `TargetsChangedEvent` event so that it can update its
+        scrape configuration to ensure that the departed metrics provider
+        is removed from the list of scrape jobs and
+
+        Args:
+            event: a `CharmEvent` that indicates a metrics provider
+               unit has departed.
+        """
+        rel_id = event.relation.id
+        self.on.targets_changed.emit(relation_id=rel_id)
+
+    def jobs(self) -> list:
+        """Fetch the list of scrape jobs.
+
+        Returns:
+            A list consisting of all the static scrape configurations
+            for each related `MetricsEndpointProvider` that has specified
+            its scrape targets.
+        """
+        scrape_jobs = []
+
+        for relation in self._charm.model.relations[self._relation_name]:
+            static_scrape_jobs = self._static_scrape_config(relation)
+            if static_scrape_jobs:
+                scrape_jobs.extend(static_scrape_jobs)
+
+        scrape_jobs = _dedupe_job_names(scrape_jobs)
+
+        return scrape_jobs
+
+    def alerts(self) -> dict:
+        """Fetch alerts for all relations.
+
+        A Prometheus alert rules file consists of a list of "groups". Each
+        group consists of a list of alerts (`rules`) that are sequentially
+        executed. This method returns all the alert rules provided by each
+        related metrics provider charm. These rules may be used to generate a
+        separate alert rules file for each relation since the returned list
+        of alert groups are indexed by that relations Juju topology identifier.
+        The Juju topology identifier string includes substrings that identify
+        alert rule related metadata such as the Juju model, model UUID and the
+        application name from where the alert rule originates. Since this
+        topology identifier is globally unique, it may be used for instance as
+        the name for the file into which the list of alert rule groups are
+        written. For each relation, the structure of data returned is a dictionary
+        representation of a standard prometheus rules file:
+
+        {"groups": [{"name": ...}, ...]}
+
+        per official prometheus documentation
+        https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+
+        The value of the `groups` key is such that it may be used to generate
+        a Prometheus alert rules file directly using `yaml.dump` but the
+        `groups` key itself must be included as this is required by Prometheus.
+
+        For example the list of alert rule groups returned by this method may
+        be written into files consumed by Prometheus as follows
+
+        ```
+        for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items():
+            filename = "juju_" + topology_identifier + ".rules"
+            path = os.path.join(PROMETHEUS_RULES_DIR, filename)
+            rules = yaml.safe_dump(alert_rule_groups)
+            container.push(path, rules, make_dirs=True)
+        ```
+
+        Returns:
+            A dictionary mapping the Juju topology identifier of the source charm to
+            its list of alert rule groups.
+        """
+        alerts = {}  # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files
+        for relation in self._charm.model.relations[self._relation_name]:
+            if not relation.units or not relation.app:
+                continue
+
+            alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}"))
+            if not alert_rules:
+                continue
+
+            try:
+                scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
+                identifier = JujuTopology.from_dict(scrape_metadata).identifier
+                alerts[identifier] = self._tool.apply_label_matchers(alert_rules)
+
+            except KeyError as e:
+                logger.debug(
+                    "Relation %s has no 'scrape_metadata': %s",
+                    relation.id,
+                    e,
+                )
+                identifier = self._get_identifier_by_alert_rules(alert_rules)
+
+            if not identifier:
+                logger.error(
+                    "Alert rules were found but no usable group or identifier was present"
+                )
+                continue
+
+            alerts[identifier] = alert_rules
+
+            _, errmsg = self._tool.validate_alert_rules(alert_rules)
+            if errmsg:
+                if alerts[identifier]:
+                    del alerts[identifier]
+                relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg})
+                continue
+
+        return alerts
+
+    def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]:
+        """Determine an appropriate dict key for alert rules.
+
+        The key is used as the filename when writing alerts to disk, so the structure
+        and uniqueness is important.
+
+        Args:
+            rules: a dict of alert rules
+        """
+        if "groups" not in rules:
+            logger.debug("No alert groups were found in relation data")
+            return None
+
+        # Construct an ID based on what's in the alert rules if they have labels
+        for group in rules["groups"]:
+            try:
+                labels = group["rules"][0]["labels"]
+                identifier = "{}_{}_{}".format(
+                    labels["juju_model"],
+                    labels["juju_model_uuid"],
+                    labels["juju_application"],
+                )
+                return identifier
+            except KeyError:
+                logger.debug("Alert rules were found but no usable labels were present")
+                continue
+
+        logger.warning(
+            "No labeled alert rules were found, and no 'scrape_metadata' "
+            "was available. Using the alert group name as filename."
+        )
+        try:
+            for group in rules["groups"]:
+                return group["name"]
+        except KeyError:
+            logger.debug("No group name was found to use as identifier")
+
+        return None
+
+    def _static_scrape_config(self, relation) -> list:
+        """Generate the static scrape configuration for a single relation.
+
+        If the relation data includes `scrape_metadata` then the value
+        of this key is used to annotate the scrape jobs with Juju
+        Topology labels before returning them.
+
+        Args:
+            relation: an `ops.model.Relation` object whose static
+                scrape configuration is required.
+
+        Returns:
+            A list (possibly empty) of scrape jobs. Each job is a
+            valid Prometheus scrape configuration for that job,
+            represented as a Python dictionary.
+        """
+        if not relation.units:
+            return []
+
+        scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]"))
+
+        if not scrape_jobs:
+            return []
+
+        scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}"))
+
+        if not scrape_metadata:
+            return scrape_jobs
+
+        topology = JujuTopology.from_dict(scrape_metadata)
+
+        job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier)
+        scrape_jobs = PrometheusConfig.prefix_job_names(scrape_jobs, job_name_prefix)
+        scrape_jobs = PrometheusConfig.sanitize_scrape_configs(scrape_jobs)
+
+        hosts = self._relation_hosts(relation)
+
+        scrape_jobs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs(
+            scrape_jobs, hosts, topology
+        )
+
+        return scrape_jobs
+
+    def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]:
+        """Returns a mapping from unit names to (address, path) tuples, for the given relation."""
+        hosts = {}
+        for unit in relation.units:
+            # TODO deprecate and remove unit.name
+            unit_name = relation.data[unit].get("prometheus_scrape_unit_name") or unit.name
+            # TODO deprecate and remove "prometheus_scrape_host"
+            unit_address = relation.data[unit].get(
+                "prometheus_scrape_unit_address"
+            ) or relation.data[unit].get("prometheus_scrape_host")
+            unit_path = relation.data[unit].get("prometheus_scrape_unit_path", "")
+            if unit_name and unit_address:
+                hosts.update({unit_name: (unit_address, unit_path)})
+
+        return hosts
+
+    def _target_parts(self, target) -> list:
+        """Extract host and port from a wildcard target.
+
+        Args:
+            target: a string specifying a scrape target. A
+              scrape target is expected to have the format
+              "host:port". The host part may be a wildcard
+              "*" and the port part can be missing (along
+              with ":") in which case port is set to 80.
+
+        Returns:
+            a list with target host and port as in [host, port]
+        """
+        if ":" in target:
+            parts = target.split(":")
+        else:
+            parts = [target, "80"]
+
+        return parts
+
+
+def _dedupe_job_names(jobs: List[dict]):
+    """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key.
+
+    Additionally, fully de-duplicate any identical jobs.
+
+    Args:
+        jobs: A list of prometheus scrape jobs
+    """
+    jobs_copy = copy.deepcopy(jobs)
+
+    # Convert to a dict with job names as keys
+    # I think this line is O(n^2) but it should be okay given the list sizes
+    jobs_dict = {
+        job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy))
+        for job in jobs_copy
+    }
+
+    # If multiple jobs have the same name, convert the name to "name_<hash-of-job>"
+    for key in jobs_dict:
+        if len(jobs_dict[key]) > 1:
+            for job in jobs_dict[key]:
+                job_json = json.dumps(job)
+                hashed = hashlib.sha256(job_json.encode()).hexdigest()
+                job["job_name"] = "{}_{}".format(job["job_name"], hashed)
+    new_jobs = []
+    for key in jobs_dict:
+        new_jobs.extend([i for i in jobs_dict[key]])
+
+    # Deduplicate jobs which are equal
+    # Again this in O(n^2) but it should be okay
+    deduped_jobs = []
+    seen = []
+    for job in new_jobs:
+        job_json = json.dumps(job)
+        hashed = hashlib.sha256(job_json.encode()).hexdigest()
+        if hashed in seen:
+            continue
+        seen.append(hashed)
+        deduped_jobs.append(job)
+
+    return deduped_jobs
+
+
+def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str:
+    """Resolve the provided path items against the directory of the main file.
+
+    Look up the directory of the `main.py` file being executed. This is normally
+    going to be the charm.py file of the charm including this library. Then, resolve
+    the provided path elements and, if the result path exists and is a directory,
+    return its absolute path; otherwise, raise en exception.
+
+    Raises:
+        InvalidAlertRulePathError, if the path does not exist or is not a directory.
+    """
+    charm_dir = Path(str(charm.charm_dir))
+    if not charm_dir.exists() or not charm_dir.is_dir():
+        # Operator Framework does not currently expose a robust
+        # way to determine the top level charm source directory
+        # that is consistent across deployed charms and unit tests
+        # Hence for unit tests the current working directory is used
+        # TODO: updated this logic when the following ticket is resolved
+        # https://github.com/canonical/operator/issues/643
+        charm_dir = Path(os.getcwd())
+
+    alerts_dir_path = charm_dir.absolute().joinpath(*path_elements)
+
+    if not alerts_dir_path.exists():
+        raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist")
+    if not alerts_dir_path.is_dir():
+        raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory")
+
+    return str(alerts_dir_path)
+
+
+class MetricsEndpointProvider(Object):
+    """A metrics endpoint for Prometheus."""
+
+    on = MetricsEndpointProviderEvents()
+
+    def __init__(
+        self,
+        charm,
+        relation_name: str = DEFAULT_RELATION_NAME,
+        jobs=None,
+        alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
+        refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
+        external_url: str = "",
+        lookaside_jobs_callable: Optional[Callable] = None,
+    ):
+        """Construct a metrics provider for a Prometheus charm.
+
+        If your charm exposes a Prometheus metrics endpoint, the
+        `MetricsEndpointProvider` object enables your charm to easily
+        communicate how to reach that metrics endpoint.
+
+        By default, a charm instantiating this object has the metrics
+        endpoints of each of its units scraped by the related Prometheus
+        charms. The scraped metrics are automatically tagged by the
+        Prometheus charms with Juju topology data via the
+        `juju_model_name`, `juju_model_uuid`, `juju_application_name`
+        and `juju_unit` labels. To support such tagging `MetricsEndpointProvider`
+        automatically forwards scrape metadata to a `MetricsEndpointConsumer`
+        (Prometheus charm).
+
+        Scrape targets provided by `MetricsEndpointProvider` can be
+        customized when instantiating this object. For example in the
+        case of a charm exposing the metrics endpoint for each of its
+        units on port 8080 and the `/metrics` path, the
+        `MetricsEndpointProvider` can be instantiated as follows:
+
+            self.metrics_endpoint_provider = MetricsEndpointProvider(
+                self,
+                jobs=[{
+                    "static_configs": [{"targets": ["*:8080"]}],
+                }])
+
+        The notation `*:<port>` means "scrape each unit of this charm on port
+        `<port>`.
+
+        In case the metrics endpoints are not on the standard `/metrics` path,
+        a custom path can be specified as follows:
+
+            self.metrics_endpoint_provider = MetricsEndpointProvider(
+                self,
+                jobs=[{
+                    "metrics_path": "/my/strange/metrics/path",
+                    "static_configs": [{"targets": ["*:8080"]}],
+                }])
+
+        Note how the `jobs` argument is a list: this allows you to expose multiple
+        combinations of paths "metrics_path" and "static_configs" in case your charm
+        exposes multiple endpoints, which could happen, for example, when you have
+        multiple workload containers, with applications in each needing to be scraped.
+        The structure of the objects in the `jobs` list is one-to-one with the
+        `scrape_config` configuration item of Prometheus' own configuration (see
+        https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
+        ), but with only a subset of the fields allowed. The permitted fields are
+        listed in `ALLOWED_KEYS` object in this charm library module.
+
+        It is also possible to specify alert rules. By default, this library will look
+        into the `<charm_parent_dir>/prometheus_alert_rules`, which in a standard charm
+        layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a
+        separate `*.rule` file. If the syntax of a rule is invalid,
+        the  `MetricsEndpointProvider` logs an error and does not load the particular
+        rule.
+
+        To avoid false positives and negatives in the evaluation of alert rules,
+        all ingested alert rule expressions are automatically qualified using Juju
+        Topology filters. This ensures that alert rules provided by your charm, trigger
+        alerts based only on data scrapped from your charm. For example an alert rule
+        such as the following
+
+            alert: UnitUnavailable
+            expr: up < 1
+            for: 0m
+
+        will be automatically transformed into something along the lines of the following
+
+            alert: UnitUnavailable
+            expr: up{juju_model=<model>, juju_model_uuid=<uuid-prefix>, juju_application=<app>} < 1
+            for: 0m
+
+        An attempt will be made to validate alert rules prior to loading them into Prometheus.
+        If they are invalid, an event will be emitted from this object which charms can respond
+        to in order to set a meaningful status for administrators.
+
+        This can be observed via `consumer.on.alert_rule_status_changed` which contains:
+            - The error(s) encountered when validating as `errors`
+            - A `valid` attribute, which can be used to reset the state of charms if alert rules
+              are updated via another mechanism (e.g. `cos-config`) and refreshed.
+
+        Args:
+            charm: a `CharmBase` object that manages this
+                `MetricsEndpointProvider` object. Typically, this is
+                `self` in the instantiating class.
+            relation_name: an optional string name of the relation between `charm`
+                and the Prometheus charmed service. The default is "metrics-endpoint".
+                It is strongly advised not to change the default, so that people
+                deploying your charm will have a consistent experience with all
+                other charms that provide metrics endpoints.
+            jobs: an optional list of dictionaries where each
+                dictionary represents the Prometheus scrape
+                configuration for a single job. When not provided, a
+                default scrape configuration is provided for the
+                `/metrics` endpoint polling all units of the charm on port `80`
+                using the `MetricsEndpointProvider` object.
+            alert_rules_path: an optional path for the location of alert rules
+                files.  Defaults to "./prometheus_alert_rules",
+                resolved relative to the directory hosting the charm entry file.
+                The alert rules are automatically updated on charm upgrade.
+            refresh_event: an optional bound event or list of bound events which
+                will be observed to re-set scrape job data (IP address and others)
+            external_url: an optional argument that represents an external url that
+                can be generated by an Ingress or a Proxy.
+            lookaside_jobs_callable: an optional `Callable` which should be invoked
+                when the job configuration is built as a secondary mapping. The callable
+                should return a `List[Dict]` which is syntactically identical to the
+                `jobs` parameter, but can be updated out of step initialization of
+                this library without disrupting the 'global' job spec.
+
+        Raises:
+            RelationNotFoundError: If there is no relation in the charm's metadata.yaml
+                with the same name as provided via `relation_name` argument.
+            RelationInterfaceMismatchError: The relation with the same name as provided
+                via `relation_name` argument does not have the `prometheus_scrape` relation
+                interface.
+            RelationRoleMismatchError: If the relation with the same name as provided
+                via `relation_name` argument does not have the `RelationRole.provides`
+                role.
+        """
+        _validate_relation_by_interface_and_direction(
+            charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides
+        )
+
+        try:
+            alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path)
+        except InvalidAlertRulePathError as e:
+            logger.debug(
+                "Invalid Prometheus alert rules folder at %s: %s",
+                e.alert_rules_absolute_path,
+                e.message,
+            )
+
+        super().__init__(charm, relation_name)
+        self.topology = JujuTopology.from_charm(charm)
+
+        self._charm = charm
+        self._alert_rules_path = alert_rules_path
+        self._relation_name = relation_name
+        # sanitize job configurations to the supported subset of parameters
+        jobs = [] if jobs is None else jobs
+        self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs)
+
+        if external_url:
+            external_url = (
+                external_url if urlparse(external_url).scheme else ("http://" + external_url)
+            )
+        self.external_url = external_url
+        self._lookaside_jobs = lookaside_jobs_callable
+
+        events = self._charm.on[self._relation_name]
+        self.framework.observe(events.relation_changed, self._on_relation_changed)
+
+        if not refresh_event:
+            # FIXME remove once podspec charms are verified.
+            # `self.set_scrape_job_spec()` is called every re-init so this should not be needed.
+            if len(self._charm.meta.containers) == 1:
+                if "kubernetes" in self._charm.meta.series:
+                    # This is a podspec charm
+                    refresh_event = [self._charm.on.update_status]
+                else:
+                    # This is a sidecar/pebble charm
+                    container = list(self._charm.meta.containers.values())[0]
+                    refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready]
+            else:
+                logger.warning(
+                    "%d containers are present in metadata.yaml and "
+                    "refresh_event was not specified. Defaulting to update_status. "
+                    "Metrics IP may not be set in a timely fashion.",
+                    len(self._charm.meta.containers),
+                )
+                refresh_event = [self._charm.on.update_status]
+
+        else:
+            if not isinstance(refresh_event, list):
+                refresh_event = [refresh_event]
+
+        for ev in refresh_event:
+            self.framework.observe(ev, self.set_scrape_job_spec)
+
+        # Update relation data every reinit. If instead we used event hooks then observing only
+        # relation-joined would not be sufficient:
+        # - Would need to observe leader-elected, in case there was no leader during
+        #   relation-joined.
+        # - If later related to an ingress provider, then would need to register and wait for
+        #   update-status interval to elapse before changes would apply.
+        # - The ingerss-ready custom event is currently emitted prematurely and cannot be relied
+        #   upon: https://github.com/canonical/traefik-k8s-operator/issues/78
+        # NOTE We may still end up waiting for update-status before changes are applied.
+        self.set_scrape_job_spec()
+
+    def _on_relation_changed(self, event):
+        """Check for alert rule messages in the relation data before moving on."""
+        if self._charm.unit.is_leader():
+            ev = json.loads(event.relation.data[event.app].get("event", "{}"))
+
+            if ev:
+                valid = bool(ev.get("valid", True))
+                errors = ev.get("errors", "")
+
+                if valid and not errors:
+                    self.on.alert_rule_status_changed.emit(valid=valid)
+                else:
+                    self.on.alert_rule_status_changed.emit(valid=valid, errors=errors)
+
+    def update_scrape_job_spec(self, jobs):
+        """Update scrape job specification."""
+        self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs)
+        self.set_scrape_job_spec()
+
+    def set_scrape_job_spec(self, _=None):
+        """Ensure scrape target information is made available to prometheus.
+
+        When a metrics provider charm is related to a prometheus charm, the
+        metrics provider sets specification and metadata related to its own
+        scrape configuration. This information is set using Juju application
+        data. In addition, each of the consumer units also sets its own
+        host address in Juju unit relation data.
+        """
+        self._set_unit_ip()
+
+        if not self._charm.unit.is_leader():
+            return
+
+        alert_rules = AlertRules(topology=self.topology)
+        alert_rules.add_path(self._alert_rules_path, recursive=True)
+        alert_rules_as_dict = alert_rules.as_dict()
+
+        for relation in self._charm.model.relations[self._relation_name]:
+            relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata)
+            relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs)
+
+            if alert_rules_as_dict:
+                # Update relation data with the string representation of the rule file.
+                # Juju topology is already included in the "scrape_metadata" field above.
+                # The consumer side of the relation uses this information to name the rules file
+                # that is written to the filesystem.
+                relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict)
+
+    def _set_unit_ip(self, _=None):
+        """Set unit host address.
+
+        Each time a metrics provider charm container is restarted it updates its own
+        host address in the unit relation data for the prometheus charm.
+
+        The only argument specified is an event, and it ignored. This is for expediency
+        to be able to use this method as an event handler, although no access to the
+        event is actually needed.
+        """
+        for relation in self._charm.model.relations[self._relation_name]:
+            unit_ip = str(self._charm.model.get_binding(relation).network.bind_address)
+
+            # TODO store entire url in relation data, instead of only select url parts.
+
+            if self.external_url:
+                parsed = urlparse(self.external_url)
+                unit_address = parsed.hostname
+                path = parsed.path
+            elif self._is_valid_unit_address(unit_ip):
+                unit_address = unit_ip
+                path = ""
+            else:
+                unit_address = socket.getfqdn()
+                path = ""
+
+            relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address
+            relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path
+            relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str(
+                self._charm.model.unit.name
+            )
+
+    def _is_valid_unit_address(self, address: str) -> bool:
+        """Validate a unit address.
+
+        At present only IP address validation is supported, but
+        this may be extended to DNS addresses also, as needed.
+
+        Args:
+            address: a string representing a unit address
+        """
+        try:
+            _ = ipaddress.ip_address(address)
+        except ValueError:
+            return False
+
+        return True
+
+    @property
+    def _scrape_jobs(self) -> list:
+        """Fetch list of scrape jobs.
+
+        Returns:
+           A list of dictionaries, where each dictionary specifies a
+           single scrape job for Prometheus.
+        """
+        jobs = self._jobs if self._jobs else [DEFAULT_JOB]
+        if callable(self._lookaside_jobs):
+            return jobs + PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())
+        else:
+            return jobs
+
+    @property
+    def _scrape_metadata(self) -> dict:
+        """Generate scrape metadata.
+
+        Returns:
+            Scrape configuration metadata for this metrics provider charm.
+        """
+        return self.topology.as_dict()
+
+
+class PrometheusRulesProvider(Object):
+    """Forward rules to Prometheus.
+
+    This object may be used to forward rules to Prometheus. At present it only supports
+    forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which
+    is used for forwarding both scrape targets and associated alert rules. This object
+    is typically used when there is a desire to forward rules that apply globally (across
+    all deployed charms and units) rather than to a single charm. All rule files are
+    forwarded using the same 'prometheus_scrape' interface that is also used by
+    `MetricsEndpointProvider`.
+
+    Args:
+        charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface.
+        relation_name: Name of the relation in `metadata.yaml` that
+            has the `prometheus_scrape` interface.
+        dir_path: Root directory for the collection of rule files.
+        recursive: Whether to scan for rule files recursively.
+    """
+
+    def __init__(
+        self,
+        charm: CharmBase,
+        relation_name: str = DEFAULT_RELATION_NAME,
+        dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
+        recursive=True,
+    ):
+        super().__init__(charm, relation_name)
+        self._charm = charm
+        self._relation_name = relation_name
+        self._recursive = recursive
+
+        try:
+            dir_path = _resolve_dir_against_charm_path(charm, dir_path)
+        except InvalidAlertRulePathError as e:
+            logger.debug(
+                "Invalid Prometheus alert rules folder at %s: %s",
+                e.alert_rules_absolute_path,
+                e.message,
+            )
+        self.dir_path = dir_path
+
+        events = self._charm.on[self._relation_name]
+        event_sources = [
+            events.relation_joined,
+            events.relation_changed,
+            self._charm.on.leader_elected,
+            self._charm.on.upgrade_charm,
+        ]
+
+        for event_source in event_sources:
+            self.framework.observe(event_source, self._update_relation_data)
+
+    def _reinitialize_alert_rules(self):
+        """Reloads alert rules and updates all relations."""
+        self._update_relation_data(None)
+
+    def _update_relation_data(self, _):
+        """Update application relation data with alert rules for all relations."""
+        if not self._charm.unit.is_leader():
+            return
+
+        alert_rules = AlertRules()
+        alert_rules.add_path(self.dir_path, recursive=self._recursive)
+        alert_rules_as_dict = alert_rules.as_dict()
+
+        logger.info("Updating relation data with rule files from disk")
+        for relation in self._charm.model.relations[self._relation_name]:
+            relation.data[self._charm.app]["alert_rules"] = json.dumps(
+                alert_rules_as_dict,
+                sort_keys=True,  # sort, to prevent unnecessary relation_changed events
+            )
+
+
+class MetricsEndpointAggregator(Object):
+    """Aggregate metrics from multiple scrape targets.
+
+    `MetricsEndpointAggregator` collects scrape target information from one
+    or more related charms and forwards this to a `MetricsEndpointConsumer`
+    charm, which may be in a different Juju model. However, it is
+    essential that `MetricsEndpointAggregator` itself resides in the same
+    model as its scrape targets, as this is currently the only way to
+    ensure in Juju that the `MetricsEndpointAggregator` will be able to
+    determine the model name and uuid of the scrape targets.
+
+    `MetricsEndpointAggregator` should be used in place of
+    `MetricsEndpointProvider` in the following two use cases:
+
+    1. Integrating one or more scrape targets that do not support the
+    `prometheus_scrape` interface.
+
+    2. Integrating one or more scrape targets through cross model
+    relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s)
+    may also be used for the purpose of supporting cross model
+    relations.
+
+    Using `MetricsEndpointAggregator` to build a Prometheus charm client
+    only requires instantiating it. Instantiating
+    `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except
+    that it requires specifying the names of three relations: the
+    relation with scrape targets, the relation for alert rules, and
+    that with the Prometheus charms. For example
+
+    ```python
+    self._aggregator = MetricsEndpointAggregator(
+        self,
+        {
+            "prometheus": "monitoring",
+            "scrape_target": "prometheus-target",
+            "alert_rules": "prometheus-rules"
+        }
+    )
+    ```
+
+    `MetricsEndpointAggregator` assumes that each unit of a scrape target
+    sets in its unit-level relation data two entries with keys
+    "hostname" and "port". If it is required to integrate with charms
+    that do not honor these assumptions, it is always possible to
+    derive from `MetricsEndpointAggregator` overriding the `_get_targets()`
+    method, which is responsible for aggregating the unit name, host
+    address ("hostname") and port of the scrape target.
+
+    `MetricsEndpointAggregator` also assumes that each unit of a
+    scrape target sets in its unit-level relation data a key named
+    "groups". The value of this key is expected to be the string
+    representation of list of Prometheus Alert rules in YAML format.
+    An example of a single such alert rule is
+
+    ```yaml
+    - alert: HighRequestLatency
+      expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5
+      for: 10m
+      labels:
+        severity: page
+      annotations:
+        summary: High request latency
+    ```
+
+    Once again if it is required to integrate with charms that do not
+    honour these assumptions about alert rules then an object derived
+    from `MetricsEndpointAggregator` may be used by overriding the
+    `_get_alert_rules()` method.
+
+    `MetricsEndpointAggregator` ensures that Prometheus scrape job
+    specifications and alert rules are annotated with Juju topology
+    information, just like `MetricsEndpointProvider` and
+    `MetricsEndpointConsumer` do.
+
+    By default, `MetricsEndpointAggregator` ensures that Prometheus
+    "instance" labels refer to Juju topology. This ensures that
+    instance labels are stable over unit recreation. While it is not
+    advisable to change this option, if required it can be done by
+    setting the "relabel_instance" keyword argument to `False` when
+    constructing an aggregator object.
+    """
+
+    def __init__(self, charm, relation_names, relabel_instance=True):
+        """Construct a `MetricsEndpointAggregator`.
+
+        Args:
+            charm: a `CharmBase` object that manages this
+                `MetricsEndpointAggregator` object. Typically, this is
+                `self` in the instantiating class.
+            relation_names: a dictionary with three keys. The value
+                of the "scrape_target" and "alert_rules" keys are
+                the relation names over which scrape job and alert rule
+                information is gathered by this `MetricsEndpointAggregator`.
+                And the value of the "prometheus" key is the name of
+                the relation with a `MetricsEndpointConsumer` such as
+                the Prometheus charm.
+            relabel_instance: A boolean flag indicating if Prometheus
+                scrape job "instance" labels must refer to Juju Topology.
+        """
+        super().__init__(charm, relation_names["prometheus"])
+
+        self._charm = charm
+        self._target_relation = relation_names["scrape_target"]
+        self._prometheus_relation = relation_names["prometheus"]
+        self._alert_rules_relation = relation_names["alert_rules"]
+        self._relabel_instance = relabel_instance
+
+        # manage Prometheus charm relation events
+        prometheus_events = self._charm.on[self._prometheus_relation]
+        self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data)
+
+        # manage list of Prometheus scrape jobs from related scrape targets
+        target_events = self._charm.on[self._target_relation]
+        self.framework.observe(target_events.relation_changed, self._update_prometheus_jobs)
+        self.framework.observe(target_events.relation_departed, self._remove_prometheus_jobs)
+
+        # manage alert rules for Prometheus from related scrape targets
+        alert_rule_events = self._charm.on[self._alert_rules_relation]
+        self.framework.observe(alert_rule_events.relation_changed, self._update_alert_rules)
+        self.framework.observe(alert_rule_events.relation_departed, self._remove_alert_rules)
+
+    def _set_prometheus_data(self, event):
+        """Ensure every new Prometheus instances is updated.
+
+        Any time a new Prometheus unit joins the relation with
+        `MetricsEndpointAggregator`, that Prometheus unit is provided
+        with the complete set of existing scrape jobs and alert rules.
+        """
+        jobs = []  # list of scrape jobs, one per relation
+        for relation in self.model.relations[self._target_relation]:
+            targets = self._get_targets(relation)
+            if targets and relation.app:
+                jobs.append(self._static_scrape_job(targets, relation.app.name))
+
+        groups = []  # list of alert rule groups, one group per relation
+        for relation in self.model.relations[self._alert_rules_relation]:
+            unit_rules = self._get_alert_rules(relation)
+            if unit_rules and relation.app:
+                appname = relation.app.name
+                rules = self._label_alert_rules(unit_rules, appname)
+                group = {"name": self._group_name(appname), "rules": rules}
+                groups.append(group)
+
+        event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)
+        event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups})
+
+    def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None:
+        """Update scrape jobs in response to scrape target changes.
+
+        When there is any change in relation data with any scrape
+        target, the Prometheus scrape job, for that specific target is
+        updated. Additionally, if this method is called manually, do the
+        same.
+
+        Args:
+            targets: a `dict` containing target information
+            app_name: a `str` identifying the application
+        """
+        # new scrape job for the relation that has changed
+        updated_job = self._static_scrape_job(targets, app_name, **kwargs)
+
+        for relation in self.model.relations[self._prometheus_relation]:
+            jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]"))
+            # list of scrape jobs that have not changed
+            jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]]
+            jobs.append(updated_job)
+            relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)
+
+    def _update_prometheus_jobs(self, event):
+        """Update scrape jobs in response to scrape target changes.
+
+        When there is any change in relation data with any scrape
+        target, the Prometheus scrape job, for that specific target is
+        updated.
+        """
+        targets = self._get_targets(event.relation)
+        if not targets:
+            return
+
+        # new scrape job for the relation that has changed
+        updated_job = self._static_scrape_job(targets, event.relation.app.name)
+
+        for relation in self.model.relations[self._prometheus_relation]:
+            jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]"))
+            # list of scrape jobs that have not changed
+            jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]]
+            jobs.append(updated_job)
+            relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)
+
+    def _remove_prometheus_jobs(self, event):
+        """Remove scrape jobs when a target departs.
+
+        Any time a scrape target departs, any Prometheus scrape job
+        associated with that specific scrape target is removed.
+        """
+        job_name = self._job_name(event.relation.app.name)
+        unit_name = event.unit.name
+
+        for relation in self.model.relations[self._prometheus_relation]:
+            jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]"))
+            if not jobs:
+                continue
+
+            changed_job = [j for j in jobs if j.get("job_name") == job_name]
+            if not changed_job:
+                continue
+            changed_job = changed_job[0]
+
+            # list of scrape jobs that have not changed
+            jobs = [job for job in jobs if job.get("job_name") != job_name]
+
+            # list of scrape jobs for units of the same application that still exist
+            configs_kept = [
+                config
+                for config in changed_job["static_configs"]  # type: ignore
+                if config.get("labels", {}).get("juju_unit") != unit_name
+            ]
+
+            if configs_kept:
+                changed_job["static_configs"] = configs_kept  # type: ignore
+                jobs.append(changed_job)
+
+            relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)
+
+    def _update_alert_rules(self, event):
+        """Update alert rules in response to scrape target changes.
+
+        When there is any change in alert rule relation data for any
+        scrape target, the list of alert rules for that specific
+        target is updated.
+        """
+        unit_rules = self._get_alert_rules(event.relation)
+        if not unit_rules:
+            return
+
+        appname = event.relation.app.name
+        rules = self._label_alert_rules(unit_rules, appname)
+        # the alert rule group that has changed
+        updated_group = {"name": self._group_name(appname), "rules": rules}
+
+        for relation in self.model.relations[self._prometheus_relation]:
+            alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}"))
+            groups = alert_rules.get("groups", [])
+            # list of alert rule groups that have not changed
+            groups = [group for group in groups if updated_group["name"] != group["name"]]
+            groups.append(updated_group)
+            relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups})
+
+    def _remove_alert_rules(self, event):
+        """Remove alert rules for departed targets.
+
+        Any time a scrape target departs any alert rules associated
+        with that specific scrape target is removed.
+        """
+        group_name = self._group_name(event.relation.app.name)
+        unit_name = event.unit.name
+
+        for relation in self.model.relations[self._prometheus_relation]:
+            alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}"))
+            if not alert_rules:
+                continue
+
+            groups = alert_rules.get("groups", [])
+            if not groups:
+                continue
+
+            changed_group = [group for group in groups if group["name"] == group_name]
+            if not changed_group:
+                continue
+            changed_group = changed_group[0]
+
+            # list of alert rule groups that have not changed
+            groups = [group for group in groups if group["name"] != group_name]
+
+            # list of alert rules not associated with departing unit
+            rules_kept = [
+                rule
+                for rule in changed_group.get("rules")  # type: ignore
+                if rule.get("labels").get("juju_unit") != unit_name
+            ]
+
+            if rules_kept:
+                changed_group["rules"] = rules_kept  # type: ignore
+                groups.append(changed_group)
+
+            relation.data[self._charm.app]["alert_rules"] = (
+                json.dumps({"groups": groups}) if groups else "{}"
+            )
+
+    def _get_targets(self, relation) -> dict:
+        """Fetch scrape targets for a relation.
+
+        Scrape target information is returned for each unit in the
+        relation. This information contains the unit name, network
+        hostname (or address) for that unit, and port on which a
+        metrics endpoint is exposed in that unit.
+
+        Args:
+            relation: an `ops.model.Relation` object for which scrape
+                targets are required.
+
+        Returns:
+            a dictionary whose keys are names of the units in the
+            relation. There values associated with each key is itself
+            a dictionary of the form
+            ```
+            {"hostname": hostname, "port": port}
+            ```
+        """
+        targets = {}
+        for unit in relation.units:
+            port = relation.data[unit].get("port", 80)
+            hostname = relation.data[unit].get("hostname")
+            if hostname:
+                targets.update({unit.name: {"hostname": hostname, "port": port}})
+
+        return targets
+
+    def _get_alert_rules(self, relation) -> dict:
+        """Fetch alert rules for a relation.
+
+        Each unit of the related scrape target may have its own
+        associated alert rules. Alert rules for all units are returned
+        indexed by unit name.
+
+        Args:
+            relation: an `ops.model.Relation` object for which alert
+                rules are required.
+
+        Returns:
+            a dictionary whose keys are names of the units in the
+            relation. There values associated with each key is a list
+            of alert rules. Each rule is in dictionary format. The
+            structure "rule dictionary" corresponds to single
+            Prometheus alert rule.
+        """
+        rules = {}
+        for unit in relation.units:
+            unit_rules = yaml.safe_load(relation.data[unit].get("groups", ""))
+            if unit_rules:
+                rules.update({unit.name: unit_rules})
+
+        return rules
+
+    def _job_name(self, appname) -> str:
+        """Construct a scrape job name.
+
+        Each relation has its own unique scrape job name. All units in
+        the relation are scraped as part of the same scrape job.
+
+        Args:
+            appname: string name of a related application.
+
+        Returns:
+            a string Prometheus scrape job name for the application.
+        """
+        return "juju_{}_{}_{}_prometheus_scrape".format(
+            self.model.name, self.model.uuid[:7], appname
+        )
+
+    def _group_name(self, appname) -> str:
+        """Construct name for an alert rule group.
+
+        Each unit in a relation may define its own alert rules. All
+        rules, for all units in a relation are grouped together and
+        given a single alert rule group name.
+
+        Args:
+            appname: string name of a related application.
+
+        Returns:
+            a string Prometheus alert rules group name for the application.
+        """
+        return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], appname)
+
+    def _label_alert_rules(self, unit_rules, appname) -> list:
+        """Apply juju topology labels to alert rules.
+
+        Args:
+            unit_rules: a list of alert rules, where each rule is in
+                dictionary format.
+            appname: a string name of the application to which the
+                alert rules belong.
+
+        Returns:
+            a list of alert rules with Juju topology labels.
+        """
+        labeled_rules = []
+        for unit_name, rules in unit_rules.items():
+            for rule in rules:
+                # the new JujuTopology removed this, so build it up by hand
+                matchers = {
+                    "juju_{}".format(k): v
+                    for k, v in JujuTopology(self.model.name, self.model.uuid, appname, unit_name)
+                    .as_dict(excluded_keys=["charm_name"])
+                    .items()
+                }
+                rule["labels"].update(matchers.items())
+                labeled_rules.append(rule)
+
+        return labeled_rules
+
+    def _static_scrape_job(self, targets, application_name, **kwargs) -> dict:
+        """Construct a static scrape job for an application.
+
+        Args:
+            targets: a dictionary providing hostname and port for all
+                scrape target. The keys of this dictionary are unit
+                names. Values corresponding to these keys are
+                themselves a dictionary with keys "hostname" and
+                "port".
+            application_name: a string name of the application for
+                which this static scrape job is being constructed.
+
+        Returns:
+            A dictionary corresponding to a Prometheus static scrape
+            job configuration for one application. The returned
+            dictionary may be transformed into YAML and appended to
+            the list of any existing list of Prometheus static configs.
+        """
+        juju_model = self.model.name
+        juju_model_uuid = self.model.uuid
+        job = {
+            "job_name": self._job_name(application_name),
+            "static_configs": [
+                {
+                    "targets": ["{}:{}".format(target["hostname"], target["port"])],
+                    "labels": {
+                        "juju_model": juju_model,
+                        "juju_model_uuid": juju_model_uuid,
+                        "juju_application": application_name,
+                        "juju_unit": unit_name,
+                        "host": target["hostname"],
+                    },
+                }
+                for unit_name, target in targets.items()
+            ],
+            "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []),
+        }
+        job.update(kwargs.get("updates", {}))
+
+        return job
+
+    @property
+    def _relabel_configs(self) -> list:
+        """Create Juju topology relabeling configuration.
+
+        Using Juju topology for instance labels ensures that these
+        labels are stable across unit recreation.
+
+        Returns:
+            a list of Prometheus relabeling configurations. Each item in
+            this list is one relabel configuration.
+        """
+        return (
+            [
+                {
+                    "source_labels": [
+                        "juju_model",
+                        "juju_model_uuid",
+                        "juju_application",
+                        "juju_unit",
+                    ],
+                    "separator": "_",
+                    "target_label": "instance",
+                    "regex": "(.*)",
+                }
+            ]
+            if self._relabel_instance
+            else []
+        )
+
+
+class CosTool:
+    """Uses cos-tool to inject label matchers into alert rule expressions and validate rules."""
+
+    _path = None
+    _disabled = False
+
+    def __init__(self, charm):
+        self._charm = charm
+
+    @property
+    def path(self):
+        """Lazy lookup of the path of cos-tool."""
+        if self._disabled:
+            return None
+        if not self._path:
+            self._path = self._get_tool_path()
+            if not self._path:
+                logger.debug("Skipping injection of juju topology as label matchers")
+                self._disabled = True
+        return self._path
+
+    def apply_label_matchers(self, rules) -> dict:
+        """Will apply label matchers to the expression of all alerts in all supplied groups."""
+        if not self.path:
+            return rules
+        for group in rules["groups"]:
+            rules_in_group = group.get("rules", [])
+            for rule in rules_in_group:
+                topology = {}
+                # if the user for some reason has provided juju_unit, we'll need to honor it
+                # in most cases, however, this will be empty
+                for label in [
+                    "juju_model",
+                    "juju_model_uuid",
+                    "juju_application",
+                    "juju_charm",
+                    "juju_unit",
+                ]:
+                    if label in rule["labels"]:
+                        topology[label] = rule["labels"][label]
+
+                rule["expr"] = self.inject_label_matchers(rule["expr"], topology)
+        return rules
+
+    def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]:
+        """Will validate correctness of alert rules, returning a boolean and any errors."""
+        if not self.path:
+            logger.debug("`cos-tool` unavailable. Not validating alert correctness.")
+            return True, ""
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            rule_path = Path(tmpdir + "/validate_rule.yaml")
+            rule_path.write_text(yaml.dump(rules))
+
+            args = [str(self.path), "validate", str(rule_path)]
+            # noinspection PyBroadException
+            try:
+                self._exec(args)
+                return True, ""
+            except subprocess.CalledProcessError as e:
+                logger.debug("Validating the rules failed: %s", e.output)
+                return False, ", ".join(
+                    [
+                        line
+                        for line in e.output.decode("utf8").splitlines()
+                        if "error validating" in line
+                    ]
+                )
+
+    def inject_label_matchers(self, expression, topology) -> str:
+        """Add label matchers to an expression."""
+        if not topology:
+            return expression
+        if not self.path:
+            logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression)
+            return expression
+        args = [str(self.path), "transform"]
+        args.extend(
+            ["--label-matcher={}={}".format(key, value) for key, value in topology.items()]
+        )
+
+        args.extend(["{}".format(expression)])
+        # noinspection PyBroadException
+        try:
+            return self._exec(args)
+        except subprocess.CalledProcessError as e:
+            logger.debug('Applying the expression failed: "%s", falling back to the original', e)
+            return expression
+
+    def _get_tool_path(self) -> Optional[Path]:
+        arch = platform.machine()
+        arch = "amd64" if arch == "x86_64" else arch
+        res = "cos-tool-{}".format(arch)
+        try:
+            path = Path(res).resolve()
+            path.chmod(0o777)
+            return path
+        except NotImplementedError:
+            logger.debug("System lacks support for chmod")
+        except FileNotFoundError:
+            logger.debug('Could not locate cos-tool at: "{}"'.format(res))
+        return None
+
+    def _exec(self, cmd) -> str:
+        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        return result.stdout.decode("utf-8").strip()
diff --git a/metadata.yaml b/metadata.yaml
index 0c051fb..a2df776 100644
--- a/metadata.yaml
+++ b/metadata.yaml
@@ -36,5 +36,7 @@ resources:
     upstream-source: docker.io/grafana/mimir:2.4.0
 
 provides:
+  metrics-endpoint:
+    interface: prometheus_scrape
   receive-remote-write:
     interface: prometheus_remote_write
diff --git a/src/charm.py b/src/charm.py
index 4f99695..4bf7458 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -21,6 +21,7 @@
     DEFAULT_RELATION_NAME as DEFAULT_REMOTE_WRITE_RELATION_NAME,
 )
 from charms.prometheus_k8s.v0.prometheus_remote_write import PrometheusRemoteWriteProvider
+from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
 from deepdiff import DeepDiff  # type: ignore
 from ops.charm import CharmBase
 from ops.framework import StoredState
@@ -61,6 +62,13 @@ def __init__(self, *args):
             self, [ServicePort(self._http_listen_port, name=self.app.name)]
         )
 
+        self.metrics_provider = MetricsEndpointProvider(
+            self,
+            refresh_event=[
+                self.on.update_status,
+            ],
+        )
+
         self.remote_write_provider = PrometheusRemoteWriteProvider(
             charm=self,
             relation_name=DEFAULT_REMOTE_WRITE_RELATION_NAME,

From 862e84a4b04a67a3652d3c2fd61947db038f563e Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 11:08:10 +0000
Subject: [PATCH 04/13] fix linting configuration

---
 tests/unit/test_charm.py |  2 +-
 tox.ini                  | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index 2cfdb0f..52ec78f 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -1,4 +1,4 @@
-# Copyright 2022 Canonical
+# Copyright 2022 Canonical Ltd.
 # See LICENSE file for licensing details.
 #
 # Learn more about testing at: https://juju.is/docs/sdk/testing
diff --git a/tox.ini b/tox.ini
index 86f0947..8b91aa6 100644
--- a/tox.ini
+++ b/tox.ini
@@ -35,18 +35,19 @@ commands =
 description = Check code against coding style standards
 deps =
     black
+    codespell
+    flake8 < 5
     flake8-docstrings
+    flake8-copyright
     flake8-builtins
     pyproject-flake8
     pep8-naming
     isort
-    codespell
 commands =
     # uncomment the following line if this charm owns a lib
     # codespell {[vars]lib_path}
-    codespell {toxinidir}/. --skip {toxinidir}/.git --skip {toxinidir}/.tox \
-      --skip {toxinidir}/build --skip {toxinidir}/lib --skip {toxinidir}/venv \
-      --skip {toxinidir}/.mypy_cache --skip {toxinidir}/icon.svg
+    codespell . --skip .git --skip .tox --skip build --skip lib --skip venv --skip .mypy_cache \
+      --skip icon.svg
     # pflake8 wrapper supports config from pyproject.toml
     pflake8 {[vars]all_path}
     isort --check-only --diff {[vars]all_path}
@@ -105,4 +106,4 @@ deps =
     pytest-operator
     -r{toxinidir}/requirements.txt
 commands =
-    pytest -vv --tb native --log-cli-level=INFO --color=yes -s {posargs} {toxinidir}/tests/integration
\ No newline at end of file
+    pytest -vv --tb native --log-cli-level=INFO --color=yes -s {posargs} {toxinidir}/tests/integration

From 2209831bbe83684714658e97a309eab9e5f4e217 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 16:20:22 +0000
Subject: [PATCH 05/13] pin deepdiff version and use correct mimir port

---
 requirements.txt | 8 ++++----
 src/charm.py     | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d5cce20..7856aca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-ops >= 1.5.0
-deepdiff
-lightkube
-lightkube-models
+ops
+deepdiff == 6.2.2
+lightkube >= 0.11
+lightkube-models >= 1.22.0.4
 parse
diff --git a/src/charm.py b/src/charm.py
index 4bf7458..49e94a7 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -64,6 +64,7 @@ def __init__(self, *args):
 
         self.metrics_provider = MetricsEndpointProvider(
             self,
+            jobs=[{"static_configs": [{"targets": ["*:9009"]}]}],
             refresh_event=[
                 self.on.update_status,
             ],

From d40a83281c8a0d650883a5739b48d455fe3089e3 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 16:50:20 +0000
Subject: [PATCH 06/13] add integration tests

---
 tests/integration/helpers.py              | 59 +++++++++++++++++++++++
 tests/integration/test_self_monitoring.py | 54 +++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 tests/integration/helpers.py
 create mode 100644 tests/integration/test_self_monitoring.py

diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
new file mode 100644
index 0000000..57cc94e
--- /dev/null
+++ b/tests/integration/helpers.py
@@ -0,0 +1,59 @@
+# Copyright 2021 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+import logging
+from pathlib import Path
+from urllib.parse import urljoin
+
+import requests
+import yaml
+
+logger = logging.getLogger(__name__)
+
+
+async def get_unit_address(ops_test, app_name: str, unit_num: int) -> str:
+    status = await ops_test.model.get_status()  # noqa: F821
+    return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"]
+
+
+async def mimir_endpoint_request(ops_test, app_name: str, endpoint: str, unit_num: int = 0):
+    address = await get_unit_address(ops_test, app_name, unit_num)
+    url = urljoin(f"http://{address}:9009/", endpoint)
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.text
+        return ""
+    except requests.exceptions.RequestException:
+        return ""
+
+
+def oci_image(metadata_file: str, image_name: str) -> str:
+    """Find upstream source for a container image.
+
+    Args:
+        metadata_file: string path of metadata YAML file relative
+            to top level charm directory
+        image_name: OCI container image string name as defined in
+            metadata.yaml file
+    Returns:
+        upstream image source
+    Raises:
+        FileNotFoundError: if metadata_file path is invalid
+        ValueError: if upstream source for image name can not be found
+    """
+    metadata = yaml.safe_load(Path(metadata_file).read_text())
+
+    resources = metadata.get("resources", {})
+    if not resources:
+        raise ValueError("No resources found")
+
+    image = resources.get(image_name, {})
+    if not image:
+        raise ValueError("{} image not found".format(image_name))
+
+    upstream_source = image.get("upstream-source", "")
+    if not upstream_source:
+        raise ValueError("Upstream source not found")
+
+    return upstream_source
diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py
new file mode 100644
index 0000000..ccb3047
--- /dev/null
+++ b/tests/integration/test_self_monitoring.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright 2021 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+import asyncio
+import logging
+
+import pytest
+import requests
+from helpers import get_unit_address, mimir_endpoint_request, oci_image
+
+logger = logging.getLogger(__name__)
+
+MIMIR = "mimir"
+PROMETHEUS = "prometheus"
+
+
+@pytest.mark.abort_on_fail
+async def test_deploy_and_relate_charms(ops_test, mimir_charm):
+    """Test that Mimir can be related with Prometheus over prometheus_scrape."""
+    await asyncio.gather(
+        ops_test.model.deploy(
+            mimir_charm,
+            resources={"mimir-image": oci_image("./metadata.yaml", "mimir-image")},
+            application_name=MIMIR,
+            trust=True,
+        ),
+        ops_test.model.deploy(
+            "prometheus-k8s",
+            application_name=PROMETHEUS,
+            channel="edge",
+            trust=True,
+        ),
+    )
+
+    await ops_test.model.add_relation(MIMIR, f"{PROMETHEUS}:metrics-endpoint")
+    apps = [MIMIR, PROMETHEUS]
+    await ops_test.model.wait_for_idle(apps=apps, status="active")
+
+
+async def test_metrics_are_available(ops_test):
+    metrics = await mimir_endpoint_request(ops_test, MIMIR, "metrics", 0)
+    assert len(metrics) > 0
+
+
+async def test_query_metrics_from_prometheus(ops_test):
+    address = await get_unit_address(ops_test, PROMETHEUS, 0)
+    url = f"http://{address}:9090/api/v1/query"
+    params = {"query": f"up{{juju_application='{MIMIR}'}}"}
+    try:
+        response = requests.get(url, params=params)
+        assert response.json()["status"] == "success"
+    except requests.exceptions.RequestException:
+        assert False

From 567440920b6342851d5e6cae47eea783279de2f4 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 17:20:41 +0000
Subject: [PATCH 07/13] improve integration tests

---
 tests/integration/test_self_monitoring.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py
index ccb3047..9c43347 100644
--- a/tests/integration/test_self_monitoring.py
+++ b/tests/integration/test_self_monitoring.py
@@ -9,6 +9,8 @@
 import requests
 from helpers import get_unit_address, mimir_endpoint_request, oci_image
 
+from pytest_operator.plugin import OpsTest
+
 logger = logging.getLogger(__name__)
 
 MIMIR = "mimir"
@@ -16,8 +18,11 @@
 
 
 @pytest.mark.abort_on_fail
-async def test_deploy_and_relate_charms(ops_test, mimir_charm):
+async def test_deploy_and_relate_charms(ops_test: OpsTest):
     """Test that Mimir can be related with Prometheus over prometheus_scrape."""
+    # Build charm from local source folder
+    mimir_charm = await ops_test.build_charm(".")
+
     await asyncio.gather(
         ops_test.model.deploy(
             mimir_charm,

From 5d978ef637db05e40d7a57314f84cd3cd83c5470 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Tue, 17 Jan 2023 17:22:15 +0000
Subject: [PATCH 08/13] satisfy linter

---
 tests/integration/test_self_monitoring.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py
index 9c43347..a2f1ab5 100644
--- a/tests/integration/test_self_monitoring.py
+++ b/tests/integration/test_self_monitoring.py
@@ -8,7 +8,6 @@
 import pytest
 import requests
 from helpers import get_unit_address, mimir_endpoint_request, oci_image
-
 from pytest_operator.plugin import OpsTest
 
 logger = logging.getLogger(__name__)

From 3d1ccf167f327b752d5f3f620d14e3e94953f4cb Mon Sep 17 00:00:00 2001
From: Ryan Barry <ryan.barry@canonical.com>
Date: Tue, 17 Jan 2023 19:00:54 +0000
Subject: [PATCH 09/13] No DeepDiff for Pebble Layers

---
 requirements.txt |  1 -
 src/charm.py     | 36 +++++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7856aca..14d0f14 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 ops
-deepdiff == 6.2.2
 lightkube >= 0.11
 lightkube-models >= 1.22.0.4
 parse
diff --git a/src/charm.py b/src/charm.py
index 49e94a7..82b3656 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -22,13 +22,12 @@
 )
 from charms.prometheus_k8s.v0.prometheus_remote_write import PrometheusRemoteWriteProvider
 from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
-from deepdiff import DeepDiff  # type: ignore
 from ops.charm import CharmBase
 from ops.framework import StoredState
 from ops.main import main
 from ops.model import ActiveStatus, BlockedStatus, WaitingStatus
 from ops.pebble import Error as PebbleError
-from ops.pebble import PathError, ProtocolError
+from ops.pebble import Layer, PathError, ProtocolError
 from parse import search  # type: ignore
 
 MIMIR_CONFIG = "/etc/mimir/mimir-config.yaml"
@@ -128,11 +127,12 @@ def _set_pebble_layer(self) -> bool:
 
         Returns: True if Pebble layer was added, otherwise False.
         """
-        current_layer = self._container.get_plan().to_dict()
+        current_layer = self._container.get_plan()
         new_layer = self._pebble_layer
 
-        if "services" not in current_layer or DeepDiff(
-            current_layer["services"], new_layer["services"], ignore_order=True
+        if (
+            "services" not in current_layer.to_dict()
+            or current_layer.services != new_layer.services
         ):
             self._container.add_layer(self._name, new_layer, combine=True)
             return True
@@ -205,18 +205,20 @@ def _push_alert_rules(self, alerts):
 
     @property
     def _pebble_layer(self):
-        return {
-            "summary": "mimir layer",
-            "description": "pebble config layer for mimir",
-            "services": {
-                "mimir": {
-                    "override": "replace",
-                    "summary": "mimir daemon",
-                    "command": f"/bin/mimir --config.file={MIMIR_CONFIG}",
-                    "startup": "enabled",
-                }
-            },
-        }
+        return Layer(
+            {
+                "summary": "mimir layer",
+                "description": "pebble config layer for mimir",
+                "services": {
+                    "mimir": {
+                        "override": "replace",
+                        "summary": "mimir daemon",
+                        "command": f"/bin/mimir --config.file={MIMIR_CONFIG}",
+                        "startup": "enabled",
+                    }
+                },
+            }
+        )
 
     @property
     def _mimir_config(self) -> dict:

From 03243f34f521b6fc3763f6bb277b312bf04d5f90 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Fri, 20 Jan 2023 11:16:51 +0000
Subject: [PATCH 10/13] add recording rules and juju topology

---
 src/charm.py                                  |  12 +-
 src/prometheus_alert_rules/alert-rules.yaml   | 943 ++++++++++++++++++
 .../recording-rules.yaml                      | 571 +++++++++++
 3 files changed, 1525 insertions(+), 1 deletion(-)
 create mode 100644 src/prometheus_alert_rules/alert-rules.yaml
 create mode 100644 src/prometheus_alert_rules/recording-rules.yaml

diff --git a/src/charm.py b/src/charm.py
index 82b3656..4d5eb8c 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -57,13 +57,23 @@ def __init__(self, *args):
         self._stored.set_default(alerts_hash=None)
         self._container = self.unit.get_container(self._name)
 
+        self.topology = JujuTopology.from_charm(self)
+
         self.service_patch = KubernetesServicePatch(
             self, [ServicePort(self._http_listen_port, name=self.app.name)]
         )
 
         self.metrics_provider = MetricsEndpointProvider(
             self,
-            jobs=[{"static_configs": [{"targets": ["*:9009"]}]}],
+            jobs=[{"static_configs": [{
+                "targets": [f"*:{self._http_listen_port}"],
+                "labels": {
+                    "cluster": self.topology.model_uuid,
+                    "namespace": self.topology.model,
+                    "job": f"{self.topology.model}/mimir",
+                    "pod": self.topology.unit,
+                }
+                }]}],
             refresh_event=[
                 self.on.update_status,
             ],
diff --git a/src/prometheus_alert_rules/alert-rules.yaml b/src/prometheus_alert_rules/alert-rules.yaml
new file mode 100644
index 0000000..bc5701d
--- /dev/null
+++ b/src/prometheus_alert_rules/alert-rules.yaml
@@ -0,0 +1,943 @@
+groups:
+- name: mimir_alerts
+  rules:
+  - alert: MimirIngesterUnhealthy
+    annotations:
+      message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{
+        printf "%f" $value }} unhealthy ingester(s).
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy
+    expr: |
+      min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirRequestErrors
+    annotations:
+      message: |
+        The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors
+    expr: |
+      100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
+        /
+      sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
+        > 1
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirRequestLatency
+    annotations:
+      message: |
+        {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
+    expr: |
+      cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
+         >
+      2.5
+    for: 15m
+    labels:
+      severity: warning
+  - alert: MimirQueriesIncorrect
+    annotations:
+      message: |
+        The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect
+    expr: |
+      100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m]))
+        /
+      sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1
+    for: 15m
+    labels:
+      severity: warning
+  - alert: MimirInconsistentRuntimeConfig
+    annotations:
+      message: |
+        An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig
+    expr: |
+      count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1
+    for: 1h
+    labels:
+      severity: critical
+  - alert: MimirBadRuntimeConfig
+    annotations:
+      message: |
+        {{ $labels.job }} failed to reload runtime config.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig
+    expr: |
+      # The metric value is reset to 0 on error while reloading the config at runtime.
+      cortex_runtime_config_last_reload_successful == 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirFrontendQueriesStuck
+    annotations:
+      message: |
+        There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck
+    expr: |
+      sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirSchedulerQueriesStuck
+    annotations:
+      message: |
+        There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck
+    expr: |
+      sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0
+    for: 7m
+    labels:
+      severity: critical
+  - alert: MimirMemcachedRequestErrors
+    annotations:
+      message: |
+        Memcached {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemcachedrequesterrors
+    expr: |
+      (
+        sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operation_failures_total[1m])) /
+        sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operations_total[1m]))
+      ) * 100 > 5
+    for: 5m
+    labels:
+      severity: warning
+  - alert: MimirIngesterRestarts
+    annotations:
+      message: '{{ $labels.job }}/{{ $labels.pod }} has restarted {{ printf "%.2f"
+        $value }} times in the last 30 mins.'
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts
+    expr: |
+      changes(process_start_time_seconds{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}[30m]) >= 2
+    labels:
+      severity: warning
+  - alert: MimirKVStoreFailure
+    annotations:
+      message: |
+        Mimir {{ $labels.pod }} in  {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
+    expr: |
+      (
+        sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
+        /
+        sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
+      )
+      # We want to get alerted only in case there's a constant failure.
+      == 1
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirMemoryMapAreasTooHigh
+    annotations:
+      message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas
+        close to the limit.'
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh
+    expr: |
+      process_memory_map_areas{job=~".*/((ingester.*|cortex|mimir|mimir-write.*)|(store-gateway.*|cortex|mimir|mimir-backend.*))"} / process_memory_map_areas_limit{job=~".*/((ingester.*|cortex|mimir|mimir-write.*)|(store-gateway.*|cortex|mimir|mimir-backend.*))"} > 0.8
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirDistributorForwardingErrorRate
+    annotations:
+      message: |
+        Mimir in {{ $labels.cluster }}/{{ $labels.namespace }} has a high failure rate when forwarding samples.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorforwardingerrorrate
+    expr: |
+      sum by (cluster, namespace) (rate(cortex_distributor_forward_errors_total{}[1m]))
+      /
+      sum by (cluster, namespace) (rate(cortex_distributor_forward_requests_total{}[1m]))
+      > 0.01
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirIngesterInstanceHasNoTenants
+    annotations:
+      message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has no tenants assigned.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants
+    expr: |
+      (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0)
+      and on (cluster, namespace)
+      # Only if there are more time-series than would be expected due to continuous testing load
+      (
+        sum by(cluster, namespace) (cortex_ingester_memory_series)
+        /
+        max by(cluster, namespace) (cortex_distributor_replication_factor)
+      ) > 100000
+    for: 1h
+    labels:
+      severity: warning
+  - alert: MimirRulerInstanceHasNoRuleGroups
+    annotations:
+      message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has no rule groups assigned.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups
+    expr: |
+      # Alert on ruler instances in microservices mode that have no rule groups assigned,
+      min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*-mimir-)?ruler.*"}) == 0
+      # but only if other ruler instances of the same cell do have rule groups assigned
+      and on (cluster, namespace)
+      (max by(cluster, namespace) (cortex_ruler_managers_total) > 0)
+      # and there are more than two instances overall
+      and on (cluster, namespace)
+      (count by (cluster, namespace) (cortex_ruler_managers_total) > 2)
+    for: 1h
+    labels:
+      severity: warning
+  - alert: MimirRingMembersMismatch
+    annotations:
+      message: |
+        Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch
+    expr: |
+      (
+        avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~"(.*/)?(ingester.*|cortex|mimir|mimir-write.*)"}))
+        != sum by(cluster, namespace) (up{job=~"(.*/)?(ingester.*|cortex|mimir|mimir-write.*)"})
+      )
+      and
+      (
+        count by(cluster, namespace) (cortex_build_info) > 0
+      )
+    for: 15m
+    labels:
+      component: ingester
+      severity: warning
+- name: mimir_instance_limits_alerts
+  rules:
+  - alert: MimirIngesterReachingSeriesLimit
+    annotations:
+      message: |
+        Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit
+    expr: |
+      (
+          (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"})
+          and ignoring (limit)
+          (cortex_ingester_instance_limits{limit="max_series"} > 0)
+      ) > 0.8
+    for: 3h
+    labels:
+      severity: warning
+  - alert: MimirIngesterReachingSeriesLimit
+    annotations:
+      message: |
+        Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit
+    expr: |
+      (
+          (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"})
+          and ignoring (limit)
+          (cortex_ingester_instance_limits{limit="max_series"} > 0)
+      ) > 0.9
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirIngesterReachingTenantsLimit
+    annotations:
+      message: |
+        Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit
+    expr: |
+      (
+          (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"})
+          and ignoring (limit)
+          (cortex_ingester_instance_limits{limit="max_tenants"} > 0)
+      ) > 0.7
+    for: 5m
+    labels:
+      severity: warning
+  - alert: MimirIngesterReachingTenantsLimit
+    annotations:
+      message: |
+        Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit
+    expr: |
+      (
+          (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"})
+          and ignoring (limit)
+          (cortex_ingester_instance_limits{limit="max_tenants"} > 0)
+      ) > 0.8
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirReachingTCPConnectionsLimit
+    annotations:
+      message: |
+        Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit
+    expr: |
+      cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and
+      cortex_tcp_connections_limit > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirDistributorReachingInflightPushRequestLimit
+    annotations:
+      message: |
+        Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit
+    expr: |
+      (
+          (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"})
+          and ignoring (limit)
+          (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0)
+      ) > 0.8
+    for: 5m
+    labels:
+      severity: critical
+- name: mimir-rollout-alerts
+  rules:
+  - alert: MimirRolloutStuck
+    annotations:
+      message: |
+        The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
+    expr: |
+      (
+        max without (revision) (
+          sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
+            unless
+          sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
+        )
+          *
+        (
+          sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
+            !=
+          sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
+        )
+      ) and (
+        changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
+          ==
+        0
+      )
+      * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
+    for: 30m
+    labels:
+      severity: warning
+  - alert: MimirRolloutStuck
+    annotations:
+      message: |
+        The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
+    expr: |
+      (
+        sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
+          !=
+        sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
+      ) and (
+        changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
+          ==
+        0
+      )
+      * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
+    for: 30m
+    labels:
+      severity: warning
+  - alert: RolloutOperatorNotReconciling
+    annotations:
+      message: |
+        Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling
+    expr: |
+      max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600
+    for: 5m
+    labels:
+      severity: critical
+- name: mimir-provisioning
+  rules:
+  - alert: MimirProvisioningTooManyActiveSeries
+    annotations:
+      message: |
+        The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries
+    expr: |
+      avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
+    for: 2h
+    labels:
+      severity: warning
+  - alert: MimirProvisioningTooManyWrites
+    annotations:
+      message: |
+        Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanywrites
+    expr: |
+      avg by (cluster, namespace) (cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m) > 80e3
+    for: 15m
+    labels:
+      severity: warning
+  - alert: MimirAllocatingTooMuchMemory
+    annotations:
+      message: |
+        Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory
+    expr: |
+      (
+        # We use RSS instead of working set memory because of the ingester's extensive usage of mmap.
+        # See: https://github.com/grafana/mimir/issues/2466
+        container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"}
+          /
+        ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 )
+      ) > 0.65
+    for: 15m
+    labels:
+      severity: warning
+  - alert: MimirAllocatingTooMuchMemory
+    annotations:
+      message: |
+        Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory
+    expr: |
+      (
+        # We use RSS instead of working set memory because of the ingester's extensive usage of mmap.
+        # See: https://github.com/grafana/mimir/issues/2466
+        container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"}
+          /
+        ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 )
+      ) > 0.8
+    for: 15m
+    labels:
+      severity: critical
+- name: ruler_alerts
+  rules:
+  - alert: MimirRulerTooManyFailedPushes
+    annotations:
+      message: |
+        Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes
+    expr: |
+      100 * (
+      sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m]))
+        /
+      sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m]))
+      ) > 1
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirRulerTooManyFailedQueries
+    annotations:
+      message: |
+        Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries
+    expr: |
+      100 * (
+      sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m]))
+        /
+      sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m]))
+      ) > 1
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirRulerMissedEvaluations
+    annotations:
+      message: |
+        Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations
+    expr: |
+      100 * (
+      sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m]))
+        /
+      sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m]))
+      ) > 1
+    for: 5m
+    labels:
+      severity: warning
+  - alert: MimirRulerFailedRingCheck
+    annotations:
+      message: |
+        Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck
+    expr: |
+      sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m]))
+         > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirRulerRemoteEvaluationFailing
+    annotations:
+      message: |
+        Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing
+    expr: |
+      100 * (
+      sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m]))
+        /
+      sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))
+      ) > 1
+    for: 5m
+    labels:
+      severity: warning
+- name: gossip_alerts
+  rules:
+  - alert: MimirGossipMembersMismatch
+    annotations:
+      message: Mimir instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} sees incorrect number of gossip members.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch
+    expr: |
+      avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(alertmanager|compactor|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
+    for: 15m
+    labels:
+      severity: warning
+- name: etcd_alerts
+  rules:
+  - alert: EtcdAllocatingTooMuchMemory
+    annotations:
+      message: |
+        Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory
+    expr: |
+      (
+        container_memory_working_set_bytes{container="etcd"}
+          /
+        ( container_spec_memory_limit_bytes{container="etcd"} > 0 )
+      ) > 0.65
+    for: 15m
+    labels:
+      severity: warning
+  - alert: EtcdAllocatingTooMuchMemory
+    annotations:
+      message: |
+        Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory
+    expr: |
+      (
+        container_memory_working_set_bytes{container="etcd"}
+          /
+        ( container_spec_memory_limit_bytes{container="etcd"} > 0 )
+      ) > 0.8
+    for: 15m
+    labels:
+      severity: critical
+- name: alertmanager_alerts
+  rules:
+  - alert: MimirAlertmanagerSyncConfigsFailing
+    annotations:
+      message: |
+        Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing
+    expr: |
+      rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
+    for: 30m
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerRingCheckFailing
+    annotations:
+      message: |
+        Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing
+    expr: |
+      rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0
+    for: 10m
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerPartialStateMergeFailing
+    annotations:
+      message: |
+        Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing
+    expr: |
+      rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0
+    for: 10m
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerReplicationFailing
+    annotations:
+      message: |
+        Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing
+    expr: |
+      rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0
+    for: 10m
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerPersistStateFailing
+    annotations:
+      message: |
+        Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing
+    expr: |
+      rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
+    for: 1h
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerInitialSyncFailed
+    annotations:
+      message: |
+        Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed
+    expr: |
+      increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerAllocatingTooMuchMemory
+    annotations:
+      message: |
+        Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory
+    expr: |
+      (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80
+      and
+      (container_spec_memory_limit_bytes{container="alertmanager"} > 0)
+    for: 15m
+    labels:
+      severity: warning
+  - alert: MimirAlertmanagerAllocatingTooMuchMemory
+    annotations:
+      message: |
+        Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory
+    expr: |
+      (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90
+      and
+      (container_spec_memory_limit_bytes{container="alertmanager"} > 0)
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirAlertmanagerInstanceHasNoTenants
+    annotations:
+      message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} owns no tenants.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants
+    expr: |
+      # Alert on alertmanager instances in microservices mode that own no tenants,
+      min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*-mimir-)?alertmanager.*"}) == 0
+      # but only if other instances of the same cell do have tenants assigned.
+      and on (cluster, namespace)
+      max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0
+    for: 1h
+    labels:
+      severity: warning
+- name: mimir_blocks_alerts
+  rules:
+  - alert: MimirIngesterHasNotShippedBlocks
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not shipped any block in the last 4 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks
+    expr: |
+      (min by(cluster, namespace, pod) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4)
+      and
+      (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) > 0)
+      and
+      # Only if the ingester has ingested samples over the last 4h.
+      (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
+      and
+      # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica
+      # had ingested samples in the past, then no traffic was received for a long period and then it starts
+      # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
+      # samples, while the a block shipping is expected within the next 4h.
+      (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirIngesterHasNotShippedBlocksSinceStart
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not shipped any block in the last 4 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart
+    expr: |
+      (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) == 0)
+      and
+      (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
+    for: 4h
+    labels:
+      severity: critical
+  - alert: MimirIngesterHasUnshippedBlocks
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't
+        been successfully uploaded to the storage yet.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks
+    expr: |
+      (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600)
+      and
+      (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0)
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirIngesterTSDBHeadCompactionFailed
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to compact TSDB head.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed
+    expr: |
+      rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirIngesterTSDBHeadTruncationFailed
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to truncate TSDB head.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed
+    expr: |
+      rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0
+    labels:
+      severity: critical
+  - alert: MimirIngesterTSDBCheckpointCreationFailed
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to create TSDB checkpoint.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed
+    expr: |
+      rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0
+    labels:
+      severity: critical
+  - alert: MimirIngesterTSDBCheckpointDeletionFailed
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to delete TSDB checkpoint.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed
+    expr: |
+      rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0
+    labels:
+      severity: critical
+  - alert: MimirIngesterTSDBWALTruncationFailed
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to truncate TSDB WAL.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed
+    expr: |
+      rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0
+    labels:
+      severity: warning
+  - alert: MimirIngesterTSDBWALCorrupted
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} got a corrupted TSDB WAL.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted
+    expr: |
+      rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0
+    labels:
+      severity: critical
+  - alert: MimirIngesterTSDBWALWritesFailed
+    annotations:
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to write to TSDB WAL.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed
+    expr: |
+      rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0
+    for: 3m
+    labels:
+      severity: critical
+  - alert: MimirQuerierHasNotScanTheBucket
+    annotations:
+      message: Mimir Querier {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not successfully scanned the bucket since {{ $value | humanizeDuration
+        }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirquerierhasnotscanthebucket
+    expr: |
+      (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
+      and
+      cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirQuerierHighRefetchRate
+    annotations:
+      message: Mimir Queries in {{ $labels.cluster }}/{{ $labels.namespace }} are
+        refetching series from different store-gateways (because of missing blocks)
+        for the {{ printf "%.0f" $value }}% of queries.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirquerierhighrefetchrate
+    expr: |
+      100 * (
+        (
+          sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
+          -
+          sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
+        )
+        /
+        sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
+      )
+      > 1
+    for: 10m
+    labels:
+      severity: warning
+  - alert: MimirStoreGatewayHasNotSyncTheBucket
+    annotations:
+      message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not successfully synched the bucket since {{ $value | humanizeDuration
+        }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket
+    expr: |
+      (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
+      and
+      cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirStoreGatewayNoSyncedTenants
+    annotations:
+      message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is not syncing any blocks for any tenant.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants
+    expr: |
+      min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0
+    for: 1h
+    labels:
+      severity: warning
+  - alert: MimirBucketIndexNotUpdated
+    annotations:
+      message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster
+        }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration
+        }}.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated
+    expr: |
+      min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
+    labels:
+      severity: critical
+  - alert: MimirTenantHasPartialBlocks
+    annotations:
+      message: Mimir tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has {{ $value }} partial blocks.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirtenanthaspartialblocks
+    expr: |
+      max by(cluster, namespace, user) (cortex_bucket_blocks_partials_count) > 0
+    for: 6h
+    labels:
+      severity: warning
+- name: mimir_compactor_alerts
+  rules:
+  - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not successfully cleaned up blocks in the last 6 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks
+    expr: |
+      # The "last successful run" metric is updated even if the compactor owns no tenants,
+      # so this alert correctly doesn't fire if compactor has nothing to do.
+      (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6)
+    for: 1h
+    labels:
+      severity: critical
+  - alert: MimirCompactorHasNotSuccessfullyRunCompaction
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not run compaction in the last 24 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction
+    expr: |
+      # The "last successful run" metric is updated even if the compactor owns no tenants,
+      # so this alert correctly doesn't fire if compactor has nothing to do.
+      (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24)
+      and
+      (cortex_compactor_last_successful_run_timestamp_seconds > 0)
+    for: 1h
+    labels:
+      reason: in-last-24h
+      severity: critical
+  - alert: MimirCompactorHasNotSuccessfullyRunCompaction
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not run compaction in the last 24 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction
+    expr: |
+      # The "last successful run" metric is updated even if the compactor owns no tenants,
+      # so this alert correctly doesn't fire if compactor has nothing to do.
+      cortex_compactor_last_successful_run_timestamp_seconds == 0
+    for: 24h
+    labels:
+      reason: since-startup
+      severity: critical
+  - alert: MimirCompactorHasNotSuccessfullyRunCompaction
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} failed to run 2 consecutive compactions.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction
+    expr: |
+      increase(cortex_compactor_runs_failed_total[2h]) >= 2
+    labels:
+      reason: consecutive-failures
+      severity: critical
+  - alert: MimirCompactorHasNotUploadedBlocks
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not uploaded any block in the last 24 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks
+    expr: |
+      (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24)
+      and
+      (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0)
+      and
+      # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do
+      # (e.g. there are more replicas than required because running as part of mimir-backend).
+      (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0)
+    for: 15m
+    labels:
+      severity: critical
+  - alert: MimirCompactorHasNotUploadedBlocks
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has not uploaded any block in the last 24 hours.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks
+    expr: |
+      (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0)
+      and
+      # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do
+      # (e.g. there are more replicas than required because running as part of mimir-backend).
+      (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0)
+    for: 24h
+    labels:
+      severity: critical
+  - alert: MimirCompactorSkippedBlocksWithOutOfOrderChunks
+    annotations:
+      message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} has found and ignored blocks with out of order chunks.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedblockswithoutoforderchunks
+    expr: |
+      increase(cortex_compactor_blocks_marked_for_no_compaction_total{reason="block-index-out-of-order-chunk"}[5m]) > 0
+    for: 1m
+    labels:
+      severity: warning
+- name: mimir_autoscaling
+  rules:
+  - alert: MimirAutoscalerNotActive
+    annotations:
+      message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler
+        }} in {{ $labels.namespace }} is not active.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive
+    expr: |
+      (
+          kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"}
+          # Match only Mimir namespaces.
+          * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
+          # Add "metric" label.
+          + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)")
+          > 0
+      )
+      # Do not alert if metric is 0, because in that case we expect the HPA to be inactive.
+      unless on (cluster, namespace, metric)
+      (label_replace(keda_metrics_adapter_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") == 0)
+    for: 1h
+    labels:
+      severity: critical
+- name: mimir_continuous_test
+  rules:
+  - alert: MimirContinuousTestNotRunningOnWrites
+    annotations:
+      message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{
+        $labels.namespace }} is not effectively running because writes are failing.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites
+    expr: |
+      sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0
+    for: 1h
+    labels:
+      severity: warning
+  - alert: MimirContinuousTestNotRunningOnReads
+    annotations:
+      message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{
+        $labels.namespace }} is not effectively running because queries are failing.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads
+    expr: |
+      sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0
+    for: 1h
+    labels:
+      severity: warning
+  - alert: MimirContinuousTestFailed
+    annotations:
+      message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{
+        $labels.namespace }} failed when asserting query results.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed
+    expr: |
+      sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0
+    labels:
+      severity: warning
diff --git a/src/prometheus_alert_rules/recording-rules.yaml b/src/prometheus_alert_rules/recording-rules.yaml
new file mode 100644
index 0000000..acb180f
--- /dev/null
+++ b/src/prometheus_alert_rules/recording-rules.yaml
@@ -0,0 +1,571 @@
+groups:
+- name: mimir_api_1
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
+      by (cluster, job)
+    record: cluster_job:cortex_request_duration_seconds:avg
+  - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
+    record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_request_duration_seconds_count:sum_rate
+- name: mimir_api_2
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, route))
+    record: cluster_job_route:cortex_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, route))
+    record: cluster_job_route:cortex_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
+      / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
+    record: cluster_job_route:cortex_request_duration_seconds:avg
+  - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
+      route)
+    record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
+    record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
+    record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
+- name: mimir_api_3
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
+      by (le, cluster, namespace, job, route))
+    record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
+      by (le, cluster, namespace, job, route))
+    record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
+      job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
+      namespace, job, route)
+    record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
+  - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
+      job, route)
+    record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
+      job, route)
+    record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
+      job, route)
+    record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
+- name: mimir_querier_api
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_querier_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_querier_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
+      job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
+      job)
+    record: cluster_job:cortex_querier_request_duration_seconds:avg
+  - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
+      job)
+    record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
+      job)
+    record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
+      job)
+    record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, route))
+    record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, route))
+    record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
+      job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
+      (cluster, job, route)
+    record: cluster_job_route:cortex_querier_request_duration_seconds:avg
+  - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
+      job, route)
+    record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
+      job, route)
+    record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
+      job, route)
+    record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
+      by (le, cluster, namespace, job, route))
+    record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
+      by (le, cluster, namespace, job, route))
+    record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
+      namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
+      by (cluster, namespace, job, route)
+    record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
+  - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
+      namespace, job, route)
+    record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
+      namespace, job, route)
+    record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
+      namespace, job, route)
+    record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
+- name: mimir_cache
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, method))
+    record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, method))
+    record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
+      job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
+      by (cluster, job, method)
+    record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
+  - expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
+      job, method)
+    record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
+      job, method)
+    record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
+      job, method)
+    record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_cache_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_cache_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
+      / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_cache_request_duration_seconds:avg
+  - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
+      job)
+    record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
+      job)
+    record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, method))
+    record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job, method))
+    record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
+      method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
+      job, method)
+    record: cluster_job_method:cortex_cache_request_duration_seconds:avg
+  - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
+      job, method)
+    record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
+      method)
+    record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
+      job, method)
+    record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
+- name: mimir_storage
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_kv_request_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_kv_request_duration_seconds:50quantile
+  - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
+      / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_kv_request_duration_seconds:avg
+  - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
+      job)
+    record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
+- name: mimir_queries
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_query_frontend_retries:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_query_frontend_retries:50quantile
+  - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
+      by (cluster, job)
+    record: cluster_job:cortex_query_frontend_retries:avg
+  - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
+    record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
+  - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
+  - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_query_frontend_retries_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
+  - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
+      job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
+      (cluster, job)
+    record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
+  - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
+      cluster, job)
+    record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
+  - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
+      job)
+    record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
+  - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
+      job)
+    record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
+- name: mimir_ingester_queries
+  rules:
+  - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_ingester_queried_series:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_ingester_queried_series:50quantile
+  - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
+      by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_series:avg
+  - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
+    record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
+  - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
+  - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_series_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_ingester_queried_samples:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_ingester_queried_samples:50quantile
+  - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
+      by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_samples:avg
+  - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
+    record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
+  - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
+  - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
+  - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_ingester_queried_exemplars:99quantile
+  - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
+      by (le, cluster, job))
+    record: cluster_job:cortex_ingester_queried_exemplars:50quantile
+  - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
+      sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_exemplars:avg
+  - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
+      job)
+    record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
+  - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
+  - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
+    record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
+- name: mimir_received_samples
+  rules:
+  - expr: |
+      sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
+    record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
+- name: mimir_exemplars_in
+  rules:
+  - expr: |
+      sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
+    record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
+- name: mimir_received_exemplars
+  rules:
+  - expr: |
+      sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
+    record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
+- name: mimir_exemplars_ingested
+  rules:
+  - expr: |
+      sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
+    record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
+- name: mimir_exemplars_appended
+  rules:
+  - expr: |
+      sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
+    record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
+- name: mimir_scaling_rules
+  rules:
+  - expr: |
+      # Convenience rule to get the number of replicas for both a deployment and a statefulset.
+      # Multi-zone deployments are grouped together removing the "zone-X" suffix.
+      sum by (cluster, namespace, deployment) (
+        label_replace(
+          kube_deployment_spec_replicas,
+          # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+          # always matches everything and the (optional) zone is not removed.
+          "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+        )
+      )
+      or
+      sum by (cluster, namespace, deployment) (
+        label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
+      )
+    record: cluster_namespace_deployment:actual_replicas:count
+  - expr: |
+      ceil(
+        quantile_over_time(0.99,
+          sum by (cluster, namespace) (
+            cluster_namespace_job:cortex_distributor_received_samples:rate5m
+          )[24h:]
+        )
+        / 240000
+      )
+    labels:
+      deployment: distributor
+      reason: sample_rate
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      ceil(
+        sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
+        * 0.59999999999999998 / 240000
+      )
+    labels:
+      deployment: distributor
+      reason: sample_rate_limits
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      ceil(
+        quantile_over_time(0.99,
+          sum by (cluster, namespace) (
+            cluster_namespace_job:cortex_distributor_received_samples:rate5m
+          )[24h:]
+        )
+        * 3 / 80000
+      )
+    labels:
+      deployment: ingester
+      reason: sample_rate
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      ceil(
+        quantile_over_time(0.99,
+          sum by(cluster, namespace) (
+            cortex_ingester_memory_series
+          )[24h:]
+        )
+        / 1500000
+      )
+    labels:
+      deployment: ingester
+      reason: active_series
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      ceil(
+        sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
+        * 3 * 0.59999999999999998 / 1500000
+      )
+    labels:
+      deployment: ingester
+      reason: active_series_limits
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      ceil(
+        sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
+        * 0.59999999999999998 / 80000
+      )
+    labels:
+      deployment: ingester
+      reason: sample_rate_limits
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      ceil(
+        (sum by (cluster, namespace) (
+          cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
+        ) / 4)
+          /
+        avg by (cluster, namespace) (
+          memcached_limit_bytes{job=~".+/memcached"}
+        )
+      )
+    labels:
+      deployment: memcached
+      reason: active_series
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      sum by (cluster, namespace, deployment) (
+        label_replace(
+          label_replace(
+            node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
+            "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+          ),
+          # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+          # always matches everything and the (optional) zone is not removed.
+          "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+        )
+      )
+    record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
+  - expr: |
+      # Convenience rule to get the CPU request for both a deployment and a statefulset.
+      # Multi-zone deployments are grouped together removing the "zone-X" suffix.
+      # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
+      # that remove resource metrics, ref:
+      # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
+      # - https://github.com/kubernetes/kube-state-metrics/pull/1004
+      #
+      # This is the old expression, compatible with kube-state-metrics < v2.0.0,
+      # where kube_pod_container_resource_requests_cpu_cores was removed:
+      (
+        sum by (cluster, namespace, deployment) (
+          label_replace(
+            label_replace(
+              kube_pod_container_resource_requests_cpu_cores,
+              "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+            ),
+            # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+            # always matches everything and the (optional) zone is not removed.
+            "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+          )
+        )
+      )
+      or
+      # This expression is compatible with kube-state-metrics >= v1.4.0,
+      # where kube_pod_container_resource_requests was introduced.
+      (
+        sum by (cluster, namespace, deployment) (
+          label_replace(
+            label_replace(
+              kube_pod_container_resource_requests{resource="cpu"},
+              "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+            ),
+            # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+            # always matches everything and the (optional) zone is not removed.
+            "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+          )
+        )
+      )
+    record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
+  - expr: |
+      # Jobs should be sized to their CPU usage.
+      # We do this by comparing 99th percentile usage over the last 24hrs to
+      # their current provisioned #replicas and resource requests.
+      ceil(
+        cluster_namespace_deployment:actual_replicas:count
+          *
+        quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
+          /
+        cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
+      )
+    labels:
+      reason: cpu_usage
+    record: cluster_namespace_deployment_reason:required_replicas:count
+  - expr: |
+      # Convenience rule to get the Memory utilization for both a deployment and a statefulset.
+      # Multi-zone deployments are grouped together removing the "zone-X" suffix.
+      sum by (cluster, namespace, deployment) (
+        label_replace(
+          label_replace(
+            container_memory_usage_bytes{image!=""},
+            "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+          ),
+          # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+          # always matches everything and the (optional) zone is not removed.
+          "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+        )
+      )
+    record: cluster_namespace_deployment:container_memory_usage_bytes:sum
+  - expr: |
+      # Convenience rule to get the Memory request for both a deployment and a statefulset.
+      # Multi-zone deployments are grouped together removing the "zone-X" suffix.
+      # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
+      # that remove resource metrics, ref:
+      # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
+      # - https://github.com/kubernetes/kube-state-metrics/pull/1004
+      #
+      # This is the old expression, compatible with kube-state-metrics < v2.0.0,
+      # where kube_pod_container_resource_requests_memory_bytes was removed:
+      (
+        sum by (cluster, namespace, deployment) (
+          label_replace(
+            label_replace(
+              kube_pod_container_resource_requests_memory_bytes,
+              "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+            ),
+            # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+            # always matches everything and the (optional) zone is not removed.
+            "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+          )
+        )
+      )
+      or
+      # This expression is compatible with kube-state-metrics >= v1.4.0,
+      # where kube_pod_container_resource_requests was introduced.
+      (
+        sum by (cluster, namespace, deployment) (
+          label_replace(
+            label_replace(
+              kube_pod_container_resource_requests{resource="memory"},
+              "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+            ),
+            # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
+            # always matches everything and the (optional) zone is not removed.
+            "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
+          )
+        )
+      )
+    record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
+  - expr: |
+      # Jobs should be sized to their Memory usage.
+      # We do this by comparing 99th percentile usage over the last 24hrs to
+      # their current provisioned #replicas and resource requests.
+      ceil(
+        cluster_namespace_deployment:actual_replicas:count
+          *
+        quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
+          /
+        cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
+      )
+    labels:
+      reason: memory_usage
+    record: cluster_namespace_deployment_reason:required_replicas:count
+- name: mimir_alertmanager_rules
+  rules:
+  - expr: |
+      sum by (cluster, job, pod) (cortex_alertmanager_alerts)
+    record: cluster_job_pod:cortex_alertmanager_alerts:sum
+  - expr: |
+      sum by (cluster, job, pod) (cortex_alertmanager_silences)
+    record: cluster_job_pod:cortex_alertmanager_silences:sum
+  - expr: |
+      sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
+    record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
+  - expr: |
+      sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
+    record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
+  - expr: |
+      sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
+    record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
+  - expr: |
+      sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
+    record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
+  - expr: |
+      sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
+    record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
+  - expr: |
+      sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
+    record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
+  - expr: |
+      sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
+    record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
+  - expr: |
+      sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
+    record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
+- name: mimir_ingester_rules
+  rules:
+  - expr: |
+      sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
+    record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m

From a3ba2f3b91db75306e07702f9f5b8ae513312be0 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Fri, 20 Jan 2023 14:38:02 +0000
Subject: [PATCH 11/13] address PR comments

---
 src/charm.py                              | 25 ++++---
 tests/integration/conftest.py             | 56 ++++++++++++++++
 tests/integration/helpers.py              | 14 ----
 tests/integration/test_self_monitoring.py | 17 +++--
 tests/integration/workload.py             | 80 +++++++++++++++++++++++
 tox.ini                                   |  3 +-
 6 files changed, 164 insertions(+), 31 deletions(-)
 create mode 100644 tests/integration/conftest.py
 create mode 100644 tests/integration/workload.py

diff --git a/src/charm.py b/src/charm.py
index 4d5eb8c..9207934 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -13,6 +13,7 @@
 from typing import Optional
 
 import yaml
+from charms.observability_libs.v0.juju_topology import JujuTopology
 from charms.observability_libs.v1.kubernetes_service_patch import (
     KubernetesServicePatch,
     ServicePort,
@@ -65,17 +66,21 @@ def __init__(self, *args):
 
         self.metrics_provider = MetricsEndpointProvider(
             self,
-            jobs=[{"static_configs": [{
-                "targets": [f"*:{self._http_listen_port}"],
-                "labels": {
-                    "cluster": self.topology.model_uuid,
-                    "namespace": self.topology.model,
-                    "job": f"{self.topology.model}/mimir",
-                    "pod": self.topology.unit,
+            jobs=[
+                {
+                    "static_configs": [
+                        {
+                            "targets": [f"*:{self._http_listen_port}"],
+                            "labels": {
+                                "cluster": self.topology.model_uuid,
+                                "namespace": self.topology.model,
+                                "job": f"{self.topology.model}/mimir",
+                                "pod": self.topology.unit,
+                            },
+                        }
+                    ],
+                    "scrape_interval": "15s"
                 }
-                }]}],
-            refresh_event=[
-                self.on.update_status,
             ],
         )
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 0000000..382d043
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,56 @@
+# Copyright 2021 Canonical Ltd.
+# See LICENSE file for licensing details.
+import functools
+import logging
+from collections import defaultdict
+from datetime import datetime
+
+import pytest
+from pytest_operator.plugin import OpsTest
+
+logger = logging.getLogger(__name__)
+
+
+class Store(defaultdict):
+    def __init__(self):
+        super(Store, self).__init__(Store)
+
+    def __getattr__(self, key):
+        """Override __getattr__ so dot syntax works on keys."""
+        try:
+            return self[key]
+        except KeyError:
+            raise AttributeError(key)
+
+    def __setattr__(self, key, value):
+        """Override __setattr__ so dot syntax works on keys."""
+        self[key] = value
+
+
+store = Store()
+
+
+def timed_memoizer(func):
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        fname = func.__qualname__
+        logger.info("Started: %s" % fname)
+        start_time = datetime.now()
+        if fname in store.keys():
+            ret = store[fname]
+        else:
+            logger.info("Return for {} not cached".format(fname))
+            ret = await func(*args, **kwargs)
+            store[fname] = ret
+        logger.info("Finished: {} in: {} seconds".format(fname, datetime.now() - start_time))
+        return ret
+
+    return wrapper
+
+
+@pytest.fixture(scope="module")
+@timed_memoizer
+async def mimir_charm(ops_test: OpsTest):
+    """Mimir charm used for integration testing."""
+    charm = await ops_test.build_charm(".")
+    return charm
diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index 57cc94e..b56a212 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -3,9 +3,7 @@
 
 import logging
 from pathlib import Path
-from urllib.parse import urljoin
 
-import requests
 import yaml
 
 logger = logging.getLogger(__name__)
@@ -16,18 +14,6 @@ async def get_unit_address(ops_test, app_name: str, unit_num: int) -> str:
     return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"]
 
 
-async def mimir_endpoint_request(ops_test, app_name: str, endpoint: str, unit_num: int = 0):
-    address = await get_unit_address(ops_test, app_name, unit_num)
-    url = urljoin(f"http://{address}:9009/", endpoint)
-    try:
-        response = requests.get(url)
-        if response.status_code == 200:
-            return response.text
-        return ""
-    except requests.exceptions.RequestException:
-        return ""
-
-
 def oci_image(metadata_file: str, image_name: str) -> str:
     """Find upstream source for a container image.
 
diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py
index a2f1ab5..e2b018e 100644
--- a/tests/integration/test_self_monitoring.py
+++ b/tests/integration/test_self_monitoring.py
@@ -7,8 +7,9 @@
 
 import pytest
 import requests
-from helpers import get_unit_address, mimir_endpoint_request, oci_image
+from helpers import get_unit_address, oci_image
 from pytest_operator.plugin import OpsTest
+from workload import Mimir
 
 logger = logging.getLogger(__name__)
 
@@ -17,14 +18,13 @@
 
 
 @pytest.mark.abort_on_fail
-async def test_deploy_and_relate_charms(ops_test: OpsTest):
+async def test_deploy_and_relate_charms(ops_test: OpsTest, mimir_charm):
     """Test that Mimir can be related with Prometheus over prometheus_scrape."""
     # Build charm from local source folder
-    mimir_charm = await ops_test.build_charm(".")
-
+    # mimir_charm = await ops_test.build_charm(".")
     await asyncio.gather(
         ops_test.model.deploy(
-            mimir_charm,
+            await mimir_charm,
             resources={"mimir-image": oci_image("./metadata.yaml", "mimir-image")},
             application_name=MIMIR,
             trust=True,
@@ -43,7 +43,9 @@ async def test_deploy_and_relate_charms(ops_test: OpsTest):
 
 
 async def test_metrics_are_available(ops_test):
-    metrics = await mimir_endpoint_request(ops_test, MIMIR, "metrics", 0)
+    address = await get_unit_address(ops_test, MIMIR, 0)
+    mimir = Mimir(host=address)
+    metrics = await mimir.api_request("/metrics")
     assert len(metrics) > 0
 
 
@@ -54,5 +56,8 @@ async def test_query_metrics_from_prometheus(ops_test):
     try:
         response = requests.get(url, params=params)
         assert response.json()["status"] == "success"
+        print(response.json())
+        for result in response.json()["data"]["result"]:
+            assert "1" in result["value"]
     except requests.exceptions.RequestException:
         assert False
diff --git a/tests/integration/workload.py b/tests/integration/workload.py
new file mode 100644
index 0000000..f3d67cc
--- /dev/null
+++ b/tests/integration/workload.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# Copyright 2023 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+import logging
+from urllib.parse import urljoin
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class Mimir:
+    """A class that represents a running instance of Mimir."""
+
+    def __init__(self, host="localhost", port=9009):
+        """Utility to manage a Mimir application.
+
+        Args:
+            host: Optional; host address of Mimir application.
+            port: Optional; port on which Mimir service is exposed.
+        """
+        self.base_url = f"http://{host}:{port}"
+
+        # Set a timeout of 5 second - should be sufficient for all the checks here.
+        # The default (5 min) prolongs itests unnecessarily.
+        self.timeout = aiohttp.ClientTimeout(total=5)
+
+    async def is_ready(self) -> bool:
+        """Send a GET request to check readiness.
+
+        Returns:
+          True if Mimir is ready (returned 200 OK); False otherwise.
+        """
+        url = f"{self.base_url}/ready"
+
+        async with aiohttp.ClientSession(timeout=self.timeout) as session:
+            async with session.get(url) as response:
+                return response.status == 200
+
+    async def config(self) -> str:
+        """Send a GET request to get Mimir configuration.
+
+        Returns:
+          YAML config in string format or empty string
+        """
+        url = f"{self.base_url}/config"
+        # Response looks like this:
+        # {
+        #   "status": "success",
+        #   "data": {
+        #     "yaml": "global:\n
+        #       scrape_interval: 1m\n
+        #       scrape_timeout: 10s\n
+        #       evaluation_interval: 1m\n
+        #       rule_files:\n
+        #       - /etc/prometheus/rules/juju_*.rules\n
+        #       scrape_configs:\n
+        #       - job_name: prometheus\n
+        #       honor_timestamps: true\n
+        #       scrape_interval: 5s\n
+        #       scrape_timeout: 5s\n
+        #       metrics_path: /metrics\n
+        #       scheme: http\n
+        #       static_configs:\n
+        #       - targets:\n
+        #       - localhost:9090\n"
+        #   }
+        # }
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                result = await response.text()
+                return result if response.status == 200 else ""
+
+    async def api_request(self, endpoint: str):
+        url = urljoin(self.base_url, endpoint)
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                result = await response.text()
+                return result if response.status == 200 else ""
diff --git a/tox.ini b/tox.ini
index 8b91aa6..4cbcb2a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -47,7 +47,7 @@ commands =
     # uncomment the following line if this charm owns a lib
     # codespell {[vars]lib_path}
     codespell . --skip .git --skip .tox --skip build --skip lib --skip venv --skip .mypy_cache \
-      --skip icon.svg
+      --skip icon.svg --skip prometheus_alert_rules
     # pflake8 wrapper supports config from pyproject.toml
     pflake8 {[vars]all_path}
     isort --check-only --diff {[vars]all_path}
@@ -101,6 +101,7 @@ commands =
 [testenv:integration]
 description = Run integration tests
 deps =
+    aiohttp
     pytest
     juju
     pytest-operator

From 04e5b4467cbe582d42d628de591e42951d579b45 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Fri, 20 Jan 2023 14:39:30 +0000
Subject: [PATCH 12/13] fix linting

---
 src/charm.py                              | 2 +-
 tests/integration/test_self_monitoring.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/charm.py b/src/charm.py
index 9207934..2d81974 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -79,7 +79,7 @@ def __init__(self, *args):
                             },
                         }
                     ],
-                    "scrape_interval": "15s"
+                    "scrape_interval": "15s",
                 }
             ],
         )
diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py
index e2b018e..e0e8b50 100644
--- a/tests/integration/test_self_monitoring.py
+++ b/tests/integration/test_self_monitoring.py
@@ -56,7 +56,6 @@ async def test_query_metrics_from_prometheus(ops_test):
     try:
         response = requests.get(url, params=params)
         assert response.json()["status"] == "success"
-        print(response.json())
         for result in response.json()["data"]["result"]:
             assert "1" in result["value"]
     except requests.exceptions.RequestException:

From 897acae23c5c54b9310d402d80191f8deabf7695 Mon Sep 17 00:00:00 2001
From: Luca Bello <luca.bello@canonical.com>
Date: Fri, 20 Jan 2023 15:25:39 +0000
Subject: [PATCH 13/13] remove the word master from rules

---
 src/prometheus_alert_rules/recording-rules.yaml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/prometheus_alert_rules/recording-rules.yaml b/src/prometheus_alert_rules/recording-rules.yaml
index acb180f..05e6a48 100644
--- a/src/prometheus_alert_rules/recording-rules.yaml
+++ b/src/prometheus_alert_rules/recording-rules.yaml
@@ -412,10 +412,6 @@ groups:
   - expr: |
       # Convenience rule to get the CPU request for both a deployment and a statefulset.
       # Multi-zone deployments are grouped together removing the "zone-X" suffix.
-      # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
-      # that remove resource metrics, ref:
-      # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
-      # - https://github.com/kubernetes/kube-state-metrics/pull/1004
       #
       # This is the old expression, compatible with kube-state-metrics < v2.0.0,
       # where kube_pod_container_resource_requests_cpu_cores was removed:
@@ -481,10 +477,6 @@ groups:
   - expr: |
       # Convenience rule to get the Memory request for both a deployment and a statefulset.
       # Multi-zone deployments are grouped together removing the "zone-X" suffix.
-      # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
-      # that remove resource metrics, ref:
-      # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
-      # - https://github.com/kubernetes/kube-state-metrics/pull/1004
       #
       # This is the old expression, compatible with kube-state-metrics < v2.0.0,
       # where kube_pod_container_resource_requests_memory_bytes was removed: