From 6b3cc6772c97db8af7e0d9c6ebdf40da6064112e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?=
 <939888+Abuelodelanada@users.noreply.github.com>
Date: Wed, 22 Jun 2022 12:42:02 -0300
Subject: [PATCH] Adding self-monitoring and alert rules (#312)

Co-authored-by: Simon Aronsson <simme@arcticbit.se>
Co-authored-by: Ryan Barry <ryan.barry@canonical.com>
Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com>
---
 .../grafana_k8s/v0/grafana_dashboard.py       | 1544 +++++++++
 .../observability_libs/v0/juju_topology.py    |   24 +-
 lib/charms/traefik_k8s/v0/ingress_per_unit.py |  799 +++--
 metadata.yaml                                 |    9 +
 pyproject.toml                                |    2 +-
 src/charm.py                                  |   49 +-
 .../prometheus-k8s_rev1.json.tmpl             | 2972 +++++++++++++++++
 ...ometheus_configuration_reload_failure.rule |   12 +
 .../prometheus_exporters_slowly.rule          |   12 +
 .../prometheus_large_scrape.rule              |   12 +
 .../prometheus_missing.rule                   |   12 +
 .../prometheus_notifications_backlog.rule     |   12 +
 .../prometheus_rule_evaluation_slow.rule      |   13 +
 .../prometheus_target_scrape_duplicate.rule   |   15 +
 ...eus_tsdb_checkpoint_creation_failures.rule |   15 +
 ...eus_tsdb_checkpoint_deletion_failures.rule |   15 +
 .../prometheus_tsdb_compactions_failed.rule   |   15 +
 ...ometheus_tsdb_head_truncations_failed.rule |   15 +
 .../prometheus_tsdb_reload_failures.rule      |   15 +
 .../prometheus_tsdb_wal_corruptions.rule      |   15 +
 ...rometheus_tsdb_wal_truncations_failed.rule |   15 +
 tests/integration/test_check_config.py        |    4 +-
 .../test_prometheus_scrape_multiunit.py       |   13 +-
 .../test_remote_write_grafana_agent.py        |    4 +-
 .../test_remote_write_with_zinc.py            |    4 +-
 tests/unit/test_charm.py                      |   10 +-
 26 files changed, 5369 insertions(+), 258 deletions(-)
 create mode 100644 lib/charms/grafana_k8s/v0/grafana_dashboard.py
 create mode 100644 src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl
 create mode 100644 src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_exporters_slowly.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_large_scrape.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_missing.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_notifications_backlog.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule
 create mode 100644 src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule

diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py
new file mode 100644
index 00000000..bbf15ed7
--- /dev/null
+++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py
@@ -0,0 +1,1544 @@
+# Copyright 2021 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""## Overview.
+
+This document explains how to integrate with the Grafana charm
+for the purpose of providing a dashboard which can be used by
+end users. It also explains the structure of the data
+expected by the `grafana-dashboard` interface, and may provide a
+mechanism or reference point for providing a compatible interface
+or library by providing a definitive reference guide to the
+structure of relation data which is shared between the Grafana
+charm and any charm providing datasource information.
+
+## Provider Library Usage
+
+The Grafana charm interacts with its dashboards using its charm
+library. The goal of this library is to be as simple to use as
+possible, and instantiation of the class with or without changing
+the default arguments provides a complete use case. For the simplest
+use case of a charm which bundles dashboards and provides a
+`provides: grafana-dashboard` interface,
+
+    requires:
+      grafana-dashboard:
+        interface: grafana_dashboard
+
+creation of a `GrafanaDashboardProvider` object with the default arguments is
+sufficient.
+
+:class:`GrafanaDashboardProvider` expects that bundled dashboards should
+be included in your charm with a default path of:
+
+    path/to/charm.py
+    path/to/src/grafana_dashboards/*.{json|json.tmpl|.tmpl}
+
+Where the files are Grafana dashboard JSON data either from the
+Grafana marketplace, or directly exported from a Grafana instance.
+Refer to the [official docs](https://grafana.com/tutorials/provision-dashboards-and-data-sources/)
+for more information.
+
+When constructing a dashboard that is intended to be consumed by COS, make sure to use variables
+for your datasources, and name them "prometheusds" and "lokids". You can also use the following
+juju topology variables in your dashboards: $juju_model, $juju_model_uuid, $juju_application
+and $juju_unit. Note, however, that if metrics are coming via peripheral charms (scrape-config
+or cos-config) then topology labels would not exist.
+
+The default constructor arguments are:
+
+    `charm`: `self` from the charm instantiating this library
+    `relation_name`: grafana-dashboard
+    `dashboards_path`: "/src/grafana_dashboards"
+
+If your configuration requires any changes from these defaults, they
+may be set from the class constructor. It may be instantiated as
+follows:
+
+    from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider
+
+    class FooCharm:
+        def __init__(self, *args):
+            super().__init__(*args, **kwargs)
+            ...
+            self.grafana_dashboard_provider = GrafanaDashboardProvider(self)
+            ...
+
+The first argument (`self`) should be a reference to the parent (providing
+dashboards), as this charm's lifecycle events will be used to re-submit
+dashboard information if a charm is upgraded, the pod is restarted, or other.
+
+An instantiated `GrafanaDashboardProvider` validates that the path specified
+in the constructor (or the default) exists, reads the file contents, then
+compresses them with LZMA and adds them to the application relation data
+when a relation is established with Grafana.
+
+Provided dashboards will be checked by Grafana, and a series of dropdown menus
+providing the ability to select query targets by Juju Model, application instance,
+and unit will be added if they do not exist.
+
+To avoid requiring `jinja` in `GrafanaDashboardProvider` users, template validation
+and rendering occurs on the other side of the relation, and relation data in
+the form of:
+
+    {
+        "event": {
+            "valid": `true|false`,
+            "errors": [],
+        }
+    }
+
+Will be returned if rendering or validation fails. In this case, the
+`GrafanaDashboardProvider` object will emit a `dashboard_status_changed` event
+of the type :class:`GrafanaDashboardEvent`, which will contain information
+about the validation error.
+
+This information is added to the relation data for the charms as serialized JSON
+from a dict, with a structure of:
+```
+{
+    "application": {
+        "dashboards": {
+            "uuid": a uuid generated to ensure a relation event triggers,
+            "templates": {
+                "file:{hash}": {
+                    "content": `{compressed_template_data}`,
+                    "charm": `charm.meta.name`,
+                    "juju_topology": {
+                        "model": `charm.model.name`,
+                        "model_uuid": `charm.model.uuid`,
+                        "application": `charm.app.name`,
+                        "unit": `charm.unit.name`,
+                    }
+                },
+                "file:{other_file_hash}": {
+                    ...
+                },
+            },
+        },
+    },
+}
+```
+
+This is ingested by :class:`GrafanaDashboardConsumer`, and is sufficient for configuration.
+
+The [COS Configuration Charm](https://charmhub.io/cos-configuration-k8s) can be used to
+add dashboards which are not bundled with charms.
+
+## Consumer Library Usage
+
+The `GrafanaDashboardConsumer` object may be used by Grafana
+charms to manage relations with available dashboards. For this
+purpose, a charm consuming Grafana dashboard information should do
+the following things:
+
+1. Instantiate the `GrafanaDashboardConsumer` object by providing it a
+reference to the parent (Grafana) charm and, optionally, the name of
+the relation that the Grafana charm uses to interact with dashboards.
+This relation must confirm to the `grafana-dashboard` interface.
+
+For example a Grafana charm may instantiate the
+`GrafanaDashboardConsumer` in its constructor as follows
+
+    from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardConsumer
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        ...
+        self.grafana_dashboard_consumer = GrafanaDashboardConsumer(self)
+        ...
+
+2. A Grafana charm also needs to listen to the
+`GrafanaDashboardConsumer` events emitted by the `GrafanaDashboardConsumer`
+by adding itself as an observer for these events:
+
+    self.framework.observe(
+        self.grafana_source_consumer.on.sources_changed,
+        self._on_dashboards_changed,
+    )
+
+Dashboards can be retrieved the :meth:`dashboards`:
+
+It will be returned in the format of:
+
+```
+[
+    {
+        "id": unique_id,
+        "relation_id": relation_id,
+        "charm": the name of the charm which provided the dashboard,
+        "content": compressed_template_data
+    },
+]
+```
+
+The consuming charm should decompress the dashboard.
+"""
+
+import base64
+import json
+import logging
+import lzma
+import os
+import re
+import uuid
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from ops.charm import (
+    CharmBase,
+    HookEvent,
+    RelationBrokenEvent,
+    RelationChangedEvent,
+    RelationCreatedEvent,
+    RelationEvent,
+    RelationRole,
+)
+from ops.framework import (
+    EventBase,
+    EventSource,
+    Object,
+    ObjectEvents,
+    StoredDict,
+    StoredList,
+    StoredState,
+)
+from ops.model import Relation
+
+# The unique Charmhub library identifier, never change it
+LIBID = "c49eb9c7dfef40c7b6235ebd67010a3f"
+
+# Increment this major API version when introducing breaking changes
+LIBAPI = 0
+
+# Increment this PATCH version before using `charmcraft publish-lib` or reset
+# to 0 if you are raising the major API version
+LIBPATCH = 12
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_RELATION_NAME = "grafana-dashboard"
+DEFAULT_PEER_NAME = "grafana"
+RELATION_INTERFACE_NAME = "grafana_dashboard"
+
+TEMPLATE_DROPDOWNS = [
+    {
+        "allValue": None,
+        "datasource": "${prometheusds}",
+        "definition": "label_values(up,juju_model)",
+        "description": None,
+        "error": None,
+        "hide": 0,
+        "includeAll": False,
+        "label": "Juju model",
+        "multi": False,
+        "name": "juju_model",
+        "query": {
+            "query": "label_values(up,juju_model)",
+            "refId": "StandardVariableQuery",
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": False,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": False,
+    },
+    {
+        "allValue": None,
+        "datasource": "${prometheusds}",
+        "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)',
+        "description": None,
+        "error": None,
+        "hide": 0,
+        "includeAll": False,
+        "label": "Juju model uuid",
+        "multi": False,
+        "name": "juju_model_uuid",
+        "query": {
+            "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)',
+            "refId": "StandardVariableQuery",
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": False,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": False,
+    },
+    {
+        "allValue": None,
+        "datasource": "${prometheusds}",
+        "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)',
+        "description": None,
+        "error": None,
+        "hide": 0,
+        "includeAll": False,
+        "label": "Juju application",
+        "multi": False,
+        "name": "juju_application",
+        "query": {
+            "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)',
+            "refId": "StandardVariableQuery",
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": False,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": False,
+    },
+    {
+        "allValue": None,
+        "datasource": "${prometheusds}",
+        "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)',
+        "description": None,
+        "error": None,
+        "hide": 0,
+        "includeAll": False,
+        "label": "Juju unit",
+        "multi": False,
+        "name": "juju_unit",
+        "query": {
+            "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)',
+            "refId": "StandardVariableQuery",
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": False,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": False,
+    },
+    {
+        "description": None,
+        "error": None,
+        "hide": 0,
+        "includeAll": False,
+        "label": None,
+        "multi": False,
+        "name": "prometheusds",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": False,
+        "type": "datasource",
+    },
+]
+
+REACTIVE_CONVERTER = {  # type: ignore
+    "allValue": None,
+    "datasource": "${prometheusds}",
+    "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)',
+    "description": None,
+    "error": None,
+    "hide": 0,
+    "includeAll": False,
+    "label": "hosts",
+    "multi": True,
+    "name": "host",
+    "options": [],
+    "query": {
+        "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)',
+        "refId": "StandardVariableQuery",
+    },
+    "refresh": 1,
+    "regex": "",
+    "skipUrlSync": False,
+    "sort": 1,
+    "tagValuesQuery": "",
+    "tags": [],
+    "tagsQuery": "",
+    "type": "query",
+    "useTags": False,
+}
+
+
+class RelationNotFoundError(Exception):
+    """Raised if there is no relation with the given name."""
+
+    def __init__(self, relation_name: str):
+        self.relation_name = relation_name
+        self.message = "No relation named '{}' found".format(relation_name)
+
+        super().__init__(self.message)
+
+
+class RelationInterfaceMismatchError(Exception):
+    """Raised if the relation with the given name has a different interface."""
+
+    def __init__(
+        self,
+        relation_name: str,
+        expected_relation_interface: str,
+        actual_relation_interface: str,
+    ):
+        self.relation_name = relation_name
+        self.expected_relation_interface = expected_relation_interface
+        self.actual_relation_interface = actual_relation_interface
+        self.message = (
+            "The '{}' relation has '{}' as "
+            "interface rather than the expected '{}'".format(
+                relation_name, actual_relation_interface, expected_relation_interface
+            )
+        )
+
+        super().__init__(self.message)
+
+
+class RelationRoleMismatchError(Exception):
+    """Raised if the relation with the given name has a different direction."""
+
+    def __init__(
+        self,
+        relation_name: str,
+        expected_relation_role: RelationRole,
+        actual_relation_role: RelationRole,
+    ):
+        self.relation_name = relation_name
+        self.expected_relation_interface = expected_relation_role
+        self.actual_relation_role = actual_relation_role
+        self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format(
+            relation_name, repr(actual_relation_role), repr(expected_relation_role)
+        )
+
+        super().__init__(self.message)
+
+
+class InvalidDirectoryPathError(Exception):
+    """Raised if the grafana dashboards folder cannot be found or is otherwise invalid."""
+
+    def __init__(
+        self,
+        grafana_dashboards_absolute_path: str,
+        message: str,
+    ):
+        self.grafana_dashboards_absolute_path = grafana_dashboards_absolute_path
+        self.message = message
+
+        super().__init__(self.message)
+
+
+def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str:
+    """Resolve the provided path items against the directory of the main file.
+
+    Look up the directory of the charmed operator file being executed. This is normally
+    going to be the charm.py file of the charm including this library. Then, resolve
+    the provided path elements and return its absolute path.
+
+    Raises:
+        InvalidDirectoryPathError if the resolved path does not exist or it is not a directory
+
+    """
+    charm_dir = Path(str(charm.charm_dir))
+    if not charm_dir.exists() or not charm_dir.is_dir():
+        # Operator Framework does not currently expose a robust
+        # way to determine the top level charm source directory
+        # that is consistent across deployed charms and unit tests
+        # Hence for unit tests the current working directory is used
+        # TODO: updated this logic when the following ticket is resolved
+        # https://github.com/canonical/operator/issues/643
+        charm_dir = Path(os.getcwd())
+
+    dir_path = charm_dir.absolute().joinpath(*path_elements)
+
+    if not dir_path.exists():
+        raise InvalidDirectoryPathError(str(dir_path), "directory does not exist")
+    if not dir_path.is_dir():
+        raise InvalidDirectoryPathError(str(dir_path), "is not a directory")
+
+    return str(dir_path)
+
+
+def _validate_relation_by_interface_and_direction(
+    charm: CharmBase,
+    relation_name: str,
+    expected_relation_interface: str,
+    expected_relation_role: RelationRole,
+) -> None:
+    """Verifies that a relation has the necessary characteristics.
+
+    Verifies that the `relation_name` provided: (1) exists in metadata.yaml,
+    (2) declares as interface the interface name passed as `relation_interface`
+    and (3) has the right "direction", i.e., it is a relation that `charm`
+    provides or requires.
+
+    Args:
+        charm: a `CharmBase` object to scan for the matching relation.
+        relation_name: the name of the relation to be verified.
+        expected_relation_interface: the interface name to be matched by the
+            relation named `relation_name`.
+        expected_relation_role: whether the `relation_name` must be either
+            provided or required by `charm`.
+
+    Raises:
+        RelationNotFoundError: If there is no relation in the charm's metadata.yaml
+            named like the value of the `relation_name` argument.
+        RelationInterfaceMismatchError: If the relation interface of the
+            relation named as the provided `relation_name` argument does not
+            match the `expected_relation_interface` argument.
+        RelationRoleMismatchError: If the relation named as the provided `relation_name`
+            argument has a different role than what is specified by the
+            `expected_relation_role` argument.
+    """
+    if relation_name not in charm.meta.relations:
+        raise RelationNotFoundError(relation_name)
+
+    relation = charm.meta.relations[relation_name]
+
+    actual_relation_interface = relation.interface_name
+    if actual_relation_interface != expected_relation_interface:
+        raise RelationInterfaceMismatchError(
+            relation_name, expected_relation_interface, actual_relation_interface
+        )
+
+    if expected_relation_role == RelationRole.provides:
+        if relation_name not in charm.meta.provides:
+            raise RelationRoleMismatchError(
+                relation_name, RelationRole.provides, RelationRole.requires
+            )
+    elif expected_relation_role == RelationRole.requires:
+        if relation_name not in charm.meta.requires:
+            raise RelationRoleMismatchError(
+                relation_name, RelationRole.requires, RelationRole.provides
+            )
+    else:
+        raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role))
+
+
+def _encode_dashboard_content(content: Union[str, bytes]) -> str:
+    if isinstance(content, str):
+        content = bytes(content, "utf-8")
+
+    return base64.b64encode(lzma.compress(content)).decode("utf-8")
+
+
+def _decode_dashboard_content(encoded_content: str) -> str:
+    return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode()
+
+
+def _convert_dashboard_fields(content: str) -> str:
+    """Make sure values are present for Juju topology.
+
+    Inserts Juju topology variables and selectors into the template, as well as
+    a variable for Prometheus.
+    """
+    dict_content = json.loads(content)
+    datasources = {}
+    existing_templates = False
+
+    # If no existing template variables exist, just insert our own
+    if "templating" not in dict_content:
+        dict_content["templating"] = {"list": [d for d in TEMPLATE_DROPDOWNS]}
+    else:
+        # Otherwise, set a flag so we can go back later
+        existing_templates = True
+        for maybe in dict_content["templating"]["list"]:
+            # Build a list of `datasource_name`: `datasource_type` mappings
+            # The "query" field is actually "prometheus", "loki", "influxdb", etc
+            if "type" in maybe and maybe["type"] == "datasource":
+                datasources[maybe["name"]] = maybe["query"]
+
+        # Put our own variables in the template
+        for d in TEMPLATE_DROPDOWNS:
+            if d not in dict_content["templating"]["list"]:
+                dict_content["templating"]["list"].insert(0, d)
+
+    dict_content = _replace_template_fields(dict_content, datasources, existing_templates)
+
+    return json.dumps(dict_content)
+
+
+def _replace_template_fields(  # noqa: C901
+    dict_content: dict, datasources: dict, existing_templates: bool
+) -> dict:
+    """Make templated fields get cleaned up afterwards.
+
+    If existing datasource variables are present, try to substitute them, otherwise
+    assume they are all for Prometheus and put the prometheus variable there.
+    """
+    replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"}
+    used_replacements = []
+
+    # If any existing datasources match types we know, or we didn't find
+    # any templating variables at all, template them.
+    if datasources or not existing_templates:
+        panels = dict_content["panels"]
+
+        # Go through all of the panels. If they have a datasource set, AND it's one
+        # that we can convert to ${lokids} or ${prometheusds}, by stripping off the
+        # ${} templating and comparing the name to the list we built, replace it,
+        # otherwise, leave it alone.
+        #
+        # COS only knows about Prometheus and Loki.
+        for panel in panels:
+            if "datasource" not in panel or not panel.get("datasource", ""):
+                continue
+            if not existing_templates:
+                panel["datasource"] = "${prometheusds}"
+            else:
+                if panel["datasource"] in replacements.values():
+                    # Already a known template variable
+                    continue
+                if not panel["datasource"]:
+                    # Don't worry about null values
+                    continue
+                # Strip out variable characters and maybe braces
+                ds = re.sub(r"(\$|\{|\})", "", panel["datasource"])
+                replacement = replacements.get(datasources[ds], "")
+                if replacement:
+                    used_replacements.append(ds)
+                panel["datasource"] = replacement or panel["datasource"]
+
+        # Put our substitutions back
+        dict_content["panels"] = panels
+
+    # Finally, go back and pop off the templates we stubbed out
+    deletions = []
+    for tmpl in dict_content["templating"]["list"]:
+        if tmpl["name"] and tmpl["name"] in used_replacements:
+            deletions.append(tmpl)
+
+    for d in deletions:
+        dict_content["templating"]["list"].remove(d)
+
+    return dict_content
+
+
+def _type_convert_stored(obj):
+    """Convert Stored* to their appropriate types, recursively."""
+    if isinstance(obj, StoredList):
+        return list(map(_type_convert_stored, obj))
+    elif isinstance(obj, StoredDict):
+        rdict = {}  # type: Dict[Any, Any]
+        for k in obj.keys():
+            rdict[k] = _type_convert_stored(obj[k])
+        return rdict
+    else:
+        return obj
+
+
+class GrafanaDashboardsChanged(EventBase):
+    """Event emitted when Grafana dashboards change."""
+
+    def __init__(self, handle, data=None):
+        super().__init__(handle)
+        self.data = data
+
+    def snapshot(self) -> Dict:
+        """Save grafana source information."""
+        return {"data": self.data}
+
+    def restore(self, snapshot):
+        """Restore grafana source information."""
+        self.data = snapshot["data"]
+
+
+class GrafanaDashboardEvents(ObjectEvents):
+    """Events raised by :class:`GrafanaSourceEvents`."""
+
+    dashboards_changed = EventSource(GrafanaDashboardsChanged)
+
+
+class GrafanaDashboardEvent(EventBase):
+    """Event emitted when Grafana dashboards cannot be resolved.
+
+    Enables us to set a clear status on the provider.
+    """
+
+    def __init__(self, handle, errors: List[Dict[str, str]] = [], valid: bool = False):
+        super().__init__(handle)
+        self.errors = errors
+        self.error_message = "; ".join([error["error"] for error in errors if "error" in error])
+        self.valid = valid
+
+    def snapshot(self) -> Dict:
+        """Save grafana source information."""
+        return {
+            "error_message": self.error_message,
+            "valid": self.valid,
+            "errors": json.dumps(self.errors),
+        }
+
+    def restore(self, snapshot):
+        """Restore grafana source information."""
+        self.error_message = snapshot["error_message"]
+        self.valid = snapshot["valid"]
+        self.errors = json.loads(snapshot["errors"])
+
+
+class GrafanaProviderEvents(ObjectEvents):
+    """Events raised by :class:`GrafanaSourceEvents`."""
+
+    dashboard_status_changed = EventSource(GrafanaDashboardEvent)
+
+
+class GrafanaDashboardProvider(Object):
+    """An API to provide Grafana dashboards to a Grafana charm."""
+
+    _stored = StoredState()
+    on = GrafanaProviderEvents()
+
+    def __init__(
+        self,
+        charm: CharmBase,
+        relation_name: str = DEFAULT_RELATION_NAME,
+        dashboards_path: str = "src/grafana_dashboards",
+    ) -> None:
+        """API to provide Grafana dashboard to a Grafana charmed operator.
+
+        The :class:`GrafanaDashboardProvider` object provides an API
+        to upload dashboards to a Grafana charm. In its most streamlined
+        usage, the :class:`GrafanaDashboardProvider` is integrated in a
+        charmed operator as follows:
+
+            self.grafana = GrafanaDashboardProvider(self)
+
+        The :class:`GrafanaDashboardProvider` will look for dashboard
+        templates in the `<charm-py-directory>/grafana_dashboards` folder.
+        Additionally, dashboard templates can be uploaded programmatically
+        via the :method:`GrafanaDashboardProvider.add_dashboard` method.
+
+        To use the :class:`GrafanaDashboardProvider` API, you need a relation
+        defined in your charm operator's metadata.yaml as follows:
+
+            provides:
+                grafana-dashboard:
+                    interface: grafana_dashboard
+
+        If you would like to use relation name other than `grafana-dashboard`,
+        you will need to specify the relation name via the `relation_name`
+        argument when instantiating the :class:`GrafanaDashboardProvider` object.
+        However, it is strongly advised to keep the the default relation name,
+        so that people deploying your charm will have a consistent experience
+        with all other charms that provide Grafana dashboards.
+
+        It is possible to provide a different file path for the Grafana dashboards
+        to be automatically managed by the :class:`GrafanaDashboardProvider` object
+        via the `dashboards_path` argument. This may be necessary when the directory
+        structure of your charmed operator repository is not the "usual" one as
+        generated by `charmcraft init`, for example when adding the charmed operator
+        in a Java repository managed by Maven or Gradle. However, unless there are
+        such constraints with other tooling, it is strongly advised to store the
+        Grafana dashboards in the default `<charm-py-directory>/grafana_dashboards`
+        folder, in order to provide a consistent experience for other charmed operator
+        authors.
+
+        Args:
+            charm: a :class:`CharmBase` object which manages this
+                :class:`GrafanaProvider` object. Generally this is
+                `self` in the instantiating class.
+            relation_name: a :string: name of the relation managed by this
+                :class:`GrafanaDashboardProvider`; it defaults to "grafana-dashboard".
+            dashboards_path: a filesystem path relative to the charm root
+                where dashboard templates can be located. By default, the library
+                expects dashboard files to be in the `<charm-py-directory>/grafana_dashboards`
+                directory.
+        """
+        _validate_relation_by_interface_and_direction(
+            charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides
+        )
+
+        try:
+            dashboards_path = _resolve_dir_against_charm_path(charm, dashboards_path)
+        except InvalidDirectoryPathError as e:
+            logger.warning(
+                "Invalid Grafana dashboards folder at %s: %s",
+                e.grafana_dashboards_absolute_path,
+                e.message,
+            )
+
+        super().__init__(charm, relation_name)
+
+        self._charm = charm
+        self._relation_name = relation_name
+        self._dashboards_path = dashboards_path
+
+        # No peer relation bucket we can rely on providers, keep StoredState here, too
+        self._stored.set_default(dashboard_templates={})
+
+        self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir)
+        self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir)
+
+        self.framework.observe(
+            self._charm.on[self._relation_name].relation_created,
+            self._on_grafana_dashboard_relation_created,
+        )
+        self.framework.observe(
+            self._charm.on[self._relation_name].relation_changed,
+            self._on_grafana_dashboard_relation_changed,
+        )
+
+    def add_dashboard(self, content: str) -> None:
+        """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`.
+
+        Args:
+            content: a string representing a Jinja template. Currently, no
+                global variables are added to the Jinja template evaluation
+                context.
+        """
+        # Update of storage must be done irrespective of leadership, so
+        # that the stored state is there when this unit becomes leader.
+        stored_dashboard_templates = self._stored.dashboard_templates
+
+        encoded_dashboard = _encode_dashboard_content(content)
+
+        # Use as id the first chars of the encoded dashboard, so that
+        # it is predictable across units.
+        id = "prog:{}".format(encoded_dashboard[-24:-16])
+        stored_dashboard_templates[id] = self._content_to_dashboard_object(encoded_dashboard)
+
+        if self._charm.unit.is_leader():
+            for dashboard_relation in self._charm.model.relations[self._relation_name]:
+                self._upset_dashboards_on_relation(dashboard_relation)
+
+    def remove_non_builtin_dashboards(self) -> None:
+        """Remove all dashboards to the relation added via :method:`add_dashboard`."""
+        # Update of storage must be done irrespective of leadership, so
+        # that the stored state is there when this unit becomes leader.
+        stored_dashboard_templates = self._stored.dashboard_templates
+
+        for dashboard_id in list(stored_dashboard_templates.keys()):
+            if dashboard_id.startswith("prog:"):
+                del stored_dashboard_templates[dashboard_id]
+        self._stored.dashboard_templates = stored_dashboard_templates
+
+        if self._charm.unit.is_leader():
+            for dashboard_relation in self._charm.model.relations[self._relation_name]:
+                self._upset_dashboards_on_relation(dashboard_relation)
+
+    def update_dashboards(self) -> None:
+        """Trigger the re-evaluation of the data on all relations."""
+        if self._charm.unit.is_leader():
+            for dashboard_relation in self._charm.model.relations[self._relation_name]:
+                self._upset_dashboards_on_relation(dashboard_relation)
+
+    def _update_all_dashboards_from_dir(self, _: Optional[HookEvent] = None) -> None:
+        """Scans the built-in dashboards and updates relations with changes."""
+        # Update of storage must be done irrespective of leadership, so
+        # that the stored state is there when this unit becomes leader.
+
+        # Ensure we do not leave outdated dashboards by removing from stored all
+        # the encoded dashboards that start with "file/".
+        if self._dashboards_path:
+            stored_dashboard_templates = self._stored.dashboard_templates
+
+            for dashboard_id in list(stored_dashboard_templates.keys()):
+                if dashboard_id.startswith("file:"):
+                    del stored_dashboard_templates[dashboard_id]
+
+            # Path.glob uses fnmatch on the backend, which is pretty limited, so use a
+            # custom function for the filter
+            def _is_dashbaord(p: Path) -> bool:
+                return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl"))
+
+            for path in filter(_is_dashbaord, Path(self._dashboards_path).glob("*")):
+                # path = Path(path)
+                id = "file:{}".format(path.stem)
+                stored_dashboard_templates[id] = self._content_to_dashboard_object(
+                    _encode_dashboard_content(path.read_bytes())
+                )
+
+            self._stored.dashboard_templates = stored_dashboard_templates
+
+            if self._charm.unit.is_leader():
+                for dashboard_relation in self._charm.model.relations[self._relation_name]:
+                    self._upset_dashboards_on_relation(dashboard_relation)
+
+    def _reinitialize_dashboard_data(self) -> None:
+        """Triggers a reload of dashboard outside of an eventing workflow.
+
+        This will destroy any existing relation data.
+        """
+        try:
+            _resolve_dir_against_charm_path(self._charm, self._dashboards_path)
+            self._update_all_dashboards_from_dir()
+
+        except InvalidDirectoryPathError as e:
+            logger.warning(
+                "Invalid Grafana dashboards folder at %s: %s",
+                e.grafana_dashboards_absolute_path,
+                e.message,
+            )
+            stored_dashboard_templates = self._stored.dashboard_templates
+
+            for dashboard_id in list(stored_dashboard_templates.keys()):
+                if dashboard_id.startswith("file:"):
+                    del stored_dashboard_templates[dashboard_id]
+            self._stored.dashboard_templates = stored_dashboard_templates
+
+            # With all of the file-based dashboards cleared out, force a refresh
+            # of relation data
+            if self._charm.unit.is_leader():
+                for dashboard_relation in self._charm.model.relations[self._relation_name]:
+                    self._upset_dashboards_on_relation(dashboard_relation)
+
+    def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> None:
+        """Watch for a relation being created and automatically send dashboards.
+
+        Args:
+            event: The :class:`RelationJoinedEvent` sent when a
+                `grafana_dashboaard` relationship is joined
+        """
+        if self._charm.unit.is_leader():
+            self._upset_dashboards_on_relation(event.relation)
+
+    def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None:
+        """Watch for changes so we know if there's an error to signal back to the parent charm.
+
+        Args:
+            event: The `RelationChangedEvent` that triggered this handler.
+        """
+        if self._charm.unit.is_leader():
+            data = json.loads(event.relation.data[event.app].get("event", "{}"))
+
+            if not data:
+                return
+
+            valid = bool(data.get("valid", True))
+            errors = data.get("errors", [])
+            if valid and not errors:
+                self.on.dashboard_status_changed.emit(valid=valid)
+            else:
+                self.on.dashboard_status_changed.emit(valid=valid, errors=errors)
+
+    def _upset_dashboards_on_relation(self, relation: Relation) -> None:
+        """Update the dashboards in the relation data bucket."""
+        # It's completely ridiculous to add a UUID, but if we don't have some
+        # pseudo-random value, this never makes it across 'juju set-state'
+        stored_data = {
+            "templates": _type_convert_stored(self._stored.dashboard_templates),
+            "uuid": str(uuid.uuid4()),
+        }
+
+        relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data)
+
+    def _content_to_dashboard_object(self, content: str) -> Dict:
+        return {
+            "charm": self._charm.meta.name,
+            "content": content,
+            "juju_topology": self._juju_topology,
+        }
+
+    # This is not actually used in the dashboards, but is present to provide a secondary
+    # salt to ensure uniqueness in the dict keys in case individual charm units provide
+    # dashboards
+    @property
+    def _juju_topology(self) -> Dict:
+        return {
+            "model": self._charm.model.name,
+            "model_uuid": self._charm.model.uuid,
+            "application": self._charm.app.name,
+            "unit": self._charm.unit.name,
+        }
+
+    @property
+    def dashboard_templates(self) -> List:
+        """Return a list of the known dashboard templates."""
+        return [v for v in self._stored.dashboard_templates.values()]
+
+
+class GrafanaDashboardConsumer(Object):
+    """A consumer object for working with Grafana Dashboards."""
+
+    on = GrafanaDashboardEvents()
+    _stored = StoredState()
+
+    def __init__(
+        self,
+        charm: CharmBase,
+        relation_name: str = DEFAULT_RELATION_NAME,
+    ) -> None:
+        """API to receive Grafana dashboards from charmed operators.
+
+        The :class:`GrafanaDashboardConsumer` object provides an API
+        to consume dashboards provided by a charmed operator using the
+        :class:`GrafanaDashboardProvider` library. The
+        :class:`GrafanaDashboardConsumer` is integrated in a
+        charmed operator as follows:
+
+            self.grafana = GrafanaDashboardConsumer(self)
+
+        To use this library, you need a relation defined as follows in
+        your charm operator's metadata.yaml:
+
+            requires:
+                grafana-dashboard:
+                    interface: grafana_dashboard
+
+        If you would like to use a different relation name than
+        `grafana-dashboard`, you need to specify the relation name via the
+        `relation_name` argument. However, it is strongly advised not to
+        change the default, so that people deploying your charm will have
+        a consistent experience with all other charms that consume Grafana
+        dashboards.
+
+        Args:
+            charm: a :class:`CharmBase` object which manages this
+                :class:`GrafanaProvider` object. Generally this is
+                `self` in the instantiating class.
+            relation_name: a :string: name of the relation managed by this
+                :class:`GrafanaDashboardConsumer`; it defaults to "grafana-dashboard".
+        """
+        _validate_relation_by_interface_and_direction(
+            charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires
+        )
+
+        super().__init__(charm, relation_name)
+        self._charm = charm
+        self._relation_name = relation_name
+
+        self._stored.set_default(dashboards=dict())
+
+        self.framework.observe(
+            self._charm.on[self._relation_name].relation_changed,
+            self._on_grafana_dashboard_relation_changed,
+        )
+        self.framework.observe(
+            self._charm.on[self._relation_name].relation_broken,
+            self._on_grafana_dashboard_relation_broken,
+        )
+        self.framework.observe(
+            self._charm.on[DEFAULT_PEER_NAME].relation_changed,
+            self._on_grafana_peer_changed,
+        )
+
+    def get_dashboards_from_relation(self, relation_id: int) -> List:
+        """Get a list of known dashboards for one instance of the monitored relation.
+
+        Args:
+            relation_id: the identifier of the relation instance, as returned by
+                :method:`ops.model.Relation.id`.
+
+        Returns: a list of known dashboards coming from the provided relation instance.
+        """
+        return [
+            self._to_external_object(relation_id, dashboard)
+            for dashboard in self._get_stored_dashboards(relation_id)
+        ]
+
+    def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None:
+        """Handle relation changes in related providers.
+
+        If there are changes in relations between Grafana dashboard consumers
+        and providers, this event handler (if the unit is the leader) will
+        get data for an incoming grafana-dashboard relation through a
+        :class:`GrafanaDashboardsChanged` event, and make the relation data
+        available in the app's datastore object. The Grafana charm can
+        then respond to the event to update its configuration.
+        """
+        changes = False
+        if self._charm.unit.is_leader():
+            changes = self._render_dashboards_and_signal_changed(event.relation)
+
+        if changes:
+            self.on.dashboards_changed.emit()
+
+    def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None:
+        """Emit dashboard events on peer events so secondary charm data updates."""
+        if self._charm.unit.is_leader():
+            return
+        self.on.dashboards_changed.emit()
+
+    def update_dashboards(self, relation: Optional[Relation] = None) -> None:
+        """Re-establish dashboards on one or more relations.
+
+        If something changes between this library and a datasource, try to re-establish
+        invalid dashboards and invalidate active ones.
+
+        Args:
+            relation: a specific relation for which the dashboards have to be
+                updated. If not specified, all relations managed by this
+                :class:`GrafanaDashboardConsumer` will be updated.
+        """
+        changes = False
+        if self._charm.unit.is_leader():
+            relations = (
+                [relation] if relation else self._charm.model.relations[self._relation_name]
+            )
+
+            for relation in relations:
+                self._render_dashboards_and_signal_changed(relation)  # type: ignore
+
+        if changes:
+            self.on.dashboards_changed.emit()
+
+    def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None:
+        """Update job config when providers depart.
+
+        When a Grafana dashboard provider departs, the configuration
+        for that provider is removed from the list of dashboards
+        """
+        if not self._charm.unit.is_leader():
+            return
+
+        self._remove_all_dashboards_for_relation(event.relation)
+
+    def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool:  # type: ignore
+        """Validate a given dashboard.
+
+        Verify that the passed dashboard data is able to be found in our list
+        of datasources and will render. If they do, let the charm know by
+        emitting an event.
+
+        Args:
+            relation: Relation; The relation the dashboard is associated with.
+
+        Returns:
+            a boolean indicating whether an event should be emitted
+        """
+        other_app = relation.app
+
+        raw_data = relation.data[other_app].get("dashboards", {})
+
+        if not raw_data:
+            logger.warning(
+                "No dashboard data found in the %s:%s relation",
+                self._relation_name,
+                str(relation.id),
+            )
+            return False
+
+        data = json.loads(raw_data)
+
+        # The only piece of data needed on this side of the relations is "templates"
+        templates = data.pop("templates")
+
+        # Import only if a charmed operator uses the consumer, we don't impose these
+        # dependencies on the client
+        from jinja2 import Template  # type: ignore
+        from jinja2.exceptions import TemplateSyntaxError  # type: ignore
+
+        # The dashboards are WAY too big since this ultimately calls out to Juju to
+        # set the relation data, and it overflows the maximum argument length for
+        # subprocess, so we have to use b64, annoyingly.
+        # Worse, Python3 expects absolutely everything to be a byte, and a plain
+        # `base64.b64encode()` is still too large, so we have to go through hoops
+        # of encoding to byte, compressing with lzma, converting to base64 so it
+        # can be converted to JSON, then all the way back.
+
+        rendered_dashboards = []
+        relation_has_invalid_dashboards = False
+
+        for _, (fname, template) in enumerate(templates.items()):
+            decoded_content = None
+            content = None
+            error = None
+            try:
+                decoded_content = _decode_dashboard_content(template["content"])
+                content = Template(decoded_content).render()
+                content = _encode_dashboard_content(_convert_dashboard_fields(content))
+            except lzma.LZMAError as e:
+                error = str(e)
+                relation_has_invalid_dashboards = True
+            except json.JSONDecodeError as e:
+                error = str(e.msg)
+                logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname))
+                continue
+            except TemplateSyntaxError as e:
+                error = str(e)
+                relation_has_invalid_dashboards = True
+
+            # Prepend the relation name and ID to the dashboard ID to avoid clashes with
+            # multiple relations with apps from the same charm, or having dashboards with
+            # the same ids inside their charm operators
+            rendered_dashboards.append(
+                {
+                    "id": "{}:{}/{}".format(relation.name, relation.id, fname),
+                    "original_id": fname,
+                    "content": content if content else None,
+                    "template": template,
+                    "valid": (error is None),
+                    "error": error,
+                }
+            )
+
+        if relation_has_invalid_dashboards:
+            self._remove_all_dashboards_for_relation(relation)
+
+            invalid_templates = [
+                data["original_id"] for data in rendered_dashboards if not data["valid"]
+            ]
+
+            logger.warning(
+                "Cannot add one or more Grafana dashboards from relation '{}:{}': the following "
+                "templates are invalid: {}".format(
+                    relation.name,
+                    relation.id,
+                    invalid_templates,
+                )
+            )
+
+            relation.data[self._charm.app]["event"] = json.dumps(
+                {
+                    "errors": [
+                        {
+                            "dashboard_id": rendered_dashboard["original_id"],
+                            "error": rendered_dashboard["error"],
+                        }
+                        for rendered_dashboard in rendered_dashboards
+                        if rendered_dashboard["error"]
+                    ]
+                }
+            )
+
+            # Dropping dashboards for a relation needs to be signalled
+            return True
+        else:
+            stored_data = rendered_dashboards
+            currently_stored_data = self._get_stored_dashboards(relation.id)
+
+            coerced_data = (
+                _type_convert_stored(currently_stored_data) if currently_stored_data else {}
+            )
+
+            if not coerced_data == stored_data:
+                stored_dashboards = self.get_peer_data("dashboards")
+                stored_dashboards[relation.id] = stored_data
+                self.set_peer_data("dashboards", stored_dashboards)
+                return True
+
+    def _remove_all_dashboards_for_relation(self, relation: Relation) -> None:
+        """If an errored dashboard is in stored data, remove it and trigger a deletion."""
+        if self._get_stored_dashboards(relation.id):
+            stored_dashboards = self.get_peer_data("dashboards")
+            stored_dashboards.pop(str(relation.id))
+            self.set_peer_data("dashboards", stored_dashboards)
+            self.on.dashboards_changed.emit()
+
+    def _to_external_object(self, relation_id, dashboard):
+        return {
+            "id": dashboard["original_id"],
+            "relation_id": relation_id,
+            "charm": dashboard["template"]["charm"],
+            "content": _decode_dashboard_content(dashboard["content"]),
+        }
+
+    @property
+    def dashboards(self) -> List[Dict]:
+        """Get a list of known dashboards across all instances of the monitored relation.
+
+        Returns: a list of known dashboards. The JSON of each of the dashboards is available
+            in the `content` field of the corresponding `dict`.
+        """
+        dashboards = []
+
+        for _, (relation_id, dashboards_for_relation) in enumerate(
+            self.get_peer_data("dashboards").items()
+        ):
+            for dashboard in dashboards_for_relation:
+                dashboards.append(self._to_external_object(relation_id, dashboard))
+
+        return dashboards
+
+    def _get_stored_dashboards(self, relation_id: int) -> list:
+        """Pull stored dashboards out of the peer data bucket."""
+        return self.get_peer_data("dashboards").get(str(relation_id), {})
+
+    def _set_default_data(self) -> None:
+        """Set defaults if they are not in peer relation data."""
+        data = {"dashboards": {}}  # type: ignore
+        for k, v in data.items():
+            if not self.get_peer_data(k):
+                self.set_peer_data(k, v)
+
+    def set_peer_data(self, key: str, data: Any) -> None:
+        """Put information into the peer data bucket instead of `StoredState`."""
+        self._charm.peers.data[self._charm.app][key] = json.dumps(data)  # type: ignore
+
+    def get_peer_data(self, key: str) -> Any:
+        """Retrieve information from the peer data bucket instead of `StoredState`."""
+        data = self._charm.peers.data[self._charm.app].get(key, "")  # type: ignore
+        return json.loads(data) if data else {}
+
+
+class GrafanaDashboardAggregator(Object):
+    """API to retrieve Grafana dashboards from machine dashboards.
+
+    The :class:`GrafanaDashboardAggregator` object provides a way to
+    collate and aggregate Grafana dashboards from reactive/machine charms
+    and transport them into Charmed Operators, using Juju topology.
+
+    For detailed usage instructions, see the documentation for
+    :module:`lma-proxy-operator`, as this class is intended for use as a
+    single point of intersection rather than use in individual charms.
+
+    Since :class:`GrafanaDashboardAggregator` serves as a bridge between
+    Canonical Observability Stack Charmed Operators and Reactive Charms,
+    deployed in a Reactive Juju model, both a target relation which is
+    used to collect events from Reactive charms and a `grafana_relation`
+    which is used to send the collected data back to the Canonical
+    Observability Stack are required.
+
+    In its most streamlined usage, :class:`GrafanaDashboardAggregator` is
+    integrated in a charmed operator as follows:
+
+        self.grafana = GrafanaDashboardAggregator(self)
+
+    Args:
+        charm: a :class:`CharmBase` object which manages this
+            :class:`GrafanaProvider` object. Generally this is
+            `self` in the instantiating class.
+        target_relation: a :string: name of a relation managed by this
+            :class:`GrafanaDashboardAggregator`, which is used to communicate
+            with reactive/machine charms it defaults to "dashboards".
+        grafana_relation: a :string: name of a relation used by this
+            :class:`GrafanaDashboardAggregator`, which is used to communicate
+            with charmed grafana. It defaults to "downstream-grafana-dashboard"
+    """
+
+    _stored = StoredState()
+    on = GrafanaProviderEvents()
+
+    def __init__(
+        self,
+        charm: CharmBase,
+        target_relation: str = "dashboards",
+        grafana_relation: str = "downstream-grafana-dashboard",
+    ):
+        super().__init__(charm, grafana_relation)
+
+        # Reactive charms may be RPC-ish and not leave reliable data around. Keep
+        # StoredState here
+        self._stored.set_default(
+            dashboard_templates={},
+            id_mappings={},
+        )
+
+        self._charm = charm
+        self._target_relation = target_relation
+        self._grafana_relation = grafana_relation
+
+        self.framework.observe(
+            self._charm.on[self._grafana_relation].relation_joined,
+            self._update_remote_grafana,
+        )
+        self.framework.observe(
+            self._charm.on[self._grafana_relation].relation_changed,
+            self._update_remote_grafana,
+        )
+        self.framework.observe(
+            self._charm.on[self._target_relation].relation_changed,
+            self.update_dashboards,
+        )
+        self.framework.observe(
+            self._charm.on[self._target_relation].relation_broken,
+            self.remove_dashboards,
+        )
+
+    def update_dashboards(self, event: RelationEvent) -> None:
+        """If we get a dashboard from a reactive charm, parse it out and update."""
+        if self._charm.unit.is_leader():
+            self._upset_dashboards_on_event(event)
+
+    def _upset_dashboards_on_event(self, event: RelationEvent) -> None:
+        """Update the dashboards in the relation data bucket."""
+        dashboards = self._handle_reactive_dashboards(event)
+
+        if not dashboards:
+            logger.warning(
+                "Could not find dashboard data after a relation change for {}".format(event.app)
+            )
+            return
+
+        for id in dashboards:
+            self._stored.dashboard_templates[id] = self._content_to_dashboard_object(
+                dashboards[id], event
+            )
+
+        self._stored.id_mappings[event.app.name] = dashboards
+        self._update_remote_grafana(event)
+
+    def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None:
+        """Push dashboards to the downstream Grafana relation."""
+        # It's still ridiculous to add a UUID here, but needed
+        stored_data = {
+            "templates": _type_convert_stored(self._stored.dashboard_templates),
+            "uuid": str(uuid.uuid4()),
+        }
+
+        for grafana_relation in self.model.relations[self._grafana_relation]:
+            grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data)
+
+    def remove_dashboards(self, event: RelationBrokenEvent) -> None:
+        """Remove a dashboard if the relation is broken."""
+        app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name])
+
+        del self._stored.id_mappings[event.app.name]
+        for id in app_ids:
+            del self._stored.dashboard_templates[id]
+
+        stored_data = {
+            "templates": _type_convert_stored(self._stored.dashboard_templates),
+            "uuid": str(uuid.uuid4()),
+        }
+
+        for grafana_relation in self.model.relations[self._grafana_relation]:
+            grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data)
+
+    # Yes, this has a fair amount of branching. It's not that complex, though
+    def _strip_existing_datasources(self, template: dict) -> dict:  # noqa: C901
+        """Remove existing reactive charm datasource templating out.
+
+        This method iterates through *known* places where reactive charms may set
+        data in contributed dashboards and removes them.
+
+        `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from
+        the Grafana UI. It is not present in earlier Grafana versions, and can be disabled
+        in 5.3.4 and above (optionally). If set, any values present will be substituted on
+        import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors
+        for datasources, and leaving this present results in "default" datasource values
+        which are broken.
+
+        Similarly, `dashboard["templating"]["list"][N]["name"] == "host"` can be used to
+        set a `host` variable for use in dashboards which is not meaningful in the context
+        of Juju topology and will yield broken dashboards.
+
+        Further properties may be discovered.
+        """
+        dash = template["dashboard"]
+        try:
+            if "list" in dash["templating"]:
+                for i in range(len(dash["templating"]["list"])):
+                    if (
+                        "datasource" in dash["templating"]["list"][i]
+                        and "Juju" in dash["templating"]["list"][i]["datasource"]
+                    ):
+                        dash["templating"]["list"][i]["datasource"] = r"${prometheusds}"
+                    if (
+                        "name" in dash["templating"]["list"][i]
+                        and dash["templating"]["list"][i]["name"] == "host"
+                    ):
+                        dash["templating"]["list"][i] = REACTIVE_CONVERTER
+        except KeyError:
+            logger.debug("No existing templating data in dashboard")
+
+        if "__inputs" in dash:
+            inputs = dash
+            for i in range(len(dash["__inputs"])):
+                if dash["__inputs"][i]["pluginName"] == "Prometheus":
+                    del inputs["__inputs"][i]
+            if inputs:
+                dash["__inputs"] = inputs["__inputs"]
+            else:
+                del dash["__inputs"]
+
+        template["dashboard"] = dash
+        return template
+
+    def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]:
+        """Look for a dashboard in relation data (during a reactive hook) or builtin by name."""
+        templates = []
+        id = ""
+
+        # Reactive data can reliably be pulled out of events. In theory, if we got an event,
+        # it's on the bucket, but using event explicitly keeps the mental model in
+        # place for reactive
+        for k in event.relation.data[event.unit].keys():
+            if k.startswith("request_"):
+                templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"])
+
+        for k in event.relation.data[event.app].keys():
+            if k.startswith("request_"):
+                templates.append(json.loads(event.relation.data[event.app][k])["dashboard"])
+
+        builtins = self._maybe_get_builtin_dashboards(event)
+
+        if not templates and not builtins:
+            return {}
+
+        dashboards = {}
+        for t in templates:
+            # Replace values with LMA-style templating
+            t = self._strip_existing_datasources(t)
+
+            # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON
+            # in the bucket back out to the actual "dashboard" we _need_, this is the way
+            # This is not a mistake -- there's a double nesting in reactive charms, and
+            # Grafana won't load it. We have to unbox:
+            # event.relation.data[event.<type>]["request_*"]["dashboard"]["dashboard"],
+            # and the final unboxing is below.
+            dash = json.dumps(t["dashboard"])
+
+            # Replace the old-style datasource templates
+            dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash)
+            dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash)
+
+            from jinja2 import Template
+
+            content = _encode_dashboard_content(
+                Template(dash).render(host=event.unit.name, datasource="prometheus")
+            )
+            id = "prog:{}".format(content[-24:-16])
+
+            dashboards[id] = content
+        return {**builtins, **dashboards}
+
+    def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict:
+        """Tries to match the event with an included dashboard.
+
+        Scans dashboards packed with the charm instantiating this class, and tries to match
+        one with the event. There is no guarantee that any given event will match a builtin,
+        since each charm instantiating this class may include a different set of dashboards,
+        or none.
+        """
+        builtins = {}
+        dashboards_path = None
+
+        try:
+            dashboards_path = _resolve_dir_against_charm_path(
+                self._charm, "src/grafana_dashboards"
+            )
+        except InvalidDirectoryPathError as e:
+            logger.warning(
+                "Invalid Grafana dashboards folder at %s: %s",
+                e.grafana_dashboards_absolute_path,
+                e.message,
+            )
+
+        if dashboards_path:
+
+            def _is_dashbaord(p: Path) -> bool:
+                return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl"))
+
+            for path in filter(_is_dashbaord, Path(dashboards_path).glob("*")):
+                # path = Path(path)
+                if event.app.name in path.name:
+                    id = "file:{}".format(path.stem)
+                    builtins[id] = self._content_to_dashboard_object(
+                        _encode_dashboard_content(path.read_bytes()), event
+                    )
+
+        return builtins
+
+    def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict:
+        return {
+            "charm": event.app.name,
+            "content": content,
+            "juju_topology": self._juju_topology(event),
+        }
+
+    # This is not actually used in the dashboards, but is present to provide a secondary
+    # salt to ensure uniqueness in the dict keys in case individual charm units provide
+    # dashboards
+    def _juju_topology(self, event: RelationEvent) -> Dict:
+        return {
+            "model": self._charm.model.name,
+            "model_uuid": self._charm.model.uuid,
+            "application": event.app.name,
+            "unit": event.unit.name,
+        }
diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py
index a065dd53..c985b1e7 100644
--- a/lib/charms/observability_libs/v0/juju_topology.py
+++ b/lib/charms/observability_libs/v0/juju_topology.py
@@ -76,7 +76,7 @@
 LIBID = "bced1658f20f49d28b88f61f83c2d232"
 
 LIBAPI = 0
-LIBPATCH = 1
+LIBPATCH = 2
 
 
 class InvalidUUIDError(Exception):
@@ -126,9 +126,27 @@ def __init__(
         self._unit = unit
 
     def is_valid_uuid(self, uuid):
-        """Validates the supplied UUID against the Juju Model UUID pattern."""
+        """Validate the supplied UUID against the Juju Model UUID pattern."""
+        # TODO:
+        # Harness is harcoding an UUID that is v1 not v4: f2c1b2a6-e006-11eb-ba80-0242ac130004
+        # See: https://github.com/canonical/operator/issues/779
+        #
+        # >>> uuid.UUID("f2c1b2a6-e006-11eb-ba80-0242ac130004").version
+        # 1
+        #
+        # we changed the validation of the 3ed UUID block: 4[a-f0-9]{3} -> [a-f0-9]{4}
+        # See: https://github.com/canonical/operator/blob/main/ops/testing.py#L1094
+        #
+        # Juju in fact generates a UUID v4: https://github.com/juju/utils/blob/master/uuid.go#L62
+        # but does not validate it is actually v4:
+        # See:
+        # - https://github.com/juju/utils/blob/master/uuid.go#L22
+        # - https://github.com/juju/schema/blob/master/strings.go#L79
+        #
+        # Once Harness fixes this, we should remove this comment and refactor the regex or
+        # the entire method using the uuid module to validate UUIDs
         regex = re.compile(
-            "^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}$"
+            "^[a-f0-9]{8}-?[a-f0-9]{4}-?[a-f0-9]{4}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}$"
         )
         return bool(regex.match(uuid))
 
diff --git a/lib/charms/traefik_k8s/v0/ingress_per_unit.py b/lib/charms/traefik_k8s/v0/ingress_per_unit.py
index 21c58a98..59dfebfa 100644
--- a/lib/charms/traefik_k8s/v0/ingress_per_unit.py
+++ b/lib/charms/traefik_k8s/v0/ingress_per_unit.py
@@ -10,15 +10,13 @@
 ## Getting Started
 
 To get started using the library, you just need to fetch the library using `charmcraft`.
-**Note that you also need to add the `serialized_data_interface` dependency to your
-charm's `requirements.txt`.**
 
 ```shell
-cd some-charm
 charmcraft fetch-lib charms.traefik_k8s.v0.ingress_per_unit
-echo -e "serialized_data_interface\n" >> requirements.txt
 ```
 
+Add the `jsonschema` dependency to the `requirements.txt` of your charm.
+
 ```yaml
 requires:
     ingress:
@@ -48,285 +46,534 @@ def _handle_ingress_per_unit(self, event):
         logger.info("This unit's ingress URL: %s", self.ingress_per_unit.url)
 ```
 """
-
 import logging
-from typing import Optional
-
-from ops.charm import CharmBase, RelationBrokenEvent, RelationEvent, RelationRole
-from ops.framework import EventSource
-from ops.model import Relation, Unit
-
-try:
-    from serialized_data_interface import EndpointWrapper
-    from serialized_data_interface.errors import RelationDataError
-    from serialized_data_interface.events import EndpointWrapperEvents
-except ImportError:
-    import os
-
-    library_name = os.path.basename(__file__)
-    raise ModuleNotFoundError(
-        "To use the '{}' library, you must include "
-        "the '{}' package in your dependencies".format(library_name, "serialized_data_interface")
-    ) from None  # Suppress original ImportError
-
-try:
-    # introduced in 3.9
-    from functools import cache  # type: ignore
-except ImportError:
-    from functools import lru_cache
-
-    cache = lru_cache(maxsize=None)
+import socket
+import typing
+from typing import Dict, Optional, Union
+
+import ops.model
+import yaml
+from ops.charm import CharmBase, RelationBrokenEvent, RelationEvent
+from ops.framework import EventSource, Object, ObjectEvents
+from ops.model import (
+    ActiveStatus,
+    Application,
+    BlockedStatus,
+    Relation,
+    StatusBase,
+    Unit,
+    WaitingStatus,
+)
 
 # The unique Charmhub library identifier, never change it
-LIBID = "7ef06111da2945ed84f4f5d4eb5b353a"  # can't register a library until the charm is in the store 9_9
+LIBID = "7ef06111da2945ed84f4f5d4eb5b353a"
 
 # Increment this major API version when introducing breaking changes
 LIBAPI = 0
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 6
+LIBPATCH = 10
 
 log = logging.getLogger(__name__)
 
-INGRESS_SCHEMA = {
-    "v1": {
-        "requires": {
-            "unit": {
-                "type": "object",
-                "properties": {
-                    "model": {"type": "string"},
-                    "name": {"type": "string"},
-                    "host": {"type": "string"},
-                    "port": {"type": "integer"},
-                },
-                "required": ["model", "name", "host", "port"],
-            }
-        },
-        "provides": {
-            "app": {
-                "type": "object",
-                "properties": {
-                    "ingress": {
-                        "type": "object",
-                        "patternProperties": {
-                            "": {
-                                "type": "object",
-                                "properties": {"url": {"type": "string"}},
-                                "required": ["url"],
-                            }
-                        },
-                    }
-                },
-                "required": ["ingress"],
-            }
+try:
+    import jsonschema
+
+    DO_VALIDATION = True
+except ModuleNotFoundError:
+    log.warning(
+        "The `ingress_per_unit` library needs the `jsonschema` package to be able "
+        "to do runtime data validation; without it, it will still work but validation "
+        "will be disabled. \n"
+        "It is recommended to add `jsonschema` to the 'requirements.txt' of your charm, "
+        "which will enable this feature."
+    )
+    DO_VALIDATION = False
+
+# LIBRARY GLOBS
+RELATION_INTERFACE = "ingress_per_unit"
+DEFAULT_RELATION_NAME = RELATION_INTERFACE.replace("_", "-")
+
+INGRESS_REQUIRES_UNIT_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "model": {"type": "string"},
+        "name": {"type": "string"},
+        "host": {"type": "string"},
+        "port": {"type": "integer"},
+    },
+    "required": ["model", "name", "host", "port"],
+}
+INGRESS_PROVIDES_APP_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "ingress": {
+            "type": "object",
+            "patternProperties": {
+                "": {
+                    "type": "object",
+                    "properties": {
+                        "url": {"type": "string"},
+                    },
+                    "required": ["url"],
+                }
+            },
         },
-    }
+        # Optional key for backwards compatibility
+        # with legacy requirers based on SDI
+        "_supported_versions": {"type": "string"},
+    },
+    "required": ["ingress"],
 }
 
 
-class IngressPerUnitRequestEvent(RelationEvent):
-    """Event representing an incoming request.
+# TYPES
+try:
+    from typing import TypedDict
+except ImportError:
+    from typing_extensions import TypedDict  # py35 compat
+
+
+class RequirerData(TypedDict):  # pyright: reportGeneralTypeIssues=false
+    """Model of the data a unit implementing the requirer will need to provide."""
+
+    model: str
+    name: str
+    host: str
+    port: int
+
+
+RequirerUnitData = Dict[Unit, "RequirerData"]
+KeyValueMapping = Dict[str, str]
+ProviderApplicationData = Dict[str, KeyValueMapping]
+
+
+def _validate_data(data, schema):
+    """Checks whether `data` matches `schema`.
 
-    This is equivalent to the "ready" event, but is more semantically meaningful.
+    Will raise DataValidationError if the data is not valid, else return None.
     """
+    if not DO_VALIDATION:
+        return
+    try:
+        jsonschema.validate(instance=data, schema=schema)
+    except jsonschema.ValidationError as e:
+        raise DataValidationError(data, schema) from e
 
 
-class IngressPerUnitProviderEvents(EndpointWrapperEvents):
-    """Container for IUP events."""
+# EXCEPTIONS
+class DataValidationError(RuntimeError):
+    """Raised when data validation fails on IPU relation data."""
 
-    request = EventSource(IngressPerUnitRequestEvent)
 
+class RelationException(RuntimeError):
+    """Base class for relation exceptions from this library.
 
-class IngressPerUnitProvider(EndpointWrapper):
-    """Implementation of the provider of ingress_per_unit."""
+    Attributes:
+        relation: The Relation which caused the exception.
+        entity: The Application or Unit which caused the exception.
+    """
 
-    ROLE = RelationRole.provides.name
-    INTERFACE = "ingress_per_unit"
-    SCHEMA = INGRESS_SCHEMA
+    def __init__(self, relation: Relation, entity: Union[Application, Unit]):
+        super().__init__(relation)
+        self.args = (
+            "There is an error with the relation {}:{} with {}".format(
+                relation.name, relation.id, entity.name
+            ),
+        )
+        self.relation = relation
+        self.entity = entity
 
-    on = IngressPerUnitProviderEvents()
 
-    def __init__(self, charm: CharmBase, endpoint: str = None):
-        """Constructor for IngressPerUnitProvider.
+class RelationDataMismatchError(RelationException):
+    """Data from different units do not match where they should."""
+
+
+class RelationPermissionError(RelationException):
+    """Ingress is requested to do something for which it lacks permissions."""
+
+    def __init__(self, relation: Relation, entity: Union[Application, Unit], message: str):
+        super(RelationPermissionError, self).__init__(relation, entity)
+        self.args = (
+            "Unable to write data to relation '{}:{}' with {}: {}".format(
+                relation.name, relation.id, entity.name, message
+            ),
+        )
+
+
+# EVENTS
+class RelationAvailableEvent(RelationEvent):
+    """Event triggered when a relation is ready to provide ingress."""
+
+
+class RelationFailedEvent(RelationEvent):
+    """Event triggered when something went wrong with a relation."""
+
+
+class RelationReadyEvent(RelationEvent):
+    """Event triggered when a remote relation has the expected data."""
+
+
+class IngressPerUnitEvents(ObjectEvents):
+    """Container for events for IngressPerUnit."""
+
+    available = EventSource(RelationAvailableEvent)
+    ready = EventSource(RelationReadyEvent)
+    failed = EventSource(RelationFailedEvent)
+    broken = EventSource(RelationBrokenEvent)
+
+
+class _IngressPerUnitBase(Object):
+    """Base class for IngressPerUnit interface classes."""
+
+    if typing.TYPE_CHECKING:
+
+        @property
+        def on(self) -> IngressPerUnitEvents:
+            ...  # noqa
+
+    def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME):
+        """Constructor for _IngressPerUnitBase.
 
         Args:
             charm: The charm that is instantiating the instance.
-            endpoint: The name of the relation endpoint to bind to
+            relation_name: The name of the relation name to bind to
                 (defaults to "ingress-per-unit").
         """
-        super().__init__(charm, endpoint)
-        self.framework.observe(self.on.ready, self._emit_request_event)
-
-    def _emit_request_event(self, event):
-        self.on.request.emit(event.relation)
-
-    def get_request(self, relation: Relation):
-        """Get the IngressRequest for the given Relation."""
-        return IngressRequest(self, relation)
+        super().__init__(charm, relation_name)
+        self.charm: CharmBase = charm
+
+        self.relation_name = relation_name
+        self.app = self.charm.app
+        self.unit = self.charm.unit
+
+        observe = self.framework.observe
+        rel_events = charm.on[relation_name]
+        observe(rel_events.relation_created, self._handle_relation)
+        observe(rel_events.relation_joined, self._handle_relation)
+        observe(rel_events.relation_changed, self._handle_relation)
+        observe(rel_events.relation_broken, self._handle_relation_broken)
+        observe(charm.on.leader_elected, self._handle_upgrade_or_leader)
+        observe(charm.on.upgrade_charm, self._handle_upgrade_or_leader)
 
-    @cache
-    def is_failed(self, relation: Relation = None):
-        """Checks whether the given relation, or any relation if not specified, has an error."""
+    @property
+    def relations(self):
+        """The list of Relation instances associated with this relation_name."""
+        return list(self.charm.model.relations[self.relation_name])
+
+    def _handle_relation(self, event):
+        relation = event.relation
+        if self.is_ready(relation):
+            self.on.ready.emit(relation)
+        elif self.is_available(relation):
+            self.on.available.emit(relation)
+        elif self.is_failed(relation):
+            self.on.failed.emit(relation)
+        else:
+            log.debug(
+                "Relation {} is neither ready, nor available, nor failed. "
+                "Something fishy's going on...".format(relation)
+            )
+
+    def get_status(self, relation: Relation) -> StatusBase:
+        """Get the suggested status for the given Relation."""
+        if self.is_failed(relation):
+            return BlockedStatus(
+                "Error handling relation {}:{}".format(relation.name, relation.id)
+            )
+        elif not self.is_available(relation):
+            return WaitingStatus("Waiting on relation {}:{}".format(relation.name, relation.id))
+        elif not self.is_ready(relation):
+            return WaitingStatus("Waiting on relation {}:{}".format(relation.name, relation.id))
+        else:
+            return ActiveStatus()
+
+    def _handle_relation_broken(self, event):
+        self.on.broken.emit(event.relation)
+
+    def _handle_upgrade_or_leader(self, _):
+        pass
+
+    def is_available(self, relation: Optional[Relation] = None) -> bool:
+        """Check whether the given relation is available.
+
+        Or any relation if not specified.
+        """
         if relation is None:
-            return any(self.is_failed(relation) for relation in self.relations)
-        if not relation.units:
+            return any(map(self.is_available, self.relations))
+        if relation.app is None:
+            return False
+        if not relation.app.name:
+            # Juju doesn't provide JUJU_REMOTE_APP during relation-broken
+            # hooks. See https://github.com/canonical/operator/issues/693.
+            # Relation in the process of breaking cannot be available.
             return False
-        if super().is_failed(relation):
-            return True
-        data = self.unwrap(relation)
-        prev_fields = None
-        for unit in relation.units:
-            if not data[unit]:
-                continue
-            new_fields = {field: data[unit][field] for field in ("model", "port")}
-            if prev_fields is None:
-                prev_fields = new_fields
-            if new_fields != prev_fields:
-                raise RelationDataMismatchError(relation, unit)
-        return False
 
-    @property
-    def proxied_endpoints(self):
-        """Returns the ingress settings provided to units by this IngressPerUnitProvider.
+        return True
 
-        For example, when this IngressPerUnitProvider has provided the
-        `http://foo.bar/my-model.my-app-1` and `http://foo.bar/my-model.my-app-2` URLs to
-        the two units of the my-app application, the returned dictionary will be:
+    def is_ready(self, relation: Optional[Relation] = None) -> bool:
+        """Checks whether the given relation is ready.
 
-        ```
-        {
-            "my-app/1": {
-                "url": "http://foo.bar/my-model.my-app-1"
-            },
-            "my-app/2": {
-                "url": "http://foo.bar/my-model.my-app-2"
-            }
-        }
-        ```
+        Or any relation if not specified.
+        A given relation is ready if the remote side has sent valid data.
+        The base implementation does nothing but check that the relation is
+        available. It's up to subclasses to decide what it means for the
+        relation to be actually 'ready'.
         """
-        results = {}
+        if relation is None:
+            return any(map(self.is_ready, self.relations))
+        return self.is_available(relation)
 
-        for ingress_relation in self.charm.model.relations[self.endpoint]:
-            results.update(self.unwrap(ingress_relation)[self.charm.app].get("ingress", {}))
+    def is_failed(self, _: Optional[Relation] = None) -> bool:
+        """Checks whether the given relation is failed.
 
-        return results
+        Or any relation if not specified.
+        """
+        raise NotImplementedError("implement in subclass")
 
 
-class IngressRequest:
-    """A request for per-unit ingress."""
+class IngressPerUnitProvider(_IngressPerUnitBase):
+    """Implementation of the provider of ingress_per_unit."""
 
-    def __init__(self, provider: IngressPerUnitProvider, relation: Relation):
-        """Construct an IngressRequest."""
-        self._provider = provider
-        self._relation = relation
-        self._data = provider.unwrap(relation)
+    on = IngressPerUnitEvents()
 
-    @property
-    def model(self):
-        """The name of the model the request was made from."""
-        return self._get_data_from_first_unit("model")
+    def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME):
+        """Constructor for IngressPerUnitProvider.
 
-    @property
-    def app(self):
-        """The remote application."""
-        return self._relation.app
+        Args:
+            charm: The charm that is instantiating the instance.
+            relation_name: The name of the relation relation_name to bind to
+                (defaults to "ingress-per-unit").
+        """
+        super().__init__(charm, relation_name)
+        observe = self.framework.observe
+        observe(self.charm.on[relation_name].relation_joined, self._share_version_info)
 
-    @property
-    def app_name(self):
-        """The name of the remote app.
+    def _share_version_info(self, event):
+        """Backwards-compatibility shim for version negotiation.
 
-        Note: This is not the same as `self.app.name` when using CMR relations,
-        since `self.app.name` is replaced by a `remote-{UUID}` pattern.
+        Allows older versions of IPU (requirer side) to interact with this
+        provider without breaking.
+        Will be removed in a future version of this library.
+        Do not use.
         """
-        first_unit_name = self._get_data_from_first_unit("name")
+        relation = event.relation
+        if self.charm.unit.is_leader():
+            log.info("shared supported_versions shim information")
+            relation.data[self.charm.app]["_supported_versions"] = "- v1"
 
-        if first_unit_name:
-            return first_unit_name.split("/")[0]
+    def is_ready(self, relation: Optional[Relation] = None) -> bool:
+        """Checks whether the given relation is ready.
 
-        return None
+        Or any relation if not specified.
+        A given relation is ready if SOME remote side has sent valid data.
+        """
+        if relation is None:
+            return any(map(self.is_ready, self.relations))
 
-    @property
-    def units(self):
-        """The remote units."""
-        return sorted(self._relation.units, key=lambda unit: unit.name)
+        if not super().is_ready(relation):
+            return False
 
-    @property
-    def port(self):
-        """The backend port."""
-        return self._get_data_from_first_unit("port")
+        try:
+            requirer_unit_data = self._requirer_unit_data(relation)
+        except Exception:
+            log.exception("Cannot fetch ingress data for the '{}' relation".format(relation))
+            return False
 
-    def get_host(self, unit: Unit):
-        """The hostname (DNS address, ip) of the given unit."""
-        return self._get_unit_data(unit, "host")
+        return any(requirer_unit_data.values())
 
-    def get_unit_name(self, unit: Unit):
-        """The name of the remote unit.
+    def is_failed(self, relation: Optional[Relation] = None) -> bool:
+        """Checks whether the given relation is failed.
 
-        Note: This is not the same as `self.unit.name` when using CMR relations,
-        since `self.unit.name` is replaced by a `remote-{UUID}` pattern.
+        Or any relation if not specified.
         """
-        return self._get_unit_data(unit, "name")
+        if relation is None:
+            return any(map(self.is_failed, self.relations))
 
-    def _get_data_from_first_unit(self, key: str):
-        if self.units:
-            first_unit_data = self._data[self.units[0]]
+        if not relation.app.name:  # type: ignore
+            # Juju doesn't provide JUJU_REMOTE_APP during relation-broken
+            # hooks. See https://github.com/canonical/operator/issues/693
+            return False
 
-            if key in first_unit_data:
-                return first_unit_data[key]
+        if not relation.units:
+            # Relations without requiring units cannot be in failed state
+            return False
 
-        return None
+        try:
+            # grab the data and validate it; might raise
+            requirer_unit_data = self._requirer_unit_data(relation)
+        except DataValidationError as e:
+            log.warning("Failed to validate relation data for {} relation: {}".format(relation, e))
+            return True
 
-    def _get_unit_data(self, unit: Unit, key: str):
-        if self.units:
-            if unit in self.units:
-                unit_data = self._data[unit]
+        # verify that all remote units (requirer's side) publish the same model.
+        # We do not validate the port because, in case of changes to the configuration
+        # of the charm or a new version of the charmed workload, e.g. over an upgrade,
+        # the remote port may be different among units.
+        expected_model = None  # It may be none for units that have not yet written data
 
-                if key in unit_data:
-                    return unit_data[key]
+        for remote_unit, remote_unit_data in requirer_unit_data.items():
+            if "model" in remote_unit_data:
+                remote_model = remote_unit_data["model"]
+                if not expected_model:
+                    expected_model = remote_model
+                elif expected_model != remote_model:
+                    raise RelationDataMismatchError(relation, remote_unit)
 
-        return None
+        return False
+
+    def is_unit_ready(self, relation: Relation, unit: Unit) -> bool:
+        """Report whether the given unit has shared data in its unit data bag."""
+        # sanity check: this should not occur in production, but it may happen
+        # during testing: cfr https://github.com/canonical/traefik-k8s-operator/issues/39
+        assert (
+            unit in relation.units
+        ), "attempting to get ready state for unit that does not belong to relation"
+        if relation.data.get(unit, {}).get("data"):
+            # TODO consider doing schema-based validation here
+            return True
+        return False
+
+    def get_data(self, relation: Relation, unit: Unit) -> "RequirerData":
+        """Fetch the data shared by the specified unit on the relation (Requirer side)."""
+        data = yaml.safe_load(relation.data[unit]["data"])
+        _validate_data(data, INGRESS_REQUIRES_UNIT_SCHEMA)
+        return data
 
-    def respond(self, unit: Unit, url: str):
-        """Send URL back for the given unit.
+    def publish_url(self, relation: Relation, unit_name: str, url: str):
+        """Place the ingress url in the application data bag for the units on the requires side.
 
-        Note: only the leader can send URLs.
+        Assumes that this unit is leader.
         """
-        # Can't use `unit.name` because with CMR it's a UUID.
-        remote_unit_name = self.get_unit_name(unit)
-        ingress = self._data[self._provider.charm.app].setdefault("ingress", {})
-        ingress.setdefault(remote_unit_name, {})["url"] = url
-        self._provider.wrap(self._relation, self._data)
+        raw_data = relation.data[self.app].get("data", None)
+        data = yaml.safe_load(raw_data) if raw_data else {"ingress": {}}
+
+        # we ensure that the application databag has the shape we think it
+        # should have; to catch any inconsistencies early on.
+        try:
+            _validate_data(data, INGRESS_PROVIDES_APP_SCHEMA)
+        except DataValidationError as e:
+            log.error(
+                "unable to publish url to {}: corrupted application databag ({})".format(
+                    unit_name, e
+                )
+            )
+            return
+
+        # we update the data with a new url
+        data["ingress"][unit_name] = {"url": url}
+
+        # we validate the data **again**, to ensure that we respected the schema
+        # and did not accidentally corrupt our own databag.
+        _validate_data(data, INGRESS_PROVIDES_APP_SCHEMA)
+
+        try:
+            relation.data[self.app]["data"] = yaml.safe_dump(data)
+        except ops.model.RelationDataError:
+            unit = self.unit
+            raise RelationPermissionError(
+                relation,
+                unit,
+                "failed to write application data: leader={}".format(unit.is_leader()),
+            )
+
+    def wipe_ingress_data(self, relation):
+        """Remove all published ingress data.
+
+        Assumes that this unit is leader.
+        """
+        relation.data[self.app]["data"] = ""
+
+    def _requirer_unit_data(self, relation: Relation) -> RequirerUnitData:
+        """Fetch and validate the requirer's unit databag."""
+        if not relation.app or not relation.app.name:
+            # Handle edge case where remote app name can be missing, e.g.,
+            # relation_broken events.
+            # FIXME https://github.com/canonical/traefik-k8s-operator/issues/34
+            return {}
 
+        remote_units = [unit for unit in relation.units if unit.app is not self.app]
+
+        requirer_unit_data = {}
+        for remote_unit in remote_units:
+            remote_data = relation.data[remote_unit].get("data")
+            remote_deserialized = {}
+            if remote_data:
+                remote_deserialized = yaml.safe_load(remote_data)
+                _validate_data(remote_deserialized, INGRESS_REQUIRES_UNIT_SCHEMA)
+            requirer_unit_data[remote_unit] = remote_deserialized
+        return requirer_unit_data
+
+    def _provider_app_data(self, relation: Relation) -> ProviderApplicationData:
+        """Fetch and validate the provider's app databag."""
+        if not relation.app or not relation.app.name:
+            # Handle edge case where remote app name can be missing, e.g.,
+            # relation_broken events.
+            # FIXME https://github.com/canonical/traefik-k8s-operator/issues/34
+            return {}
 
-class RelationDataMismatchError(RelationDataError):
-    """Data from different units do not match where they should."""
+        provider_app_data = {}
+        # we start by looking at the provider's app databag
+        if self.unit.is_leader():
+            # only leaders can read their app's data
+            data = relation.data[self.app].get("data")
+            deserialized = {}
+            if data:
+                deserialized = yaml.safe_load(data)
+                _validate_data(deserialized, INGRESS_PROVIDES_APP_SCHEMA)
+            provider_app_data = deserialized.get("ingress", {})
+
+        return provider_app_data
+
+    @property
+    def proxied_endpoints(self) -> dict:
+        """The ingress settings provided to units by this provider.
+
+        For example, when this IngressPerUnitProvider has provided the
+        `http://foo.bar/my-model.my-app-1` and
+        `http://foo.bar/my-model.my-app-2` URLs to the two units of the
+        my-app application, the returned dictionary will be:
+
+        ```
+        {
+            "my-app/1": {
+                "url": "http://foo.bar/my-model.my-app-1"
+            },
+            "my-app/2": {
+                "url": "http://foo.bar/my-model.my-app-2"
+            }
+        }
+        ```
+        """
+        results = {}
+
+        for ingress_relation in self.relations:
+            provider_app_data = self._provider_app_data(ingress_relation)
+            results.update(provider_app_data)
+
+        return results
 
 
 class IngressPerUnitConfigurationChangeEvent(RelationEvent):
     """Event representing a change in the data sent by the ingress."""
 
 
-class IngressPerUnitRequirerEvents(EndpointWrapperEvents):
+class IngressPerUnitRequirerEvents(IngressPerUnitEvents):
     """Container for IUP events."""
 
     ingress_changed = EventSource(IngressPerUnitConfigurationChangeEvent)
 
 
-class IngressPerUnitRequirer(EndpointWrapper):
+class IngressPerUnitRequirer(_IngressPerUnitBase):
     """Implementation of the requirer of ingress_per_unit."""
 
     on = IngressPerUnitRequirerEvents()
 
-    ROLE = RelationRole.requires.name
-    INTERFACE = "ingress_per_unit"
-    SCHEMA = INGRESS_SCHEMA
-    LIMIT = 1
-
     def __init__(
         self,
         charm: CharmBase,
-        endpoint: str = None,
+        relation_name: str = DEFAULT_RELATION_NAME,
         *,
         host: str = None,
         port: int = None,
@@ -340,29 +587,103 @@ def __init__(
 
         Args:
             charm: the charm that is instantiating the library.
-            endpoint: the name of the relation endpoint to bind to
-                (defaults to "ingress-per-unit"; relation must be of interface type
-                "ingress_per_unit" and have "limit: 1")
-            host: Hostname to be used by the ingress provider to address the requirer
-                unit; if unspecified, the pod ip of the unit will be used instead
+            relation_name: the name of the relation name to bind to
+                (defaults to "ingress-per-unit"; relation must be of interface
+                type "ingress_per_unit" and have "limit: 1")
+            host: Hostname to be used by the ingress provider to address the
+            requirer unit; if unspecified, the pod ip of the unit will be used
+            instead
         Request Args:
             port: the port of the service
         """
-        super().__init__(charm, endpoint)
+        super().__init__(charm, relation_name)
+
+        # if instantiated with a port, and we are related, then
+        # we immediately publish our ingress data  to speed up the process.
         if port:
-            self.auto_data = self._complete_request(host or "", port)
+            self._auto_data = host, port
+        else:
+            self._auto_data = None
 
         # Workaround for SDI not marking the EndpointWrapper as not
         # ready upon a relation broken event
         self.is_relation_broken = False
 
         self.framework.observe(
-            self.charm.on[self.endpoint].relation_changed, self._emit_ingress_change_event
+            self.charm.on[self.relation_name].relation_changed, self._emit_ingress_change_event
         )
         self.framework.observe(
-            self.charm.on[self.endpoint].relation_broken, self._emit_ingress_change_event
+            self.charm.on[self.relation_name].relation_broken, self._emit_ingress_change_event
         )
 
+    def _handle_relation(self, event):
+        super()._handle_relation(event)
+        self._publish_auto_data(event.relation)
+
+    def _handle_upgrade_or_leader(self, event):
+        for relation in self.relations:
+            self._publish_auto_data(relation)
+
+    def _publish_auto_data(self, relation: Relation):
+        if self._auto_data and self.is_available(relation):
+            host, port = self._auto_data
+            self.provide_ingress_requirements(host=host, port=port)
+
+    @property
+    def relation(self) -> Optional[Relation]:
+        """The established Relation instance, or None if still unrelated."""
+        if len(self.relations) > 1:
+            raise ValueError("Multiple ingress-per-unit relations found.")
+        return self.relations[0] if self.relations else None
+
+    def is_ready(self, relation: Optional[Relation] = None) -> bool:
+        """Checks whether the given relation is ready.
+
+        Or any relation if not specified.
+        A given relation is ready if the remote side has sent valid data.
+        """
+        if super().is_ready(relation) is False:
+            return False
+
+        return bool(self.url)
+
+    def is_failed(self, relation: Optional[Relation] = None) -> bool:
+        """Checks whether the given relation is failed.
+
+        Or any relation if not specified.
+        """
+        if not self.relations:  # can't fail if you can't try
+            return False
+
+        if relation is None:
+            return any(map(self.is_failed, self.relations))
+
+        if not relation.app.name:  # type: ignore
+            # Juju doesn't provide JUJU_REMOTE_APP during relation-broken
+            # hooks. See https://github.com/canonical/operator/issues/693
+            return False
+
+        if not relation.units:
+            return False
+
+        try:
+            # grab the data and validate it; might raise
+            raw = relation.data[self.unit].get("data")
+        except Exception:
+            log.exception("Error accessing relation databag")
+            return True
+
+        if raw:
+            # validate data
+            data = yaml.safe_load(raw)
+            try:
+                _validate_data(data, INGRESS_REQUIRES_UNIT_SCHEMA)
+            except DataValidationError:
+                log.exception("Error validating relation data")
+                return True
+
+        return False
+
     def _emit_ingress_change_event(self, event):
         if isinstance(event, RelationBrokenEvent):
             self.is_relation_broken = True
@@ -370,49 +691,57 @@ def _emit_ingress_change_event(self, event):
         # TODO Avoid spurious events, emit only when URL changes
         self.on.ingress_changed.emit(self.relation)
 
-    def _complete_request(self, host: Optional[str], port: int):
-        if not host:
-            binding = self.charm.model.get_binding(self.endpoint)
-            host = str(binding.network.bind_address)
-
-        return {
-            self.charm.unit: {
-                "model": self.model.name,
-                "name": self.charm.unit.name,
-                "host": host,
-                "port": port,
-            },
-        }
-
-    def request(self, *, host: str = None, port: int):
-        """Request ingress to this unit.
+    def provide_ingress_requirements(self, *, host: str = None, port: int):
+        """Publishes the data that Traefik needs to provide ingress.
 
         Args:
-            host: Hostname to be used by the ingress provider to address the requirer
-                unit; if unspecified, the pod ip of the unit will be used instead
+            host: Hostname to be used by the ingress provider to address the
+             requirer unit; if unspecified, the pod ip of the unit will be used
+             instead
             port: the port of the service (required)
         """
-        self.wrap(self.relation, self._complete_request(host, port))
+        if not host:
+            host = socket.getfqdn()
 
-    @property
-    def relation(self):
-        """The established Relation instance, or None."""
-        return self.relations[0] if self.relations else None
+        data = {
+            "model": self.model.name,
+            "name": self.unit.name,
+            "host": host,
+            "port": port,
+        }
+        _validate_data(data, INGRESS_REQUIRES_UNIT_SCHEMA)
+
+        if not self.relation:
+            raise RuntimeError("Can't publish ingress data: no relation found.")
+        self.relation.data[self.unit]["data"] = yaml.safe_dump(data)
 
     @property
-    def urls(self):
+    def urls(self) -> dict:
         """The full ingress URLs to reach every unit.
 
         May return an empty dict if the URLs aren't available yet.
         """
-        if self.is_relation_broken or not self.is_ready():
+        relation = self.relation
+        if not relation or self.is_relation_broken:
             return {}
-        data = self.unwrap(self.relation)
-        ingress = data[self.relation.app].get("ingress", {})
+
+        raw = None
+        if relation.app.name:  # type: ignore
+            # FIXME Workaround for https://github.com/canonical/operator/issues/693
+            # We must be in a relation_broken hook
+            raw = relation.data.get(relation.app, {}).get("data")
+
+        if not raw:
+            return {}
+
+        data = yaml.safe_load(raw)
+        _validate_data(data, INGRESS_PROVIDES_APP_SCHEMA)
+
+        ingress = data.get("ingress", {})
         return {unit_name: unit_data["url"] for unit_name, unit_data in ingress.items()}
 
     @property
-    def url(self):
+    def url(self) -> Optional[str]:
         """The full ingress URL to reach the current unit.
 
         May return None if the URL isn't available yet.
diff --git a/metadata.yaml b/metadata.yaml
index a0547038..97c85608 100644
--- a/metadata.yaml
+++ b/metadata.yaml
@@ -22,11 +22,17 @@ containers:
     mounts:
       - storage: database
         location: /var/lib/prometheus
+
 provides:
+  self-metrics-endpoint:
+    interface: prometheus_scrape
   grafana-source:
     interface: grafana_datasource
+  grafana-dashboard:
+    interface: grafana_dashboard
   receive-remote-write:
     interface: prometheus_remote_write
+
 requires:
   metrics-endpoint:
     interface: prometheus_scrape
@@ -35,12 +41,15 @@ requires:
   ingress:
     interface: ingress_per_unit
     limit: 1
+
 peers:
   prometheus-peers:
     interface: prometheus_peers
+
 storage:
   database:
     type: filesystem
+
 resources:
   prometheus-image:
     type: oci-image
diff --git a/pyproject.toml b/pyproject.toml
index d1c8e29d..65f207fa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ module = ["ops.*", "pytest.*", "pytest_operator.*", "prometheus_api_client.*", "
 ignore_missing_imports = true
 
 [[tool.mypy.overrides]]
-module = ["charms.grafana_k8s.*", "charms.alertmanager_k8s.*"]
+module = ["charms.grafana_k8s.*", "charms.alertmanager_k8s.*", "charms.traefik_k8s.*"]
 follow_imports = "silent"
 
 [tool.pytest.ini_options]
diff --git a/src/charm.py b/src/charm.py
index 4d7d3435..6374e382 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -14,7 +14,9 @@
 import bitmath
 import yaml
 from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerConsumer
+from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider
 from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider
+from charms.observability_libs.v0.juju_topology import JujuTopology
 from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch
 from charms.prometheus_k8s.v0.prometheus_remote_write import (
     DEFAULT_RELATION_NAME as DEFAULT_REMOTE_WRITE_RELATION_NAME,
@@ -22,7 +24,10 @@
 from charms.prometheus_k8s.v0.prometheus_remote_write import (
     PrometheusRemoteWriteProvider,
 )
-from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer
+from charms.prometheus_k8s.v0.prometheus_scrape import (
+    MetricsEndpointConsumer,
+    MetricsEndpointProvider,
+)
 from charms.traefik_k8s.v0.ingress_per_unit import IngressPerUnitRequirer
 from lightkube import Client
 from lightkube.core.exceptions import ApiError as LightkubeApiError
@@ -52,14 +57,23 @@ def __init__(self, *args):
         self._port = 9090
 
         self.service_patch = KubernetesServicePatch(self, [(f"{self.app.name}", self._port)])
+        self._topology = JujuTopology.from_charm(self)
 
         # Relation handler objects
 
+        # Self-monitoring
+        self._scraping = MetricsEndpointProvider(
+            self,
+            relation_name="self-metrics-endpoint",
+            jobs=[{"static_configs": [{"targets": [f"*:{self._port}"]}]}],
+        )
+        self.grafana_dashboard_provider = GrafanaDashboardProvider(charm=self)
+
         # Gathers scrape job information from metrics endpoints
         self.metrics_consumer = MetricsEndpointConsumer(self)
 
         # Manages ingress for this charm
-        self.ingress = IngressPerUnitRequirer(self, endpoint="ingress", port=self._port)
+        self.ingress = IngressPerUnitRequirer(self, relation_name="ingress", port=self._port)
 
         external_url = urlparse(self._external_url)
 
@@ -74,7 +88,7 @@ def __init__(self, *args):
         )
 
         # Allows Grafana to aggregate metrics
-        self.grafana_source_consumer = GrafanaSourceProvider(
+        self.grafana_source_provider = GrafanaSourceProvider(
             charm=self,
             source_type="prometheus",
             source_url=self._external_url,
@@ -163,7 +177,7 @@ def _configure(self, _):
 
         # Make sure that if the remote_write endpoint changes, it is reflected in relation data.
         self.remote_write_provider.update_endpoint()
-        self.grafana_source_consumer.update_source(self._external_url)
+        self.grafana_source_provider.update_source(self._external_url)
 
         self.unit.status = ActiveStatus()
 
@@ -452,7 +466,32 @@ def _prometheus_config(self) -> str:
             "metrics_path": "/metrics",
             "honor_timestamps": True,
             "scheme": "http",
-            "static_configs": [{"targets": [f"localhost:{self._port}"]}],
+            "static_configs": [
+                {
+                    "targets": [f"localhost:{self._port}"],
+                    "labels": {
+                        "juju_model": self._topology.model,
+                        "juju_model_uuid": self._topology.model_uuid,
+                        "juju_application": self._topology.application,
+                        "juju_unit": self._topology.charm_name,
+                        "host": "localhost",
+                    },
+                }
+            ],
+            # Replace the value of the "instance" label with a juju topology identifier
+            "relabel_configs": [
+                {
+                    "source_labels": [
+                        "juju_model",
+                        "juju_model_uuid",
+                        "juju_application",
+                        "juju_unit",
+                    ],
+                    "separator": "_",
+                    "target_label": "instance",
+                    "regex": "(.*)",
+                }
+            ],
         }
         prometheus_config["scrape_configs"].append(default_config)  # type: ignore
         scrape_jobs = self.metrics_consumer.jobs()
diff --git a/src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl b/src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl
new file mode 100644
index 00000000..a9db8a51
--- /dev/null
+++ b/src/grafana_dashboards/prometheus-k8s_rev1.json.tmpl
@@ -0,0 +1,2972 @@
+{
+  "annotations": {
+    "list": [
+    ]
+  },
+  "description": "Dashboard for the Prometheus Operator, powered by Juju",
+  "editable": true,
+  "gnetId": 3662,
+  "graphTooltip": 0,
+  "hideControls": false,
+  "id": null,
+  "links": [],
+  "refresh": "30s",
+  "rows": [
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "datasource": "${prometheusds}",
+          "decimals": 3,
+          "description": "Percentage of uptime during the most recent $interval period.  Change the period with the 'interval' dropdown above.",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": false
+          },
+          "id": 2,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "%",
+          "postfixFontSize": "100%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 3,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": true,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": false
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "avg(avg_over_time(up{instance=~\"$instance\",job=~\"$job\"}[$interval]) * 100)",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "",
+              "refId": "A",
+              "step": 40
+            }
+          ],
+          "thresholds": "90, 99",
+          "title": "Uptime [$interval]",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        },
+        {
+          "columns": [],
+          "datasource": "${prometheusds}",
+          "description": "Servers which are DOWN RIGHT NOW! \nFIX THEM!!",
+          "fontSize": "100%",
+          "hideTimeOverride": true,
+          "id": 25,
+          "links": [],
+          "pageSize": null,
+          "scroll": true,
+          "showHeader": true,
+          "sort": {
+            "col": 0,
+            "desc": true
+          },
+          "span": 3,
+          "styles": [
+            {
+              "alias": "Time",
+              "dateFormat": "YYYY-MM-DD HH:mm:ss",
+              "pattern": "Time",
+              "type": "hidden"
+            },
+            {
+              "alias": "",
+              "colorMode": null,
+              "colors": [
+                "rgba(245, 54, 54, 0.9)",
+                "rgba(237, 129, 40, 0.89)",
+                "rgba(50, 172, 45, 0.97)"
+              ],
+              "dateFormat": "YYYY-MM-DD HH:mm:ss",
+              "decimals": 2,
+              "pattern": "/__name__|job|Value/",
+              "thresholds": [],
+              "type": "hidden",
+              "unit": "short"
+            },
+            {
+              "alias": "   ",
+              "colorMode": "cell",
+              "colors": [
+                "rgba(255, 0, 0, 0.9)",
+                "rgba(237, 129, 40, 0.89)",
+                "rgba(255, 0, 0, 0.97)"
+              ],
+              "dateFormat": "YYYY-MM-DD HH:mm:ss",
+              "decimals": 2,
+              "link": false,
+              "pattern": "instance",
+              "thresholds": [
+                "",
+                "",
+                ""
+              ],
+              "type": "string",
+              "unit": "short"
+            }
+          ],
+          "targets": [
+            {
+              "expr": "up{instance=~\"$instance\",job=~\"$job\"} < 1",
+              "format": "table",
+              "intervalFactor": 2,
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "timeFrom": "1s",
+          "title": "Currently Down",
+          "transform": "table",
+          "type": "table"
+        },
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "datasource": "${prometheusds}",
+          "description": "Total number of time series in prometheus",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 12,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 3,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": true,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": true
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "B",
+              "step": 40
+            }
+          ],
+          "thresholds": "1000000,2000000",
+          "title": "Total Series",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        },
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": false,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "datasource": "${prometheusds}",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 14,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 3,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": true,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": true
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "B",
+              "step": 40
+            }
+          ],
+          "thresholds": "",
+          "title": "Memory Chunks",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "at a glance",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 236,
+      "panels": [
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "datasource": "${prometheusds}",
+          "description": "The total number of rule group evaluations missed due to slow rule group evaluation.",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 16,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 2,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": false,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": false
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(sum_over_time(prometheus_evaluator_iterations_missed_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "A",
+              "step": 40
+            }
+          ],
+          "thresholds": "1,10",
+          "title": "Missed Iterations [$interval]",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        },
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "datasource": "${prometheusds}",
+          "description": "The total number of rule group evaluations skipped due to throttled metric storage.",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 18,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 2,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": false,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": false
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(sum_over_time(prometheus_evaluator_iterations_skipped_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "A",
+              "step": 40
+            }
+          ],
+          "thresholds": "1,10",
+          "title": "Skipped Iterations [$interval]",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        },
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "datasource": "${prometheusds}",
+          "description": "Total number of scrapes that hit the sample limit and were rejected.",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 19,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 2,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": false,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": false
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "A",
+              "step": 40
+            }
+          ],
+          "thresholds": "1,10",
+          "title": "Tardy Scrapes [$interval]",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        },
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "datasource": "${prometheusds}",
+          "description": "Number of times the database failed to reload block data from disk.",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 13,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 2,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": false,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": false
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(sum_over_time(prometheus_tsdb_reloads_failures_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "A",
+              "step": 40
+            }
+          ],
+          "thresholds": "1,10",
+          "title": "Reload Failures [$interval]",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        },
+        {
+          "cacheTimeout": null,
+          "colorBackground": false,
+          "colorValue": true,
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "datasource": "${prometheusds}",
+          "description": "Sum of all skipped scrapes",
+          "format": "none",
+          "gauge": {
+            "maxValue": 100,
+            "minValue": 0,
+            "show": false,
+            "thresholdLabels": false,
+            "thresholdMarkers": true
+          },
+          "id": 20,
+          "interval": null,
+          "links": [],
+          "mappingType": 1,
+          "mappingTypes": [
+            {
+              "name": "value to text",
+              "value": 1
+            },
+            {
+              "name": "range to text",
+              "value": 2
+            }
+          ],
+          "maxDataPoints": 100,
+          "nullPointMode": "connected",
+          "nullText": null,
+          "postfix": "",
+          "postfixFontSize": "50%",
+          "prefix": "",
+          "prefixFontSize": "50%",
+          "rangeMaps": [
+            {
+              "from": "null",
+              "text": "N/A",
+              "to": "null"
+            }
+          ],
+          "span": 4,
+          "sparkline": {
+            "fillColor": "rgba(31, 118, 189, 0.18)",
+            "full": false,
+            "lineColor": "rgb(31, 120, 193)",
+            "show": false
+          },
+          "tableColumn": "",
+          "targets": [
+            {
+              "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) ",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "refId": "A",
+              "step": 40
+            }
+          ],
+          "thresholds": "1,10",
+          "title": "Skipped Scrapes [$interval]",
+          "type": "singlestat",
+          "valueFontSize": "100%",
+          "valueMaps": [
+            {
+              "op": "=",
+              "text": "N/A",
+              "value": "null"
+            }
+          ],
+          "valueName": "current"
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "quick numbers",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "All non-zero failures and errors",
+          "fill": 1,
+          "id": 33,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(increase(net_conntrack_dialer_conn_failed_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Failed Connections",
+              "refId": "A",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_evaluator_iterations_missed_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Missed Iterations",
+              "refId": "B",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_evaluator_iterations_skipped_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Skipped Iterations",
+              "refId": "C",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_rule_evaluation_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Evaluation",
+              "refId": "D",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_azure_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Azure Refresh",
+              "refId": "E",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_consul_rpc_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Consul RPC",
+              "refId": "F",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_dns_lookup_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "DNS Lookup",
+              "refId": "G",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_ec2_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "EC2 Refresh",
+              "refId": "H",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_gce_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "GCE Refresh",
+              "refId": "I",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_marathon_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Marathon Refresh",
+              "refId": "J",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_openstack_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Openstack Refresh",
+              "refId": "K",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_sd_triton_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Triton Refresh",
+              "refId": "L",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Sample Limit",
+              "refId": "M",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Duplicate Timestamp",
+              "refId": "N",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_bounds_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Timestamp Out of Bounds",
+              "refId": "O",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_order_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Sample Out of Order",
+              "refId": "P",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_treecache_zookeeper_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Zookeeper",
+              "refId": "Q",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_tsdb_compactions_failed_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "TSDB Compactions",
+              "refId": "R",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_tsdb_head_series_not_found{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Series Not Found",
+              "refId": "S",
+              "step": 2
+            },
+            {
+              "expr": "sum(increase(prometheus_tsdb_reloads_failures_total{instance=~\"$instance\"}[5m])) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Reload",
+              "refId": "T",
+              "step": 2
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Failures and Errors",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Errors",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "errors",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": "250px",
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 1,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": true,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "up{instance=~\"$instance\",job=~\"$job\"}",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Upness (stacked)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "none",
+              "label": "Up",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 5,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Storage Memory Chunks",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Chunks",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "up",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 3,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Series Count",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Series",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 32,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "removed",
+              "transform": "negative-Y"
+            }
+          ],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum( increase(prometheus_tsdb_head_series_created_total{instance=~\"$instance\"}[5m]) )",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "created",
+              "refId": "A",
+              "step": 4
+            },
+            {
+              "expr": "sum( increase(prometheus_tsdb_head_series_removed_total{instance=~\"$instance\"}[5m]) )",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "removed",
+              "refId": "B",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Series Created / Removed",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Series Count",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "series",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {
+            "10.58.3.10:80": "#BA43A9"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "Rate of total number of appended samples",
+          "fill": 1,
+          "id": 4,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[1m])",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Appended Samples per Second",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Samples / Second",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "appended samples",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "Total number of syncs that were executed on a scrape pool.",
+          "fill": 1,
+          "id": 6,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(prometheus_target_scrape_pool_sync_total{job=~\"$job\",instance=~\"$instance\"}) by (scrape_job)",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "{{scrape_job}}",
+              "refId": "B",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Scrape Sync Total",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Syncs",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "Actual interval to sync the scrape pool.",
+          "fill": 1,
+          "id": 21,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[2m])) by (scrape_job) * 1000",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{scrape_job}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Target Sync",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Milliseconds",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "sync",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 29,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "scrape_duration_seconds{instance=~\"$instance\"}",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Scrape Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Seconds",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "Total number of rejected scrapes",
+          "fill": 1,
+          "id": 30,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "exceeded sample limit",
+              "refId": "A",
+              "step": 4
+            },
+            {
+              "expr": "sum(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"})",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "duplicate timestamp",
+              "refId": "B",
+              "step": 4
+            },
+            {
+              "expr": "sum(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"})",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "out of bounds",
+              "refId": "C",
+              "step": 4
+            },
+            {
+              "expr": "sum(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}) ",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "out of order",
+              "refId": "D",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Rejected Scrapes",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": "Scrapes",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "scrapes",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "The duration of rule group evaluations",
+          "fill": 1,
+          "id": 10,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "1000 * rate(prometheus_evaluator_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[5m]) / rate(prometheus_evaluator_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[5m])",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "E",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Average Rule Evaluation Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Milliseconds",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 11,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(http_request_duration_microseconds_count{job=~\"$job\",instance=~\"$instance\"}[1m])) by (handler) > 0",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{handler}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "HTTP Request Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Microseconds",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 15,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(prometheus_engine_query_duration_seconds_sum{job=~\"$job\",instance=~\"$instance\"}) by (slice)",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{slice}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Prometheus Engine Query Duration Seconds",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Seconds",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "Rule-group evaluations \n - total\n - missed due to slow rule group evaluation\n - skipped due to throttled metric storage",
+          "fill": 1,
+          "id": 31,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(prometheus_evaluator_iterations_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Total",
+              "refId": "B",
+              "step": 4
+            },
+            {
+              "expr": "sum(rate(prometheus_evaluator_iterations_missed_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Missed",
+              "refId": "A",
+              "step": 4
+            },
+            {
+              "expr": "sum(rate(prometheus_evaluator_iterations_skipped_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "Skipped",
+              "refId": "C",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Rule Evaluator Iterations",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "iterations",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "durations",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 22,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "rate(prometheus_notifications_sent_total[5m])",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Notifications Sent",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Notifications",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "notifications",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 23,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "(time() - prometheus_config_last_reload_success_timestamp_seconds{job=~\"$job\",instance=~\"$instance\"}) / 60",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Minutes Since Successful Config Reload",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Minutes",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 24,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "prometheus_config_last_reload_successful{job=~\"$job\",instance=~\"$instance\"}",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 4
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Successful Config Reload",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": "Success",
+              "logBase": 1,
+              "max": "1",
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "config",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": false,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "GC invocation durations",
+          "fill": 1,
+          "id": 28,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(go_gc_duration_seconds_sum{instance=~\"$instance\",job=~\"$job\"}[2m])) by (instance)",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "GC Rate / 2m",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "garbage collection",
+      "titleSize": "h6"
+    },
+    {
+      "collapse": true,
+      "height": 250,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "description": "This is probably wrong!  Please help.",
+          "fill": 1,
+          "id": 26,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "allocated",
+              "stack": false
+            }
+          ],
+          "spaceLength": 10,
+          "span": 6,
+          "stack": true,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(go_memstats_alloc_bytes_total{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "hide": true,
+              "intervalFactor": 2,
+              "legendFormat": "alloc_bytes_total",
+              "refId": "A",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "allocated",
+              "refId": "B",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_buck_hash_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "profiling bucket hash table",
+              "refId": "C",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_gc_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "GC metadata",
+              "refId": "D",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_heap_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "heap in-use",
+              "refId": "E",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_heap_idle_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "heap idle",
+              "refId": "F",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "heap in use",
+              "refId": "G",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_heap_released_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "heap released",
+              "refId": "H",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_heap_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "heap system",
+              "refId": "I",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_mcache_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "mcache in use",
+              "refId": "J",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_mcache_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "mcache sys",
+              "refId": "K",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_mspan_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "mspan in use",
+              "refId": "L",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_mspan_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "mspan sys",
+              "refId": "M",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_next_gc_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "heap next gc",
+              "refId": "N",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_other_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "other sys",
+              "refId": "O",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "stack in use",
+              "refId": "P",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_stack_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "stack sys",
+              "refId": "Q",
+              "step": 10
+            },
+            {
+              "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "sys",
+              "refId": "R",
+              "step": 10
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Go Memory Usage (FIXME)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "bytes",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 9,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 3,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "prometheus_target_interval_length_seconds{instance=~\"$instance\", job=~\"$job\"}",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{quantile}} {{interval}}",
+              "refId": "A",
+              "step": 20
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Scrape Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Seconds",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${prometheusds}",
+          "fill": 1,
+          "id": 7,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "span": 3,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m])) by (interval)",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{interval}}",
+              "refId": "A",
+              "step": 20
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Target Scrapes / 5m",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "Scrapes",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "repeatIteration": null,
+      "repeatRowId": null,
+      "showTitle": false,
+      "title": "Broken, ignore",
+      "titleSize": "h6"
+    }
+  ],
+  "schemaVersion": 14,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "${prometheusds}",
+        "hide": 0,
+        "includeAll": true,
+        "label": null,
+        "multi": true,
+        "name": "job",
+        "options": [],
+        "query": "query_result(prometheus_tsdb_head_samples_appended_total)",
+        "refresh": 2,
+        "regex": "/.*job=\"([^\"]+)/",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "${prometheusds}",
+        "hide": 0,
+        "includeAll": true,
+        "label": null,
+        "multi": true,
+        "name": "instance",
+        "options": [],
+        "query": "query_result(up{job=~\"$job\"})",
+        "refresh": 2,
+        "regex": "/.*instance=\"([^\"]+).*/",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": true,
+          "text": "1h",
+          "value": "1h"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "interval",
+        "options": [
+          {
+            "selected": true,
+            "text": "1h",
+            "value": "1h"
+          },
+          {
+            "selected": false,
+            "text": "3h",
+            "value": "3h"
+          },
+          {
+            "selected": false,
+            "text": "6h",
+            "value": "6h"
+          },
+          {
+            "selected": false,
+            "text": "12h",
+            "value": "12h"
+          },
+          {
+            "selected": false,
+            "text": "1d",
+            "value": "1d"
+          },
+          {
+            "selected": false,
+            "text": "2d",
+            "value": "2d"
+          },
+          {
+            "selected": false,
+            "text": "7d",
+            "value": "7d"
+          },
+          {
+            "selected": false,
+            "text": "30d",
+            "value": "30d"
+          },
+          {
+            "selected": false,
+            "text": "90d",
+            "value": "90d"
+          },
+          {
+            "selected": false,
+            "text": "180d",
+            "value": "180d"
+          }
+        ],
+        "query": "1h, 3h, 6h, 12h, 1d, 2d, 7d, 30d, 90d, 180d",
+        "type": "custom"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-30m",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "Prometheus Operator Overview",
+  "version": 21
+}
+
diff --git a/src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule b/src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule
new file mode 100644
index 00000000..64a3ad75
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_configuration_reload_failure.rule
@@ -0,0 +1,12 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusConfigurationReloadFailure
+expr: prometheus_config_last_reload_successful{} != 1
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+  description: |
+    Prometheus configuration reload error
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_exporters_slowly.rule b/src/prometheus_alert_rules/prometheus_exporters_slowly.rule
new file mode 100644
index 00000000..e3c182c0
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_exporters_slowly.rule
@@ -0,0 +1,12 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTargetScrapingSlow
+expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
+for: 5m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+  description: |
+      Prometheus is scraping exporters slowly
+      VALUE = {{ $value }}
+      LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_large_scrape.rule b/src/prometheus_alert_rules/prometheus_large_scrape.rule
new file mode 100644
index 00000000..c4a7e005
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_large_scrape.rule
@@ -0,0 +1,12 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusLargeScrape
+expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{}[10m]) > 10
+for: 5m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus large scrape (instance {{ $labels.instance }})
+  description: |
+    Prometheus has many scrapes that exceed the sample limit
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_missing.rule b/src/prometheus_alert_rules/prometheus_missing.rule
new file mode 100644
index 00000000..10c9789f
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_missing.rule
@@ -0,0 +1,12 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusJobMissing
+expr: absent(up{})
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus job missing (instance {{ $labels.instance }})
+  description: |
+    A Prometheus job has disappeared
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_notifications_backlog.rule b/src/prometheus_alert_rules/prometheus_notifications_backlog.rule
new file mode 100644
index 00000000..76b0adb7
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_notifications_backlog.rule
@@ -0,0 +1,12 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusNotificationsBacklog
+expr: min_over_time(prometheus_notifications_queue_length{}[10m]) > 0
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+  description: |
+    The Prometheus notification queue has not been empty for 10 minutes
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule b/src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule
new file mode 100644
index 00000000..88137fc1
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_rule_evaluation_slow.rule
@@ -0,0 +1,13 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusRuleEvaluationSlow
+expr: prometheus_rule_group_last_duration_seconds{} > prometheus_rule_group_interval_seconds{}
+for: 5m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+  description: |
+    Prometheus rule evaluation took more time than the scheduled interval.
+    It indicates a slower storage backend access or too complex query.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule b/src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule
new file mode 100644
index 00000000..898623ec
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_target_scrape_duplicate.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTargetScrapeDuplicate
+expr: |
+ increase(
+   prometheus_target_scrapes_sample_duplicate_timestamp_total{}[5m]
+ ) > 0
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+  description: |
+    Prometheus has many samples rejected due to duplicate timestamps but differing values
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule
new file mode 100644
index 00000000..2f8ded7a
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_creation_failures.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbCheckpointCreationFailures
+expr: |
+  increase(
+    prometheus_tsdb_checkpoint_creations_failed_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} checkpoint creation failures
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule
new file mode 100644
index 00000000..18cc337a
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_checkpoint_deletion_failures.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbCheckpointDeletionFailures
+expr: |
+  increase(
+    prometheus_tsdb_checkpoint_deletions_failed_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} checkpoint deletion failures.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule b/src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule
new file mode 100644
index 00000000..693e2288
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_compactions_failed.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbCompactionsFailed
+expr: |
+  increase(
+    prometheus_tsdb_compactions_failed_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} TSDB compactions failures.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule b/src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule
new file mode 100644
index 00000000..33c35072
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_head_truncations_failed.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbHeadTruncationsFailed
+expr: |
+  increase(
+    prometheus_tsdb_head_truncations_failed_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} TSDB head truncation failures.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule b/src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule
new file mode 100644
index 00000000..66bcc745
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_reload_failures.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbReloadFailures
+expr: |
+  increase(
+    prometheus_tsdb_reloads_failures_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} TSDB reload failures.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule b/src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule
new file mode 100644
index 00000000..f86ac2c2
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_wal_corruptions.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbWalCorruptions
+expr: |
+  increase(
+    prometheus_tsdb_wal_corruptions_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} TSDB WAL corruptions.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule b/src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule
new file mode 100644
index 00000000..91a2e59b
--- /dev/null
+++ b/src/prometheus_alert_rules/prometheus_tsdb_wal_truncations_failed.rule
@@ -0,0 +1,15 @@
+# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1
+alert: PrometheusTsdbWalTruncationsFailed
+expr: |
+  increase(
+    prometheus_tsdb_wal_truncations_failed_total{}[1m]
+  ) > 0
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+  description: |
+    Prometheus encountered {{ $value }} TSDB WAL truncation failures.
+    VALUE = {{ $value }}
+    LABELS = {{ $labels }}
diff --git a/tests/integration/test_check_config.py b/tests/integration/test_check_config.py
index f0b8d7aa..9017b652 100644
--- a/tests/integration/test_check_config.py
+++ b/tests/integration/test_check_config.py
@@ -87,7 +87,9 @@ async def test_bad_config_sets_action_results(ops_test, prometheus_charm, promet
 
     await asyncio.gather(
         ops_test.model.add_relation(bad_scrape_tester, scrape_shim),
-        ops_test.model.add_relation(prometheus_app_name, scrape_shim),
+        ops_test.model.add_relation(
+            f"{prometheus_app_name}:metrics-endpoint", f"{scrape_shim}:metrics-endpoint"
+        ),
     )
     await ops_test.model.wait_for_idle(apps=[prometheus_app_name, scrape_shim, bad_scrape_tester])
 
diff --git a/tests/integration/test_prometheus_scrape_multiunit.py b/tests/integration/test_prometheus_scrape_multiunit.py
index 87fd92f8..b0f6c6e7 100644
--- a/tests/integration/test_prometheus_scrape_multiunit.py
+++ b/tests/integration/test_prometheus_scrape_multiunit.py
@@ -101,12 +101,17 @@ async def test_prometheus_scrape_relation_with_prometheus_tester(
         assert len(targets) == 1
         self_scrape = next(iter(targets))
         assert self_scrape["labels"]["job"] == "prometheus"
-        assert self_scrape["labels"]["instance"] == "localhost:9090"
+        assert self_scrape["labels"]["host"] == "localhost"
 
     # WHEN prometheus is related to the testers
     await asyncio.gather(
-        ops_test.model.add_relation(prometheus_app_name, scrape_tester),
-        ops_test.model.add_relation(prometheus_app_name, remote_write_tester),
+        ops_test.model.add_relation(
+            f"{prometheus_app_name}:metrics-endpoint", f"{scrape_tester}:metrics-endpoint"
+        ),
+        ops_test.model.add_relation(
+            f"{prometheus_app_name}:receive-remote-write",
+            f"{remote_write_tester}:send-remote-write",
+        ),
     )
     await ops_test.model.wait_for_idle(apps=app_names, status="active")
 
@@ -175,7 +180,7 @@ async def test_prometheus_scrape_relation_with_prometheus_tester(
 async def test_upgrade_prometheus(ops_test: OpsTest, prometheus_charm):
     """Upgrade prometheus and confirm all is still green (see also test_upgrade_charm.py)."""
     # GIVEN an existing "up" timeseries
-    query = 'count_over_time(up{instance="localhost:9090",job="prometheus"}[1y])'
+    query = 'count_over_time(up{host="localhost",job="prometheus"}[1y])'
     up_before = await asyncio.gather(
         *[run_promql(ops_test, query, prometheus_app_name, u) for u in range(num_units)]
     )
diff --git a/tests/integration/test_remote_write_grafana_agent.py b/tests/integration/test_remote_write_grafana_agent.py
index d2ab3a83..1dda16e8 100644
--- a/tests/integration/test_remote_write_grafana_agent.py
+++ b/tests/integration/test_remote_write_grafana_agent.py
@@ -36,7 +36,9 @@ async def test_remote_write_with_grafana_agent(ops_test, prometheus_charm):
     await ops_test.model.wait_for_idle(apps=apps, status="active", wait_for_units=1)
     assert await check_prometheus_is_ready(ops_test, prometheus_name, 0)
 
-    await ops_test.model.add_relation(prometheus_name, agent_name)
+    await ops_test.model.add_relation(
+        f"{prometheus_name}:receive-remote-write", f"{agent_name}:send-remote-write"
+    )
 
     # A considerable idle_period is needed to guarantee metrics show up in prometheus
     # (60 sec was not enough).
diff --git a/tests/integration/test_remote_write_with_zinc.py b/tests/integration/test_remote_write_with_zinc.py
index e772da48..7b53fce5 100644
--- a/tests/integration/test_remote_write_with_zinc.py
+++ b/tests/integration/test_remote_write_with_zinc.py
@@ -46,7 +46,9 @@ async def test_remote_write_with_zinc(ops_test, prometheus_charm):
     assert await check_prometheus_is_ready(ops_test, prometheus_name, 0)
 
     await asyncio.gather(
-        ops_test.model.add_relation(prometheus_name, agent_name),
+        ops_test.model.add_relation(
+            f"{prometheus_name}:receive-remote-write", f"{agent_name}:send-remote-write"
+        ),
         ops_test.model.add_relation(
             f"{agent_name}:metrics-endpoint", f"{zinc_name}:metrics-endpoint"
         ),
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index 748637bc..e078e56b 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -4,10 +4,10 @@
 import json
 import socket
 import unittest
+import uuid
 from unittest.mock import patch
 
 import yaml
-from helpers import patch_network_get
 from ops.testing import Harness
 
 from charm import PROMETHEUS_CONFIG, PrometheusCharm
@@ -16,16 +16,14 @@
 DEFAULT_JOBS = [{"metrics_path": "/metrics"}]
 SCRAPE_METADATA = {
     "model": "provider-model",
-    "model_uuid": "abcdef",
+    "model_uuid": str(uuid.uuid4()),
     "application": "provider",
     "charm_name": "provider-charm",
 }
 
 
-@patch("charms.observability_libs.v0.juju_topology.JujuTopology.is_valid_uuid", lambda *args: True)
 class TestCharm(unittest.TestCase):
     @patch("charm.KubernetesServicePatch", lambda x, y: None)
-    @patch_network_get()
     def setUp(self, *unused):
         self.harness = Harness(PrometheusCharm)
         self.addCleanup(self.harness.cleanup)
@@ -33,7 +31,7 @@ def setUp(self, *unused):
         patcher = patch.object(PrometheusCharm, "_get_pvc_capacity")
         self.mock_capacity = patcher.start()
         self.addCleanup(patcher.stop)
-
+        self.harness.set_model_name("prometheus_model")
         self.mock_capacity.return_value = "1Gi"
         self.harness.begin_with_initial_hooks()
 
@@ -253,7 +251,6 @@ def setUp(self):
         self.addCleanup(patcher.stop)
 
     @patch("charm.KubernetesServicePatch", lambda x, y: None)
-    @patch_network_get()
     def test_default_maximum_retention_size_is_80_percent(self):
         """This test is here to guarantee backwards compatibility.
 
@@ -273,7 +270,6 @@ def test_default_maximum_retention_size_is_80_percent(self):
         self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB")
 
     @patch("charm.KubernetesServicePatch", lambda x, y: None)
-    @patch_network_get()
     def test_multiplication_factor_applied_to_pvc_capacity(self):
         """The `--storage.tsdb.retention.size` arg must be multiplied by maximum_retention_size."""
         # GIVEN a capacity limit in binary notation (k8s notation)