From 19c9951c438cc0ce7df9b090564e656fcb3f8918 Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Mon, 3 Jun 2024 21:18:44 +0200 Subject: [PATCH] [wip] Use shell of mimir coordinator charm as tempo --- .github/workflows/issues.yaml | 2 +- README.md | 8 +- metadata.yaml | 20 +- src/charm.py | 88 ++-- ...rtmanager.json => tempo-alertmanager.json} | 12 +- ...ir-compactor.json => tempo-compactor.json} | 12 +- .../{mimir-config.json => tempo-config.json} | 8 +- ...ect-store.json => tempo-object-store.json} | 12 +- ...ir-overrides.json => tempo-overrides.json} | 12 +- ...imir-overview.json => tempo-overview.json} | 34 +- ...{mimir-queries.json => tempo-queries.json} | 10 +- .../{mimir-ruler.json => tempo-ruler.json} | 8 +- src/nginx.py | 8 +- .../alerts.yaml | 450 +++++++++--------- .../rules.yaml | 34 +- src/{mimir_cluster.py => tempo_cluster.py} | 89 ++-- src/{mimir_config.py => tempo_config.py} | 10 +- ...ir_coordinator.py => tempo_coordinator.py} | 106 ++--- tests/integration/conftest.py | 4 +- tests/integration/test_self_monitoring.py | 12 +- tests/integration/test_tls.py | 16 +- ...ace.py => test_tempo_cluster_interface.py} | 44 +- tests/unit/test_charm.py | 4 +- tests/unit/test_coherence.py | 26 +- tests/unit/test_config.py | 28 +- 25 files changed, 527 insertions(+), 530 deletions(-) rename src/grafana_dashboards/{mimir-alertmanager.json => tempo-alertmanager.json} (99%) rename src/grafana_dashboards/{mimir-compactor.json => tempo-compactor.json} (99%) rename src/grafana_dashboards/{mimir-config.json => tempo-config.json} (99%) rename src/grafana_dashboards/{mimir-object-store.json => tempo-object-store.json} (99%) rename src/grafana_dashboards/{mimir-overrides.json => tempo-overrides.json} (97%) rename src/grafana_dashboards/{mimir-overview.json => tempo-overview.json} (95%) rename src/grafana_dashboards/{mimir-queries.json => tempo-queries.json} (99%) rename src/grafana_dashboards/{mimir-ruler.json => tempo-ruler.json} (99%) rename src/prometheus_alert_rules/{mimir_workers => tempo_workers}/alerts.yaml (64%) rename src/prometheus_alert_rules/{mimir_workers => tempo_workers}/rules.yaml (98%) rename src/{mimir_cluster.py => tempo_cluster.py} (76%) rename src/{mimir_config.py => tempo_config.py} (94%) rename src/{mimir_coordinator.py => tempo_coordinator.py} (80%) rename tests/scenario/{test_mimir_cluster_interface.py => test_tempo_cluster_interface.py} (64%) diff --git a/.github/workflows/issues.yaml b/.github/workflows/issues.yaml index 610cb70..ac34730 100644 --- a/.github/workflows/issues.yaml +++ b/.github/workflows/issues.yaml @@ -8,4 +8,4 @@ jobs: uses: canonical/observability/.github/workflows/issues.yaml@main secrets: inherit with: - component: mimir-coordinator + component: tempo-coordinator diff --git a/README.md b/README.md index 4745f46..1151ce7 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -# Mimir Coordinator charm for Kubernetes +# Tempo Coordinator charm for Kubernetes -[![CharmHub Badge](https://charmhub.io/mimir-coordinator-k8s/badge.svg)](https://charmhub.io/mimir-coordinator-k8s) -[![Release](https://github.com/canonical/mimir-coordinator-k8s-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/mimir-coordinator-k8s-operator/actions/workflows/release.yaml) +[![CharmHub Badge](https://charmhub.io/tempo-coordinator-k8s/badge.svg)](https://charmhub.io/tempo-coordinator-k8s) +[![Release](https://github.com/canonical/tempo-coordinator-k8s-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/tempo-coordinator-k8s-operator/actions/workflows/release.yaml) [![Discourse Status](https://img.shields.io/discourse/status?server=https%3A%2F%2Fdiscourse.charmhub.io&style=flat&label=CharmHub%20Discourse)](https://discourse.charmhub.io) ## Description -This charm serves as a coordinator for a Mimir HA deployment, together with the [mimir-worker-k8s](https://github.com/canonical/mimir-worker-k8s-operator) charm. +This charm serves as a coordinator for a Tempo HA deployment, together with the [tempo-worker-k8s](https://github.com/canonical/tempo-worker-k8s-operator) charm. ## Contributing diff --git a/metadata.yaml b/metadata.yaml index 945fe4c..0c56085 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -1,13 +1,13 @@ -name: mimir-coordinator-k8s +name: tempo-coordinator-k8s assumes: - k8s-api -docs: https://discourse.charmhub.io/t/mimir-coordinator-index/10531 +docs: # TODO fill with docs -summary: Mimir coordinator +summary: Tempo coordinator description: | - Mimir coordinator. + Tempo coordinator. containers: nginx: @@ -34,14 +34,14 @@ requires: interface: s3 limit: 1 description: | - The coordinator obtains and shares storage details with workers, enabling Mimir's access to an S3 bucket for data storage. + The coordinator obtains and shares storage details with workers, enabling Tempo's access to an S3 bucket for data storage. logging-consumer: interface: loki_push_api description: | Forward workers' built-in logging rules to the external Loki (the coordinator, not the worker, owns all rule files). - Obtain rules and Loki's API endpoint to later send them to the mimir ruler over another + Obtain rules and Loki's API endpoint to later send them to the tempo ruler over another relation. ingress: @@ -54,7 +54,7 @@ requires: interface: tls-certificates limit: 1 description: | - Certificate and key files for securing Mimir communications with TLS. + Certificate and key files for securing Tempo communications with TLS. tracing: interface: tracing @@ -63,10 +63,10 @@ requires: Enables sending traces to the tracing backend. provides: - mimir-cluster: - interface: mimir_cluster + tempo-cluster: + interface: tempo_cluster description: | - The coordinator sends the Mimir configuration to the workers, obtaining + The coordinator sends the Tempo configuration to the workers, obtaining the roles they are configured to take on and their addressing information. receive-remote-write: diff --git a/src/charm.py b/src/charm.py index 65f2a8d..87e1412 100755 --- a/src/charm.py +++ b/src/charm.py @@ -35,9 +35,9 @@ from charms.traefik_k8s.v2.ingress import IngressPerAppReadyEvent, IngressPerAppRequirer from cosl import JujuTopology from cosl.rules import AlertRules -from mimir_cluster import MimirClusterProvider -from mimir_config import BUCKET_NAME, S3_RELATION_NAME, _S3ConfigData -from mimir_coordinator import MimirCoordinator +from tempo_cluster import TempoClusterProvider +from tempo_config import BUCKET_NAME, S3_RELATION_NAME, _S3ConfigData +from tempo_coordinator import TempoCoordinator from nginx import CA_CERT_PATH, CERT_PATH, KEY_PATH, Nginx from nginx_prometheus_exporter import NGINX_PROMETHEUS_EXPORTER_PORT, NginxPrometheusExporter from ops.charm import CollectStatusEvent @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) NGINX_ORIGINAL_ALERT_RULES_PATH = "./src/prometheus_alert_rules/nginx" -WORKER_ORIGINAL_ALERT_RULES_PATH = "./src/prometheus_alert_rules/mimir_workers" +WORKER_ORIGINAL_ALERT_RULES_PATH = "./src/prometheus_alert_rules/tempo_workers" CONSOLIDATED_ALERT_RULES_PATH = "./src/prometheus_alert_rules/consolidated_rules" @@ -57,12 +57,12 @@ server_cert="server_cert_path", extra_types=[ S3Requirer, - MimirClusterProvider, - MimirCoordinator, + TempoClusterProvider, + TempoCoordinator, Nginx, ], ) -class MimirCoordinatorK8SOperatorCharm(ops.CharmBase): +class TempoCoordinatorK8SOperatorCharm(ops.CharmBase): """Charm the service.""" def __init__(self, *args: Any): @@ -77,12 +77,12 @@ def __init__(self, *args: Any): ) self.server_cert = CertHandler( charm=self, - key="mimir-server-cert", + key="tempo-server-cert", sans=[self.hostname], ) self.s3_requirer = S3Requirer(self, S3_RELATION_NAME, BUCKET_NAME) - self.cluster_provider = MimirClusterProvider(self) - self.coordinator = MimirCoordinator( + self.cluster_provider = TempoClusterProvider(self) + self.coordinator = TempoCoordinator( cluster_provider=self.cluster_provider, tls_requirer=self.server_cert, ) @@ -94,7 +94,7 @@ def __init__(self, *args: Any): self.nginx_prometheus_exporter = NginxPrometheusExporter(self) self.remote_write_provider = PrometheusRemoteWriteProvider( charm=self, - server_url_func=lambda: MimirCoordinatorK8SOperatorCharm.external_url.fget(self), # type: ignore + server_url_func=lambda: TempoCoordinatorK8SOperatorCharm.external_url.fget(self), # type: ignore endpoint_path="/api/v1/push", ) self.tracing = TracingEndpointRequirer(self) @@ -118,10 +118,10 @@ def __init__(self, *args: Any): alert_rules_path=CONSOLIDATED_ALERT_RULES_PATH, jobs=self._scrape_jobs, refresh_event=[ - self.on.mimir_cluster_relation_joined, - self.on.mimir_cluster_relation_changed, - self.on.mimir_cluster_relation_departed, - self.on.mimir_cluster_relation_broken, + self.on.tempo_cluster_relation_joined, + self.on.tempo_cluster_relation_changed, + self.on.tempo_cluster_relation_departed, + self.on.tempo_cluster_relation_broken, ], ) self.ingress = IngressPerAppRequirer(charm=self, strip_prefix=True) @@ -137,18 +137,18 @@ def __init__(self, *args: Any): self._on_nginx_prometheus_exporter_pebble_ready, ) self.framework.observe(self.server_cert.on.cert_changed, self._on_server_cert_changed) - # Mimir Cluster + # Tempo Cluster self.framework.observe( - self.on.mimir_cluster_relation_joined, self._on_mimir_cluster_joined + self.on.tempo_cluster_relation_joined, self._on_tempo_cluster_joined ) self.framework.observe( - self.on.mimir_cluster_relation_changed, self._on_mimir_cluster_changed + self.on.tempo_cluster_relation_changed, self._on_tempo_cluster_changed ) self.framework.observe( - self.on.mimir_cluster_relation_departed, self._on_mimir_cluster_changed + self.on.tempo_cluster_relation_departed, self._on_tempo_cluster_changed ) self.framework.observe( - self.on.mimir_cluster_relation_broken, self._on_mimir_cluster_changed + self.on.tempo_cluster_relation_broken, self._on_tempo_cluster_changed ) # S3 Requirer self.framework.observe(self.s3_requirer.on.credentials_changed, self._on_s3_changed) @@ -171,30 +171,30 @@ def _on_config_changed(self, _: ops.ConfigChangedEvent): """Handle changed configuration.""" self.nginx.configure_pebble_layer(tls=self._is_tls_ready) self._render_workers_alert_rules() - self._update_mimir_cluster() + self._update_tempo_cluster() def _on_server_cert_changed(self, _): self._update_cert() self.nginx.configure_pebble_layer(tls=self._is_tls_ready) - self._update_mimir_cluster() + self._update_tempo_cluster() - def _on_mimir_cluster_joined(self, _): + def _on_tempo_cluster_joined(self, _): self.nginx.configure_pebble_layer(tls=self._is_tls_ready) self._render_workers_alert_rules() - self._update_mimir_cluster() + self._update_tempo_cluster() - def _on_mimir_cluster_changed(self, _): + def _on_tempo_cluster_changed(self, _): self.nginx.configure_pebble_layer(tls=self._is_tls_ready) self._render_workers_alert_rules() - self._update_mimir_cluster() + self._update_tempo_cluster() - def _on_mimir_cluster_departed(self, _): + def _on_tempo_cluster_departed(self, _): self.nginx.configure_pebble_layer(tls=self._is_tls_ready) self._render_workers_alert_rules() - self._update_mimir_cluster() + self._update_tempo_cluster() def _on_s3_changed(self, _): - self._update_mimir_cluster() + self._update_tempo_cluster() def _on_collect_status(self, event: CollectStatusEvent): """Handle start event.""" @@ -202,7 +202,7 @@ def _on_collect_status(self, event: CollectStatusEvent): missing_roles = [role.value for role in self.coordinator.missing_roles()] event.add_status( ops.BlockedStatus( - f"Incoherent deployment: you are lacking some required Mimir roles " + f"Incoherent deployment: you are lacking some required Tempo roles " f"({missing_roles})" ) ) @@ -210,7 +210,7 @@ def _on_collect_status(self, event: CollectStatusEvent): if not s3_config_data and self.has_multiple_workers(): event.add_status( ops.BlockedStatus( - "When multiple units of Mimir are deployed, you must add a valid S3 relation. S3 relation missing/invalid." + "When multiple units of Tempo are deployed, you must add a valid S3 relation. S3 relation missing/invalid." ) ) @@ -221,7 +221,7 @@ def _on_collect_status(self, event: CollectStatusEvent): event.add_status(ops.ActiveStatus()) def _on_loki_relation_changed(self, _): - self._update_mimir_cluster() + self._update_tempo_cluster() def _on_nginx_pebble_ready(self, _) -> None: self.nginx.configure_pebble_layer(tls=self._is_tls_ready) @@ -271,9 +271,9 @@ def _is_tls_ready(self) -> bool: ) @property - def mimir_worker_relations(self) -> List[ops.Relation]: + def tempo_worker_relations(self) -> List[ops.Relation]: """Returns the list of worker relations.""" - return self.model.relations.get("mimir_worker", []) + return self.model.relations.get("tempo_worker", []) @property def _workers_scrape_jobs(self) -> List[Dict[str, Any]]: @@ -291,7 +291,7 @@ def _workers_scrape_jobs(self) -> List[Dict[str, Any]]: # replaced by the coordinator topology # https://github.com/canonical/prometheus-k8s-operator/issues/571 "relabel_configs": [ - {"target_label": "juju_charm", "replacement": "mimir-worker-k8s"}, + {"target_label": "juju_charm", "replacement": "tempo-worker-k8s"}, {"target_label": "juju_unit", "replacement": worker["unit"]}, {"target_label": "juju_application", "replacement": worker["app"]}, {"target_label": "juju_model", "replacement": self.model.name}, @@ -384,7 +384,7 @@ def _render_workers_alert_rules(self): "model_uuid": self.model.uuid, "application": worker["app"], "unit": worker["unit"], - "charm_name": "mimir-worker-k8s", + "charm_name": "tempo-worker-k8s", } topology = JujuTopology.from_dict(topology_dict) alert_rules = AlertRules(query_type="promql", topology=topology) @@ -405,7 +405,7 @@ def _consolidate_nginx_rules(self): for filename in glob.glob(os.path.join(NGINX_ORIGINAL_ALERT_RULES_PATH, "*.*")): shutil.copy(filename, f"{CONSOLIDATED_ALERT_RULES_PATH}/") - def _update_mimir_cluster(self): # common exit hook + def _update_tempo_cluster(self): # common exit hook """Build the config and publish everything to the application databag.""" if not self.coordinator.is_coherent(): return @@ -416,7 +416,7 @@ def _update_mimir_cluster(self): # common exit hook # On every function call, we always publish everything to the databag; however, if there # are no changes, Juju will safely ignore the updates self.cluster_provider.publish_data( - mimir_config=self.coordinator.build_config( + tempo_config=self.coordinator.build_config( s3_config_data=s3_config_data, tls_enabled=tls ), loki_endpoints=self.loki_endpoints_by_unit, @@ -426,23 +426,23 @@ def _update_mimir_cluster(self): # common exit hook self.publish_grant_secrets() def has_multiple_workers(self) -> bool: - """Return True if there are multiple workers forming the Mimir cluster.""" - mimir_cluster_relations = self.model.relations.get("mimir-cluster", []) + """Return True if there are multiple workers forming the Tempo cluster.""" + tempo_cluster_relations = self.model.relations.get("tempo-cluster", []) remote_units_count = sum( len(relation.units) - for relation in mimir_cluster_relations + for relation in tempo_cluster_relations if relation.app != self.model.app ) return remote_units_count > 1 def publish_grant_secrets(self) -> None: - """Publish and Grant secrets to the mimir-cluster relation.""" + """Publish and Grant secrets to the tempo-cluster relation.""" secrets = { "private_key_secret_id": self.server_cert.private_key_secret_id, "ca_server_cert_secret_id": self.server_cert.ca_server_cert_secret_id, } - relations = self.model.relations["mimir-cluster"] + relations = self.model.relations["tempo-cluster"] for relation in relations: relation.data[self.model.app]["secrets"] = json.dumps(secrets) logger.debug("Secrets published") @@ -508,4 +508,4 @@ def _update_cert(self): if __name__ == "__main__": # pragma: nocover - ops.main.main(MimirCoordinatorK8SOperatorCharm) + ops.main.main(TempoCoordinatorK8SOperatorCharm) diff --git a/src/grafana_dashboards/mimir-alertmanager.json b/src/grafana_dashboards/tempo-alertmanager.json similarity index 99% rename from src/grafana_dashboards/mimir-alertmanager.json rename to src/grafana_dashboards/tempo-alertmanager.json index cc36957..6847649 100644 --- a/src/grafana_dashboards/mimir-alertmanager.json +++ b/src/grafana_dashboards/tempo-alertmanager.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -3334,7 +3334,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -3462,10 +3462,10 @@ "current": { "selected": true, "text": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ], "value": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ] }, "hide": 0, @@ -3512,7 +3512,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Alertmanager", + "title": "Tempo / Alertmanager", "uid": "b0d38d318bbddd80476246d4930f9e55", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-compactor.json b/src/grafana_dashboards/tempo-compactor.json similarity index 99% rename from src/grafana_dashboards/mimir-compactor.json rename to src/grafana_dashboards/tempo-compactor.json index 88db7e7..32b3015 100644 --- a/src/grafana_dashboards/mimir-compactor.json +++ b/src/grafana_dashboards/tempo-compactor.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -2846,7 +2846,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -2974,10 +2974,10 @@ "current": { "selected": true, "text": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ], "value": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ] }, "hide": 0, @@ -3024,7 +3024,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Compactor", + "title": "Tempo / Compactor", "uid": "1b3443aea86db629e6efdb7d05c53823", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-config.json b/src/grafana_dashboards/tempo-config.json similarity index 99% rename from src/grafana_dashboards/mimir-config.json rename to src/grafana_dashboards/tempo-config.json index e231ab5..381aa17 100644 --- a/src/grafana_dashboards/mimir-config.json +++ b/src/grafana_dashboards/tempo-config.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -277,7 +277,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -456,7 +456,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Config", + "title": "Tempo / Config", "uid": "5d9d0b4724c0f80d68467088ec61e003", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-object-store.json b/src/grafana_dashboards/tempo-object-store.json similarity index 99% rename from src/grafana_dashboards/mimir-object-store.json rename to src/grafana_dashboards/tempo-object-store.json index 7d0acb7..d8da615 100644 --- a/src/grafana_dashboards/mimir-object-store.json +++ b/src/grafana_dashboards/tempo-object-store.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -1149,7 +1149,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -1157,10 +1157,10 @@ "current": { "selected": true, "text": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ], "value": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ] }, "hide": 0, @@ -1327,7 +1327,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Object Store", + "title": "Tempo / Object Store", "uid": "e1324ee2a434f4158c00a9ee279d3292", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-overrides.json b/src/grafana_dashboards/tempo-overrides.json similarity index 97% rename from src/grafana_dashboards/mimir-overrides.json rename to src/grafana_dashboards/tempo-overrides.json index 3f6dbde..47edc5e 100644 --- a/src/grafana_dashboards/mimir-overrides.json +++ b/src/grafana_dashboards/tempo-overrides.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -284,7 +284,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -292,10 +292,10 @@ "current": { "selected": true, "text": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ], "value": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ] }, "hide": 0, @@ -482,7 +482,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Overrides", + "title": "Tempo / Overrides", "uid": "1e2c358600ac53f09faea133f811b5bb", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-overview.json b/src/grafana_dashboards/tempo-overview.json similarity index 95% rename from src/grafana_dashboards/mimir-overview.json rename to src/grafana_dashboards/tempo-overview.json index 5d88f0b..7f38671 100644 --- a/src/grafana_dashboards/mimir-overview.json +++ b/src/grafana_dashboards/tempo-overview.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -63,7 +63,7 @@ "refId": "A" } ], - "title": "Mimir cluster health", + "title": "Tempo cluster health", "type": "row" }, { @@ -85,7 +85,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "The 'Status' panel shows an overview on the cluster health over the time.\nTo investigate failures, see a specific dashboard:\n\n- Writes\n- Reads\n- Rule evaluations\n- Alerting notifications\n- Object storage\n", + "content": "The 'Status' panel shows an overview on the cluster health over the time.\nTo investigate failures, see a specific dashboard:\n\n- Writes\n- Reads\n- Rule evaluations\n- Alerting notifications\n- Object storage\n", "mode": "markdown" }, "pluginVersion": "9.2.1", @@ -164,7 +164,7 @@ "uid": "$prometheusds" }, "exemplar": false, - "expr": "(sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((distributor|cortex|mimir|mimir-write.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])) or vector(0)) / sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((distributor|cortex|mimir|mimir-write.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))", + "expr": "(sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((distributor|cortex|tempo|tempo-write.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])) or vector(0)) / sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((distributor|cortex|tempo|tempo-write.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))", "instant": false, "legendFormat": "Writes", "range": true, @@ -175,7 +175,7 @@ "uid": "$prometheusds" }, "exemplar": false, - "expr": "(sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval])) or vector(0)) / sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))", + "expr": "(sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((query-frontend.*|cortex|tempo|tempo-read.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval])) or vector(0)) / sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/((query-frontend.*|cortex|tempo|tempo-read.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\",route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))", "instant": false, "legendFormat": "Reads", "range": true, @@ -186,7 +186,7 @@ "uid": "$prometheusds" }, "exemplar": false, - "expr": "((sum(rate(cortex_prometheus_rule_evaluation_failures_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval])) + sum(rate(cortex_prometheus_rule_group_iterations_missed_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))) or vector(0)) / sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))", + "expr": "((sum(rate(cortex_prometheus_rule_evaluation_failures_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval])) + sum(rate(cortex_prometheus_rule_group_iterations_missed_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))) or vector(0)) / sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))", "instant": false, "legendFormat": "Rule evaluations", "range": true, @@ -197,7 +197,7 @@ "uid": "$prometheusds" }, "exemplar": false, - "expr": "(((sum(rate(cortex_prometheus_notifications_errors_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))) or vector(0)) + ((sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{cluster=~\"$cluster\",job=~\"($namespace)/((alertmanager|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"})) or vector(0))) / (((sum(rate(cortex_prometheus_notifications_sent_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))) or vector(0)) + ((sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{cluster=~\"$cluster\",job=~\"($namespace)/((alertmanager|cortex|mimir|mimir-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"})) or vector(0)))", + "expr": "(((sum(rate(cortex_prometheus_notifications_errors_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))) or vector(0)) + ((sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{cluster=~\"$cluster\",job=~\"($namespace)/((alertmanager|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"})) or vector(0))) / (((sum(rate(cortex_prometheus_notifications_sent_total{cluster=~\"$cluster\",job=~\"($namespace)/((ruler|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval]))) or vector(0)) + ((sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{cluster=~\"$cluster\",job=~\"($namespace)/((alertmanager|cortex|tempo|tempo-backend.*))\",juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"})) or vector(0)))", "instant": false, "legendFormat": "Alerting notifications", "range": true, @@ -232,7 +232,7 @@ "id": 3, "options": { "alertInstanceLabelFilter": "juju_application=\"$juju_application\", juju_model=\"$juju_model\",juju_model_uuid=\"$juju_model_uuid\",juju_unit=\"$juju_unit\"", - "alertName": "Mimir", + "alertName": "Tempo", "dashboardAlerts": false, "groupBy": [], "groupMode": "default", @@ -304,7 +304,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "These panels show an overview on the write path. \nTo examine the write path in detail, see a specific dashboard:\n\n- Writes\n- Writes resources\n- Writes networking\n- Overview resources\n- Overview networking\n", + "content": "These panels show an overview on the write path. \nTo examine the write path in detail, see a specific dashboard:\n\n- Writes\n- Writes resources\n- Writes networking\n- Overview resources\n- Overview networking\n", "mode": "markdown" }, "pluginVersion": "9.2.1", @@ -682,7 +682,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "These panels show an overview on the read path. \nTo examine the read path in detail, see a specific dashboard:\n\n- Reads\n- Reads resources\n- Reads networking\n- Overview resources\n- Overview networking\n- Queries\n- Compactor\n", + "content": "These panels show an overview on the read path. \nTo examine the read path in detail, see a specific dashboard:\n\n- Reads\n- Reads resources\n- Reads networking\n- Overview resources\n- Overview networking\n- Queries\n- Compactor\n", "mode": "markdown" }, "pluginVersion": "9.2.1", @@ -1188,7 +1188,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "These panels show an overview on the recording and alerting rules evaluation.\nTo examine the rules evaluation and alerts notifications in detail, see a specific dashboard:\n\n- Ruler\n- Alertmanager\n- Alertmanager resources\n- Overview resources\n- Overview networking\n", + "content": "These panels show an overview on the recording and alerting rules evaluation.\nTo examine the rules evaluation and alerts notifications in detail, see a specific dashboard:\n\n- Ruler\n- Alertmanager\n- Alertmanager resources\n- Overview resources\n- Overview networking\n", "mode": "markdown" }, "pluginVersion": "9.2.1", @@ -1564,7 +1564,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "These panels show an overview on the long-term storage (object storage).\nTo examine the storage in detail, see a specific dashboard:\n\n- Object store\n- Compactor\n", + "content": "These panels show an overview on the long-term storage (object storage).\nTo examine the storage in detail, see a specific dashboard:\n\n- Object store\n- Compactor\n", "mode": "markdown" }, "pluginVersion": "9.2.1", @@ -1900,7 +1900,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -1908,10 +1908,10 @@ "current": { "selected": true, "text": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ], "value": [ - "juju_mimir-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" + "juju_tempo-self_ad905178-0b39-4676-891e-f2d33079ce73_prometheus_0" ] }, "hide": 0, @@ -2078,7 +2078,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Overview", + "title": "Tempo / Overview", "uid": "ffcd83628d7d4b5a03d1cafd159e6c9c", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-queries.json b/src/grafana_dashboards/tempo-queries.json similarity index 99% rename from src/grafana_dashboards/mimir-queries.json rename to src/grafana_dashboards/tempo-queries.json index b4c4711..16bad0c 100644 --- a/src/grafana_dashboards/mimir-queries.json +++ b/src/grafana_dashboards/tempo-queries.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -1205,7 +1205,7 @@ "uid": "${prometheusds}" }, "editorMode": "code", - "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|tempo|tempo-read.*))\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", @@ -3787,7 +3787,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -3979,7 +3979,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Queries", + "title": "Tempo / Queries", "uid": "b3abe8d5c040395cc36615cb4334c92d", "version": 1, "weekStart": "" diff --git a/src/grafana_dashboards/mimir-ruler.json b/src/grafana_dashboards/tempo-ruler.json similarity index 99% rename from src/grafana_dashboards/mimir-ruler.json rename to src/grafana_dashboards/tempo-ruler.json index a089ce9..6cf76f4 100644 --- a/src/grafana_dashboards/mimir-ruler.json +++ b/src/grafana_dashboards/tempo-ruler.json @@ -31,10 +31,10 @@ "includeVars": true, "keepTime": true, "tags": [ - "mimir" + "tempo" ], "targetBlank": false, - "title": "Mimir dashboards", + "title": "Tempo dashboards", "type": "dashboards" } ], @@ -2356,7 +2356,7 @@ "schemaVersion": 37, "style": "dark", "tags": [ - "mimir" + "tempo" ], "templating": { "list": [ @@ -2548,7 +2548,7 @@ ] }, "timezone": "utc", - "title": "Mimir / Ruler", + "title": "Tempo / Ruler", "uid": "631e15d5d85afb2ca8e35d62984eeaa0", "version": 1, "weekStart": "" diff --git a/src/nginx.py b/src/nginx.py index ea46692..c85ced0 100644 --- a/src/nginx.py +++ b/src/nginx.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Set import crossplane -from mimir_cluster import MimirClusterProvider +from tempo_cluster import TempoClusterProvider from ops import CharmBase from ops.pebble import Layer @@ -187,7 +187,7 @@ class Nginx: config_path = NGINX_CONFIG - def __init__(self, charm: CharmBase, cluster_provider: MimirClusterProvider, server_name: str): + def __init__(self, charm: CharmBase, cluster_provider: TempoClusterProvider, server_name: str): self._charm = charm self.cluster_provider = cluster_provider self.server_name = server_name @@ -240,7 +240,7 @@ def config(self, tls: bool = False) -> str: ], }, *self._log_verbose(verbose=False), - # mimir-related + # tempo-related {"directive": "sendfile", "args": ["on"]}, {"directive": "tcp_nopush", "args": ["on"]}, *self._resolver(custom_resolver=None), @@ -335,7 +335,7 @@ def _resolver(self, custom_resolver: Optional[List[Any]] = None) -> List[Dict[st def _basic_auth(self, enabled: bool) -> List[Optional[Dict[str, Any]]]: if enabled: return [ - {"directive": "auth_basic", "args": ['"Mimir"']}, + {"directive": "auth_basic", "args": ['"Tempo"']}, { "directive": "auth_basic_user_file", "args": ["/etc/nginx/secrets/.htpasswd"], diff --git a/src/prometheus_alert_rules/mimir_workers/alerts.yaml b/src/prometheus_alert_rules/tempo_workers/alerts.yaml similarity index 64% rename from src/prometheus_alert_rules/mimir_workers/alerts.yaml rename to src/prometheus_alert_rules/tempo_workers/alerts.yaml index 18253c9..b6706f1 100644 --- a/src/prometheus_alert_rules/mimir_workers/alerts.yaml +++ b/src/prometheus_alert_rules/tempo_workers/alerts.yaml @@ -1,24 +1,24 @@ # Obtained from: -# https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml +# https://github.com/grafana/tempo/blob/main/operations/tempo-mixin-compiled/alerts.yaml groups: -- name: mimir_alerts +- name: tempo_alerts rules: - - alert: MimirIngesterUnhealthy + - alert: TempoIngesterUnhealthy annotations: - message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ + message: Tempo cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ printf "%f" $value }} unhealthy ingester(s). - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterunhealthy expr: | min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 for: 15m labels: severity: critical - - alert: MimirRequestErrors + - alert: TempoRequestErrors annotations: message: | The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporequesterrors expr: | 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m])) / @@ -27,11 +27,11 @@ groups: for: 15m labels: severity: critical - - alert: MimirRequestLatency + - alert: TempoRequestLatency annotations: message: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporequestlatency expr: | cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} > @@ -39,11 +39,11 @@ groups: for: 15m labels: severity: warning - - alert: MimirQueriesIncorrect + - alert: TempoQueriesIncorrect annotations: message: | - The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect + The Tempo cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoqueriesincorrect expr: | 100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) / @@ -51,52 +51,52 @@ groups: for: 15m labels: severity: warning - - alert: MimirInconsistentRuntimeConfig + - alert: TempoInconsistentRuntimeConfig annotations: message: | An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoinconsistentruntimeconfig expr: | count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 for: 1h labels: severity: critical - - alert: MimirBadRuntimeConfig + - alert: TempoBadRuntimeConfig annotations: message: | {{ $labels.job }} failed to reload runtime config. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempobadruntimeconfig expr: | # The metric value is reset to 0 on error while reloading the config at runtime. cortex_runtime_config_last_reload_successful == 0 for: 5m labels: severity: critical - - alert: MimirFrontendQueriesStuck + - alert: TempoFrontendQueriesStuck annotations: message: | There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempofrontendqueriesstuck expr: | sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 for: 5m labels: severity: critical - - alert: MimirSchedulerQueriesStuck + - alert: TempoSchedulerQueriesStuck annotations: message: | There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temposchedulerqueriesstuck expr: | sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 for: 7m labels: severity: critical - - alert: MimirCacheRequestErrors + - alert: TempoCacheRequestErrors annotations: message: | - The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors + The cache {{ $labels.name }} used by Tempo {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocacherequesterrors expr: | ( sum by(cluster, namespace, name, operation) ( @@ -114,15 +114,15 @@ groups: for: 5m labels: severity: warning - - alert: MimirIngesterRestarts + - alert: TempoIngesterRestarts annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterrestarts expr: | ( sum by(cluster, namespace, pod) ( - increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) + increase(kube_pod_container_status_restarts_total{container=~"(ingester|tempo-write)"}[30m]) ) >= 2 ) @@ -132,11 +132,11 @@ groups: ) labels: severity: warning - - alert: MimirKVStoreFailure + - alert: TempoKVStoreFailure annotations: message: | - Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure + Tempo {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempokvstorefailure expr: | ( sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) @@ -148,21 +148,21 @@ groups: for: 5m labels: severity: critical - - alert: MimirMemoryMapAreasTooHigh + - alert: TempoMemoryMapAreasTooHigh annotations: message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas close to the limit.' - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempomemorymapareastoohigh expr: | - process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 + process_memory_map_areas{job=~".*/(ingester.*|cortex|tempo|tempo-write.*|store-gateway.*|cortex|tempo|tempo-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|tempo|tempo-write.*|store-gateway.*|cortex|tempo|tempo-backend.*)"} > 0.8 for: 5m labels: severity: critical - - alert: MimirIngesterInstanceHasNoTenants + - alert: TempoIngesterInstanceHasNoTenants annotations: - message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no tenants assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterinstancehasnotenants expr: | (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) and on (cluster, namespace) @@ -175,14 +175,14 @@ groups: for: 1h labels: severity: warning - - alert: MimirRulerInstanceHasNoRuleGroups + - alert: TempoRulerInstanceHasNoRuleGroups annotations: - message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no rule groups assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporulerinstancehasnorulegroups expr: | # Alert on ruler instances in microservices mode that have no rule groups assigned, - min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*mimir-)?ruler.*"}) == 0 + min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*tempo-)?ruler.*"}) == 0 # but only if other ruler instances of the same cell do have rule groups assigned and on (cluster, namespace) (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) @@ -192,11 +192,11 @@ groups: for: 1h labels: severity: warning - - alert: MimirIngestedDataTooFarInTheFuture + - alert: TempoIngestedDataTooFarInTheFuture annotations: - message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has ingested samples with timestamps more than 1h in the future. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesteddatatoofarinthefuture expr: | max by(cluster, namespace, pod) ( cortex_ingester_tsdb_head_max_timestamp_seconds - time() @@ -206,26 +206,26 @@ groups: for: 5m labels: severity: warning - - alert: MimirStoreGatewayTooManyFailedOperations + - alert: TempoStoreGatewayTooManyFailedOperations annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempostoregatewaytoomanyfailedoperations expr: | sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 for: 5m labels: severity: warning - - alert: MimirRingMembersMismatch + - alert: TempoRingMembersMismatch annotations: message: | - Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch + Number of members in Tempo ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporingmembersmismatch expr: | ( - avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) - != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) + avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|tempo|tempo-write.*)"})) + != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|tempo|tempo-write.*)"}) ) and ( @@ -235,13 +235,13 @@ groups: labels: component: ingester severity: warning -- name: mimir_instance_limits_alerts +- name: tempo_instance_limits_alerts rules: - - alert: MimirIngesterReachingSeriesLimit + - alert: TempoIngesterReachingSeriesLimit annotations: message: | Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterreachingserieslimit expr: | ( (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) @@ -251,11 +251,11 @@ groups: for: 3h labels: severity: warning - - alert: MimirIngesterReachingSeriesLimit + - alert: TempoIngesterReachingSeriesLimit annotations: message: | Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterreachingserieslimit expr: | ( (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) @@ -265,11 +265,11 @@ groups: for: 5m labels: severity: critical - - alert: MimirIngesterReachingTenantsLimit + - alert: TempoIngesterReachingTenantsLimit annotations: message: | Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterreachingtenantslimit expr: | ( (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) @@ -279,11 +279,11 @@ groups: for: 5m labels: severity: warning - - alert: MimirIngesterReachingTenantsLimit + - alert: TempoIngesterReachingTenantsLimit annotations: message: | Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterreachingtenantslimit expr: | ( (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) @@ -293,22 +293,22 @@ groups: for: 5m labels: severity: critical - - alert: MimirReachingTCPConnectionsLimit + - alert: TempoReachingTCPConnectionsLimit annotations: message: | - Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit + Tempo instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporeachingtcpconnectionslimit expr: | cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and cortex_tcp_connections_limit > 0 for: 5m labels: severity: critical - - alert: MimirDistributorReachingInflightPushRequestLimit + - alert: TempoDistributorReachingInflightPushRequestLimit annotations: message: | Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempodistributorreachinginflightpushrequestlimit expr: | ( (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) @@ -318,13 +318,13 @@ groups: for: 5m labels: severity: critical -- name: mimir-rollout-alerts +- name: tempo-rollout-alerts rules: - - alert: MimirRolloutStuck + - alert: TempoRolloutStuck annotations: message: | The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporolloutstuck expr: | ( max without (revision) ( @@ -348,11 +348,11 @@ groups: labels: severity: warning workload_type: statefulset - - alert: MimirRolloutStuck + - alert: TempoRolloutStuck annotations: message: | The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporolloutstuck expr: | ( sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) @@ -372,47 +372,47 @@ groups: annotations: message: | Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#rolloutoperatornotreconciling expr: | max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 for: 5m labels: severity: critical -- name: mimir-provisioning +- name: tempo-provisioning rules: - - alert: MimirAllocatingTooMuchMemory + - alert: TempoAllocatingTooMuchMemory annotations: message: | Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoallocatingtoomuchmemory expr: | ( # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. - # See: https://github.com/grafana/mimir/issues/2466 - container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + # See: https://github.com/grafana/tempo/issues/2466 + container_memory_rss{container=~"(ingester|tempo-write|tempo-backend)"} / - ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ( container_spec_memory_limit_bytes{container=~"(ingester|tempo-write|tempo-backend)"} > 0 ) ) - # Match only Mimir namespaces. + # Match only Tempo namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) > 0.65 for: 15m labels: severity: warning - - alert: MimirAllocatingTooMuchMemory + - alert: TempoAllocatingTooMuchMemory annotations: message: | Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoallocatingtoomuchmemory expr: | ( # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. - # See: https://github.com/grafana/mimir/issues/2466 - container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + # See: https://github.com/grafana/tempo/issues/2466 + container_memory_rss{container=~"(ingester|tempo-write|tempo-backend)"} / - ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ( container_spec_memory_limit_bytes{container=~"(ingester|tempo-write|tempo-backend)"} > 0 ) ) - # Match only Mimir namespaces. + # Match only Tempo namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) > 0.8 for: 15m @@ -420,11 +420,11 @@ groups: severity: critical - name: ruler_alerts rules: - - alert: MimirRulerTooManyFailedPushes + - alert: TempoRulerTooManyFailedPushes annotations: message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes + Tempo Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporulertoomanyfailedpushes expr: | 100 * ( sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) @@ -434,11 +434,11 @@ groups: for: 5m labels: severity: critical - - alert: MimirRulerTooManyFailedQueries + - alert: TempoRulerTooManyFailedQueries annotations: message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries + Tempo Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporulertoomanyfailedqueries expr: | 100 * ( sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) @@ -448,11 +448,11 @@ groups: for: 5m labels: severity: critical - - alert: MimirRulerMissedEvaluations + - alert: TempoRulerMissedEvaluations annotations: message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations + Tempo Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporulermissedevaluations expr: | 100 * ( sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) @@ -462,22 +462,22 @@ groups: for: 5m labels: severity: warning - - alert: MimirRulerFailedRingCheck + - alert: TempoRulerFailedRingCheck annotations: message: | - Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck + Tempo Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporulerfailedringcheck expr: | sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) > 0 for: 5m labels: severity: critical - - alert: MimirRulerRemoteEvaluationFailing + - alert: TempoRulerRemoteEvaluationFailing annotations: message: | - Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + Tempo rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#temporulerremoteevaluationfailing expr: | 100 * ( sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) @@ -489,27 +489,27 @@ groups: severity: warning - name: gossip_alerts rules: - - alert: MimirGossipMembersTooHigh + - alert: TempoGossipMembersTooHigh annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace + message: One or more Tempo instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a higher than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempogossipmemberstoohigh expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|tempo|tempo-write.*|tempo-read.*|tempo-backend.*)"}) + 10) for: 20m labels: severity: warning - - alert: MimirGossipMembersTooLow + - alert: TempoGossipMembersTooLow annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace + message: One or more Tempo instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a lower than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempogossipmemberstoolow expr: | min by (cluster, namespace) (memberlist_client_cluster_members_count) < - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|tempo|tempo-write.*|tempo-read.*|tempo-backend.*)"}) * 0.5) for: 20m labels: severity: warning @@ -519,7 +519,7 @@ groups: annotations: message: | Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#etcdallocatingtoomuchmemory expr: | ( container_memory_working_set_bytes{container="etcd"} @@ -533,7 +533,7 @@ groups: annotations: message: | Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#etcdallocatingtoomuchmemory expr: | ( container_memory_working_set_bytes{container="etcd"} @@ -545,70 +545,70 @@ groups: severity: critical - name: alertmanager_alerts rules: - - alert: MimirAlertmanagerSyncConfigsFailing + - alert: TempoAlertmanagerSyncConfigsFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing + Tempo Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagersyncconfigsfailing expr: | rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 for: 30m labels: severity: critical - - alert: MimirAlertmanagerRingCheckFailing + - alert: TempoAlertmanagerRingCheckFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing + Tempo Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerringcheckfailing expr: | rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 for: 10m labels: severity: critical - - alert: MimirAlertmanagerPartialStateMergeFailing + - alert: TempoAlertmanagerPartialStateMergeFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing + Tempo Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerpartialstatemergefailing expr: | rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 for: 10m labels: severity: critical - - alert: MimirAlertmanagerReplicationFailing + - alert: TempoAlertmanagerReplicationFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing + Tempo Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerreplicationfailing expr: | rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 for: 10m labels: severity: critical - - alert: MimirAlertmanagerPersistStateFailing + - alert: TempoAlertmanagerPersistStateFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snapshots to remote storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing + Tempo Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snapshots to remote storage. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerpersiststatefailing expr: | rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 for: 1h labels: severity: critical - - alert: MimirAlertmanagerInitialSyncFailed + - alert: TempoAlertmanagerInitialSyncFailed annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed + Tempo Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerinitialsyncfailed expr: | increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 labels: severity: critical - - alert: MimirAlertmanagerAllocatingTooMuchMemory + - alert: TempoAlertmanagerAllocatingTooMuchMemory annotations: message: | Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerallocatingtoomuchmemory expr: | (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80 and @@ -616,11 +616,11 @@ groups: for: 15m labels: severity: warning - - alert: MimirAlertmanagerAllocatingTooMuchMemory + - alert: TempoAlertmanagerAllocatingTooMuchMemory annotations: message: | Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerallocatingtoomuchmemory expr: | (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90 and @@ -628,27 +628,27 @@ groups: for: 15m labels: severity: critical - - alert: MimirAlertmanagerInstanceHasNoTenants + - alert: TempoAlertmanagerInstanceHasNoTenants annotations: - message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} owns no tenants. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoalertmanagerinstancehasnotenants expr: | # Alert on alertmanager instances in microservices mode that own no tenants, - min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*mimir-)?alertmanager.*"}) == 0 + min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*tempo-)?alertmanager.*"}) == 0 # but only if other instances of the same cell do have tenants assigned. and on (cluster, namespace) max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 for: 1h labels: severity: warning -- name: mimir_blocks_alerts +- name: tempo_blocks_alerts rules: - - alert: MimirIngesterHasNotShippedBlocks + - alert: TempoIngesterHasNotShippedBlocks annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterhasnotshippedblocks expr: | (min by(cluster, namespace, pod) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) and @@ -665,11 +665,11 @@ groups: for: 15m labels: severity: critical - - alert: MimirIngesterHasNotShippedBlocksSinceStart + - alert: TempoIngesterHasNotShippedBlocksSinceStart annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterhasnotshippedblockssincestart expr: | (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) and @@ -677,12 +677,12 @@ groups: for: 4h labels: severity: critical - - alert: MimirIngesterHasUnshippedBlocks + - alert: TempoIngesterHasUnshippedBlocks annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingesterhasunshippedblocks expr: | (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) and @@ -690,57 +690,57 @@ groups: for: 15m labels: severity: critical - - alert: MimirIngesterTSDBHeadCompactionFailed + - alert: TempoIngesterTSDBHeadCompactionFailed annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to compact TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbheadcompactionfailed expr: | rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 for: 15m labels: severity: critical - - alert: MimirIngesterTSDBHeadTruncationFailed + - alert: TempoIngesterTSDBHeadTruncationFailed annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbheadtruncationfailed expr: | rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 labels: severity: critical - - alert: MimirIngesterTSDBCheckpointCreationFailed + - alert: TempoIngesterTSDBCheckpointCreationFailed annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to create TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbcheckpointcreationfailed expr: | rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 labels: severity: critical - - alert: MimirIngesterTSDBCheckpointDeletionFailed + - alert: TempoIngesterTSDBCheckpointDeletionFailed annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to delete TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbcheckpointdeletionfailed expr: | rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 labels: severity: critical - - alert: MimirIngesterTSDBWALTruncationFailed + - alert: TempoIngesterTSDBWALTruncationFailed annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbwaltruncationfailed expr: | rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 labels: severity: warning - - alert: MimirIngesterTSDBWALCorrupted + - alert: TempoIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbwalcorrupted expr: | # alert when there are more than one corruptions count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 @@ -750,11 +750,11 @@ groups: labels: deployment: single-zone severity: critical - - alert: MimirIngesterTSDBWALCorrupted + - alert: TempoIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbwalcorrupted expr: | # alert when there are more than one corruptions count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 @@ -764,22 +764,22 @@ groups: labels: deployment: multi-zone severity: critical - - alert: MimirIngesterTSDBWALWritesFailed + - alert: TempoIngesterTSDBWALWritesFailed annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to write to TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoingestertsdbwalwritesfailed expr: | rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 for: 3m labels: severity: critical - - alert: MimirStoreGatewayHasNotSyncTheBucket + - alert: TempoStoreGatewayHasNotSyncTheBucket annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully synched the bucket since {{ $value | humanizeDuration }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempostoregatewayhasnotsyncthebucket expr: | (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) and @@ -787,33 +787,33 @@ groups: for: 5m labels: severity: critical - - alert: MimirStoreGatewayNoSyncedTenants + - alert: TempoStoreGatewayNoSyncedTenants annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not syncing any blocks for any tenant. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempostoregatewaynosyncedtenants expr: | min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 for: 1h labels: severity: warning - - alert: MimirBucketIndexNotUpdated + - alert: TempoBucketIndexNotUpdated annotations: - message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster + message: Tempo bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempobucketindexnotupdated expr: | min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 labels: severity: critical -- name: mimir_compactor_alerts +- name: tempo_compactor_alerts rules: - - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks + - alert: TempoCompactorHasNotSuccessfullyCleanedUpBlocks annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully cleaned up blocks in the last 6 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorhasnotsuccessfullycleanedupblocks expr: | # The "last successful run" metric is updated even if the compactor owns no tenants, # so this alert correctly doesn't fire if compactor has nothing to do. @@ -821,11 +821,11 @@ groups: for: 1h labels: severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction + - alert: TempoCompactorHasNotSuccessfullyRunCompaction annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorhasnotsuccessfullyruncompaction expr: | # The "last successful run" metric is updated even if the compactor owns no tenants, # so this alert correctly doesn't fire if compactor has nothing to do. @@ -836,11 +836,11 @@ groups: labels: reason: in-last-24h severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction + - alert: TempoCompactorHasNotSuccessfullyRunCompaction annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorhasnotsuccessfullyruncompaction expr: | # The "last successful run" metric is updated even if the compactor owns no tenants, # so this alert correctly doesn't fire if compactor has nothing to do. @@ -849,80 +849,80 @@ groups: labels: reason: since-startup severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction + - alert: TempoCompactorHasNotSuccessfullyRunCompaction annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed to run 2 consecutive compactions. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorhasnotsuccessfullyruncompaction expr: | increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 labels: reason: consecutive-failures severity: critical - - alert: MimirCompactorHasNotUploadedBlocks + - alert: TempoCompactorHasNotUploadedBlocks annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorhasnotuploadedblocks expr: | (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) and (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) and # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). + # (e.g. there are more replicas than required because running as part of tempo-backend). (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) for: 15m labels: severity: critical time_period: 24h - - alert: MimirCompactorHasNotUploadedBlocks + - alert: TempoCompactorHasNotUploadedBlocks annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block since its start. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorhasnotuploadedblocks expr: | (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) and # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). + # (e.g. there are more replicas than required because running as part of tempo-backend). (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) for: 24h labels: severity: critical time_period: since-start - - alert: MimirCompactorSkippedUnhealthyBlocks + - alert: TempoCompactorSkippedUnhealthyBlocks annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorskippedunhealthyblocks expr: | increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 for: 1m labels: severity: warning - - alert: MimirCompactorSkippedUnhealthyBlocks + - alert: TempoCompactorSkippedUnhealthyBlocks annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Tempo Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocompactorskippedunhealthyblocks expr: | increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 for: 30m labels: severity: critical -- name: mimir_autoscaling +- name: tempo_autoscaling rules: - - alert: MimirAutoscalerNotActive + - alert: TempoAutoscalerNotActive annotations: message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler }} in {{ $labels.namespace }} is not active. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoautoscalernotactive expr: | ( label_replace(( kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} - # Match only Mimir namespaces. + # Match only Tempo namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) # Add "metric" label. + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") @@ -938,50 +938,50 @@ groups: for: 1h labels: severity: critical - - alert: MimirAutoscalerKedaFailing + - alert: TempoAutoscalerKedaFailing annotations: message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace }} is experiencing errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempoautoscalerkedafailing expr: | ( # Find KEDA scalers reporting errors. label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") - # Match only Mimir namespaces. + # Match only Tempo namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) ) > 0 for: 1h labels: severity: critical -- name: mimir_continuous_test +- name: tempo_continuous_test rules: - - alert: MimirContinuousTestNotRunningOnWrites + - alert: TempoContinuousTestNotRunningOnWrites annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ + message: Tempo continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because writes are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocontinuoustestnotrunningonwrites expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + sum by(cluster, namespace, test) (rate(tempo_continuous_test_writes_failed_total[5m])) > 0 for: 1h labels: severity: warning - - alert: MimirContinuousTestNotRunningOnReads + - alert: TempoContinuousTestNotRunningOnReads annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ + message: Tempo continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because queries are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocontinuoustestnotrunningonreads expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + sum by(cluster, namespace, test) (rate(tempo_continuous_test_queries_failed_total[5m])) > 0 for: 1h labels: severity: warning - - alert: MimirContinuousTestFailed + - alert: TempoContinuousTestFailed annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ + message: Tempo continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed when asserting query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed + runbook_url: https://grafana.com/docs/tempo/latest/operators-guide/tempo-runbooks/#tempocontinuoustestfailed expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 + sum by(cluster, namespace, test) (rate(tempo_continuous_test_query_result_checks_failed_total[10m])) > 0 labels: severity: warning diff --git a/src/prometheus_alert_rules/mimir_workers/rules.yaml b/src/prometheus_alert_rules/tempo_workers/rules.yaml similarity index 98% rename from src/prometheus_alert_rules/mimir_workers/rules.yaml rename to src/prometheus_alert_rules/tempo_workers/rules.yaml index b8b6165..a02e6b6 100644 --- a/src/prometheus_alert_rules/mimir_workers/rules.yaml +++ b/src/prometheus_alert_rules/tempo_workers/rules.yaml @@ -1,8 +1,8 @@ # Obtained from: -# https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/rules.yaml +# https://github.com/grafana/tempo/blob/main/operations/tempo-mixin-compiled/rules.yaml groups: -- name: mimir_api_1 +- name: tempo_api_1 rules: - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) @@ -19,7 +19,7 @@ groups: record: cluster_job:cortex_request_duration_seconds_sum:sum_rate - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) record: cluster_job:cortex_request_duration_seconds_count:sum_rate -- name: mimir_api_2 +- name: tempo_api_2 rules: - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) @@ -37,7 +37,7 @@ groups: record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate -- name: mimir_api_3 +- name: tempo_api_3 rules: - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) @@ -58,7 +58,7 @@ groups: - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate -- name: mimir_querier_api +- name: tempo_querier_api rules: - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) @@ -117,7 +117,7 @@ groups: - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate -- name: mimir_cache +- name: tempo_cache rules: - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) @@ -174,7 +174,7 @@ groups: - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method) record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate -- name: mimir_storage +- name: tempo_storage rules: - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) @@ -192,7 +192,7 @@ groups: record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate -- name: mimir_queries +- name: tempo_queries rules: - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) @@ -228,7 +228,7 @@ groups: - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate -- name: mimir_ingester_queries +- name: tempo_ingester_queries rules: - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) @@ -276,32 +276,32 @@ groups: record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate -- name: mimir_received_samples +- name: tempo_received_samples rules: - expr: | sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) record: cluster_namespace_job:cortex_distributor_received_samples:rate5m -- name: mimir_exemplars_in +- name: tempo_exemplars_in rules: - expr: | sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m -- name: mimir_received_exemplars +- name: tempo_received_exemplars rules: - expr: | sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m -- name: mimir_exemplars_ingested +- name: tempo_exemplars_ingested rules: - expr: | sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m -- name: mimir_exemplars_appended +- name: tempo_exemplars_appended rules: - expr: | sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m -- name: mimir_scaling_rules +- name: tempo_scaling_rules rules: - expr: | # Convenience rule to get the number of replicas for both a deployment and a statefulset. @@ -535,7 +535,7 @@ groups: labels: reason: memory_usage record: cluster_namespace_deployment_reason:required_replicas:count -- name: mimir_alertmanager_rules +- name: tempo_alertmanager_rules rules: - expr: | sum by (cluster, job, pod) (cortex_alertmanager_alerts) @@ -567,7 +567,7 @@ groups: - expr: | sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m -- name: mimir_ingester_rules +- name: tempo_ingester_rules rules: - expr: | sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) diff --git a/src/mimir_cluster.py b/src/tempo_cluster.py similarity index 76% rename from src/mimir_cluster.py rename to src/tempo_cluster.py index 7e7519d..28b19d0 100644 --- a/src/mimir_cluster.py +++ b/src/tempo_cluster.py @@ -2,9 +2,9 @@ # Copyright 2024 Canonical # See LICENSE file for licensing details. -"""This module contains an endpoint wrapper class for the provider side of the ``mimir-cluster`` relation. +"""This module contains an endpoint wrapper class for the provider side of the ``tempo-cluster`` relation. -As this relation is cluster-internal and not intended for third-party charms to interact with `mimir-coordinator-k8s`, its only user will be the mimir-coordinator-k8s charm. As such, it does not live in a charm lib as most other relation endpoint wrappers do. +As this relation is cluster-internal and not intended for third-party charms to interact with `tempo-coordinator-k8s`, its only user will be the tempo-coordinator-k8s charm. As such, it does not live in a charm lib as most other relation endpoint wrappers do. """ @@ -19,18 +19,18 @@ from ops import Object from pydantic import BaseModel, ConfigDict -log = logging.getLogger("mimir_cluster") +log = logging.getLogger("tempo_cluster") -DEFAULT_ENDPOINT_NAME = "mimir-cluster" +DEFAULT_ENDPOINT_NAME = "tempo-cluster" BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} -MIMIR_CONFIG_FILE = "/etc/mimir/mimir-config.yaml" -MIMIR_CERT_FILE = "/etc/mimir/server.cert" -MIMIR_KEY_FILE = "/etc/mimir/private.key" -MIMIR_CLIENT_CA_FILE = "/etc/mimir/ca.cert" +TEMPO_CONFIG_FILE = "/etc/tempo/tempo-config.yaml" +TEMPO_CERT_FILE = "/etc/tempo/server.cert" +TEMPO_KEY_FILE = "/etc/tempo/private.key" +TEMPO_CLIENT_CA_FILE = "/etc/tempo/ca.cert" -class MimirRole(str, Enum): - """Mimir component role names.""" +class TempoRole(str, Enum): + """Tempo component role names.""" overrides_exporter = "overrides-exporter" query_scheduler = "query-scheduler" @@ -52,21 +52,21 @@ class MimirRole(str, Enum): META_ROLES = { - MimirRole.read: (MimirRole.query_frontend, MimirRole.querier), - MimirRole.write: (MimirRole.distributor, MimirRole.ingester), - MimirRole.backend: ( - MimirRole.store_gateway, - MimirRole.compactor, - MimirRole.ruler, - MimirRole.alertmanager, - MimirRole.query_scheduler, - MimirRole.overrides_exporter, + TempoRole.read: (TempoRole.query_frontend, TempoRole.querier), + TempoRole.write: (TempoRole.distributor, TempoRole.ingester), + TempoRole.backend: ( + TempoRole.store_gateway, + TempoRole.compactor, + TempoRole.ruler, + TempoRole.alertmanager, + TempoRole.query_scheduler, + TempoRole.overrides_exporter, ), - MimirRole.all: list(MimirRole), + TempoRole.all: list(TempoRole), } -def expand_roles(roles: Iterable[MimirRole]) -> Set[MimirRole]: +def expand_roles(roles: Iterable[TempoRole]) -> Set[TempoRole]: """Expand any meta roles to their 'atomic' equivalents.""" expanded_roles = set() for role in roles: @@ -77,8 +77,8 @@ def expand_roles(roles: Iterable[MimirRole]) -> Set[MimirRole]: return expanded_roles -class MimirClusterProvider(Object): - """``mimir-cluster`` provider endpoint wrapper.""" +class TempoClusterProvider(Object): + """``tempo-cluster`` provider endpoint wrapper.""" def __init__( self, @@ -92,25 +92,25 @@ def __init__( def publish_data( self, - mimir_config: Dict[str, Any], + tempo_config: Dict[str, Any], loki_endpoints: Optional[Dict[str, str]] = None, ) -> None: - """Publish the mimir config and loki endpoints to all related mimir worker clusters.""" + """Publish the tempo config and loki endpoints to all related tempo worker clusters.""" for relation in self._relations: if relation: - local_app_databag = MimirClusterProviderAppData( - mimir_config=mimir_config, loki_endpoints=loki_endpoints + local_app_databag = TempoClusterProviderAppData( + tempo_config=tempo_config, loki_endpoints=loki_endpoints ) local_app_databag.dump(relation.data[self.model.app]) - def gather_roles(self) -> Dict[MimirRole, int]: + def gather_roles(self) -> Dict[TempoRole, int]: """Go through the worker's app databags and sum the available application roles.""" data = {} for relation in self._relations: if relation.app: remote_app_databag = relation.data[relation.app] try: - worker_roles: List[MimirRole] = MimirClusterRequirerAppData.load( + worker_roles: List[TempoRole] = TempoClusterRequirerAppData.load( remote_app_databag ).roles except DataValidationError as e: @@ -135,7 +135,7 @@ def gather_addresses_by_role(self) -> Dict[str, Set[str]]: continue try: - worker_app_data = MimirClusterRequirerAppData.load(relation.data[relation.app]) + worker_app_data = TempoClusterRequirerAppData.load(relation.data[relation.app]) worker_roles = set(worker_app_data.roles) except DataValidationError as e: log.info(f"invalid databag contents: {e}") @@ -143,7 +143,7 @@ def gather_addresses_by_role(self) -> Dict[str, Set[str]]: for worker_unit in relation.units: try: - worker_data = MimirClusterRequirerUnitData.load(relation.data[worker_unit]) + worker_data = TempoClusterRequirerUnitData.load(relation.data[worker_unit]) unit_address = worker_data.address for role in worker_roles: data[role].add(unit_address) @@ -177,7 +177,7 @@ def gather_topology(self) -> List[Dict[str, str]]: for worker_unit in relation.units: try: - worker_data = MimirClusterRequirerUnitData.load(relation.data[worker_unit]) + worker_data = TempoClusterRequirerUnitData.load(relation.data[worker_unit]) unit_address = worker_data.address except DataValidationError as e: log.info(f"invalid databag contents: {e}") @@ -255,36 +255,33 @@ class JujuTopology(pydantic.BaseModel): # ... -class MimirClusterProviderAppData(DatabagModel): - """MimirClusterProviderAppData.""" +class TempoClusterProviderAppData(DatabagModel): + """TempoClusterProviderAppData.""" - mimir_config: Dict[str, Any] + tempo_config: Dict[str, Any] loki_endpoints: Optional[Dict[str, str]] = None - # todo: validate with - # https://grafana.com/docs/mimir/latest/configure/about-configurations/#:~:text=Validate%20a%20configuration,or%20in%20a%20CI%20environment. - # caveat: only the requirer node can do it -class MimirClusterRequirerAppData(DatabagModel): - """MimirClusterRequirerAppData.""" +class TempoClusterRequirerAppData(DatabagModel): + """TempoClusterRequirerAppData.""" - roles: List[MimirRole] + roles: List[TempoRole] -class MimirClusterRequirerUnitData(DatabagModel): - """MimirClusterRequirerUnitData.""" +class TempoClusterRequirerUnitData(DatabagModel): + """TempoClusterRequirerUnitData.""" juju_topology: JujuTopology address: str -class MimirClusterError(Exception): +class TempoClusterError(Exception): """Base class for exceptions raised by this module.""" -class DataValidationError(MimirClusterError): +class DataValidationError(TempoClusterError): """Raised when relation databag validation fails.""" -class DatabagAccessPermissionError(MimirClusterError): +class DatabagAccessPermissionError(TempoClusterError): """Raised when a follower attempts to write leader settings.""" diff --git a/src/mimir_config.py b/src/tempo_config.py similarity index 94% rename from src/mimir_config.py rename to src/tempo_config.py index b892021..53ba27d 100644 --- a/src/mimir_config.py +++ b/src/tempo_config.py @@ -1,7 +1,7 @@ # Copyright 2023 Canonical # See LICENSE file for licensing details. -"""Helper module for interacting with the Mimir configuration.""" +"""Helper module for interacting with the Tempo configuration.""" import logging import re @@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass as pydantic_dataclass S3_RELATION_NAME = "s3" -BUCKET_NAME = "mimir" +BUCKET_NAME = "tempo" logger = logging.getLogger(__name__) @@ -146,12 +146,12 @@ def __post_init__(self): asdict(self).get("s3", "") and not self.backend != "s3" ): raise InvalidConfigurationError( - "Mimir `backend` type must include a configuration block which matches that type" + "Tempo `backend` type must include a configuration block which matches that type" ) -class MimirBaseConfig(BaseModel): - """Base class for mimir config schema.""" +class TempoBaseConfig(BaseModel): + """Base class for tempo config schema.""" target: str memberlist: Memberlist diff --git a/src/mimir_coordinator.py b/src/tempo_coordinator.py similarity index 80% rename from src/mimir_coordinator.py rename to src/tempo_coordinator.py index ed0c259..01b60ee 100644 --- a/src/mimir_coordinator.py +++ b/src/tempo_coordinator.py @@ -2,52 +2,52 @@ # Copyright 2023 Canonical # See LICENSE file for licensing details. -"""Mimir coordinator.""" +"""Tempo coordinator.""" import logging from collections import Counter from pathlib import Path from typing import Any, Dict, Iterable, Optional, Set -from mimir_cluster import ( - MIMIR_CERT_FILE, - MIMIR_CLIENT_CA_FILE, - MIMIR_KEY_FILE, - MimirClusterProvider, - MimirRole, +from tempo_cluster import ( + TEMPO_CERT_FILE, + TEMPO_CLIENT_CA_FILE, + TEMPO_KEY_FILE, + TempoClusterProvider, + TempoRole, ) -from mimir_config import _S3ConfigData +from tempo_config import _S3ConfigData logger = logging.getLogger(__name__) MINIMAL_DEPLOYMENT = { # from official docs: - MimirRole.compactor: 1, - MimirRole.distributor: 1, - MimirRole.ingester: 1, - MimirRole.querier: 1, - MimirRole.query_frontend: 1, - MimirRole.query_scheduler: 1, - MimirRole.store_gateway: 1, + TempoRole.compactor: 1, + TempoRole.distributor: 1, + TempoRole.ingester: 1, + TempoRole.querier: 1, + TempoRole.query_frontend: 1, + TempoRole.query_scheduler: 1, + TempoRole.store_gateway: 1, # we add: - MimirRole.ruler: 1, - MimirRole.alertmanager: 1, + TempoRole.ruler: 1, + TempoRole.alertmanager: 1, } """The minimal set of roles that need to be allocated for the -deployment to be considered consistent (otherwise we set blocked). On top of what mimir itself lists as required, +deployment to be considered consistent (otherwise we set blocked). On top of what tempo itself lists as required, we add alertmanager.""" RECOMMENDED_DEPLOYMENT = Counter( { - MimirRole.ingester: 3, - MimirRole.querier: 2, - MimirRole.query_scheduler: 2, - MimirRole.alertmanager: 1, - MimirRole.query_frontend: 1, - MimirRole.ruler: 1, - MimirRole.store_gateway: 1, - MimirRole.compactor: 1, - MimirRole.distributor: 1, + TempoRole.ingester: 3, + TempoRole.querier: 2, + TempoRole.query_scheduler: 2, + TempoRole.alertmanager: 1, + TempoRole.query_frontend: 1, + TempoRole.ruler: 1, + TempoRole.store_gateway: 1, + TempoRole.compactor: 1, + TempoRole.distributor: 1, } ) """The set of roles that need to be allocated for the @@ -60,12 +60,12 @@ DEFAULT_REPLICATION = 3 -class MimirCoordinator: - """Mimir coordinator.""" +class TempoCoordinator: + """Tempo coordinator.""" def __init__( self, - cluster_provider: MimirClusterProvider, + cluster_provider: TempoClusterProvider, # TODO: use and import tls requirer obj tls_requirer: Any = None, # TODO: use and import s3 requirer obj @@ -81,13 +81,13 @@ def __init__( self._recovery_data_dir = recovery_data_dir def is_coherent(self) -> bool: - """Return True if the roles list makes up a coherent mimir deployment.""" - roles: Iterable[MimirRole] = self._cluster_provider.gather_roles().keys() + """Return True if the roles list makes up a coherent tempo deployment.""" + roles: Iterable[TempoRole] = self._cluster_provider.gather_roles().keys() return set(roles).issuperset(MINIMAL_DEPLOYMENT) - def missing_roles(self) -> Set[MimirRole]: + def missing_roles(self) -> Set[TempoRole]: """If the coordinator is incoherent, return the roles that are missing for it to become so.""" - roles: Iterable[MimirRole] = self._cluster_provider.gather_roles().keys() + roles: Iterable[TempoRole] = self._cluster_provider.gather_roles().keys() return set(MINIMAL_DEPLOYMENT).difference(roles) def is_recommended(self) -> bool: @@ -95,7 +95,7 @@ def is_recommended(self) -> bool: I.E. If all required roles are assigned, and each role has the recommended amount of units. """ - roles: Dict[MimirRole, int] = self._cluster_provider.gather_roles() + roles: Dict[TempoRole, int] = self._cluster_provider.gather_roles() # python>=3.11 would support roles >= RECOMMENDED_DEPLOYMENT for role, min_n in RECOMMENDED_DEPLOYMENT.items(): if roles.get(role, 0) < min_n: @@ -105,11 +105,11 @@ def is_recommended(self) -> bool: def build_config( self, s3_config_data: Optional[_S3ConfigData], tls_enabled: bool = False ) -> Dict[str, Any]: - """Generate shared config file for mimir. + """Generate shared config file for tempo. - Reference: https://grafana.com/docs/mimir/latest/configure/ + Reference: https://grafana.com/docs/tempo/latest/configure/ """ - mimir_config: Dict[str, Any] = { + tempo_config: Dict[str, Any] = { "common": {}, "alertmanager": self._build_alertmanager_config(), "alertmanager_storage": self._build_alertmanager_storage_config(), @@ -123,22 +123,22 @@ def build_config( } if s3_config_data: - mimir_config["common"]["storage"] = self._build_s3_storage_config(s3_config_data) - self._update_s3_storage_config(mimir_config["blocks_storage"], "blocks") - self._update_s3_storage_config(mimir_config["ruler_storage"], "rules") - self._update_s3_storage_config(mimir_config["alertmanager_storage"], "alerts") + tempo_config["common"]["storage"] = self._build_s3_storage_config(s3_config_data) + self._update_s3_storage_config(tempo_config["blocks_storage"], "blocks") + self._update_s3_storage_config(tempo_config["ruler_storage"], "rules") + self._update_s3_storage_config(tempo_config["alertmanager_storage"], "alerts") # todo: TLS config for memberlist if tls_enabled: - mimir_config["server"] = self._build_tls_config() + tempo_config["server"] = self._build_tls_config() - return mimir_config + return tempo_config def _build_tls_config(self) -> Dict[str, Any]: tls_config = { - "cert_file": MIMIR_CERT_FILE, - "key_file": MIMIR_KEY_FILE, - "client_ca_file": MIMIR_CLIENT_CA_FILE, + "cert_file": TEMPO_CERT_FILE, + "key_file": TEMPO_KEY_FILE, + "client_ca_file": TEMPO_CLIENT_CA_FILE, "client_auth_type": "RequestClientCert", } return { @@ -147,14 +147,14 @@ def _build_tls_config(self) -> Dict[str, Any]: } # data_dir: - # The Mimir Alertmanager stores the alerts state on local disk at the location configured using -alertmanager.storage.path. + # The Tempo Alertmanager stores the alerts state on local disk at the location configured using -alertmanager.storage.path. # Should be persisted if not replicated # sharding_ring.replication_factor: int # (advanced) The replication factor to use when sharding the alertmanager. def _build_alertmanager_config(self) -> Dict[str, Any]: alertmanager_scale = len( - self._cluster_provider.gather_addresses_by_role().get(MimirRole.alertmanager, []) + self._cluster_provider.gather_addresses_by_role().get(TempoRole.alertmanager, []) ) return { "data_dir": str(self._root_data_dir / "data-alertmanager"), @@ -166,7 +166,7 @@ def _build_alertmanager_config(self) -> Dict[str, Any]: } # filesystem: dir - # The Mimir Alertmanager also periodically stores the alert state in the storage backend configured with -alertmanager-storage.backend (For Recovery) + # The Tempo Alertmanager also periodically stores the alert state in the storage backend configured with -alertmanager-storage.backend (For Recovery) def _build_alertmanager_storage_config(self) -> Dict[str, Any]: return { "filesystem": { @@ -188,7 +188,7 @@ def _build_compactor_config(self) -> Dict[str, Any]: # microservices mode. def _build_ingester_config(self) -> Dict[str, Any]: ingester_scale = len( - self._cluster_provider.gather_addresses_by_role().get(MimirRole.ingester, []) + self._cluster_provider.gather_addresses_by_role().get(TempoRole.ingester, []) ) return { "ring": { @@ -212,7 +212,7 @@ def _build_ruler_config(self) -> Dict[str, Any]: # microservices mode. def _build_store_gateway_config(self) -> Dict[str, Any]: store_gateway_scale = len( - self._cluster_provider.gather_addresses_by_role().get(MimirRole.store_gateway, []) + self._cluster_provider.gather_addresses_by_role().get(TempoRole.store_gateway, []) ) return { "sharding_ring": { @@ -237,7 +237,7 @@ def _build_ruler_storage_config(self) -> Dict[str, Any]: # required to be persisted between restarts, but it's highly recommended # filesystem: dir - # Mimir upload blocks (of metrics) to the object storage at period interval. + # Tempo upload blocks (of metrics) to the object storage at period interval. # tsdb: dir # Directory to store TSDBs (including WAL) in the ingesters. diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index ef1536b..6c08a60 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -51,7 +51,7 @@ async def wrapper(*args, **kwargs): @pytest.fixture(scope="module") @timed_memoizer -async def mimir_charm(ops_test): - """Mimir charm used for integration testing.""" +async def tempo_charm(ops_test): + """Tempo charm used for integration testing.""" charm = await ops_test.build_charm(".") return charm diff --git a/tests/integration/test_self_monitoring.py b/tests/integration/test_self_monitoring.py index a2605b7..f86f919 100644 --- a/tests/integration/test_self_monitoring.py +++ b/tests/integration/test_self_monitoring.py @@ -44,7 +44,7 @@ async def test_build_and_deploy(ops_test: OpsTest): scale: 1 trust: true read: - charm: mimir-worker-k8s + charm: tempo-worker-k8s channel: edge scale: 1 constraints: arch=amd64 @@ -58,7 +58,7 @@ async def test_build_and_deploy(ops_test: OpsTest): store-gateway: true trust: true write: - charm: mimir-worker-k8s + charm: tempo-worker-k8s channel: edge scale: 1 constraints: arch=amd64 @@ -70,10 +70,10 @@ async def test_build_and_deploy(ops_test: OpsTest): relations: - - prom:metrics-endpoint - coord:self-metrics-endpoint - - - coord:mimir-cluster - - read:mimir-cluster - - - coord:mimir-cluster - - write:mimir-cluster + - - coord:tempo-cluster + - read:tempo-cluster + - - coord:tempo-cluster + - write:tempo-cluster """ ) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 82c5a26..4e83b7a 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -17,23 +17,23 @@ METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) mc = SimpleNamespace(name="mc") -mimir_app_name = "coordinator" +tempo_app_name = "coordinator" ca_app_name = "ca" -app_names = [mimir_app_name, ca_app_name] +app_names = [tempo_app_name, ca_app_name] def get_nginx_config(ops_test: OpsTest): return get_workload_file( - ops_test.model_name, mimir_app_name, 0, "nginx", "/etc/nginx/nginx.conf" + ops_test.model_name, tempo_app_name, 0, "nginx", "/etc/nginx/nginx.conf" ) @pytest.mark.abort_on_fail async def test_nginx_config_has_ssl(ops_test: OpsTest): - mimir_charm = await ops_test.build_charm(".") + tempo_charm = await ops_test.build_charm(".") await asyncio.gather( ops_test.model.deploy( - mimir_charm, + tempo_charm, resources={ "nginx-image": oci_image("./metadata.yaml", "nginx-image"), "nginx-prometheus-exporter-image": oci_image( @@ -52,12 +52,12 @@ async def test_nginx_config_has_ssl(ops_test: OpsTest): ) await asyncio.gather( - ops_test.model.wait_for_idle(apps=[mimir_app_name], status="blocked"), + ops_test.model.wait_for_idle(apps=[tempo_app_name], status="blocked"), ops_test.model.wait_for_idle(apps=[ca_app_name], status="active"), ) - await ops_test.model.add_relation(mimir_app_name, ca_app_name) + await ops_test.model.add_relation(tempo_app_name, ca_app_name) await asyncio.gather( - ops_test.model.wait_for_idle(apps=[mimir_app_name], status="blocked"), + ops_test.model.wait_for_idle(apps=[tempo_app_name], status="blocked"), ops_test.model.wait_for_idle(apps=[ca_app_name], status="active"), ) diff --git a/tests/scenario/test_mimir_cluster_interface.py b/tests/scenario/test_tempo_cluster_interface.py similarity index 64% rename from tests/scenario/test_mimir_cluster_interface.py rename to tests/scenario/test_tempo_cluster_interface.py index 2f53b06..93f76e6 100644 --- a/tests/scenario/test_mimir_cluster_interface.py +++ b/tests/scenario/test_tempo_cluster_interface.py @@ -2,11 +2,11 @@ import ops import pytest -from mimir_cluster import ( - MimirClusterProvider, - MimirClusterRequirerAppData, - MimirClusterRequirerUnitData, - MimirRole, +from tempo_cluster import ( + TempoClusterProvider, + TempoClusterRequirerAppData, + TempoClusterRequirerUnitData, + TempoRole, ) from ops import Framework from scenario import Context, Relation, State @@ -15,44 +15,44 @@ class MyCharm(ops.CharmBase): META = { "name": "lukasz", - "requires": {"mimir-cluster-require": {"interface": "mimir_cluster"}}, - "provides": {"mimir-cluster-provide": {"interface": "mimir_cluster"}}, + "requires": {"tempo-cluster-require": {"interface": "tempo_cluster"}}, + "provides": {"tempo-cluster-provide": {"interface": "tempo_cluster"}}, } def __init__(self, framework: Framework): super().__init__(framework) - self.provider = MimirClusterProvider(self, endpoint="mimir-cluster-provide") + self.provider = TempoClusterProvider(self, endpoint="tempo-cluster-provide") @pytest.mark.parametrize( "workers_roles, expected", ( ( - (({MimirRole.overrides_exporter}, 1), ({MimirRole.overrides_exporter}, 1)), - ({MimirRole.overrides_exporter: 2}), + (({TempoRole.overrides_exporter}, 1), ({TempoRole.overrides_exporter}, 1)), + ({TempoRole.overrides_exporter: 2}), ), ( - (({MimirRole.query_frontend}, 1), ({MimirRole.overrides_exporter}, 1)), - ({MimirRole.overrides_exporter: 1, MimirRole.query_frontend: 1}), + (({TempoRole.query_frontend}, 1), ({TempoRole.overrides_exporter}, 1)), + ({TempoRole.overrides_exporter: 1, TempoRole.query_frontend: 1}), ), - ((({MimirRole.querier}, 2), ({MimirRole.querier}, 1)), ({MimirRole.querier: 3})), + ((({TempoRole.querier}, 2), ({TempoRole.querier}, 1)), ({TempoRole.querier: 3})), ( ( - ({MimirRole.alertmanager}, 2), - ({MimirRole.alertmanager}, 2), - ({MimirRole.alertmanager, MimirRole.querier}, 1), + ({TempoRole.alertmanager}, 2), + ({TempoRole.alertmanager}, 2), + ({TempoRole.alertmanager, TempoRole.querier}, 1), ), - ({MimirRole.alertmanager: 5, MimirRole.querier: 1}), + ({TempoRole.alertmanager: 5, TempoRole.querier: 1}), ), ), ) def test_role_collection(workers_roles, expected): relations = [] for worker_roles, scale in workers_roles: - data = MimirClusterRequirerAppData(roles=worker_roles).dump() + data = TempoClusterRequirerAppData(roles=worker_roles).dump() relations.append( Relation( - "mimir-cluster-provide", + "tempo-cluster-provide", remote_app_data=data, remote_units_data={i: {} for i in range(scale)}, ) @@ -82,15 +82,15 @@ def test_role_collection(workers_roles, expected): def test_address_collection(workers_addresses): relations = [] topo = {"unit": "foo/0", "model": "bar"} - remote_app_data = MimirClusterRequirerAppData(roles=[MimirRole.alertmanager]).dump() + remote_app_data = TempoClusterRequirerAppData(roles=[TempoRole.alertmanager]).dump() for worker_addresses in workers_addresses: units_data = { - i: MimirClusterRequirerUnitData(address=address, juju_topology=topo).dump() + i: TempoClusterRequirerUnitData(address=address, juju_topology=topo).dump() for i, address in enumerate(worker_addresses) } relations.append( Relation( - "mimir-cluster-provide", + "tempo-cluster-provide", remote_units_data=units_data, remote_app_data=remote_app_data, ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 134a415..630bb80 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -6,7 +6,7 @@ import os import unittest -from charm import MimirCoordinatorK8SOperatorCharm +from charm import TempoCoordinatorK8SOperatorCharm from ops.model import BlockedStatus from ops.testing import Harness @@ -14,7 +14,7 @@ class TestCharm(unittest.TestCase): def setUp(self): os.environ["JUJU_VERSION"] = "3.0.3" - self.harness = Harness(MimirCoordinatorK8SOperatorCharm) + self.harness = Harness(TempoCoordinatorK8SOperatorCharm) self.harness.set_can_connect("nginx", True) self.addCleanup(self.harness.cleanup) self.harness.begin_with_initial_hooks() diff --git a/tests/unit/test_coherence.py b/tests/unit/test_coherence.py index 85087be..ee018bc 100644 --- a/tests/unit/test_coherence.py +++ b/tests/unit/test_coherence.py @@ -1,27 +1,27 @@ from unittest.mock import MagicMock import pytest as pytest -from mimir_coordinator import ( +from tempo_coordinator import ( MINIMAL_DEPLOYMENT, RECOMMENDED_DEPLOYMENT, - MimirCoordinator, - MimirRole, + TempoCoordinator, + TempoRole, ) -def _to_endpoint_name(role: MimirRole): +def _to_endpoint_name(role: TempoRole): return role.value.replace("_", "-") -ALL_MIMIR_RELATION_NAMES = list(map(_to_endpoint_name, MimirRole)) +ALL_TEMPO_RELATION_NAMES = list(map(_to_endpoint_name, TempoRole)) @pytest.mark.parametrize( "roles, expected", ( - ({MimirRole.ruler: 1}, False), - ({MimirRole.distributor: 1}, False), - ({MimirRole.distributor: 1, MimirRole.ingester: 1}, False), + ({TempoRole.ruler: 1}, False), + ({TempoRole.distributor: 1}, False), + ({TempoRole.distributor: 1, TempoRole.ingester: 1}, False), (MINIMAL_DEPLOYMENT, True), (RECOMMENDED_DEPLOYMENT, True), ), @@ -29,16 +29,16 @@ def _to_endpoint_name(role: MimirRole): def test_coherent(roles, expected): mock = MagicMock() mock.gather_roles = MagicMock(return_value=roles) - mc = MimirCoordinator(mock) + mc = TempoCoordinator(mock) assert mc.is_coherent() is expected @pytest.mark.parametrize( "roles, expected", ( - ({MimirRole.ruler: 1}, False), - ({MimirRole.distributor: 1}, False), - ({MimirRole.distributor: 1, MimirRole.ingester: 1}, False), + ({TempoRole.ruler: 1}, False), + ({TempoRole.distributor: 1}, False), + ({TempoRole.distributor: 1, TempoRole.ingester: 1}, False), (MINIMAL_DEPLOYMENT, False), (RECOMMENDED_DEPLOYMENT, True), ), @@ -46,5 +46,5 @@ def test_coherent(roles, expected): def test_recommended(roles, expected): mock = MagicMock() mock.gather_roles = MagicMock(return_value=roles) - mc = MimirCoordinator(mock) + mc = TempoCoordinator(mock) assert mc.is_recommended() is expected diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 8652384..36845de 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -2,15 +2,15 @@ from unittest.mock import MagicMock from deepdiff import DeepDiff -from mimir_config import _S3ConfigData -from mimir_coordinator import MimirCoordinator +from tempo_config import _S3ConfigData +from tempo_coordinator import TempoCoordinator -class TestMimirConfig(unittest.TestCase): +class TestTempoConfig(unittest.TestCase): def setUp(self): self.cluster_provider = MagicMock() self.tls_requirer = MagicMock() - self.coordinator = MimirCoordinator( + self.coordinator = TempoCoordinator( cluster_provider=self.cluster_provider, tls_requirer=self.tls_requirer, ) @@ -61,16 +61,16 @@ def test_build_config_with_s3_data(self): "region": "your_region", } s3_config_data = _S3ConfigData(**raw_s3_config_data) - mimir_config = self.coordinator.build_config(s3_config_data) + tempo_config = self.coordinator.build_config(s3_config_data) self.assertEqual( - mimir_config["common"]["storage"], + tempo_config["common"]["storage"], self.coordinator._build_s3_storage_config(s3_config_data), ) def test_build_config_without_s3_data(self): s3_config_data = None - mimir_config = self.coordinator.build_config(s3_config_data) - self.assertNotIn("storage", mimir_config["common"]) + tempo_config = self.coordinator.build_config(s3_config_data) + self.assertNotIn("storage", tempo_config["common"]) def test_build_s3_storage_config(self): raw_s3_config_data = { @@ -136,15 +136,15 @@ def test_build_tls_config(self): tls_config = self.coordinator._build_tls_config() expected_config = { "http_tls_config": { - "cert_file": "/etc/mimir/server.cert", - "key_file": "/etc/mimir/private.key", - "client_ca_file": "/etc/mimir/ca.cert", + "cert_file": "/etc/tempo/server.cert", + "key_file": "/etc/tempo/private.key", + "client_ca_file": "/etc/tempo/ca.cert", "client_auth_type": "RequestClientCert", }, "grpc_tls_config": { - "cert_file": "/etc/mimir/server.cert", - "key_file": "/etc/mimir/private.key", - "client_ca_file": "/etc/mimir/ca.cert", + "cert_file": "/etc/tempo/server.cert", + "key_file": "/etc/tempo/private.key", + "client_ca_file": "/etc/tempo/ca.cert", "client_auth_type": "RequestClientCert", }, }