From a274b08a5f6bf8ee8c50fbd100939d0b0d1d98de Mon Sep 17 00:00:00 2001
From: Noctua <webops+observability-noctua-bot@canonical.com>
Date: Thu, 5 Dec 2024 12:04:58 +0000
Subject: [PATCH] chore: update charm libraries

---
 .../v0/certificate_transfer.py                |  11 +-
 .../observability_libs/v1/cert_handler.py     |  55 ++-
 .../v1/kubernetes_service_patch.py            | 154 +------
 .../tempo_coordinator_k8s/v0/charm_tracing.py | 398 ++++++++++++++++--
 .../tempo_coordinator_k8s/v0/tracing.py       |  24 +-
 .../v3/tls_certificates.py                    |  30 +-
 6 files changed, 465 insertions(+), 207 deletions(-)

diff --git a/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py b/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py
index caa6e228..72cc9a26 100644
--- a/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py
+++ b/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py
@@ -101,6 +101,7 @@ def _on_certificate_removed(self, event: CertificateRemovedEvent):
 from typing import List, Mapping
 
 from jsonschema import exceptions, validate  # type: ignore[import-untyped]
+from ops import Relation
 from ops.charm import CharmBase, CharmEvents, RelationBrokenEvent, RelationChangedEvent
 from ops.framework import EventBase, EventSource, Handle, Object
 
@@ -112,7 +113,7 @@ def _on_certificate_removed(self, event: CertificateRemovedEvent):
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 8
+LIBPATCH = 9
 
 PYDEPS = ["jsonschema"]
 
@@ -391,3 +392,11 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
             None
         """
         self.on.certificate_removed.emit(relation_id=event.relation.id)
+
+    def is_ready(self, relation: Relation) -> bool:
+        """Check if the relation is ready by checking that it has valid relation data."""
+        relation_data = _load_relation_data(relation.data[relation.app])
+        if not self._relation_data_is_valid(relation_data):
+            logger.warning("Provider relation data did not pass JSON Schema validation: ")
+            return False
+        return True
diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py
index 4a1940b9..7fcc3258 100644
--- a/lib/charms/observability_libs/v1/cert_handler.py
+++ b/lib/charms/observability_libs/v1/cert_handler.py
@@ -32,6 +32,7 @@
 Since this library uses [Juju Secrets](https://juju.is/docs/juju/secret) it requires Juju >= 3.0.3.
 """
 import abc
+import hashlib
 import ipaddress
 import json
 import socket
@@ -67,7 +68,7 @@
 
 LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a"
 LIBAPI = 1
-LIBPATCH = 13
+LIBPATCH = 15
 
 VAULT_SECRET_LABEL = "cert-handler-private-vault"
 
@@ -126,7 +127,7 @@ class _RelationVaultBackend(_VaultBackend):
     _NEST_UNDER = "lib.charms.observability_libs.v1.cert_handler::vault"
     # This key needs to be relation-unique. If someone ever creates multiple Vault(_RelationVaultBackend)
     # instances backed by the same (peer) relation, they'll need to set different _NEST_UNDERs
-    # for each _RelationVaultBackend instance or they'll be fighting over it.
+    # for each _RelationVaultBackend instance, or they'll be fighting over it.
 
     def __init__(self, charm: CharmBase, relation_name: str):
         self.charm = charm
@@ -301,14 +302,11 @@ def __init__(
                 Must match metadata.yaml.
             cert_subject: Custom subject. Name collisions are under the caller's responsibility.
             sans: DNS names. If none are given, use FQDN.
-            refresh_events: an optional list of bound events which
-                will be observed to replace the current CSR with a new one
-                if there are changes in the CSR's DNS SANs or IP SANs.
-                Then, subsequently, replace its corresponding certificate with a new one.
+            refresh_events: [DEPRECATED].
         """
         super().__init__(charm, key)
         # use StoredState to store the hash of the CSR
-        # to potentially trigger a CSR renewal on `refresh_events`
+        # to potentially trigger a CSR renewal
         self._stored.set_default(
             csr_hash=None,
         )
@@ -320,8 +318,9 @@ def __init__(
 
         # Use fqdn only if no SANs were given, and drop empty/duplicate SANs
         sans = list(set(filter(None, (sans or [socket.getfqdn()]))))
-        self.sans_ip = list(filter(is_ip_address, sans))
-        self.sans_dns = list(filterfalse(is_ip_address, sans))
+        # sort SANS lists to avoid unnecessary csr renewals during reconciliation
+        self.sans_ip = sorted(filter(is_ip_address, sans))
+        self.sans_dns = sorted(filterfalse(is_ip_address, sans))
 
         if self._check_juju_supports_secrets():
             vault_backend = _SecretVaultBackend(charm, secret_label=VAULT_SECRET_LABEL)
@@ -345,6 +344,13 @@ def __init__(
             self.charm.on[self.certificates_relation_name].relation_joined,  # pyright: ignore
             self._on_certificates_relation_joined,
         )
+        # The following observer is a workaround. The tls-certificates lib sometimes fails to emit the custom
+        # "certificate_available" event on relation changed. Not sure why this was happening. We certainly have some
+        # tech debt here to address, but this workaround proved to work.
+        self.framework.observe(
+            self.charm.on[self.certificates_relation_name].relation_changed,  # pyright: ignore
+            self._on_certificate_available,
+        )
         self.framework.observe(
             self.certificates.on.certificate_available,  # pyright: ignore
             self._on_certificate_available,
@@ -367,13 +373,15 @@ def __init__(
         )
 
         if refresh_events:
-            for ev in refresh_events:
-                self.framework.observe(ev, self._on_refresh_event)
+            logger.warning(
+                "DEPRECATION WARNING. `refresh_events` is now deprecated. CertHandler will automatically refresh the CSR when necessary."
+            )
 
-    def _on_refresh_event(self, _):
-        """Replace the latest current CSR with a new one if there are any SANs changes."""
-        if self._stored.csr_hash != self._csr_hash:
-            self._generate_csr(renew=True)
+        self._reconcile()
+
+    def _reconcile(self):
+        """Run all logic that is independent of what event we're processing."""
+        self._refresh_csr_if_needed()
 
     def _on_upgrade_charm(self, _):
         has_privkey = self.vault.get_value("private-key")
@@ -388,6 +396,11 @@ def _on_upgrade_charm(self, _):
             # this will call `self.private_key` which will generate a new privkey.
             self._generate_csr(renew=True)
 
+    def _refresh_csr_if_needed(self):
+        """Refresh the current CSR with a new one if there are any SANs changes."""
+        if self._stored.csr_hash is not None and self._stored.csr_hash != self._csr_hash:
+            self._generate_csr(renew=True)
+
     def _migrate_vault(self):
         peer_backend = _RelationVaultBackend(self.charm, relation_name="peers")
 
@@ -423,7 +436,7 @@ def enabled(self) -> bool:
         See also the `available` property.
         """
         # We need to check for units as a temporary workaround because of https://bugs.launchpad.net/juju/+bug/2024583
-        # This could in theory not work correctly on scale down to 0 but it is necessary for the moment.
+        # This could in theory not work correctly on scale down to 0, but it is necessary for the moment.
 
         if not self.relation:
             return False
@@ -440,13 +453,17 @@ def enabled(self) -> bool:
         return True
 
     @property
-    def _csr_hash(self) -> int:
+    def _csr_hash(self) -> str:
         """A hash of the config that constructs the CSR.
 
         Only include here the config options that, should they change, should trigger a renewal of
         the CSR.
         """
-        return hash(
+
+        def _stable_hash(data):
+            return hashlib.sha256(str(data).encode()).hexdigest()
+
+        return _stable_hash(
             (
                 tuple(self.sans_dns),
                 tuple(self.sans_ip),
@@ -626,7 +643,7 @@ def _on_all_certificates_invalidated(self, _: AllCertificatesInvalidatedEvent) -
         # Note: assuming "limit: 1" in metadata
         # The "certificates_relation_broken" event is converted to "all invalidated" custom
         # event by the tls-certificates library. Per convention, we let the lib manage the
-        # relation and we do not observe "certificates_relation_broken" directly.
+        # relation, and we do not observe "certificates_relation_broken" directly.
         self.vault.clear()
         # We do not generate a CSR here because the relation is gone.
         self.on.cert_changed.emit()  # pyright: ignore
diff --git a/lib/charms/observability_libs/v1/kubernetes_service_patch.py b/lib/charms/observability_libs/v1/kubernetes_service_patch.py
index e85834be..4d37a38d 100644
--- a/lib/charms/observability_libs/v1/kubernetes_service_patch.py
+++ b/lib/charms/observability_libs/v1/kubernetes_service_patch.py
@@ -1,146 +1,13 @@
 # Copyright 2021 Canonical Ltd.
 # See LICENSE file for licensing details.
 
-"""# KubernetesServicePatch Library.
-
-This library is designed to enable developers to more simply patch the Kubernetes Service created
-by Juju during the deployment of a sidecar charm. When sidecar charms are deployed, Juju creates a
-service named after the application in the namespace (named after the Juju model). This service by
-default contains a "placeholder" port, which is 65535/TCP.
-
-When modifying the default set of resources managed by Juju, one must consider the lifecycle of the
-charm. In this case, any modifications to the default service (created during deployment), will be
-overwritten during a charm upgrade.
-
-When initialised, this library binds a handler to the parent charm's `install` and `upgrade_charm`
-events which applies the patch to the cluster. This should ensure that the service ports are
-correct throughout the charm's life.
-
-The constructor simply takes a reference to the parent charm, and a list of
-[`lightkube`](https://github.com/gtsystem/lightkube) ServicePorts that each define a port for the
-service. For information regarding the `lightkube` `ServicePort` model, please visit the
-`lightkube` [docs](https://gtsystem.github.io/lightkube-models/1.23/models/core_v1/#serviceport).
-
-Optionally, a name of the service (in case service name needs to be patched as well), labels,
-selectors, and annotations can be provided as keyword arguments.
-
-## Getting Started
-
-To get started using the library, you just need to fetch the library using `charmcraft`. **Note
-that you also need to add `lightkube` and `lightkube-models` to your charm's `requirements.txt`.**
-
-```shell
-cd some-charm
-charmcraft fetch-lib charms.observability_libs.v1.kubernetes_service_patch
-cat << EOF >> requirements.txt
-lightkube
-lightkube-models
-EOF
-```
-
-Then, to initialise the library:
-
-For `ClusterIP` services:
-
-```python
-# ...
-from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
-from lightkube.models.core_v1 import ServicePort
-
-class SomeCharm(CharmBase):
-  def __init__(self, *args):
-    # ...
-    port = ServicePort(443, name=f"{self.app.name}")
-    self.service_patcher = KubernetesServicePatch(self, [port])
-    # ...
-```
-
-For `LoadBalancer`/`NodePort` services:
-
-```python
-# ...
-from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
-from lightkube.models.core_v1 import ServicePort
-
-class SomeCharm(CharmBase):
-  def __init__(self, *args):
-    # ...
-    port = ServicePort(443, name=f"{self.app.name}", targetPort=443, nodePort=30666)
-    self.service_patcher = KubernetesServicePatch(
-        self, [port], "LoadBalancer"
-    )
-    # ...
-```
-
-Port protocols can also be specified. Valid protocols are `"TCP"`, `"UDP"`, and `"SCTP"`
-
-```python
-# ...
-from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
-from lightkube.models.core_v1 import ServicePort
-
-class SomeCharm(CharmBase):
-  def __init__(self, *args):
-    # ...
-    tcp = ServicePort(443, name=f"{self.app.name}-tcp", protocol="TCP")
-    udp = ServicePort(443, name=f"{self.app.name}-udp", protocol="UDP")
-    sctp = ServicePort(443, name=f"{self.app.name}-sctp", protocol="SCTP")
-    self.service_patcher = KubernetesServicePatch(self, [tcp, udp, sctp])
-    # ...
-```
-
-Bound with custom events by providing `refresh_event` argument:
-For example, you would like to have a configurable port in your charm and want to apply
-service patch every time charm config is changed.
-
-```python
-from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
-from lightkube.models.core_v1 import ServicePort
-
-class SomeCharm(CharmBase):
-  def __init__(self, *args):
-    # ...
-    port = ServicePort(int(self.config["charm-config-port"]), name=f"{self.app.name}")
-    self.service_patcher = KubernetesServicePatch(
-        self,
-        [port],
-        refresh_event=self.on.config_changed
-    )
-    # ...
-```
-
-Creating a new k8s lb service instead of patching the one created by juju
-Service name is optional. If not provided, it defaults to {app_name}-lb.
-If provided and equal to app_name, it also defaults to {app_name}-lb to prevent conflicts with the Juju default service.
-```python
-from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch
-from lightkube.models.core_v1 import ServicePort
-
-class SomeCharm(CharmBase):
-  def __init__(self, *args):
-    # ...
-    port = ServicePort(int(self.config["charm-config-port"]), name=f"{self.app.name}")
-    self.service_patcher = KubernetesServicePatch(
-        self,
-        [port],
-        service_type="LoadBalancer",
-        service_name="application-lb"
-    )
-    # ...
-```
-
-Additionally, you may wish to use mocks in your charm's unit testing to ensure that the library
-does not try to make any API calls, or open any files during testing that are unlikely to be
-present, and could break your tests. The easiest way to do this is during your test `setUp`:
-
-```python
-# ...
-
-@patch("charm.KubernetesServicePatch", lambda x, y: None)
-def setUp(self, *unused):
-    self.harness = Harness(SomeCharm)
-    # ...
-```
+"""# [DEPRECATED!] KubernetesServicePatch Library.
+
+The `kubernetes_service_patch` library is DEPRECATED and will be removed in October 2025.
+
+For patching the Kubernetes service created by Juju during the deployment of a charm,
+`ops.Unit.set_ports` functionality should be used instead.
+
 """
 
 import logging
@@ -167,7 +34,7 @@ def setUp(self, *unused):
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 12
+LIBPATCH = 13
 
 ServiceType = Literal["ClusterIP", "LoadBalancer"]
 
@@ -205,6 +72,11 @@ def __init__(
                 will be observed to re-apply the patch (e.g. on port change).
                 The `install` and `upgrade-charm` events would be observed regardless.
         """
+        logger.warning(
+            "The ``kubernetes_service_patch v1`` library is DEPRECATED and will be removed "
+            "in October 2025. For patching the Kubernetes service created by Juju during "
+            "the deployment of a charm, ``ops.Unit.set_ports`` functionality should be used instead."
+        )
         super().__init__(charm, "kubernetes-service-patch")
         self.charm = charm
         self.service_name = service_name or self._app
diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py
index 2604c39e..cf8def11 100644
--- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py
+++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py
@@ -69,6 +69,9 @@ def my_tracing_endpoint(self) -> Optional[str]:
     - every event as a span (including custom events)
     - every charm method call (except dunders) as a span
 
+We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests
+go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down.
+
 
 ## TLS support
 If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm),
@@ -114,6 +117,57 @@ def get_tracer(self) -> opentelemetry.trace.Tracer:
 See the official opentelemetry Python SDK documentation for usage:
 https://opentelemetry-python.readthedocs.io/en/latest/
 
+
+## Caching traces
+The `trace_charm` machinery will buffer any traces collected during charm execution and store them
+to a file on the charm container until a tracing backend becomes available. At that point, it will
+flush them to the tracing receiver.
+
+By default, the buffer is configured to start dropping old traces if any of these conditions apply:
+
+- the storage size exceeds 10 MiB
+- the number of buffered events exceeds 100
+
+You can configure this by, for example:
+
+```python
+@trace_charm(
+    tracing_endpoint="my_tracing_endpoint",
+    server_cert="_server_cert",
+    # only cache up to 42 events
+    buffer_max_events=42,
+    # only cache up to 42 MiB
+    buffer_max_size_mib=42,  # minimum 10!
+)
+class MyCharm(CharmBase):
+    ...
+```
+
+Note that setting `buffer_max_events` to 0 will effectively disable the buffer.
+
+The path of the buffer file is by default in the charm's execution root, which for k8s charms means
+that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage
+(or add a new one) such as:
+
+```yaml
+storage:
+  data:
+    type: filesystem
+    location: /charm-traces
+```
+
+and then configure the `@trace_charm` decorator to use it as path for storing the buffer:
+```python
+@trace_charm(
+    tracing_endpoint="my_tracing_endpoint",
+    server_cert="_server_cert",
+    # store traces to a PVC so they're not lost on pod restart.
+    buffer_path="/charm-traces/buffer.file",
+)
+class MyCharm(CharmBase):
+    ...
+```
+
 ## Upgrading from `v0`
 
 If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already
@@ -171,6 +225,12 @@ def my_tracing_endpoint(self) -> Optional[str]:
 3) If you were passing a certificate (str) using `server_cert`, you need to change it to
 provide an *absolute* path to the certificate file instead.
 """
+import typing
+
+from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import (
+    encode_spans,
+)
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 
 
 def _remove_stale_otel_sdk_packages():
@@ -222,6 +282,9 @@ def _remove_stale_otel_sdk_packages():
     otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ")
 
 
+# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm.
+# it could be trouble if someone ever decides to implement their own tracer parallel to
+# ours and before the charm has inited. We assume they won't.
 _remove_stale_otel_sdk_packages()
 
 import functools
@@ -235,6 +298,7 @@ def _remove_stale_otel_sdk_packages():
     Any,
     Callable,
     Generator,
+    List,
     Optional,
     Sequence,
     Type,
@@ -247,8 +311,12 @@ def _remove_stale_otel_sdk_packages():
 import ops
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import Span, TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider
+from opentelemetry.sdk.trace.export import (
+    BatchSpanProcessor,
+    SpanExporter,
+    SpanExportResult,
+)
 from opentelemetry.trace import INVALID_SPAN, Tracer
 from opentelemetry.trace import get_current_span as otlp_get_current_span
 from opentelemetry.trace import (
@@ -269,7 +337,7 @@ def _remove_stale_otel_sdk_packages():
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
 
-LIBPATCH = 1
+LIBPATCH = 4
 
 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"]
 
@@ -277,7 +345,7 @@ def _remove_stale_otel_sdk_packages():
 dev_logger = logging.getLogger("tracing-dev")
 
 # set this to 0 if you are debugging/developing this library source
-dev_logger.setLevel(logging.CRITICAL)
+dev_logger.setLevel(logging.ERROR)
 
 _CharmType = Type[CharmBase]  # the type CharmBase and any subclass thereof
 _C = TypeVar("_C", bound=_CharmType)
@@ -287,6 +355,186 @@ def _remove_stale_otel_sdk_packages():
 _GetterType = Union[Callable[[_CharmType], Optional[str]], property]
 
 CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED"
+BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw"
+# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in
+# any portable format. Json dumping is supported, but loading isn't.
+# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003
+
+BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10
+_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10
+BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100
+_MiB_TO_B = 2**20  # megabyte to byte conversion rate
+_OTLP_SPAN_EXPORTER_TIMEOUT = 1
+"""Timeout in seconds that the OTLP span exporter has to push traces to the backend."""
+
+
+class _Buffer:
+    """Handles buffering for spans emitted while no tracing backend is configured or available.
+
+    Use the max_event_history_length_buffering param of @trace_charm to tune
+    the amount of memory that this will hog on your units.
+
+    The buffer is formatted as a bespoke byte dump (protobuf limitation).
+    We cannot store them as json because that is not well-supported by the sdk
+    (see https://github.com/open-telemetry/opentelemetry-python/issues/3364).
+    """
+
+    _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__"
+
+    def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int):
+        self._db_file = db_file
+        self._max_event_history_length = max_event_history_length
+        self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN)
+
+        # set by caller
+        self.exporter: Optional[OTLPSpanExporter] = None
+
+    def save(self, spans: typing.Sequence[ReadableSpan]):
+        """Save the spans collected by this exporter to the cache file.
+
+        This method should be as fail-safe as possible.
+        """
+        if self._max_event_history_length < 1:
+            dev_logger.debug("buffer disabled: max history length < 1")
+            return
+
+        current_history_length = len(self.load())
+        new_history_length = current_history_length + len(spans)
+        if (diff := self._max_event_history_length - new_history_length) < 0:
+            self.drop(diff)
+        self._save(spans)
+
+    def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes:
+        # encode because otherwise we can't json-dump them
+        return encode_spans(spans).SerializeToString()
+
+    def _save(self, spans: Sequence[ReadableSpan], replace: bool = False):
+        dev_logger.debug(f"saving {len(spans)} new spans to buffer")
+        old = [] if replace else self.load()
+        new = self._serialize(spans)
+
+        try:
+            # if the buffer exceeds the size limit, we start dropping old spans until it does
+
+            while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B):
+                if not old:
+                    # if we've already dropped all spans and still we can't get under the
+                    # size limit, we can't save this span
+                    logger.error(
+                        f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); "
+                        f"buffering FAILED"
+                    )
+                    return
+
+                old = old[1:]
+                logger.warning(
+                    f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... "
+                    f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed."
+                )
+
+            self._db_file.write_bytes(new + self._SPANSEP.join(old))
+        except Exception:
+            logger.exception("error buffering spans")
+
+    def load(self) -> List[bytes]:
+        """Load currently buffered spans from the cache file.
+
+        This method should be as fail-safe as possible.
+        """
+        if not self._db_file.exists():
+            dev_logger.debug("buffer file not found. buffer empty.")
+            return []
+        try:
+            spans = self._db_file.read_bytes().split(self._SPANSEP)
+        except Exception:
+            logger.exception(f"error parsing {self._db_file}")
+            return []
+        return spans
+
+    def drop(self, n_spans: Optional[int] = None):
+        """Drop some currently buffered spans from the cache file."""
+        current = self.load()
+        if n_spans:
+            dev_logger.debug(f"dropping {n_spans} spans from buffer")
+            new = current[n_spans:]
+        else:
+            dev_logger.debug("emptying buffer")
+            new = []
+
+        self._db_file.write_bytes(self._SPANSEP.join(new))
+
+    def flush(self) -> Optional[bool]:
+        """Export all buffered spans to the given exporter, then clear the buffer.
+
+        Returns whether the flush was successful, and None if there was nothing to flush.
+        """
+        if not self.exporter:
+            dev_logger.debug("no exporter set; skipping buffer flush")
+            return False
+
+        buffered_spans = self.load()
+        if not buffered_spans:
+            dev_logger.debug("nothing to flush; buffer empty")
+            return None
+
+        errors = False
+        for span in buffered_spans:
+            try:
+                out = self.exporter._export(span)  # type: ignore
+                if not (200 <= out.status_code < 300):
+                    # take any 2xx status code as a success
+                    errors = True
+            except ConnectionError:
+                dev_logger.debug(
+                    "failed exporting buffered span; backend might be down or still starting"
+                )
+                errors = True
+            except Exception:
+                logger.exception("unexpected error while flushing span batch from buffer")
+                errors = True
+
+        if not errors:
+            self.drop()
+        else:
+            logger.error("failed flushing spans; buffer preserved")
+        return not errors
+
+    @property
+    def is_empty(self):
+        """Utility to check whether the buffer has any stored spans.
+
+        This is more efficient than attempting a load() given how large the buffer might be.
+        """
+        return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0)
+
+
+class _OTLPSpanExporter(OTLPSpanExporter):
+    """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster."""
+
+    # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured,
+    # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't
+    # flush the traces and keeps retrying for 'too long'
+
+    _MAX_RETRY_TIMEOUT = 4
+    # we give the exporter 4 seconds in total to succeed pushing the traces to tempo
+    # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk.
+    # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait
+
+
+class _BufferedExporter(InMemorySpanExporter):
+    def __init__(self, buffer: _Buffer) -> None:
+        super().__init__()
+        self._buffer = buffer
+
+    def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult:
+        self._buffer.save(spans)
+        return super().export(spans)
+
+    def force_flush(self, timeout_millis: int = 0) -> bool:
+        # parent implementation is fake, so the timeout_millis arg is not doing anything.
+        result = super().force_flush(timeout_millis)
+        self._buffer.save(self.get_finished_spans())
+        return result
 
 
 def is_enabled() -> bool:
@@ -371,10 +619,6 @@ class UntraceableObjectError(TracingError):
     """Raised when an object you're attempting to instrument cannot be autoinstrumented."""
 
 
-class TLSError(TracingError):
-    """Raised when the tracing endpoint is https but we don't have a cert yet."""
-
-
 def _get_tracing_endpoint(
     tracing_endpoint_attr: str,
     charm_instance: object,
@@ -427,7 +671,10 @@ def _setup_root_span_initializer(
     charm_type: _CharmType,
     tracing_endpoint_attr: str,
     server_cert_attr: Optional[str],
-    service_name: Optional[str] = None,
+    service_name: Optional[str],
+    buffer_path: Optional[Path],
+    buffer_max_events: int,
+    buffer_max_size_mib: int,
 ):
     """Patch the charm's initializer."""
     original_init = charm_type.__init__
@@ -446,18 +693,11 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
             logger.info("Tracing DISABLED: skipping root span initialization")
             return
 
-        # already init some attrs that will be reinited later by calling original_init:
-        # self.framework = framework
-        # self.handle = Handle(None, self.handle_kind, None)
-
         original_event_context = framework._event_context
         # default service name isn't just app name because it could conflict with the workload service name
         _service_name = service_name or f"{self.app.name}-charm"
 
         unit_name = self.unit.name
-        # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm.
-        # it could be trouble if someone ever decides to implement their own tracer parallel to
-        # ours and before the charm has inited. We assume they won't.
         resource = Resource.create(
             attributes={
                 "service.name": _service_name,
@@ -475,28 +715,60 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
         # if anything goes wrong with retrieving the endpoint, we let the exception bubble up.
         tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type)
 
+        buffer_only = False
+        # whether we're only exporting to buffer, or also to the otlp exporter.
+
         if not tracing_endpoint:
             # tracing is off if tracing_endpoint is None
-            return
+            # however we can buffer things until tracing comes online
+            buffer_only = True
 
         server_cert: Optional[Union[str, Path]] = (
             _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None
         )
 
-        if tracing_endpoint.startswith("https://") and not server_cert:
-            raise TLSError(
+        if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert:
+            logger.error(
                 "Tracing endpoint is https, but no server_cert has been passed."
-                "Please point @trace_charm to a `server_cert` attr."
+                "Please point @trace_charm to a `server_cert` attr. "
+                "This might also mean that the tracing provider is related to a "
+                "certificates provider, but this application is not (yet). "
+                "In that case, you might just have to wait a bit for the certificates "
+                "integration to settle. This span will be buffered."
             )
+            buffer_only = True
 
-        exporter = OTLPSpanExporter(
-            endpoint=tracing_endpoint,
-            certificate_file=str(Path(server_cert).absolute()) if server_cert else None,
-            timeout=2,
+        buffer = _Buffer(
+            db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME,
+            max_event_history_length=buffer_max_events,
+            max_buffer_size_mib=buffer_max_size_mib,
         )
+        previous_spans_buffered = not buffer.is_empty
+
+        exporters: List[SpanExporter] = []
+        if buffer_only:
+            # we have to buffer because we're missing necessary backend configuration
+            dev_logger.debug("buffering mode: ON")
+            exporters.append(_BufferedExporter(buffer))
+
+        else:
+            dev_logger.debug("buffering mode: FALLBACK")
+            # in principle, we have the right configuration to be pushing traces,
+            # but if we fail for whatever reason, we will put everything in the buffer
+            # and retry the next time
+            otlp_exporter = _OTLPSpanExporter(
+                endpoint=tracing_endpoint,
+                certificate_file=str(Path(server_cert).absolute()) if server_cert else None,
+                timeout=_OTLP_SPAN_EXPORTER_TIMEOUT,  # give individual requests 1 second to succeed
+            )
+            exporters.append(otlp_exporter)
+            exporters.append(_BufferedExporter(buffer))
+            buffer.exporter = otlp_exporter
+
+        for exporter in exporters:
+            processor = BatchSpanProcessor(exporter)
+            provider.add_span_processor(processor)
 
-        processor = BatchSpanProcessor(exporter)
-        provider.add_span_processor(processor)
         set_tracer_provider(provider)
         _tracer = get_tracer(_service_name)  # type: ignore
         _tracer_token = tracer.set(_tracer)
@@ -520,7 +792,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
 
         @contextmanager
         def wrap_event_context(event_name: str):
-            dev_logger.info(f"entering event context: {event_name}")
+            dev_logger.debug(f"entering event context: {event_name}")
             # when the framework enters an event context, we create a span.
             with _span("event: " + event_name) as event_context_span:
                 if event_context_span:
@@ -534,12 +806,50 @@ def wrap_event_context(event_name: str):
 
         @functools.wraps(original_close)
         def wrap_close():
-            dev_logger.info("tearing down tracer and flushing traces")
+            dev_logger.debug("tearing down tracer and flushing traces")
             span.end()
             opentelemetry.context.detach(span_token)  # type: ignore
             tracer.reset(_tracer_token)
             tp = cast(TracerProvider, get_tracer_provider())
-            tp.force_flush(timeout_millis=1000)  # don't block for too long
+            flush_successful = tp.force_flush(timeout_millis=1000)  # don't block for too long
+
+            if buffer_only:
+                # if we're in buffer_only mode, it means we couldn't even set up the exporter for
+                # tempo as we're missing some data.
+                # so attempting to flush the buffer doesn't make sense
+                dev_logger.debug("tracing backend unavailable: all spans pushed to buffer")
+
+            else:
+                dev_logger.debug("tracing backend found: attempting to flush buffer...")
+
+                # if we do have an exporter for tempo, and we could send traces to it,
+                # we can attempt to flush the buffer as well.
+                if not flush_successful:
+                    logger.error("flushing FAILED: unable to push traces to backend.")
+                else:
+                    dev_logger.debug("flush succeeded.")
+
+                    # the backend has accepted the spans generated during this event,
+                    if not previous_spans_buffered:
+                        # if the buffer was empty to begin with, any spans we collected now can be discarded
+                        buffer.drop()
+                        dev_logger.debug("buffer dropped: this trace has been sent already")
+                    else:
+                        # if the buffer was nonempty, we can attempt to flush it
+                        dev_logger.debug("attempting buffer flush...")
+                        buffer_flush_successful = buffer.flush()
+                        if buffer_flush_successful:
+                            dev_logger.debug("buffer flush OK")
+                        elif buffer_flush_successful is None:
+                            # TODO is this even possible?
+                            dev_logger.debug("buffer flush OK; empty: nothing to flush")
+                        else:
+                            # this situation is pretty weird, I'm not even sure it can happen,
+                            # because it would mean that we did manage
+                            # to push traces directly to the tempo exporter (flush_successful),
+                            # but the buffer flush failed to push to the same exporter!
+                            logger.error("buffer flush FAILED")
+
             tp.shutdown()
             original_close()
 
@@ -554,6 +864,9 @@ def trace_charm(
     server_cert: Optional[str] = None,
     service_name: Optional[str] = None,
     extra_types: Sequence[type] = (),
+    buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH,
+    buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB,
+    buffer_path: Optional[Union[str, Path]] = None,
 ) -> Callable[[_T], _T]:
     """Autoinstrument the decorated charm with tracing telemetry.
 
@@ -595,6 +908,10 @@ def trace_charm(
         Defaults to the juju application name this charm is deployed under.
     :param extra_types: pass any number of types that you also wish to autoinstrument.
         For example, charm libs, relation endpoint wrappers, workload abstractions, ...
+    :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering.
+    :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped.
+        Minimum 10MiB.
+    :param buffer_path: path to buffer file to use for saving buffered spans.
     """
 
     def _decorator(charm_type: _T) -> _T:
@@ -605,6 +922,9 @@ def _decorator(charm_type: _T) -> _T:
             server_cert_attr=server_cert,
             service_name=service_name,
             extra_types=extra_types,
+            buffer_path=Path(buffer_path) if buffer_path else None,
+            buffer_max_size_mib=buffer_max_size_mib,
+            buffer_max_events=buffer_max_events,
         )
         return charm_type
 
@@ -617,6 +937,9 @@ def _autoinstrument(
     server_cert_attr: Optional[str] = None,
     service_name: Optional[str] = None,
     extra_types: Sequence[type] = (),
+    buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH,
+    buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB,
+    buffer_path: Optional[Path] = None,
 ) -> _T:
     """Set up tracing on this charm class.
 
@@ -649,13 +972,20 @@ def _autoinstrument(
         Defaults to the juju application name this charm is deployed under.
     :param extra_types: pass any number of types that you also wish to autoinstrument.
         For example, charm libs, relation endpoint wrappers, workload abstractions, ...
+    :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering.
+    :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped.
+        Minimum 10MiB.
+    :param buffer_path: path to buffer file to use for saving buffered spans.
     """
-    dev_logger.info(f"instrumenting {charm_type}")
+    dev_logger.debug(f"instrumenting {charm_type}")
     _setup_root_span_initializer(
         charm_type,
         tracing_endpoint_attr,
         server_cert_attr=server_cert_attr,
         service_name=service_name,
+        buffer_path=buffer_path,
+        buffer_max_events=buffer_max_events,
+        buffer_max_size_mib=buffer_max_size_mib,
     )
     trace_type(charm_type)
     for type_ in extra_types:
@@ -671,12 +1001,12 @@ def trace_type(cls: _T) -> _T:
     It assumes that this class is only instantiated after a charm type decorated with `@trace_charm`
     has been instantiated.
     """
-    dev_logger.info(f"instrumenting {cls}")
+    dev_logger.debug(f"instrumenting {cls}")
     for name, method in inspect.getmembers(cls, predicate=inspect.isfunction):
-        dev_logger.info(f"discovered {method}")
+        dev_logger.debug(f"discovered {method}")
 
         if method.__name__.startswith("__"):
-            dev_logger.info(f"skipping {method} (dunder)")
+            dev_logger.debug(f"skipping {method} (dunder)")
             continue
 
         # the span title in the general case should be:
@@ -722,7 +1052,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F:
 
 
 def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F:
-    dev_logger.info(f"instrumenting {callable}")
+    dev_logger.debug(f"instrumenting {callable}")
 
     # sig = inspect.signature(callable)
     @functools.wraps(callable)
diff --git a/lib/charms/tempo_coordinator_k8s/v0/tracing.py b/lib/charms/tempo_coordinator_k8s/v0/tracing.py
index 4af379a5..2035dffd 100644
--- a/lib/charms/tempo_coordinator_k8s/v0/tracing.py
+++ b/lib/charms/tempo_coordinator_k8s/v0/tracing.py
@@ -34,7 +34,7 @@ def __init__(self, *args):
 `TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method.
 Using this method also allows you to use per-relation protocols.
 
-Units of provider charms obtain the tempo endpoint to which they will push their traces by calling
+Units of requirer charms obtain the tempo endpoint to which they will push their traces by calling
 `TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example:
 - `otlp_grpc`
 - `otlp_http`
@@ -44,7 +44,10 @@ def __init__(self, *args):
 If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time,
 the library will raise an error.
 
-## Requirer Library Usage
+We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests
+go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down.
+
+## Provider Library Usage
 
 The `TracingEndpointProvider` object may be used by charms to manage relations with their
 trace sources. For this purposes a Tempo-like charm needs to do two things
@@ -107,7 +110,7 @@ def __init__(self, *args):
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 1
+LIBPATCH = 3
 
 PYDEPS = ["pydantic"]
 
@@ -985,11 +988,16 @@ def charm_tracing_config(
     is_https = endpoint.startswith("https://")
 
     if is_https:
-        if cert_path is None:
-            raise TracingError("Cannot send traces to an https endpoint without a certificate.")
-        elif not Path(cert_path).exists():
-            # if endpoint is https BUT we don't have a server_cert yet:
-            # disable charm tracing until we do to prevent tls errors
+        if cert_path is None or not Path(cert_path).exists():
+            # disable charm tracing until we obtain a cert to prevent tls errors
+            logger.error(
+                "Tracing endpoint is https, but no server_cert has been passed."
+                "Please point @trace_charm to a `server_cert` attr. "
+                "This might also mean that the tracing provider is related to a "
+                "certificates provider, but this application is not (yet). "
+                "In that case, you might just have to wait a bit for the certificates "
+                "integration to settle. "
+            )
             return None, None
         return endpoint, str(cert_path)
     else:
diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py
index da7fa95e..141412b0 100644
--- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py
+++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py
@@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 20
+LIBPATCH = 23
 
 PYDEPS = ["cryptography", "jsonschema"]
 
@@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
                     )
                 else:
                     try:
+                        secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
                         logger.debug(
                             "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}"
                         )
-                        secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
+                        # Juju < 3.6 will create a new revision even if the content is the same
+                        if (
+                            secret.get_content(refresh=True).get("certificate", "")
+                            == certificate.certificate
+                        ):
+                            logger.debug(
+                                "Secret %s with correct certificate already exists",
+                                f"{LIBID}-{csr_in_sha256_hex}",
+                            )
+                            continue
                         secret.set_content(
                             {"certificate": certificate.certificate, "csr": certificate.csr}
                         )
@@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None:
         provider_certificate = self._find_certificate_in_relation_data(csr)
         if not provider_certificate:
             # A secret expired but we did not find matching certificate. Cleaning up
+            logger.warning(
+                "Failed to find matching certificate for csr, cleaning up secret %s",
+                event.secret.label,
+            )
             event.secret.remove_all_revisions()
             return
 
         if not provider_certificate.expiry_time:
             # A secret expired but matching certificate is invalid. Cleaning up
+            logger.warning(
+                "Certificate matching csr is invalid, cleaning up secret %s",
+                event.secret.label,
+            )
             event.secret.remove_all_revisions()
             return
 
@@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti
             return provider_certificate
         return None
 
-    def _get_csr_from_secret(self, secret: Secret) -> str:
+    def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]:
         """Extract the CSR from the secret label or content.
 
         This function is a workaround to maintain backwards compatibility
         and fix the issue reported in
         https://github.com/canonical/tls-certificates-interface/issues/228
         """
-        if not (csr := secret.get_content().get("csr", "")):
+        try:
+            content = secret.get_content(refresh=True)
+        except SecretNotFoundError:
+            return None
+        if not (csr := content.get("csr", None)):
             # In versions <14 of the Lib we were storing the CSR in the label of the secret
             # The CSR now is stored int the content of the secret, which was a breaking change
             # Here we get the CSR if the secret was created by an app using libpatch 14 or lower