From 1cd66e314c81f551acef34a7df95ef57f1bdb97c Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Thu, 24 Oct 2024 17:13:28 -0400 Subject: [PATCH 1/6] Add terraform module --- terraform/README.md | 4 ++++ terraform/main.tf | 12 ++++++++++++ terraform/outputs.tf | 24 ++++++++++++++++++++++++ terraform/variables.tf | 42 ++++++++++++++++++++++++++++++++++++++++++ terraform/versions.tf | 9 +++++++++ 5 files changed, 91 insertions(+) create mode 100644 terraform/README.md create mode 100644 terraform/main.tf create mode 100644 terraform/outputs.tf create mode 100644 terraform/variables.tf create mode 100644 terraform/versions.tf diff --git a/terraform/README.md b/terraform/README.md new file mode 100644 index 0000000..a66a8a6 --- /dev/null +++ b/terraform/README.md @@ -0,0 +1,4 @@ +# Terraform module for grafana-agent-k8s + + +This module is in experimental status. It is not yet ready for production. diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000..6628dbb --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,12 @@ +resource "juju_application" "grafana_agent" { + name = var.app_name + model = var.model_name + trust = true + charm { + name = "grafana-agent-k8s" + channel = var.channel + revision = var.revision + } + units = var.units + config = var.config +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000..6b9442c --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,24 @@ +output "app_name" { + value = juju_application.grafana_agent.name +} + +output "requires" { + value = { + certificates = "certificates", + send_remote_write = "send-remote-write", + metrics_endpoint = "metrics-endpoint", + logging_consumer = "logging-consumer", + grafana_dashboards_consumer = "grafana-dashboards-consumer", + grafana_cloud_config = "grafana-cloud-config", + receive_ca_cert = "receive-ca-cert", + tracing = "tracing", + } +} + +output "provides" { + value = { + tracing_provider = "tracing-provider", + logging_provider = "logging-provider", + grafana_dashboards_provider = "grafana-dashboards-provider", + } +} diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000..ac8d649 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,42 @@ +variable "app_name" { + description = "Application name" + type = string +} + +variable "channel" { + description = "Charm channel" + type = string + default = "latest/stable" +} + +variable "config" { + description = "Config options as in the ones we pass in juju config" + type = map(string) + default = {} +} + +# We use constraints to set AntiAffinity in K8s +# https://discourse.charmhub.io/t/pod-priority-and-affinity-in-juju-charms/4091/13?u=jose +variable "constraints" { + description = "Constraints to be applied" + type = string + default = "" +} + +variable "model_name" { + description = "Model name" + type = string +} + +variable "revision" { + description = "Charm revision" + type = number + nullable = true + default = null +} + +variable "units" { + description = "Number of units" + type = number + default = 1 +} \ No newline at end of file diff --git a/terraform/versions.tf b/terraform/versions.tf new file mode 100644 index 0000000..77b6440 --- /dev/null +++ b/terraform/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_version = ">= 1.5" + required_providers { + juju = { + source = "juju/juju" + version = "~> 0.14" + } + } +} \ No newline at end of file From ab166b146638d0067a8a6c83f48c7f64678f0854 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 12 Nov 2024 18:15:42 -0300 Subject: [PATCH 2/6] improve README --- terraform/README.md | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/terraform/README.md b/terraform/README.md index a66a8a6..acdae36 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -1,4 +1,38 @@ -# Terraform module for grafana-agent-k8s +# Terraform module for grafana-agent -This module is in experimental status. It is not yet ready for production. +This is a Terraform module facilitating the deployment of grafana-agent-k8s charm, using the [Terraform juju provider](https://github.com/juju/terraform-provider-juju/). For more information, refer to the provider [documentation](https://registry.terraform.io/providers/juju/juju/latest/docs). + + +## Requirements +This module requires a `juju` model to be available. Refer to the [usage section](#usage) below for more details. + +## API + +### Inputs +The module offers the following configurable inputs: + +| Name | Type | Description | Required | +| - | - | - | - | +| `app_name`| string | Application name | mimir-worker | +| `channel`| string | Channel that the charm is deployed from | latest/edge | +| `config`| map(any) | Map of the charm configuration options | {} | +| `constraints`| string | Constraints for the Juju deployment| "" | +| `model_name`| string | Name of the model that the charm is deployed on | | +| `revision`| number | Revision number of the charm name | null | +| `units`| number | Number of units to deploy | 1 | + +### Outputs +Upon applied, the module exports the following outputs: + +| Name | Description | +| - | - | +| `app_name`| Application name | +| `provides`| Map of `provides` endpoints | +| `requires`| Map of `requires` endpoints | + +## Usage + +Users should ensure that Terraform is aware of the `juju_model` dependency of the charm module. + +To deploy this module with its needed dependency, you can run `terraform apply -var="model_name=" -auto-approve` From 95b1538142df57b24b703e73cfbf806529a267f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 12 Nov 2024 18:15:51 -0300 Subject: [PATCH 3/6] add terraform ignores to .gitignore --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index b8ae2d8..09a8664 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,13 @@ __pycache__/ tests/integration/*-tester/lib/ .env cos-tool* + +# Terraform +*.tfstate +*.tfstate.* +*.tfplan +.terraform.lock.hcl +crash.log +.terraform/ +terraform.tfvars +terraform.tfvars.json \ No newline at end of file From 03ff3b8762cd3eb5406445e16ca55906e6f824a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 12 Nov 2024 18:16:21 -0300 Subject: [PATCH 4/6] update libs --- .../observability_libs/v1/cert_handler.py | 42 +- .../tempo_coordinator_k8s/v0/charm_tracing.py | 389 ++++++- .../tempo_coordinator_k8s/v0/tracing.py | 9 +- lib/charms/tempo_k8s/v1/charm_tracing.py | 759 +------------ lib/charms/tempo_k8s/v2/tracing.py | 1001 +---------------- .../v3/tls_certificates.py | 30 +- 6 files changed, 448 insertions(+), 1782 deletions(-) diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py index 4a1940b..26be879 100644 --- a/lib/charms/observability_libs/v1/cert_handler.py +++ b/lib/charms/observability_libs/v1/cert_handler.py @@ -32,6 +32,7 @@ Since this library uses [Juju Secrets](https://juju.is/docs/juju/secret) it requires Juju >= 3.0.3. """ import abc +import hashlib import ipaddress import json import socket @@ -67,7 +68,7 @@ LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a" LIBAPI = 1 -LIBPATCH = 13 +LIBPATCH = 14 VAULT_SECRET_LABEL = "cert-handler-private-vault" @@ -301,14 +302,11 @@ def __init__( Must match metadata.yaml. cert_subject: Custom subject. Name collisions are under the caller's responsibility. sans: DNS names. If none are given, use FQDN. - refresh_events: an optional list of bound events which - will be observed to replace the current CSR with a new one - if there are changes in the CSR's DNS SANs or IP SANs. - Then, subsequently, replace its corresponding certificate with a new one. + refresh_events: [DEPRECATED]. """ super().__init__(charm, key) # use StoredState to store the hash of the CSR - # to potentially trigger a CSR renewal on `refresh_events` + # to potentially trigger a CSR renewal self._stored.set_default( csr_hash=None, ) @@ -320,8 +318,9 @@ def __init__( # Use fqdn only if no SANs were given, and drop empty/duplicate SANs sans = list(set(filter(None, (sans or [socket.getfqdn()])))) - self.sans_ip = list(filter(is_ip_address, sans)) - self.sans_dns = list(filterfalse(is_ip_address, sans)) + # sort SANS lists to avoid unnecessary csr renewals during reconciliation + self.sans_ip = sorted(filter(is_ip_address, sans)) + self.sans_dns = sorted(filterfalse(is_ip_address, sans)) if self._check_juju_supports_secrets(): vault_backend = _SecretVaultBackend(charm, secret_label=VAULT_SECRET_LABEL) @@ -367,13 +366,15 @@ def __init__( ) if refresh_events: - for ev in refresh_events: - self.framework.observe(ev, self._on_refresh_event) + logger.warn( + "DEPRECATION WARNING. `refresh_events` is now deprecated. CertHandler will automatically refresh the CSR when necessary." + ) - def _on_refresh_event(self, _): - """Replace the latest current CSR with a new one if there are any SANs changes.""" - if self._stored.csr_hash != self._csr_hash: - self._generate_csr(renew=True) + self._reconcile() + + def _reconcile(self): + """Run all logic that is independent of what event we're processing.""" + self._refresh_csr_if_needed() def _on_upgrade_charm(self, _): has_privkey = self.vault.get_value("private-key") @@ -388,6 +389,11 @@ def _on_upgrade_charm(self, _): # this will call `self.private_key` which will generate a new privkey. self._generate_csr(renew=True) + def _refresh_csr_if_needed(self): + """Refresh the current CSR with a new one if there are any SANs changes.""" + if self._stored.csr_hash is not None and self._stored.csr_hash != self._csr_hash: + self._generate_csr(renew=True) + def _migrate_vault(self): peer_backend = _RelationVaultBackend(self.charm, relation_name="peers") @@ -440,13 +446,17 @@ def enabled(self) -> bool: return True @property - def _csr_hash(self) -> int: + def _csr_hash(self) -> str: """A hash of the config that constructs the CSR. Only include here the config options that, should they change, should trigger a renewal of the CSR. """ - return hash( + + def _stable_hash(data): + return hashlib.sha256(str(data).encode()).hexdigest() + + return _stable_hash( ( tuple(self.sans_dns), tuple(self.sans_ip), diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py index 1e7ff84..cf8def1 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py @@ -69,6 +69,9 @@ def my_tracing_endpoint(self) -> Optional[str]: - every event as a span (including custom events) - every charm method call (except dunders) as a span +We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests +go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. + ## TLS support If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), @@ -114,6 +117,57 @@ def get_tracer(self) -> opentelemetry.trace.Tracer: See the official opentelemetry Python SDK documentation for usage: https://opentelemetry-python.readthedocs.io/en/latest/ + +## Caching traces +The `trace_charm` machinery will buffer any traces collected during charm execution and store them +to a file on the charm container until a tracing backend becomes available. At that point, it will +flush them to the tracing receiver. + +By default, the buffer is configured to start dropping old traces if any of these conditions apply: + +- the storage size exceeds 10 MiB +- the number of buffered events exceeds 100 + +You can configure this by, for example: + +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # only cache up to 42 events + buffer_max_events=42, + # only cache up to 42 MiB + buffer_max_size_mib=42, # minimum 10! +) +class MyCharm(CharmBase): + ... +``` + +Note that setting `buffer_max_events` to 0 will effectively disable the buffer. + +The path of the buffer file is by default in the charm's execution root, which for k8s charms means +that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage +(or add a new one) such as: + +```yaml +storage: + data: + type: filesystem + location: /charm-traces +``` + +and then configure the `@trace_charm` decorator to use it as path for storing the buffer: +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # store traces to a PVC so they're not lost on pod restart. + buffer_path="/charm-traces/buffer.file", +) +class MyCharm(CharmBase): + ... +``` + ## Upgrading from `v0` If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already @@ -171,6 +225,12 @@ def my_tracing_endpoint(self) -> Optional[str]: 3) If you were passing a certificate (str) using `server_cert`, you need to change it to provide an *absolute* path to the certificate file instead. """ +import typing + +from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import ( + encode_spans, +) +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter def _remove_stale_otel_sdk_packages(): @@ -222,6 +282,9 @@ def _remove_stale_otel_sdk_packages(): otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") +# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. +# it could be trouble if someone ever decides to implement their own tracer parallel to +# ours and before the charm has inited. We assume they won't. _remove_stale_otel_sdk_packages() import functools @@ -235,6 +298,7 @@ def _remove_stale_otel_sdk_packages(): Any, Callable, Generator, + List, Optional, Sequence, Type, @@ -247,8 +311,12 @@ def _remove_stale_otel_sdk_packages(): import ops from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import Span, TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + SpanExporter, + SpanExportResult, +) from opentelemetry.trace import INVALID_SPAN, Tracer from opentelemetry.trace import get_current_span as otlp_get_current_span from opentelemetry.trace import ( @@ -269,7 +337,7 @@ def _remove_stale_otel_sdk_packages(): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 4 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] @@ -277,7 +345,7 @@ def _remove_stale_otel_sdk_packages(): dev_logger = logging.getLogger("tracing-dev") # set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.CRITICAL) +dev_logger.setLevel(logging.ERROR) _CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof _C = TypeVar("_C", bound=_CharmType) @@ -287,6 +355,186 @@ def _remove_stale_otel_sdk_packages(): _GetterType = Union[Callable[[_CharmType], Optional[str]], property] CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" +BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw" +# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in +# any portable format. Json dumping is supported, but loading isn't. +# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003 + +BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10 +_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10 +BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100 +_MiB_TO_B = 2**20 # megabyte to byte conversion rate +_OTLP_SPAN_EXPORTER_TIMEOUT = 1 +"""Timeout in seconds that the OTLP span exporter has to push traces to the backend.""" + + +class _Buffer: + """Handles buffering for spans emitted while no tracing backend is configured or available. + + Use the max_event_history_length_buffering param of @trace_charm to tune + the amount of memory that this will hog on your units. + + The buffer is formatted as a bespoke byte dump (protobuf limitation). + We cannot store them as json because that is not well-supported by the sdk + (see https://github.com/open-telemetry/opentelemetry-python/issues/3364). + """ + + _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__" + + def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int): + self._db_file = db_file + self._max_event_history_length = max_event_history_length + self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN) + + # set by caller + self.exporter: Optional[OTLPSpanExporter] = None + + def save(self, spans: typing.Sequence[ReadableSpan]): + """Save the spans collected by this exporter to the cache file. + + This method should be as fail-safe as possible. + """ + if self._max_event_history_length < 1: + dev_logger.debug("buffer disabled: max history length < 1") + return + + current_history_length = len(self.load()) + new_history_length = current_history_length + len(spans) + if (diff := self._max_event_history_length - new_history_length) < 0: + self.drop(diff) + self._save(spans) + + def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes: + # encode because otherwise we can't json-dump them + return encode_spans(spans).SerializeToString() + + def _save(self, spans: Sequence[ReadableSpan], replace: bool = False): + dev_logger.debug(f"saving {len(spans)} new spans to buffer") + old = [] if replace else self.load() + new = self._serialize(spans) + + try: + # if the buffer exceeds the size limit, we start dropping old spans until it does + + while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B): + if not old: + # if we've already dropped all spans and still we can't get under the + # size limit, we can't save this span + logger.error( + f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); " + f"buffering FAILED" + ) + return + + old = old[1:] + logger.warning( + f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... " + f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed." + ) + + self._db_file.write_bytes(new + self._SPANSEP.join(old)) + except Exception: + logger.exception("error buffering spans") + + def load(self) -> List[bytes]: + """Load currently buffered spans from the cache file. + + This method should be as fail-safe as possible. + """ + if not self._db_file.exists(): + dev_logger.debug("buffer file not found. buffer empty.") + return [] + try: + spans = self._db_file.read_bytes().split(self._SPANSEP) + except Exception: + logger.exception(f"error parsing {self._db_file}") + return [] + return spans + + def drop(self, n_spans: Optional[int] = None): + """Drop some currently buffered spans from the cache file.""" + current = self.load() + if n_spans: + dev_logger.debug(f"dropping {n_spans} spans from buffer") + new = current[n_spans:] + else: + dev_logger.debug("emptying buffer") + new = [] + + self._db_file.write_bytes(self._SPANSEP.join(new)) + + def flush(self) -> Optional[bool]: + """Export all buffered spans to the given exporter, then clear the buffer. + + Returns whether the flush was successful, and None if there was nothing to flush. + """ + if not self.exporter: + dev_logger.debug("no exporter set; skipping buffer flush") + return False + + buffered_spans = self.load() + if not buffered_spans: + dev_logger.debug("nothing to flush; buffer empty") + return None + + errors = False + for span in buffered_spans: + try: + out = self.exporter._export(span) # type: ignore + if not (200 <= out.status_code < 300): + # take any 2xx status code as a success + errors = True + except ConnectionError: + dev_logger.debug( + "failed exporting buffered span; backend might be down or still starting" + ) + errors = True + except Exception: + logger.exception("unexpected error while flushing span batch from buffer") + errors = True + + if not errors: + self.drop() + else: + logger.error("failed flushing spans; buffer preserved") + return not errors + + @property + def is_empty(self): + """Utility to check whether the buffer has any stored spans. + + This is more efficient than attempting a load() given how large the buffer might be. + """ + return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0) + + +class _OTLPSpanExporter(OTLPSpanExporter): + """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster.""" + + # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured, + # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't + # flush the traces and keeps retrying for 'too long' + + _MAX_RETRY_TIMEOUT = 4 + # we give the exporter 4 seconds in total to succeed pushing the traces to tempo + # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk. + # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait + + +class _BufferedExporter(InMemorySpanExporter): + def __init__(self, buffer: _Buffer) -> None: + super().__init__() + self._buffer = buffer + + def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: + self._buffer.save(spans) + return super().export(spans) + + def force_flush(self, timeout_millis: int = 0) -> bool: + # parent implementation is fake, so the timeout_millis arg is not doing anything. + result = super().force_flush(timeout_millis) + self._buffer.save(self.get_finished_spans()) + return result def is_enabled() -> bool: @@ -423,7 +671,10 @@ def _setup_root_span_initializer( charm_type: _CharmType, tracing_endpoint_attr: str, server_cert_attr: Optional[str], - service_name: Optional[str] = None, + service_name: Optional[str], + buffer_path: Optional[Path], + buffer_max_events: int, + buffer_max_size_mib: int, ): """Patch the charm's initializer.""" original_init = charm_type.__init__ @@ -442,18 +693,11 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): logger.info("Tracing DISABLED: skipping root span initialization") return - # already init some attrs that will be reinited later by calling original_init: - # self.framework = framework - # self.handle = Handle(None, self.handle_kind, None) - original_event_context = framework._event_context # default service name isn't just app name because it could conflict with the workload service name _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name - # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. - # it could be trouble if someone ever decides to implement their own tracer parallel to - # ours and before the charm has inited. We assume they won't. resource = Resource.create( attributes={ "service.name": _service_name, @@ -471,33 +715,60 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) + buffer_only = False + # whether we're only exporting to buffer, or also to the otlp exporter. + if not tracing_endpoint: # tracing is off if tracing_endpoint is None - return + # however we can buffer things until tracing comes online + buffer_only = True server_cert: Optional[Union[str, Path]] = ( _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None ) - if tracing_endpoint.startswith("https://") and not server_cert: + if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert: logger.error( "Tracing endpoint is https, but no server_cert has been passed." "Please point @trace_charm to a `server_cert` attr. " "This might also mean that the tracing provider is related to a " "certificates provider, but this application is not (yet). " "In that case, you might just have to wait a bit for the certificates " - "integration to settle. " + "integration to settle. This span will be buffered." ) - return + buffer_only = True - exporter = OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=2, + buffer = _Buffer( + db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME, + max_event_history_length=buffer_max_events, + max_buffer_size_mib=buffer_max_size_mib, ) + previous_spans_buffered = not buffer.is_empty + + exporters: List[SpanExporter] = [] + if buffer_only: + # we have to buffer because we're missing necessary backend configuration + dev_logger.debug("buffering mode: ON") + exporters.append(_BufferedExporter(buffer)) + + else: + dev_logger.debug("buffering mode: FALLBACK") + # in principle, we have the right configuration to be pushing traces, + # but if we fail for whatever reason, we will put everything in the buffer + # and retry the next time + otlp_exporter = _OTLPSpanExporter( + endpoint=tracing_endpoint, + certificate_file=str(Path(server_cert).absolute()) if server_cert else None, + timeout=_OTLP_SPAN_EXPORTER_TIMEOUT, # give individual requests 1 second to succeed + ) + exporters.append(otlp_exporter) + exporters.append(_BufferedExporter(buffer)) + buffer.exporter = otlp_exporter + + for exporter in exporters: + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) set_tracer_provider(provider) _tracer = get_tracer(_service_name) # type: ignore _tracer_token = tracer.set(_tracer) @@ -521,7 +792,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): @contextmanager def wrap_event_context(event_name: str): - dev_logger.info(f"entering event context: {event_name}") + dev_logger.debug(f"entering event context: {event_name}") # when the framework enters an event context, we create a span. with _span("event: " + event_name) as event_context_span: if event_context_span: @@ -535,12 +806,50 @@ def wrap_event_context(event_name: str): @functools.wraps(original_close) def wrap_close(): - dev_logger.info("tearing down tracer and flushing traces") + dev_logger.debug("tearing down tracer and flushing traces") span.end() opentelemetry.context.detach(span_token) # type: ignore tracer.reset(_tracer_token) tp = cast(TracerProvider, get_tracer_provider()) - tp.force_flush(timeout_millis=1000) # don't block for too long + flush_successful = tp.force_flush(timeout_millis=1000) # don't block for too long + + if buffer_only: + # if we're in buffer_only mode, it means we couldn't even set up the exporter for + # tempo as we're missing some data. + # so attempting to flush the buffer doesn't make sense + dev_logger.debug("tracing backend unavailable: all spans pushed to buffer") + + else: + dev_logger.debug("tracing backend found: attempting to flush buffer...") + + # if we do have an exporter for tempo, and we could send traces to it, + # we can attempt to flush the buffer as well. + if not flush_successful: + logger.error("flushing FAILED: unable to push traces to backend.") + else: + dev_logger.debug("flush succeeded.") + + # the backend has accepted the spans generated during this event, + if not previous_spans_buffered: + # if the buffer was empty to begin with, any spans we collected now can be discarded + buffer.drop() + dev_logger.debug("buffer dropped: this trace has been sent already") + else: + # if the buffer was nonempty, we can attempt to flush it + dev_logger.debug("attempting buffer flush...") + buffer_flush_successful = buffer.flush() + if buffer_flush_successful: + dev_logger.debug("buffer flush OK") + elif buffer_flush_successful is None: + # TODO is this even possible? + dev_logger.debug("buffer flush OK; empty: nothing to flush") + else: + # this situation is pretty weird, I'm not even sure it can happen, + # because it would mean that we did manage + # to push traces directly to the tempo exporter (flush_successful), + # but the buffer flush failed to push to the same exporter! + logger.error("buffer flush FAILED") + tp.shutdown() original_close() @@ -555,6 +864,9 @@ def trace_charm( server_cert: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Union[str, Path]] = None, ) -> Callable[[_T], _T]: """Autoinstrument the decorated charm with tracing telemetry. @@ -596,6 +908,10 @@ def trace_charm( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ def _decorator(charm_type: _T) -> _T: @@ -606,6 +922,9 @@ def _decorator(charm_type: _T) -> _T: server_cert_attr=server_cert, service_name=service_name, extra_types=extra_types, + buffer_path=Path(buffer_path) if buffer_path else None, + buffer_max_size_mib=buffer_max_size_mib, + buffer_max_events=buffer_max_events, ) return charm_type @@ -618,6 +937,9 @@ def _autoinstrument( server_cert_attr: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Path] = None, ) -> _T: """Set up tracing on this charm class. @@ -650,13 +972,20 @@ def _autoinstrument( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ - dev_logger.info(f"instrumenting {charm_type}") + dev_logger.debug(f"instrumenting {charm_type}") _setup_root_span_initializer( charm_type, tracing_endpoint_attr, server_cert_attr=server_cert_attr, service_name=service_name, + buffer_path=buffer_path, + buffer_max_events=buffer_max_events, + buffer_max_size_mib=buffer_max_size_mib, ) trace_type(charm_type) for type_ in extra_types: @@ -672,12 +1001,12 @@ def trace_type(cls: _T) -> _T: It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` has been instantiated. """ - dev_logger.info(f"instrumenting {cls}") + dev_logger.debug(f"instrumenting {cls}") for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.info(f"discovered {method}") + dev_logger.debug(f"discovered {method}") if method.__name__.startswith("__"): - dev_logger.info(f"skipping {method} (dunder)") + dev_logger.debug(f"skipping {method} (dunder)") continue # the span title in the general case should be: @@ -723,7 +1052,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F: def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.info(f"instrumenting {callable}") + dev_logger.debug(f"instrumenting {callable}") # sig = inspect.signature(callable) @functools.wraps(callable) diff --git a/lib/charms/tempo_coordinator_k8s/v0/tracing.py b/lib/charms/tempo_coordinator_k8s/v0/tracing.py index 1f92867..2035dff 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/tracing.py @@ -34,7 +34,7 @@ def __init__(self, *args): `TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method. Using this method also allows you to use per-relation protocols. -Units of provider charms obtain the tempo endpoint to which they will push their traces by calling +Units of requirer charms obtain the tempo endpoint to which they will push their traces by calling `TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example: - `otlp_grpc` - `otlp_http` @@ -44,7 +44,10 @@ def __init__(self, *args): If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time, the library will raise an error. -## Requirer Library Usage +We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests +go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. + +## Provider Library Usage The `TracingEndpointProvider` object may be used by charms to manage relations with their trace sources. For this purposes a Tempo-like charm needs to do two things @@ -107,7 +110,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 3 PYDEPS = ["pydantic"] diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 2dbdddd..cfb2dbe 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -2,759 +2,28 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. -"""This charm library contains utilities to instrument your Charm with opentelemetry tracing data collection. +"""This charm library has been transferred to the HA version of this charm. -(yes! charm code, not workload code!) +The new owner is the `tempo-coordinator-k8s` charm: +- [github](https://github.com/canonical/tempo-coordinator-k8s/) +- [charmhub](https://charmhub.io/tempo-coordinator-k8s/) -This means that, if your charm is related to, for example, COS' Tempo charm, you will be able to inspect -in real time from the Grafana dashboard the execution flow of your charm. +The new library (with its major version reset to 0) can be found at -# Quickstart -Fetch the following charm libs (and ensure the minimum version/revision numbers are satisfied): +https://charmhub.io/tempo-coordinator-k8s/libraries/charm_tracing - charmcraft fetch-lib charms.tempo_k8s.v2.tracing # >= 1.10 - charmcraft fetch-lib charms.tempo_k8s.v1.charm_tracing # >= 2.7 +to install it: -Then edit your charm code to include: +> charmcraft fetch-lib charms.tempo_coordinator_k8s.v0.charm_tracing -```python -# import the necessary charm libs -from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer, charm_tracing_config -from charms.tempo_k8s.v1.charm_tracing import charm_tracing - -# decorate your charm class with charm_tracing: -@charm_tracing( - # forward-declare the instance attributes that the instrumentor will look up to obtain the - # tempo endpoint and server certificate - tracing_endpoint="tracing_endpoint", - server_cert="server_cert" -) -class MyCharm(CharmBase): - _path_to_cert = "/path/to/cert.crt" - # path to cert file **in the charm container**. Its presence will be used to determine whether - # the charm is ready to use tls for encrypting charm traces. If your charm does not support tls, - # you can ignore this and pass None to charm_tracing_config. - # If you do support TLS, you'll need to make sure that the server cert is copied to this location - # and kept up to date so the instrumentor can use it. - - def __init__(self, ...): - ... - self.tracing = TracingEndpointRequirer(self, ...) - self.tracing_endpoint, self.server_cert = charm_tracing_config(self.tracing, self._path_to_cert) -``` - -# Detailed usage -To use this library, you need to do two things: -1) decorate your charm class with - -`@trace_charm(tracing_endpoint="my_tracing_endpoint")` - -2) add to your charm a "my_tracing_endpoint" (you can name this attribute whatever you like) -**property**, **method** or **instance attribute** that returns an otlp http/https endpoint url. -If you are using the ``charms.tempo_k8s.v2.tracing.TracingEndpointRequirer`` as -``self.tracing = TracingEndpointRequirer(self)``, the implementation could be: - -``` - @property - def my_tracing_endpoint(self) -> Optional[str]: - '''Tempo endpoint for charm tracing''' - if self.tracing.is_ready(): - return self.tracing.get_endpoint("otlp_http") - else: - return None -``` - -At this point your charm will be automatically instrumented so that: -- charm execution starts a trace, containing - - every event as a span (including custom events) - - every charm method call (except dunders) as a span - - -## TLS support -If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), -you can configure ``charm_tracing`` to use TLS by passing a ``server_cert`` parameter to the decorator. - -If your charm is not trusting the same CA as the Tempo endpoint it is sending traces to, -you'll need to implement a cert-transfer relation to obtain the CA certificate from the same -CA that Tempo is using. - -For example: -``` -from charms.tempo_k8s.v1.charm_tracing import trace_charm -@trace_charm( - tracing_endpoint="my_tracing_endpoint", - server_cert="_server_cert" -) -class MyCharm(CharmBase): - self._server_cert = "/path/to/server.crt" - ... - - def on_tls_changed(self, e) -> Optional[str]: - # update the server cert on the charm container for charm tracing - Path(self._server_cert).write_text(self.get_server_cert()) - - def on_tls_broken(self, e) -> Optional[str]: - # remove the server cert so charm_tracing won't try to use tls anymore - Path(self._server_cert).unlink() -``` - - -## More fine-grained manual instrumentation -if you wish to add more spans to the trace, you can do so by getting a hold of the tracer like so: -``` -import opentelemetry -... -def get_tracer(self) -> opentelemetry.trace.Tracer: - return opentelemetry.trace.get_tracer(type(self).__name__) -``` - -By default, the tracer is named after the charm type. If you wish to override that, you can pass -a different ``service_name`` argument to ``trace_charm``. - -See the official opentelemetry Python SDK documentation for usage: -https://opentelemetry-python.readthedocs.io/en/latest/ - -## Upgrading from `v0` - -If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already -have the newest version of the library in your charm): -1) If you need the dependency for your tests, add the following dependency to your charm project -(or, if your project had a dependency on `opentelemetry-exporter-otlp-proto-grpc` only because -of `charm_tracing` v0, you can replace it with): - -`opentelemetry-exporter-otlp-proto-http>=1.21.0`. - -2) Update the charm method referenced to from ``@trace`` and ``@trace_charm``, -to return from ``TracingEndpointRequirer.get_endpoint("otlp_http")`` instead of ``grpc_http``. -For example: - -``` - from charms.tempo_k8s.v0.charm_tracing import trace_charm - - @trace_charm( - tracing_endpoint="my_tracing_endpoint", - ) - class MyCharm(CharmBase): - - ... - - @property - def my_tracing_endpoint(self) -> Optional[str]: - '''Tempo endpoint for charm tracing''' - if self.tracing.is_ready(): - return self.tracing.otlp_grpc_endpoint() # OLD API, DEPRECATED. - else: - return None -``` - -needs to be replaced with: - -``` - from charms.tempo_k8s.v1.charm_tracing import trace_charm - - @trace_charm( - tracing_endpoint="my_tracing_endpoint", - ) - class MyCharm(CharmBase): - - ... - - @property - def my_tracing_endpoint(self) -> Optional[str]: - '''Tempo endpoint for charm tracing''' - if self.tracing.is_ready(): - return self.tracing.get_endpoint("otlp_http") # NEW API, use this. - else: - return None -``` - -3) If you were passing a certificate (str) using `server_cert`, you need to change it to -provide an *absolute* path to the certificate file instead. +The API is unchanged, so you can search and replace the path to swap the old lib with the new one. """ - -def _remove_stale_otel_sdk_packages(): - """Hack to remove stale opentelemetry sdk packages from the charm's python venv. - - See https://github.com/canonical/grafana-agent-operator/issues/146 and - https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after - this juju issue is resolved and sufficient time has passed to expect most users of this library - have migrated to the patched version of juju. When this patch is removed, un-ignore rule E402 for this file in the pyproject.toml (see setting - [tool.ruff.lint.per-file-ignores] in pyproject.toml). - - This only has an effect if executed on an upgrade-charm event. - """ - # all imports are local to keep this function standalone, side-effect-free, and easy to revert later - import os - - if os.getenv("JUJU_DISPATCH_PATH") != "hooks/upgrade-charm": - return - - import logging - import shutil - from collections import defaultdict - - from importlib_metadata import distributions - - otel_logger = logging.getLogger("charm_tracing_otel_patcher") - otel_logger.debug("Applying _remove_stale_otel_sdk_packages patch on charm upgrade") - # group by name all distributions starting with "opentelemetry_" - otel_distributions = defaultdict(list) - for distribution in distributions(): - name = distribution._normalized_name # type: ignore - if name.startswith("opentelemetry_"): - otel_distributions[name].append(distribution) - - otel_logger.debug(f"Found {len(otel_distributions)} opentelemetry distributions") - - # If we have multiple distributions with the same name, remove any that have 0 associated files - for name, distributions_ in otel_distributions.items(): - if len(distributions_) <= 1: - continue - - otel_logger.debug(f"Package {name} has multiple ({len(distributions_)}) distributions.") - for distribution in distributions_: - if not distribution.files: # Not None or empty list - path = distribution._path # type: ignore - otel_logger.info(f"Removing empty distribution of {name} at {path}.") - shutil.rmtree(path) - - otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") - - -_remove_stale_otel_sdk_packages() - -import functools -import inspect -import logging -import os -from contextlib import contextmanager -from contextvars import Context, ContextVar, copy_context -from pathlib import Path -from typing import ( - Any, - Callable, - Generator, - Optional, - Sequence, - Type, - TypeVar, - Union, - cast, -) - -import opentelemetry -import ops -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import Span, TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.trace import ( - INVALID_SPAN, - Tracer, - get_tracer, - get_tracer_provider, - set_span_in_context, - set_tracer_provider, -) -from opentelemetry.trace import get_current_span as otlp_get_current_span -from ops.charm import CharmBase -from ops.framework import Framework - -# The unique Charmhub library identifier, never change it LIBID = "cb1705dcd1a14ca09b2e60187d1215c7" - -# Increment this major API version when introducing breaking changes LIBAPI = 1 +LIBPATCH = 17 -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version - -LIBPATCH = 15 - -PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] - -logger = logging.getLogger("tracing") -dev_logger = logging.getLogger("tracing-dev") - -# set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.CRITICAL) - -_CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof -_C = TypeVar("_C", bound=_CharmType) -_T = TypeVar("_T", bound=type) -_F = TypeVar("_F", bound=Type[Callable]) -tracer: ContextVar[Tracer] = ContextVar("tracer") -_GetterType = Union[Callable[[_CharmType], Optional[str]], property] - -CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" - - -def is_enabled() -> bool: - """Whether charm tracing is enabled.""" - return os.getenv(CHARM_TRACING_ENABLED, "1") == "1" - - -@contextmanager -def charm_tracing_disabled(): - """Contextmanager to temporarily disable charm tracing. - - For usage in tests. - """ - previous = os.getenv(CHARM_TRACING_ENABLED, "1") - os.environ[CHARM_TRACING_ENABLED] = "0" - yield - os.environ[CHARM_TRACING_ENABLED] = previous - - -def get_current_span() -> Union[Span, None]: - """Return the currently active Span, if there is one, else None. - - If you'd rather keep your logic unconditional, you can use opentelemetry.trace.get_current_span, - which will return an object that behaves like a span but records no data. - """ - span = otlp_get_current_span() - if span is INVALID_SPAN: - return None - return cast(Span, span) - - -def _get_tracer_from_context(ctx: Context) -> Optional[ContextVar]: - tracers = [v for v in ctx if v is not None and v.name == "tracer"] - if tracers: - return tracers[0] - return None - - -def _get_tracer() -> Optional[Tracer]: - """Find tracer in context variable and as a fallback locate it in the full context.""" - try: - return tracer.get() - except LookupError: - # fallback: this course-corrects for a user error where charm_tracing symbols are imported - # from different paths (typically charms.tempo_k8s... and lib.charms.tempo_k8s...) - try: - ctx: Context = copy_context() - if context_tracer := _get_tracer_from_context(ctx): - logger.warning( - "Tracer not found in `tracer` context var. " - "Verify that you're importing all `charm_tracing` symbols from the same module path. \n" - "For example, DO" - ": `from charms.lib...charm_tracing import foo, bar`. \n" - "DONT: \n" - " \t - `from charms.lib...charm_tracing import foo` \n" - " \t - `from lib...charm_tracing import bar` \n" - "For more info: https://python-notes.curiousefficiency.org/en/latest/python" - "_concepts/import_traps.html#the-double-import-trap" - ) - return context_tracer.get() - else: - return None - except LookupError: - return None - - -@contextmanager -def _span(name: str) -> Generator[Optional[Span], Any, Any]: - """Context to create a span if there is a tracer, otherwise do nothing.""" - if tracer := _get_tracer(): - with tracer.start_as_current_span(name) as span: - yield cast(Span, span) - else: - yield None - - -class TracingError(RuntimeError): - """Base class for errors raised by this module.""" - - -class UntraceableObjectError(TracingError): - """Raised when an object you're attempting to instrument cannot be autoinstrumented.""" - - -class TLSError(TracingError): - """Raised when the tracing endpoint is https but we don't have a cert yet.""" - - -def _get_tracing_endpoint( - tracing_endpoint_attr: str, - charm_instance: object, - charm_type: type, -): - _tracing_endpoint = getattr(charm_instance, tracing_endpoint_attr) - if callable(_tracing_endpoint): - tracing_endpoint = _tracing_endpoint() - else: - tracing_endpoint = _tracing_endpoint - - if tracing_endpoint is None: - return - - elif not isinstance(tracing_endpoint, str): - raise TypeError( - f"{charm_type.__name__}.{tracing_endpoint_attr} should resolve to a tempo endpoint (string); " - f"got {tracing_endpoint} instead." - ) - - dev_logger.debug(f"Setting up span exporter to endpoint: {tracing_endpoint}/v1/traces") - return f"{tracing_endpoint}/v1/traces" - - -def _get_server_cert( - server_cert_attr: str, - charm_instance: ops.CharmBase, - charm_type: Type[ops.CharmBase], -): - _server_cert = getattr(charm_instance, server_cert_attr) - if callable(_server_cert): - server_cert = _server_cert() - else: - server_cert = _server_cert - - if server_cert is None: - logger.warning( - f"{charm_type}.{server_cert_attr} is None; sending traces over INSECURE connection." - ) - return - elif not Path(server_cert).is_absolute(): - raise ValueError( - f"{charm_type}.{server_cert_attr} should resolve to a valid tls cert absolute path (string | Path)); " - f"got {server_cert} instead." - ) - return server_cert - - -def _setup_root_span_initializer( - charm_type: _CharmType, - tracing_endpoint_attr: str, - server_cert_attr: Optional[str], - service_name: Optional[str] = None, -): - """Patch the charm's initializer.""" - original_init = charm_type.__init__ - - @functools.wraps(original_init) - def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): - # we're using 'self' here because this is charm init code, makes sense to read what's below - # from the perspective of the charm. Self.unit.name... - - original_init(self, framework, *args, **kwargs) - # we call this from inside the init context instead of, say, _autoinstrument, because we want it to - # be checked on a per-charm-instantiation basis, not on a per-type-declaration one. - if not is_enabled(): - # this will only happen during unittesting, hopefully, so it's fine to log a - # bit more verbosely - logger.info("Tracing DISABLED: skipping root span initialization") - return - - # already init some attrs that will be reinited later by calling original_init: - # self.framework = framework - # self.handle = Handle(None, self.handle_kind, None) - - original_event_context = framework._event_context - # default service name isn't just app name because it could conflict with the workload service name - _service_name = service_name or f"{self.app.name}-charm" - - unit_name = self.unit.name - # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. - # it could be trouble if someone ever decides to implement their own tracer parallel to - # ours and before the charm has inited. We assume they won't. - resource = Resource.create( - attributes={ - "service.name": _service_name, - "compose_service": _service_name, - "charm_type": type(self).__name__, - # juju topology - "juju_unit": unit_name, - "juju_application": self.app.name, - "juju_model": self.model.name, - "juju_model_uuid": self.model.uuid, - } - ) - provider = TracerProvider(resource=resource) - - # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. - tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) - - if not tracing_endpoint: - # tracing is off if tracing_endpoint is None - return - - server_cert: Optional[Union[str, Path]] = ( - _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None - ) - - if tracing_endpoint.startswith("https://") and not server_cert: - raise TLSError( - "Tracing endpoint is https, but no server_cert has been passed." - "Please point @trace_charm to a `server_cert` attr." - ) - - exporter = OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=2, - ) - - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) - set_tracer_provider(provider) - _tracer = get_tracer(_service_name) # type: ignore - _tracer_token = tracer.set(_tracer) - - dispatch_path = os.getenv("JUJU_DISPATCH_PATH", "") # something like hooks/install - event_name = dispatch_path.split("/")[1] if "/" in dispatch_path else dispatch_path - root_span_name = f"{unit_name}: {event_name} event" - span = _tracer.start_span(root_span_name, attributes={"juju.dispatch_path": dispatch_path}) - - # all these shenanigans are to work around the fact that the opentelemetry tracing API is built - # on the assumption that spans will be used as contextmanagers. - # Since we don't (as we need to close the span on framework.commit), - # we need to manually set the root span as current. - ctx = set_span_in_context(span) - - # log a trace id, so we can pick it up from the logs (and jhack) to look it up in tempo. - root_trace_id = hex(span.get_span_context().trace_id)[2:] # strip 0x prefix - logger.debug(f"Starting root trace with id={root_trace_id!r}.") - - span_token = opentelemetry.context.attach(ctx) # type: ignore - - @contextmanager - def wrap_event_context(event_name: str): - dev_logger.info(f"entering event context: {event_name}") - # when the framework enters an event context, we create a span. - with _span("event: " + event_name) as event_context_span: - if event_context_span: - # todo: figure out how to inject event attrs in here - event_context_span.add_event(event_name) - yield original_event_context(event_name) - - framework._event_context = wrap_event_context # type: ignore - - original_close = framework.close - - @functools.wraps(original_close) - def wrap_close(): - dev_logger.info("tearing down tracer and flushing traces") - span.end() - opentelemetry.context.detach(span_token) # type: ignore - tracer.reset(_tracer_token) - tp = cast(TracerProvider, get_tracer_provider()) - tp.force_flush(timeout_millis=1000) # don't block for too long - tp.shutdown() - original_close() - - framework.close = wrap_close - return - - charm_type.__init__ = wrap_init # type: ignore - - -def trace_charm( - tracing_endpoint: str, - server_cert: Optional[str] = None, - service_name: Optional[str] = None, - extra_types: Sequence[type] = (), -) -> Callable[[_T], _T]: - """Autoinstrument the decorated charm with tracing telemetry. - - Use this function to get out-of-the-box traces for all events emitted on this charm and all - method calls on instances of this class. - - Usage: - >>> from charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from charms.tempo_k8s.v1.tracing import TracingEndpointRequirer - >>> from ops import CharmBase - >>> - >>> @trace_charm( - >>> tracing_endpoint="tempo_otlp_http_endpoint", - >>> ) - >>> class MyCharm(CharmBase): - >>> - >>> def __init__(self, framework: Framework): - >>> ... - >>> self.tracing = TracingEndpointRequirer(self) - >>> - >>> @property - >>> def tempo_otlp_http_endpoint(self) -> Optional[str]: - >>> if self.tracing.is_ready(): - >>> return self.tracing.otlp_http_endpoint() - >>> else: - >>> return None - >>> - - :param tracing_endpoint: name of a method, property or attribute on the charm type that returns an - optional (fully resolvable) tempo url to which the charm traces will be pushed. - If None, tracing will be effectively disabled. - :param server_cert: name of a method, property or attribute on the charm type that returns an - optional absolute path to a CA certificate file to be used when sending traces to a remote server. - If it returns None, an _insecure_ connection will be used. To avoid errors in transient - situations where the endpoint is already https but there is no certificate on disk yet, it - is recommended to disable tracing (by returning None from the tracing_endpoint) altogether - until the cert has been written to disk. - :param service_name: service name tag to attach to all traces generated by this charm. - Defaults to the juju application name this charm is deployed under. - :param extra_types: pass any number of types that you also wish to autoinstrument. - For example, charm libs, relation endpoint wrappers, workload abstractions, ... - """ - - def _decorator(charm_type: _T) -> _T: - """Autoinstrument the wrapped charmbase type.""" - _autoinstrument( - charm_type, - tracing_endpoint_attr=tracing_endpoint, - server_cert_attr=server_cert, - service_name=service_name, - extra_types=extra_types, - ) - return charm_type - - return _decorator - - -def _autoinstrument( - charm_type: _T, - tracing_endpoint_attr: str, - server_cert_attr: Optional[str] = None, - service_name: Optional[str] = None, - extra_types: Sequence[type] = (), -) -> _T: - """Set up tracing on this charm class. - - Use this function to get out-of-the-box traces for all events emitted on this charm and all - method calls on instances of this class. - - Usage: - - >>> from charms.tempo_k8s.v1.charm_tracing import _autoinstrument - >>> from ops.main import main - >>> _autoinstrument( - >>> MyCharm, - >>> tracing_endpoint_attr="tempo_otlp_http_endpoint", - >>> service_name="MyCharm", - >>> extra_types=(Foo, Bar) - >>> ) - >>> main(MyCharm) - - :param charm_type: the CharmBase subclass to autoinstrument. - :param tracing_endpoint_attr: name of a method, property or attribute on the charm type that returns an - optional (fully resolvable) tempo url to which the charm traces will be pushed. - If None, tracing will be effectively disabled. - :param server_cert_attr: name of a method, property or attribute on the charm type that returns an - optional absolute path to a CA certificate file to be used when sending traces to a remote server. - If it returns None, an _insecure_ connection will be used. To avoid errors in transient - situations where the endpoint is already https but there is no certificate on disk yet, it - is recommended to disable tracing (by returning None from the tracing_endpoint) altogether - until the cert has been written to disk. - :param service_name: service name tag to attach to all traces generated by this charm. - Defaults to the juju application name this charm is deployed under. - :param extra_types: pass any number of types that you also wish to autoinstrument. - For example, charm libs, relation endpoint wrappers, workload abstractions, ... - """ - dev_logger.info(f"instrumenting {charm_type}") - _setup_root_span_initializer( - charm_type, - tracing_endpoint_attr, - server_cert_attr=server_cert_attr, - service_name=service_name, - ) - trace_type(charm_type) - for type_ in extra_types: - trace_type(type_) - - return charm_type - - -def trace_type(cls: _T) -> _T: - """Set up tracing on this class. - - Use this decorator to get out-of-the-box traces for all method calls on instances of this class. - It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` - has been instantiated. - """ - dev_logger.info(f"instrumenting {cls}") - for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.info(f"discovered {method}") - - if method.__name__.startswith("__"): - dev_logger.info(f"skipping {method} (dunder)") - continue - - # the span title in the general case should be: - # method call: MyCharmWrappedMethods.b - # if the method has a name (functools.wrapped or regular method), let - # _trace_callable use its default algorithm to determine what name to give the span. - trace_method_name = None - try: - qualname_c0 = method.__qualname__.split(".")[0] - if not hasattr(cls, method.__name__): - # if the callable doesn't have a __name__ (probably a decorated method), - # it probably has a bad qualname too (such as my_decorator..wrapper) which is not - # great for finding out what the trace is about. So we use the method name instead and - # add a reference to the decorator name. Result: - # method call: @my_decorator(MyCharmWrappedMethods.b) - trace_method_name = f"@{qualname_c0}({cls.__name__}.{name})" - except Exception: # noqa: failsafe - pass - - new_method = trace_method(method, name=trace_method_name) - - if isinstance(inspect.getattr_static(cls, name), staticmethod): - new_method = staticmethod(new_method) - setattr(cls, name, new_method) - - return cls - - -def trace_method(method: _F, name: Optional[str] = None) -> _F: - """Trace this method. - - A span will be opened when this method is called and closed when it returns. - """ - return _trace_callable(method, "method", name=name) - - -def trace_function(function: _F, name: Optional[str] = None) -> _F: - """Trace this function. - - A span will be opened when this function is called and closed when it returns. - """ - return _trace_callable(function, "function", name=name) - - -def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.info(f"instrumenting {callable}") - - # sig = inspect.signature(callable) - @functools.wraps(callable) - def wrapped_function(*args, **kwargs): # type: ignore - name_ = name or getattr( - callable, "__qualname__", getattr(callable, "__name__", str(callable)) - ) - with _span(f"{qualifier} call: {name_}"): # type: ignore - return callable(*args, **kwargs) # type: ignore - - # wrapped_function.__signature__ = sig - return wrapped_function # type: ignore - - -def trace(obj: Union[Type, Callable]): - """Trace this object and send the resulting spans to Tempo. - - It will dispatch to ``trace_type`` if the decorated object is a class, otherwise - ``trace_function``. - """ - if isinstance(obj, type): - if issubclass(obj, CharmBase): - raise ValueError( - "cannot use @trace on CharmBase subclasses: use @trace_charm instead " - "(we need some arguments!)" - ) - return trace_type(obj) - else: - try: - return trace_function(obj) - except Exception: - raise UntraceableObjectError( - f"cannot create span from {type(obj)}; instrument {obj} manually." - ) +raise DeprecationWarning( + "this charm lib is deprecated; please use charms.tempo_coordinator_k8s.v0.charm_tracing instead. " + "see https://charmhub.io/tempo-coordinator-k8s/libraries/charm_tracing" +) diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py index 81bf1f1..ce142b8 100644 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ b/lib/charms/tempo_k8s/v2/tracing.py @@ -1,996 +1,29 @@ -# Copyright 2024 Canonical Ltd. +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. -"""## Overview. -This document explains how to integrate with the Tempo charm for the purpose of pushing traces to a -tracing endpoint provided by Tempo. It also explains how alternative implementations of the Tempo charm -may maintain the same interface and be backward compatible with all currently integrated charms. +"""This charm library has been transferred to the HA version of this charm. -## Requirer Library Usage +The new owner is the `tempo-coordinator-k8s` charm: +- [github](https://github.com/canonical/tempo-coordinator-k8s/) +- [charmhub](https://charmhub.io/tempo-coordinator-k8s/) -Charms seeking to push traces to Tempo, must do so using the `TracingEndpointRequirer` -object from this charm library. For the simplest use cases, using the `TracingEndpointRequirer` -object only requires instantiating it, typically in the constructor of your charm. The -`TracingEndpointRequirer` constructor requires the name of the relation over which a tracing endpoint - is exposed by the Tempo charm, and a list of protocols it intends to send traces with. - This relation must use the `tracing` interface. - The `TracingEndpointRequirer` object may be instantiated as follows +The new library (with its major version reset to 0) can be found at - from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer +https://charmhub.io/tempo-coordinator-k8s/libraries/tracing - def __init__(self, *args): - super().__init__(*args) - # ... - self.tracing = TracingEndpointRequirer(self, - protocols=['otlp_grpc', 'otlp_http', 'jaeger_http_thrift'] - ) - # ... +to install it: -Note that the first argument (`self`) to `TracingEndpointRequirer` is always a reference to the -parent charm. +> charmcraft fetch-lib charms.tempo_coordinator_k8s.v0.tracing -Alternatively to providing the list of requested protocols at init time, the charm can do it at -any point in time by calling the -`TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method. -Using this method also allows you to use per-relation protocols. - -Units of provider charms obtain the tempo endpoint to which they will push their traces by calling -`TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example: -- `otlp_grpc` -- `otlp_http` -- `zipkin` -- `tempo` - -If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time, -the library will raise an error. - -## Requirer Library Usage - -The `TracingEndpointProvider` object may be used by charms to manage relations with their -trace sources. For this purposes a Tempo-like charm needs to do two things - -1. Instantiate the `TracingEndpointProvider` object by providing it a -reference to the parent (Tempo) charm and optionally the name of the relation that the Tempo charm -uses to interact with its trace sources. This relation must conform to the `tracing` interface -and it is strongly recommended that this relation be named `tracing` which is its -default value. - -For example a Tempo charm may instantiate the `TracingEndpointProvider` in its constructor as -follows - - from charms.tempo_k8s.v2.tracing import TracingEndpointProvider - - def __init__(self, *args): - super().__init__(*args) - # ... - self.tracing = TracingEndpointProvider(self) - # ... - - - -""" # noqa: W505 -import enum -import json -import logging -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Literal, - MutableMapping, - Optional, - Sequence, - Tuple, - Union, - cast, -) - -import pydantic -from ops.charm import ( - CharmBase, - CharmEvents, - RelationBrokenEvent, - RelationEvent, - RelationRole, -) -from ops.framework import EventSource, Object -from ops.model import ModelError, Relation -from pydantic import BaseModel, Field +The API is unchanged, so you can search and replace the path to swap the old lib with the new one. +""" -# The unique Charmhub library identifier, never change it LIBID = "12977e9aa0b34367903d8afeb8c3d85d" - -# Increment this major API version when introducing breaking changes LIBAPI = 2 +LIBPATCH = 11 -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 10 - -PYDEPS = ["pydantic"] - -logger = logging.getLogger(__name__) - -DEFAULT_RELATION_NAME = "tracing" -RELATION_INTERFACE_NAME = "tracing" - -# Supported list rationale https://github.com/canonical/tempo-coordinator-k8s-operator/issues/8 -ReceiverProtocol = Literal[ - "zipkin", - "otlp_grpc", - "otlp_http", - "jaeger_grpc", - "jaeger_thrift_http", -] - -RawReceiver = Tuple[ReceiverProtocol, str] -"""Helper type. A raw receiver is defined as a tuple consisting of the protocol name, and the (external, if available), -(secured, if available) resolvable server url. -""" - -BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} - - -class TransportProtocolType(str, enum.Enum): - """Receiver Type.""" - - http = "http" - grpc = "grpc" - - -receiver_protocol_to_transport_protocol: Dict[ReceiverProtocol, TransportProtocolType] = { - "zipkin": TransportProtocolType.http, - "otlp_grpc": TransportProtocolType.grpc, - "otlp_http": TransportProtocolType.http, - "jaeger_thrift_http": TransportProtocolType.http, - "jaeger_grpc": TransportProtocolType.grpc, -} -"""A mapping between telemetry protocols and their corresponding transport protocol. -""" - - -class TracingError(Exception): - """Base class for custom errors raised by this library.""" - - -class NotReadyError(TracingError): - """Raised by the provider wrapper if a requirer hasn't published the required data (yet).""" - - -class ProtocolNotRequestedError(TracingError): - """Raised if the user attempts to obtain an endpoint for a protocol it did not request.""" - - -class DataValidationError(TracingError): - """Raised when data validation fails on IPU relation data.""" - - -class AmbiguousRelationUsageError(TracingError): - """Raised when one wrongly assumes that there can only be one relation on an endpoint.""" - - -if int(pydantic.version.VERSION.split(".")[0]) < 2: - - class DatabagModel(BaseModel): # type: ignore - """Base databag model.""" - - class Config: - """Pydantic config.""" - - # ignore any extra fields in the databag - extra = "ignore" - """Ignore any extra fields in the databag.""" - allow_population_by_field_name = True - """Allow instantiating this class by field name (instead of forcing alias).""" - - _NEST_UNDER = None - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - if cls._NEST_UNDER: - return cls.parse_obj(json.loads(databag[cls._NEST_UNDER])) - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {f.alias for f in cls.__fields__.values()} - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - logger.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.parse_raw(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - logger.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - - if self._NEST_UNDER: - databag[self._NEST_UNDER] = self.json(by_alias=True) - return databag - - dct = self.dict() - for key, field in self.__fields__.items(): # type: ignore - value = dct[key] - databag[field.alias or key] = json.dumps(value) - - return databag - -else: - from pydantic import ConfigDict - - class DatabagModel(BaseModel): - """Base databag model.""" - - model_config = ConfigDict( - # ignore any extra fields in the databag - extra="ignore", - # Allow instantiating this class by field name (instead of forcing alias). - populate_by_name=True, - # Custom config key: whether to nest the whole datastructure (as json) - # under a field or spread it out at the toplevel. - _NEST_UNDER=None, # type: ignore - ) - """Pydantic config.""" - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - nest_under = cls.model_config.get("_NEST_UNDER") # type: ignore - if nest_under: - return cls.model_validate(json.loads(databag[nest_under])) # type: ignore - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {(f.alias or n) for n, f in cls.__fields__.items()} - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - logger.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.model_validate_json(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - logger.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - nest_under = self.model_config.get("_NEST_UNDER") - if nest_under: - databag[nest_under] = self.model_dump_json( # type: ignore - by_alias=True, - # skip keys whose values are default - exclude_defaults=True, - ) - return databag - - dct = self.model_dump() # type: ignore - for key, field in self.model_fields.items(): # type: ignore - value = dct[key] - if value == field.default: - continue - databag[field.alias or key] = json.dumps(value) - - return databag - - -# todo use models from charm-relation-interfaces -if int(pydantic.version.VERSION.split(".")[0]) < 2: - - class ProtocolType(BaseModel): # type: ignore - """Protocol Type.""" - - class Config: - """Pydantic config.""" - - use_enum_values = True - """Allow serializing enum values.""" - - name: str = Field( - ..., - description="Receiver protocol name. What protocols are supported (and what they are called) " - "may differ per provider.", - examples=["otlp_grpc", "otlp_http", "tempo_http"], - ) - - type: TransportProtocolType = Field( - ..., - description="The transport protocol used by this receiver.", - examples=["http", "grpc"], - ) - -else: - - class ProtocolType(BaseModel): - """Protocol Type.""" - - model_config = ConfigDict( # type: ignore - # Allow serializing enum values. - use_enum_values=True - ) - """Pydantic config.""" - - name: str = Field( - ..., - description="Receiver protocol name. What protocols are supported (and what they are called) " - "may differ per provider.", - examples=["otlp_grpc", "otlp_http", "tempo_http"], - ) - - type: TransportProtocolType = Field( - ..., - description="The transport protocol used by this receiver.", - examples=["http", "grpc"], - ) - - -class Receiver(BaseModel): - """Specification of an active receiver.""" - - protocol: ProtocolType = Field(..., description="Receiver protocol name and type.") - url: str = Field( - ..., - description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL. - Otherwise, it would be the service's fqdn or internal IP. - If the protocol type is grpc, the url will not contain a scheme.""", - examples=[ - "http://traefik_address:2331", - "https://traefik_address:2331", - "http://tempo_public_ip:2331", - "https://tempo_public_ip:2331", - "tempo_public_ip:2331", - ], - ) - - -class TracingProviderAppData(DatabagModel): # noqa: D101 - """Application databag model for the tracing provider.""" - - receivers: List[Receiver] = Field( - ..., - description="List of all receivers enabled on the tracing provider.", - ) - - -class TracingRequirerAppData(DatabagModel): # noqa: D101 - """Application databag model for the tracing requirer.""" - - receivers: List[ReceiverProtocol] - """Requested receivers.""" - - -class _AutoSnapshotEvent(RelationEvent): - __args__: Tuple[str, ...] = () - __optional_kwargs__: Dict[str, Any] = {} - - @classmethod - def __attrs__(cls): - return cls.__args__ + tuple(cls.__optional_kwargs__.keys()) - - def __init__(self, handle, relation, *args, **kwargs): - super().__init__(handle, relation) - - if not len(self.__args__) == len(args): - raise TypeError("expected {} args, got {}".format(len(self.__args__), len(args))) - - for attr, obj in zip(self.__args__, args): - setattr(self, attr, obj) - for attr, default in self.__optional_kwargs__.items(): - obj = kwargs.get(attr, default) - setattr(self, attr, obj) - - def snapshot(self) -> dict: - dct = super().snapshot() - for attr in self.__attrs__(): - obj = getattr(self, attr) - try: - dct[attr] = obj - except ValueError as e: - raise ValueError( - "cannot automagically serialize {}: " - "override this method and do it " - "manually.".format(obj) - ) from e - - return dct - - def restore(self, snapshot: dict) -> None: - super().restore(snapshot) - for attr, obj in snapshot.items(): - setattr(self, attr, obj) - - -class RelationNotFoundError(Exception): - """Raised if no relation with the given name is found.""" - - def __init__(self, relation_name: str): - self.relation_name = relation_name - self.message = "No relation named '{}' found".format(relation_name) - super().__init__(self.message) - - -class RelationInterfaceMismatchError(Exception): - """Raised if the relation with the given name has an unexpected interface.""" - - def __init__( - self, - relation_name: str, - expected_relation_interface: str, - actual_relation_interface: str, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_interface - self.actual_relation_interface = actual_relation_interface - self.message = ( - "The '{}' relation has '{}' as interface rather than the expected '{}'".format( - relation_name, actual_relation_interface, expected_relation_interface - ) - ) - - super().__init__(self.message) - - -class RelationRoleMismatchError(Exception): - """Raised if the relation with the given name has a different role than expected.""" - - def __init__( - self, - relation_name: str, - expected_relation_role: RelationRole, - actual_relation_role: RelationRole, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_role - self.actual_relation_role = actual_relation_role - self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( - relation_name, repr(actual_relation_role), repr(expected_relation_role) - ) - - super().__init__(self.message) - - -def _validate_relation_by_interface_and_direction( - charm: CharmBase, - relation_name: str, - expected_relation_interface: str, - expected_relation_role: RelationRole, -): - """Validate a relation. - - Verifies that the `relation_name` provided: (1) exists in metadata.yaml, - (2) declares as interface the interface name passed as `relation_interface` - and (3) has the right "direction", i.e., it is a relation that `charm` - provides or requires. - - Args: - charm: a `CharmBase` object to scan for the matching relation. - relation_name: the name of the relation to be verified. - expected_relation_interface: the interface name to be matched by the - relation named `relation_name`. - expected_relation_role: whether the `relation_name` must be either - provided or required by `charm`. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the same relation interface - as specified via the `expected_relation_interface` argument. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the same role as specified - via the `expected_relation_role` argument. - """ - if relation_name not in charm.meta.relations: - raise RelationNotFoundError(relation_name) - - relation = charm.meta.relations[relation_name] - - # fixme: why do we need to cast here? - actual_relation_interface = cast(str, relation.interface_name) - - if actual_relation_interface != expected_relation_interface: - raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface - ) - - if expected_relation_role is RelationRole.provides: - if relation_name not in charm.meta.provides: - raise RelationRoleMismatchError( - relation_name, RelationRole.provides, RelationRole.requires - ) - elif expected_relation_role is RelationRole.requires: - if relation_name not in charm.meta.requires: - raise RelationRoleMismatchError( - relation_name, RelationRole.requires, RelationRole.provides - ) - else: - raise TypeError("Unexpected RelationDirection: {}".format(expected_relation_role)) - - -class RequestEvent(RelationEvent): - """Event emitted when a remote requests a tracing endpoint.""" - - @property - def requested_receivers(self) -> List[ReceiverProtocol]: - """List of receiver protocols that have been requested.""" - relation = self.relation - app = relation.app - if not app: - raise NotReadyError("relation.app is None") - - return TracingRequirerAppData.load(relation.data[app]).receivers - - -class BrokenEvent(RelationBrokenEvent): - """Event emitted when a relation on tracing is broken.""" - - -class TracingEndpointProviderEvents(CharmEvents): - """TracingEndpointProvider events.""" - - request = EventSource(RequestEvent) - broken = EventSource(BrokenEvent) - - -class TracingEndpointProvider(Object): - """Class representing a trace receiver service.""" - - on = TracingEndpointProviderEvents() # type: ignore - - def __init__( - self, - charm: CharmBase, - external_url: Optional[str] = None, - relation_name: str = DEFAULT_RELATION_NAME, - ): - """Initialize. - - Args: - charm: a `CharmBase` instance that manages this instance of the Tempo service. - external_url: external address of the node hosting the tempo server, - if an ingress is present. - relation_name: an optional string name of the relation between `charm` - and the Tempo charmed service. The default is "tracing". - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the `tracing` relation - interface. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the `RelationRole.requires` - role. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides - ) - - super().__init__(charm, relation_name + "tracing-provider") - self._charm = charm - self._external_url = external_url - self._relation_name = relation_name - self.framework.observe( - self._charm.on[relation_name].relation_joined, self._on_relation_event - ) - self.framework.observe( - self._charm.on[relation_name].relation_created, self._on_relation_event - ) - self.framework.observe( - self._charm.on[relation_name].relation_changed, self._on_relation_event - ) - self.framework.observe( - self._charm.on[relation_name].relation_broken, self._on_relation_broken_event - ) - - def _on_relation_broken_event(self, e: RelationBrokenEvent): - """Handle relation broken events.""" - self.on.broken.emit(e.relation) - - def _on_relation_event(self, e: RelationEvent): - """Handle relation created/joined/changed events.""" - if self.is_requirer_ready(e.relation): - self.on.request.emit(e.relation) - - def is_requirer_ready(self, relation: Relation): - """Attempt to determine if requirer has already populated app data.""" - try: - self._get_requested_protocols(relation) - except NotReadyError: - return False - return True - - @staticmethod - def _get_requested_protocols(relation: Relation): - app = relation.app - if not app: - raise NotReadyError("relation.app is None") - - try: - databag = TracingRequirerAppData.load(relation.data[app]) - except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): - logger.info(f"relation {relation} is not ready to talk tracing") - raise NotReadyError() - return databag.receivers - - def requested_protocols(self): - """All receiver protocols that have been requested by our related apps.""" - requested_protocols = set() - for relation in self.relations: - try: - protocols = self._get_requested_protocols(relation) - except NotReadyError: - continue - requested_protocols.update(protocols) - return requested_protocols - - @property - def relations(self) -> List[Relation]: - """All relations active on this endpoint.""" - return self._charm.model.relations[self._relation_name] - - def publish_receivers(self, receivers: Sequence[RawReceiver]): - """Let all requirers know that these receivers are active and listening.""" - if not self._charm.unit.is_leader(): - raise RuntimeError("only leader can do this") - - for relation in self.relations: - try: - TracingProviderAppData( - receivers=[ - Receiver( - url=url, - protocol=ProtocolType( - name=protocol, - type=receiver_protocol_to_transport_protocol[protocol], - ), - ) - for protocol, url in receivers - ], - ).dump(relation.data[self._charm.app]) - - except ModelError as e: - # args are bytes - msg = e.args[0] - if isinstance(msg, bytes): - if msg.startswith( - b"ERROR cannot read relation application settings: permission denied" - ): - logger.error( - f"encountered error {e} while attempting to update_relation_data." - f"The relation must be gone." - ) - continue - raise - - -class EndpointRemovedEvent(RelationBrokenEvent): - """Event representing a change in one of the receiver endpoints.""" - - -class EndpointChangedEvent(_AutoSnapshotEvent): - """Event representing a change in one of the receiver endpoints.""" - - __args__ = ("_receivers",) - - if TYPE_CHECKING: - _receivers = [] # type: List[dict] - - @property - def receivers(self) -> List[Receiver]: - """Cast receivers back from dict.""" - return [Receiver(**i) for i in self._receivers] - - -class TracingEndpointRequirerEvents(CharmEvents): - """TracingEndpointRequirer events.""" - - endpoint_changed = EventSource(EndpointChangedEvent) - endpoint_removed = EventSource(EndpointRemovedEvent) - - -class TracingEndpointRequirer(Object): - """A tracing endpoint for Tempo.""" - - on = TracingEndpointRequirerEvents() # type: ignore - - def __init__( - self, - charm: CharmBase, - relation_name: str = DEFAULT_RELATION_NAME, - protocols: Optional[List[ReceiverProtocol]] = None, - ): - """Construct a tracing requirer for a Tempo charm. - - If your application supports pushing traces to a distributed tracing backend, the - `TracingEndpointRequirer` object enables your charm to easily access endpoint information - exchanged over a `tracing` relation interface. - - Args: - charm: a `CharmBase` object that manages this - `TracingEndpointRequirer` object. Typically, this is `self` in the instantiating - class. - relation_name: an optional string name of the relation between `charm` - and the Tempo charmed service. The default is "tracing". It is strongly - advised not to change the default, so that people deploying your charm will have a - consistent experience with all other charms that provide tracing endpoints. - protocols: optional list of protocols that the charm intends to send traces with. - The provider will enable receivers for these and only these protocols, - so be sure to enable all protocols the charm or its workload are going to need. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the `tracing` relation - interface. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the `RelationRole.provides` - role. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires - ) - - super().__init__(charm, relation_name) - - self._is_single_endpoint = charm.meta.relations[relation_name].limit == 1 - - self._charm = charm - self._relation_name = relation_name - - events = self._charm.on[self._relation_name] - self.framework.observe(events.relation_changed, self._on_tracing_relation_changed) - self.framework.observe(events.relation_broken, self._on_tracing_relation_broken) - - if protocols: - self.request_protocols(protocols) - - def request_protocols( - self, protocols: Sequence[ReceiverProtocol], relation: Optional[Relation] = None - ): - """Publish the list of protocols which the provider should activate.""" - # todo: should we check if _is_single_endpoint and len(self.relations) > 1 and raise, here? - relations = [relation] if relation else self.relations - - if not protocols: - # empty sequence - raise ValueError( - "You need to pass a nonempty sequence of protocols to `request_protocols`." - ) - - try: - if self._charm.unit.is_leader(): - for relation in relations: - TracingRequirerAppData( - receivers=list(protocols), - ).dump(relation.data[self._charm.app]) - - except ModelError as e: - # args are bytes - msg = e.args[0] - if isinstance(msg, bytes): - if msg.startswith( - b"ERROR cannot read relation application settings: permission denied" - ): - logger.error( - f"encountered error {e} while attempting to request_protocols." - f"The relation must be gone." - ) - return - raise - - @property - def relations(self) -> List[Relation]: - """The tracing relations associated with this endpoint.""" - return self._charm.model.relations[self._relation_name] - - @property - def _relation(self) -> Optional[Relation]: - """If this wraps a single endpoint, the relation bound to it, if any.""" - if not self._is_single_endpoint: - objname = type(self).__name__ - raise AmbiguousRelationUsageError( - f"This {objname} wraps a {self._relation_name} endpoint that has " - "limit != 1. We can't determine what relation, of the possibly many, you are " - f"talking about. Please pass a relation instance while calling {objname}, " - "or set limit=1 in the charm metadata." - ) - relations = self.relations - return relations[0] if relations else None - - def is_ready(self, relation: Optional[Relation] = None): - """Is this endpoint ready?""" - relation = relation or self._relation - if not relation: - logger.debug(f"no relation on {self._relation_name !r}: tracing not ready") - return False - if relation.data is None: - logger.error(f"relation data is None for {relation}") - return False - if not relation.app: - logger.error(f"{relation} event received but there is no relation.app") - return False - try: - databag = dict(relation.data[relation.app]) - TracingProviderAppData.load(databag) - - except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): - logger.info(f"failed validating relation data for {relation}") - return False - return True - - def _on_tracing_relation_changed(self, event): - """Notify the providers that there is new endpoint information available.""" - relation = event.relation - if not self.is_ready(relation): - self.on.endpoint_removed.emit(relation) # type: ignore - return - - data = TracingProviderAppData.load(relation.data[relation.app]) - self.on.endpoint_changed.emit(relation, [i.dict() for i in data.receivers]) # type: ignore - - def _on_tracing_relation_broken(self, event: RelationBrokenEvent): - """Notify the providers that the endpoint is broken.""" - relation = event.relation - self.on.endpoint_removed.emit(relation) # type: ignore - - def get_all_endpoints( - self, relation: Optional[Relation] = None - ) -> Optional[TracingProviderAppData]: - """Unmarshalled relation data.""" - relation = relation or self._relation - if not self.is_ready(relation): - return - return TracingProviderAppData.load(relation.data[relation.app]) # type: ignore - - def _get_endpoint( - self, relation: Optional[Relation], protocol: ReceiverProtocol - ) -> Optional[str]: - app_data = self.get_all_endpoints(relation) - if not app_data: - return None - receivers: List[Receiver] = list( - filter(lambda i: i.protocol.name == protocol, app_data.receivers) - ) - if not receivers: - logger.error(f"no receiver found with protocol={protocol!r}") - return - if len(receivers) > 1: - logger.error( - f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}" - ) - return - - receiver = receivers[0] - return receiver.url - - def get_endpoint( - self, protocol: ReceiverProtocol, relation: Optional[Relation] = None - ) -> Optional[str]: - """Receiver endpoint for the given protocol. - - It could happen that this function gets called before the provider publishes the endpoints. - In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to - restricted access. To prevent this, this function needs to be guarded by the `is_ready` check. - - Raises: - ProtocolNotRequestedError: - If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request. - """ - endpoint = self._get_endpoint(relation or self._relation, protocol=protocol) - if not endpoint: - requested_protocols = set() - relations = [relation] if relation else self.relations - for relation in relations: - try: - databag = TracingRequirerAppData.load(relation.data[self._charm.app]) - except DataValidationError: - continue - - requested_protocols.update(databag.receivers) - - if protocol not in requested_protocols: - raise ProtocolNotRequestedError(protocol, relation) - - return None - return endpoint - - -def charm_tracing_config( - endpoint_requirer: TracingEndpointRequirer, cert_path: Optional[Union[Path, str]] -) -> Tuple[Optional[str], Optional[str]]: - """Return the charm_tracing config you likely want. - - If no endpoint is provided: - disable charm tracing. - If https endpoint is provided but cert_path is not found on disk: - disable charm tracing. - If https endpoint is provided and cert_path is None: - ERROR - Else: - proceed with charm tracing (with or without tls, as appropriate) - - Usage: - If you are using charm_tracing >= v1.9: - >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config - >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") - >>> class MyCharm(...): - >>> _cert_path = "/path/to/cert/on/charm/container.crt" - >>> def __init__(self, ...): - >>> self.tracing = TracingEndpointRequirer(...) - >>> self.my_endpoint, self.cert_path = charm_tracing_config( - ... self.tracing, self._cert_path) - - If you are using charm_tracing < v1.9: - >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config - >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") - >>> class MyCharm(...): - >>> _cert_path = "/path/to/cert/on/charm/container.crt" - >>> def __init__(self, ...): - >>> self.tracing = TracingEndpointRequirer(...) - >>> self._my_endpoint, self._cert_path = charm_tracing_config( - ... self.tracing, self._cert_path) - >>> @property - >>> def my_endpoint(self): - >>> return self._my_endpoint - >>> @property - >>> def cert_path(self): - >>> return self._cert_path - - """ - if not endpoint_requirer.is_ready(): - return None, None - - endpoint = endpoint_requirer.get_endpoint("otlp_http") - if not endpoint: - return None, None - - is_https = endpoint.startswith("https://") - - if is_https: - if cert_path is None: - raise TracingError("Cannot send traces to an https endpoint without a certificate.") - elif not Path(cert_path).exists(): - # if endpoint is https BUT we don't have a server_cert yet: - # disable charm tracing until we do to prevent tls errors - return None, None - return endpoint, str(cert_path) - else: - return endpoint, None +raise DeprecationWarning( + "this charm lib is deprecated; please use charms.tempo_coordinator_k8s.v0.tracing instead. " + "see https://charmhub.io/tempo-coordinator-k8s/libraries/tracing" +) diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py index da7fa95..141412b 100644 --- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 20 +LIBPATCH = 23 PYDEPS = ["cryptography", "jsonschema"] @@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ) else: try: + secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") logger.debug( "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" ) - secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") + # Juju < 3.6 will create a new revision even if the content is the same + if ( + secret.get_content(refresh=True).get("certificate", "") + == certificate.certificate + ): + logger.debug( + "Secret %s with correct certificate already exists", + f"{LIBID}-{csr_in_sha256_hex}", + ) + continue secret.set_content( {"certificate": certificate.certificate, "csr": certificate.csr} ) @@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None: provider_certificate = self._find_certificate_in_relation_data(csr) if not provider_certificate: # A secret expired but we did not find matching certificate. Cleaning up + logger.warning( + "Failed to find matching certificate for csr, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return if not provider_certificate.expiry_time: # A secret expired but matching certificate is invalid. Cleaning up + logger.warning( + "Certificate matching csr is invalid, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return @@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti return provider_certificate return None - def _get_csr_from_secret(self, secret: Secret) -> str: + def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]: """Extract the CSR from the secret label or content. This function is a workaround to maintain backwards compatibility and fix the issue reported in https://github.com/canonical/tls-certificates-interface/issues/228 """ - if not (csr := secret.get_content().get("csr", "")): + try: + content = secret.get_content(refresh=True) + except SecretNotFoundError: + return None + if not (csr := content.get("csr", None)): # In versions <14 of the Lib we were storing the CSR in the label of the secret # The CSR now is stored int the content of the secret, which was a breaking change # Here we get the CSR if the secret was created by an app using libpatch 14 or lower From 42950ed240449d69f32bfc797c26ddec970da491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 12 Nov 2024 18:41:52 -0300 Subject: [PATCH 5/6] Revert "update libs" This reverts commit 03ff3b8762cd3eb5406445e16ca55906e6f824a5. --- .../observability_libs/v1/cert_handler.py | 42 +- .../tempo_coordinator_k8s/v0/charm_tracing.py | 389 +------ .../tempo_coordinator_k8s/v0/tracing.py | 9 +- lib/charms/tempo_k8s/v1/charm_tracing.py | 759 ++++++++++++- lib/charms/tempo_k8s/v2/tracing.py | 1001 ++++++++++++++++- .../v3/tls_certificates.py | 30 +- 6 files changed, 1782 insertions(+), 448 deletions(-) diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py index 26be879..4a1940b 100644 --- a/lib/charms/observability_libs/v1/cert_handler.py +++ b/lib/charms/observability_libs/v1/cert_handler.py @@ -32,7 +32,6 @@ Since this library uses [Juju Secrets](https://juju.is/docs/juju/secret) it requires Juju >= 3.0.3. """ import abc -import hashlib import ipaddress import json import socket @@ -68,7 +67,7 @@ LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a" LIBAPI = 1 -LIBPATCH = 14 +LIBPATCH = 13 VAULT_SECRET_LABEL = "cert-handler-private-vault" @@ -302,11 +301,14 @@ def __init__( Must match metadata.yaml. cert_subject: Custom subject. Name collisions are under the caller's responsibility. sans: DNS names. If none are given, use FQDN. - refresh_events: [DEPRECATED]. + refresh_events: an optional list of bound events which + will be observed to replace the current CSR with a new one + if there are changes in the CSR's DNS SANs or IP SANs. + Then, subsequently, replace its corresponding certificate with a new one. """ super().__init__(charm, key) # use StoredState to store the hash of the CSR - # to potentially trigger a CSR renewal + # to potentially trigger a CSR renewal on `refresh_events` self._stored.set_default( csr_hash=None, ) @@ -318,9 +320,8 @@ def __init__( # Use fqdn only if no SANs were given, and drop empty/duplicate SANs sans = list(set(filter(None, (sans or [socket.getfqdn()])))) - # sort SANS lists to avoid unnecessary csr renewals during reconciliation - self.sans_ip = sorted(filter(is_ip_address, sans)) - self.sans_dns = sorted(filterfalse(is_ip_address, sans)) + self.sans_ip = list(filter(is_ip_address, sans)) + self.sans_dns = list(filterfalse(is_ip_address, sans)) if self._check_juju_supports_secrets(): vault_backend = _SecretVaultBackend(charm, secret_label=VAULT_SECRET_LABEL) @@ -366,15 +367,13 @@ def __init__( ) if refresh_events: - logger.warn( - "DEPRECATION WARNING. `refresh_events` is now deprecated. CertHandler will automatically refresh the CSR when necessary." - ) - - self._reconcile() + for ev in refresh_events: + self.framework.observe(ev, self._on_refresh_event) - def _reconcile(self): - """Run all logic that is independent of what event we're processing.""" - self._refresh_csr_if_needed() + def _on_refresh_event(self, _): + """Replace the latest current CSR with a new one if there are any SANs changes.""" + if self._stored.csr_hash != self._csr_hash: + self._generate_csr(renew=True) def _on_upgrade_charm(self, _): has_privkey = self.vault.get_value("private-key") @@ -389,11 +388,6 @@ def _on_upgrade_charm(self, _): # this will call `self.private_key` which will generate a new privkey. self._generate_csr(renew=True) - def _refresh_csr_if_needed(self): - """Refresh the current CSR with a new one if there are any SANs changes.""" - if self._stored.csr_hash is not None and self._stored.csr_hash != self._csr_hash: - self._generate_csr(renew=True) - def _migrate_vault(self): peer_backend = _RelationVaultBackend(self.charm, relation_name="peers") @@ -446,17 +440,13 @@ def enabled(self) -> bool: return True @property - def _csr_hash(self) -> str: + def _csr_hash(self) -> int: """A hash of the config that constructs the CSR. Only include here the config options that, should they change, should trigger a renewal of the CSR. """ - - def _stable_hash(data): - return hashlib.sha256(str(data).encode()).hexdigest() - - return _stable_hash( + return hash( ( tuple(self.sans_dns), tuple(self.sans_ip), diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py index cf8def1..1e7ff84 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py @@ -69,9 +69,6 @@ def my_tracing_endpoint(self) -> Optional[str]: - every event as a span (including custom events) - every charm method call (except dunders) as a span -We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests -go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. - ## TLS support If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), @@ -117,57 +114,6 @@ def get_tracer(self) -> opentelemetry.trace.Tracer: See the official opentelemetry Python SDK documentation for usage: https://opentelemetry-python.readthedocs.io/en/latest/ - -## Caching traces -The `trace_charm` machinery will buffer any traces collected during charm execution and store them -to a file on the charm container until a tracing backend becomes available. At that point, it will -flush them to the tracing receiver. - -By default, the buffer is configured to start dropping old traces if any of these conditions apply: - -- the storage size exceeds 10 MiB -- the number of buffered events exceeds 100 - -You can configure this by, for example: - -```python -@trace_charm( - tracing_endpoint="my_tracing_endpoint", - server_cert="_server_cert", - # only cache up to 42 events - buffer_max_events=42, - # only cache up to 42 MiB - buffer_max_size_mib=42, # minimum 10! -) -class MyCharm(CharmBase): - ... -``` - -Note that setting `buffer_max_events` to 0 will effectively disable the buffer. - -The path of the buffer file is by default in the charm's execution root, which for k8s charms means -that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage -(or add a new one) such as: - -```yaml -storage: - data: - type: filesystem - location: /charm-traces -``` - -and then configure the `@trace_charm` decorator to use it as path for storing the buffer: -```python -@trace_charm( - tracing_endpoint="my_tracing_endpoint", - server_cert="_server_cert", - # store traces to a PVC so they're not lost on pod restart. - buffer_path="/charm-traces/buffer.file", -) -class MyCharm(CharmBase): - ... -``` - ## Upgrading from `v0` If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already @@ -225,12 +171,6 @@ def my_tracing_endpoint(self) -> Optional[str]: 3) If you were passing a certificate (str) using `server_cert`, you need to change it to provide an *absolute* path to the certificate file instead. """ -import typing - -from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import ( - encode_spans, -) -from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter def _remove_stale_otel_sdk_packages(): @@ -282,9 +222,6 @@ def _remove_stale_otel_sdk_packages(): otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") -# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. -# it could be trouble if someone ever decides to implement their own tracer parallel to -# ours and before the charm has inited. We assume they won't. _remove_stale_otel_sdk_packages() import functools @@ -298,7 +235,6 @@ def _remove_stale_otel_sdk_packages(): Any, Callable, Generator, - List, Optional, Sequence, Type, @@ -311,12 +247,8 @@ def _remove_stale_otel_sdk_packages(): import ops from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider -from opentelemetry.sdk.trace.export import ( - BatchSpanProcessor, - SpanExporter, - SpanExportResult, -) +from opentelemetry.sdk.trace import Span, TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.trace import INVALID_SPAN, Tracer from opentelemetry.trace import get_current_span as otlp_get_current_span from opentelemetry.trace import ( @@ -337,7 +269,7 @@ def _remove_stale_otel_sdk_packages(): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 2 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] @@ -345,7 +277,7 @@ def _remove_stale_otel_sdk_packages(): dev_logger = logging.getLogger("tracing-dev") # set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.ERROR) +dev_logger.setLevel(logging.CRITICAL) _CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof _C = TypeVar("_C", bound=_CharmType) @@ -355,186 +287,6 @@ def _remove_stale_otel_sdk_packages(): _GetterType = Union[Callable[[_CharmType], Optional[str]], property] CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" -BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw" -# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in -# any portable format. Json dumping is supported, but loading isn't. -# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003 - -BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10 -_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10 -BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100 -_MiB_TO_B = 2**20 # megabyte to byte conversion rate -_OTLP_SPAN_EXPORTER_TIMEOUT = 1 -"""Timeout in seconds that the OTLP span exporter has to push traces to the backend.""" - - -class _Buffer: - """Handles buffering for spans emitted while no tracing backend is configured or available. - - Use the max_event_history_length_buffering param of @trace_charm to tune - the amount of memory that this will hog on your units. - - The buffer is formatted as a bespoke byte dump (protobuf limitation). - We cannot store them as json because that is not well-supported by the sdk - (see https://github.com/open-telemetry/opentelemetry-python/issues/3364). - """ - - _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__" - - def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int): - self._db_file = db_file - self._max_event_history_length = max_event_history_length - self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN) - - # set by caller - self.exporter: Optional[OTLPSpanExporter] = None - - def save(self, spans: typing.Sequence[ReadableSpan]): - """Save the spans collected by this exporter to the cache file. - - This method should be as fail-safe as possible. - """ - if self._max_event_history_length < 1: - dev_logger.debug("buffer disabled: max history length < 1") - return - - current_history_length = len(self.load()) - new_history_length = current_history_length + len(spans) - if (diff := self._max_event_history_length - new_history_length) < 0: - self.drop(diff) - self._save(spans) - - def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes: - # encode because otherwise we can't json-dump them - return encode_spans(spans).SerializeToString() - - def _save(self, spans: Sequence[ReadableSpan], replace: bool = False): - dev_logger.debug(f"saving {len(spans)} new spans to buffer") - old = [] if replace else self.load() - new = self._serialize(spans) - - try: - # if the buffer exceeds the size limit, we start dropping old spans until it does - - while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B): - if not old: - # if we've already dropped all spans and still we can't get under the - # size limit, we can't save this span - logger.error( - f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); " - f"buffering FAILED" - ) - return - - old = old[1:] - logger.warning( - f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... " - f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed." - ) - - self._db_file.write_bytes(new + self._SPANSEP.join(old)) - except Exception: - logger.exception("error buffering spans") - - def load(self) -> List[bytes]: - """Load currently buffered spans from the cache file. - - This method should be as fail-safe as possible. - """ - if not self._db_file.exists(): - dev_logger.debug("buffer file not found. buffer empty.") - return [] - try: - spans = self._db_file.read_bytes().split(self._SPANSEP) - except Exception: - logger.exception(f"error parsing {self._db_file}") - return [] - return spans - - def drop(self, n_spans: Optional[int] = None): - """Drop some currently buffered spans from the cache file.""" - current = self.load() - if n_spans: - dev_logger.debug(f"dropping {n_spans} spans from buffer") - new = current[n_spans:] - else: - dev_logger.debug("emptying buffer") - new = [] - - self._db_file.write_bytes(self._SPANSEP.join(new)) - - def flush(self) -> Optional[bool]: - """Export all buffered spans to the given exporter, then clear the buffer. - - Returns whether the flush was successful, and None if there was nothing to flush. - """ - if not self.exporter: - dev_logger.debug("no exporter set; skipping buffer flush") - return False - - buffered_spans = self.load() - if not buffered_spans: - dev_logger.debug("nothing to flush; buffer empty") - return None - - errors = False - for span in buffered_spans: - try: - out = self.exporter._export(span) # type: ignore - if not (200 <= out.status_code < 300): - # take any 2xx status code as a success - errors = True - except ConnectionError: - dev_logger.debug( - "failed exporting buffered span; backend might be down or still starting" - ) - errors = True - except Exception: - logger.exception("unexpected error while flushing span batch from buffer") - errors = True - - if not errors: - self.drop() - else: - logger.error("failed flushing spans; buffer preserved") - return not errors - - @property - def is_empty(self): - """Utility to check whether the buffer has any stored spans. - - This is more efficient than attempting a load() given how large the buffer might be. - """ - return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0) - - -class _OTLPSpanExporter(OTLPSpanExporter): - """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster.""" - - # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured, - # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't - # flush the traces and keeps retrying for 'too long' - - _MAX_RETRY_TIMEOUT = 4 - # we give the exporter 4 seconds in total to succeed pushing the traces to tempo - # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk. - # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait - - -class _BufferedExporter(InMemorySpanExporter): - def __init__(self, buffer: _Buffer) -> None: - super().__init__() - self._buffer = buffer - - def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: - self._buffer.save(spans) - return super().export(spans) - - def force_flush(self, timeout_millis: int = 0) -> bool: - # parent implementation is fake, so the timeout_millis arg is not doing anything. - result = super().force_flush(timeout_millis) - self._buffer.save(self.get_finished_spans()) - return result def is_enabled() -> bool: @@ -671,10 +423,7 @@ def _setup_root_span_initializer( charm_type: _CharmType, tracing_endpoint_attr: str, server_cert_attr: Optional[str], - service_name: Optional[str], - buffer_path: Optional[Path], - buffer_max_events: int, - buffer_max_size_mib: int, + service_name: Optional[str] = None, ): """Patch the charm's initializer.""" original_init = charm_type.__init__ @@ -693,11 +442,18 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): logger.info("Tracing DISABLED: skipping root span initialization") return + # already init some attrs that will be reinited later by calling original_init: + # self.framework = framework + # self.handle = Handle(None, self.handle_kind, None) + original_event_context = framework._event_context # default service name isn't just app name because it could conflict with the workload service name _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name + # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. + # it could be trouble if someone ever decides to implement their own tracer parallel to + # ours and before the charm has inited. We assume they won't. resource = Resource.create( attributes={ "service.name": _service_name, @@ -715,60 +471,33 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) - buffer_only = False - # whether we're only exporting to buffer, or also to the otlp exporter. - if not tracing_endpoint: # tracing is off if tracing_endpoint is None - # however we can buffer things until tracing comes online - buffer_only = True + return server_cert: Optional[Union[str, Path]] = ( _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None ) - if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert: + if tracing_endpoint.startswith("https://") and not server_cert: logger.error( "Tracing endpoint is https, but no server_cert has been passed." "Please point @trace_charm to a `server_cert` attr. " "This might also mean that the tracing provider is related to a " "certificates provider, but this application is not (yet). " "In that case, you might just have to wait a bit for the certificates " - "integration to settle. This span will be buffered." + "integration to settle. " ) - buffer_only = True + return - buffer = _Buffer( - db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME, - max_event_history_length=buffer_max_events, - max_buffer_size_mib=buffer_max_size_mib, + exporter = OTLPSpanExporter( + endpoint=tracing_endpoint, + certificate_file=str(Path(server_cert).absolute()) if server_cert else None, + timeout=2, ) - previous_spans_buffered = not buffer.is_empty - - exporters: List[SpanExporter] = [] - if buffer_only: - # we have to buffer because we're missing necessary backend configuration - dev_logger.debug("buffering mode: ON") - exporters.append(_BufferedExporter(buffer)) - - else: - dev_logger.debug("buffering mode: FALLBACK") - # in principle, we have the right configuration to be pushing traces, - # but if we fail for whatever reason, we will put everything in the buffer - # and retry the next time - otlp_exporter = _OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=_OTLP_SPAN_EXPORTER_TIMEOUT, # give individual requests 1 second to succeed - ) - exporters.append(otlp_exporter) - exporters.append(_BufferedExporter(buffer)) - buffer.exporter = otlp_exporter - - for exporter in exporters: - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) set_tracer_provider(provider) _tracer = get_tracer(_service_name) # type: ignore _tracer_token = tracer.set(_tracer) @@ -792,7 +521,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): @contextmanager def wrap_event_context(event_name: str): - dev_logger.debug(f"entering event context: {event_name}") + dev_logger.info(f"entering event context: {event_name}") # when the framework enters an event context, we create a span. with _span("event: " + event_name) as event_context_span: if event_context_span: @@ -806,50 +535,12 @@ def wrap_event_context(event_name: str): @functools.wraps(original_close) def wrap_close(): - dev_logger.debug("tearing down tracer and flushing traces") + dev_logger.info("tearing down tracer and flushing traces") span.end() opentelemetry.context.detach(span_token) # type: ignore tracer.reset(_tracer_token) tp = cast(TracerProvider, get_tracer_provider()) - flush_successful = tp.force_flush(timeout_millis=1000) # don't block for too long - - if buffer_only: - # if we're in buffer_only mode, it means we couldn't even set up the exporter for - # tempo as we're missing some data. - # so attempting to flush the buffer doesn't make sense - dev_logger.debug("tracing backend unavailable: all spans pushed to buffer") - - else: - dev_logger.debug("tracing backend found: attempting to flush buffer...") - - # if we do have an exporter for tempo, and we could send traces to it, - # we can attempt to flush the buffer as well. - if not flush_successful: - logger.error("flushing FAILED: unable to push traces to backend.") - else: - dev_logger.debug("flush succeeded.") - - # the backend has accepted the spans generated during this event, - if not previous_spans_buffered: - # if the buffer was empty to begin with, any spans we collected now can be discarded - buffer.drop() - dev_logger.debug("buffer dropped: this trace has been sent already") - else: - # if the buffer was nonempty, we can attempt to flush it - dev_logger.debug("attempting buffer flush...") - buffer_flush_successful = buffer.flush() - if buffer_flush_successful: - dev_logger.debug("buffer flush OK") - elif buffer_flush_successful is None: - # TODO is this even possible? - dev_logger.debug("buffer flush OK; empty: nothing to flush") - else: - # this situation is pretty weird, I'm not even sure it can happen, - # because it would mean that we did manage - # to push traces directly to the tempo exporter (flush_successful), - # but the buffer flush failed to push to the same exporter! - logger.error("buffer flush FAILED") - + tp.force_flush(timeout_millis=1000) # don't block for too long tp.shutdown() original_close() @@ -864,9 +555,6 @@ def trace_charm( server_cert: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), - buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, - buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, - buffer_path: Optional[Union[str, Path]] = None, ) -> Callable[[_T], _T]: """Autoinstrument the decorated charm with tracing telemetry. @@ -908,10 +596,6 @@ def trace_charm( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... - :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. - :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. - Minimum 10MiB. - :param buffer_path: path to buffer file to use for saving buffered spans. """ def _decorator(charm_type: _T) -> _T: @@ -922,9 +606,6 @@ def _decorator(charm_type: _T) -> _T: server_cert_attr=server_cert, service_name=service_name, extra_types=extra_types, - buffer_path=Path(buffer_path) if buffer_path else None, - buffer_max_size_mib=buffer_max_size_mib, - buffer_max_events=buffer_max_events, ) return charm_type @@ -937,9 +618,6 @@ def _autoinstrument( server_cert_attr: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), - buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, - buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, - buffer_path: Optional[Path] = None, ) -> _T: """Set up tracing on this charm class. @@ -972,20 +650,13 @@ def _autoinstrument( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... - :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. - :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. - Minimum 10MiB. - :param buffer_path: path to buffer file to use for saving buffered spans. """ - dev_logger.debug(f"instrumenting {charm_type}") + dev_logger.info(f"instrumenting {charm_type}") _setup_root_span_initializer( charm_type, tracing_endpoint_attr, server_cert_attr=server_cert_attr, service_name=service_name, - buffer_path=buffer_path, - buffer_max_events=buffer_max_events, - buffer_max_size_mib=buffer_max_size_mib, ) trace_type(charm_type) for type_ in extra_types: @@ -1001,12 +672,12 @@ def trace_type(cls: _T) -> _T: It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` has been instantiated. """ - dev_logger.debug(f"instrumenting {cls}") + dev_logger.info(f"instrumenting {cls}") for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.debug(f"discovered {method}") + dev_logger.info(f"discovered {method}") if method.__name__.startswith("__"): - dev_logger.debug(f"skipping {method} (dunder)") + dev_logger.info(f"skipping {method} (dunder)") continue # the span title in the general case should be: @@ -1052,7 +723,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F: def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.debug(f"instrumenting {callable}") + dev_logger.info(f"instrumenting {callable}") # sig = inspect.signature(callable) @functools.wraps(callable) diff --git a/lib/charms/tempo_coordinator_k8s/v0/tracing.py b/lib/charms/tempo_coordinator_k8s/v0/tracing.py index 2035dff..1f92867 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/tracing.py @@ -34,7 +34,7 @@ def __init__(self, *args): `TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method. Using this method also allows you to use per-relation protocols. -Units of requirer charms obtain the tempo endpoint to which they will push their traces by calling +Units of provider charms obtain the tempo endpoint to which they will push their traces by calling `TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example: - `otlp_grpc` - `otlp_http` @@ -44,10 +44,7 @@ def __init__(self, *args): If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time, the library will raise an error. -We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests -go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. - -## Provider Library Usage +## Requirer Library Usage The `TracingEndpointProvider` object may be used by charms to manage relations with their trace sources. For this purposes a Tempo-like charm needs to do two things @@ -110,7 +107,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 3 +LIBPATCH = 2 PYDEPS = ["pydantic"] diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index cfb2dbe..2dbdddd 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -2,28 +2,759 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. -"""This charm library has been transferred to the HA version of this charm. +"""This charm library contains utilities to instrument your Charm with opentelemetry tracing data collection. -The new owner is the `tempo-coordinator-k8s` charm: -- [github](https://github.com/canonical/tempo-coordinator-k8s/) -- [charmhub](https://charmhub.io/tempo-coordinator-k8s/) +(yes! charm code, not workload code!) -The new library (with its major version reset to 0) can be found at +This means that, if your charm is related to, for example, COS' Tempo charm, you will be able to inspect +in real time from the Grafana dashboard the execution flow of your charm. -https://charmhub.io/tempo-coordinator-k8s/libraries/charm_tracing +# Quickstart +Fetch the following charm libs (and ensure the minimum version/revision numbers are satisfied): -to install it: + charmcraft fetch-lib charms.tempo_k8s.v2.tracing # >= 1.10 + charmcraft fetch-lib charms.tempo_k8s.v1.charm_tracing # >= 2.7 -> charmcraft fetch-lib charms.tempo_coordinator_k8s.v0.charm_tracing +Then edit your charm code to include: -The API is unchanged, so you can search and replace the path to swap the old lib with the new one. +```python +# import the necessary charm libs +from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer, charm_tracing_config +from charms.tempo_k8s.v1.charm_tracing import charm_tracing + +# decorate your charm class with charm_tracing: +@charm_tracing( + # forward-declare the instance attributes that the instrumentor will look up to obtain the + # tempo endpoint and server certificate + tracing_endpoint="tracing_endpoint", + server_cert="server_cert" +) +class MyCharm(CharmBase): + _path_to_cert = "/path/to/cert.crt" + # path to cert file **in the charm container**. Its presence will be used to determine whether + # the charm is ready to use tls for encrypting charm traces. If your charm does not support tls, + # you can ignore this and pass None to charm_tracing_config. + # If you do support TLS, you'll need to make sure that the server cert is copied to this location + # and kept up to date so the instrumentor can use it. + + def __init__(self, ...): + ... + self.tracing = TracingEndpointRequirer(self, ...) + self.tracing_endpoint, self.server_cert = charm_tracing_config(self.tracing, self._path_to_cert) +``` + +# Detailed usage +To use this library, you need to do two things: +1) decorate your charm class with + +`@trace_charm(tracing_endpoint="my_tracing_endpoint")` + +2) add to your charm a "my_tracing_endpoint" (you can name this attribute whatever you like) +**property**, **method** or **instance attribute** that returns an otlp http/https endpoint url. +If you are using the ``charms.tempo_k8s.v2.tracing.TracingEndpointRequirer`` as +``self.tracing = TracingEndpointRequirer(self)``, the implementation could be: + +``` + @property + def my_tracing_endpoint(self) -> Optional[str]: + '''Tempo endpoint for charm tracing''' + if self.tracing.is_ready(): + return self.tracing.get_endpoint("otlp_http") + else: + return None +``` + +At this point your charm will be automatically instrumented so that: +- charm execution starts a trace, containing + - every event as a span (including custom events) + - every charm method call (except dunders) as a span + + +## TLS support +If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), +you can configure ``charm_tracing`` to use TLS by passing a ``server_cert`` parameter to the decorator. + +If your charm is not trusting the same CA as the Tempo endpoint it is sending traces to, +you'll need to implement a cert-transfer relation to obtain the CA certificate from the same +CA that Tempo is using. + +For example: +``` +from charms.tempo_k8s.v1.charm_tracing import trace_charm +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert" +) +class MyCharm(CharmBase): + self._server_cert = "/path/to/server.crt" + ... + + def on_tls_changed(self, e) -> Optional[str]: + # update the server cert on the charm container for charm tracing + Path(self._server_cert).write_text(self.get_server_cert()) + + def on_tls_broken(self, e) -> Optional[str]: + # remove the server cert so charm_tracing won't try to use tls anymore + Path(self._server_cert).unlink() +``` + + +## More fine-grained manual instrumentation +if you wish to add more spans to the trace, you can do so by getting a hold of the tracer like so: +``` +import opentelemetry +... +def get_tracer(self) -> opentelemetry.trace.Tracer: + return opentelemetry.trace.get_tracer(type(self).__name__) +``` + +By default, the tracer is named after the charm type. If you wish to override that, you can pass +a different ``service_name`` argument to ``trace_charm``. + +See the official opentelemetry Python SDK documentation for usage: +https://opentelemetry-python.readthedocs.io/en/latest/ + +## Upgrading from `v0` + +If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already +have the newest version of the library in your charm): +1) If you need the dependency for your tests, add the following dependency to your charm project +(or, if your project had a dependency on `opentelemetry-exporter-otlp-proto-grpc` only because +of `charm_tracing` v0, you can replace it with): + +`opentelemetry-exporter-otlp-proto-http>=1.21.0`. + +2) Update the charm method referenced to from ``@trace`` and ``@trace_charm``, +to return from ``TracingEndpointRequirer.get_endpoint("otlp_http")`` instead of ``grpc_http``. +For example: + +``` + from charms.tempo_k8s.v0.charm_tracing import trace_charm + + @trace_charm( + tracing_endpoint="my_tracing_endpoint", + ) + class MyCharm(CharmBase): + + ... + + @property + def my_tracing_endpoint(self) -> Optional[str]: + '''Tempo endpoint for charm tracing''' + if self.tracing.is_ready(): + return self.tracing.otlp_grpc_endpoint() # OLD API, DEPRECATED. + else: + return None +``` + +needs to be replaced with: + +``` + from charms.tempo_k8s.v1.charm_tracing import trace_charm + + @trace_charm( + tracing_endpoint="my_tracing_endpoint", + ) + class MyCharm(CharmBase): + + ... + + @property + def my_tracing_endpoint(self) -> Optional[str]: + '''Tempo endpoint for charm tracing''' + if self.tracing.is_ready(): + return self.tracing.get_endpoint("otlp_http") # NEW API, use this. + else: + return None +``` + +3) If you were passing a certificate (str) using `server_cert`, you need to change it to +provide an *absolute* path to the certificate file instead. """ + +def _remove_stale_otel_sdk_packages(): + """Hack to remove stale opentelemetry sdk packages from the charm's python venv. + + See https://github.com/canonical/grafana-agent-operator/issues/146 and + https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after + this juju issue is resolved and sufficient time has passed to expect most users of this library + have migrated to the patched version of juju. When this patch is removed, un-ignore rule E402 for this file in the pyproject.toml (see setting + [tool.ruff.lint.per-file-ignores] in pyproject.toml). + + This only has an effect if executed on an upgrade-charm event. + """ + # all imports are local to keep this function standalone, side-effect-free, and easy to revert later + import os + + if os.getenv("JUJU_DISPATCH_PATH") != "hooks/upgrade-charm": + return + + import logging + import shutil + from collections import defaultdict + + from importlib_metadata import distributions + + otel_logger = logging.getLogger("charm_tracing_otel_patcher") + otel_logger.debug("Applying _remove_stale_otel_sdk_packages patch on charm upgrade") + # group by name all distributions starting with "opentelemetry_" + otel_distributions = defaultdict(list) + for distribution in distributions(): + name = distribution._normalized_name # type: ignore + if name.startswith("opentelemetry_"): + otel_distributions[name].append(distribution) + + otel_logger.debug(f"Found {len(otel_distributions)} opentelemetry distributions") + + # If we have multiple distributions with the same name, remove any that have 0 associated files + for name, distributions_ in otel_distributions.items(): + if len(distributions_) <= 1: + continue + + otel_logger.debug(f"Package {name} has multiple ({len(distributions_)}) distributions.") + for distribution in distributions_: + if not distribution.files: # Not None or empty list + path = distribution._path # type: ignore + otel_logger.info(f"Removing empty distribution of {name} at {path}.") + shutil.rmtree(path) + + otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") + + +_remove_stale_otel_sdk_packages() + +import functools +import inspect +import logging +import os +from contextlib import contextmanager +from contextvars import Context, ContextVar, copy_context +from pathlib import Path +from typing import ( + Any, + Callable, + Generator, + Optional, + Sequence, + Type, + TypeVar, + Union, + cast, +) + +import opentelemetry +import ops +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import Span, TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.trace import ( + INVALID_SPAN, + Tracer, + get_tracer, + get_tracer_provider, + set_span_in_context, + set_tracer_provider, +) +from opentelemetry.trace import get_current_span as otlp_get_current_span +from ops.charm import CharmBase +from ops.framework import Framework + +# The unique Charmhub library identifier, never change it LIBID = "cb1705dcd1a14ca09b2e60187d1215c7" + +# Increment this major API version when introducing breaking changes LIBAPI = 1 -LIBPATCH = 17 -raise DeprecationWarning( - "this charm lib is deprecated; please use charms.tempo_coordinator_k8s.v0.charm_tracing instead. " - "see https://charmhub.io/tempo-coordinator-k8s/libraries/charm_tracing" -) +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version + +LIBPATCH = 15 + +PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] + +logger = logging.getLogger("tracing") +dev_logger = logging.getLogger("tracing-dev") + +# set this to 0 if you are debugging/developing this library source +dev_logger.setLevel(logging.CRITICAL) + +_CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof +_C = TypeVar("_C", bound=_CharmType) +_T = TypeVar("_T", bound=type) +_F = TypeVar("_F", bound=Type[Callable]) +tracer: ContextVar[Tracer] = ContextVar("tracer") +_GetterType = Union[Callable[[_CharmType], Optional[str]], property] + +CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" + + +def is_enabled() -> bool: + """Whether charm tracing is enabled.""" + return os.getenv(CHARM_TRACING_ENABLED, "1") == "1" + + +@contextmanager +def charm_tracing_disabled(): + """Contextmanager to temporarily disable charm tracing. + + For usage in tests. + """ + previous = os.getenv(CHARM_TRACING_ENABLED, "1") + os.environ[CHARM_TRACING_ENABLED] = "0" + yield + os.environ[CHARM_TRACING_ENABLED] = previous + + +def get_current_span() -> Union[Span, None]: + """Return the currently active Span, if there is one, else None. + + If you'd rather keep your logic unconditional, you can use opentelemetry.trace.get_current_span, + which will return an object that behaves like a span but records no data. + """ + span = otlp_get_current_span() + if span is INVALID_SPAN: + return None + return cast(Span, span) + + +def _get_tracer_from_context(ctx: Context) -> Optional[ContextVar]: + tracers = [v for v in ctx if v is not None and v.name == "tracer"] + if tracers: + return tracers[0] + return None + + +def _get_tracer() -> Optional[Tracer]: + """Find tracer in context variable and as a fallback locate it in the full context.""" + try: + return tracer.get() + except LookupError: + # fallback: this course-corrects for a user error where charm_tracing symbols are imported + # from different paths (typically charms.tempo_k8s... and lib.charms.tempo_k8s...) + try: + ctx: Context = copy_context() + if context_tracer := _get_tracer_from_context(ctx): + logger.warning( + "Tracer not found in `tracer` context var. " + "Verify that you're importing all `charm_tracing` symbols from the same module path. \n" + "For example, DO" + ": `from charms.lib...charm_tracing import foo, bar`. \n" + "DONT: \n" + " \t - `from charms.lib...charm_tracing import foo` \n" + " \t - `from lib...charm_tracing import bar` \n" + "For more info: https://python-notes.curiousefficiency.org/en/latest/python" + "_concepts/import_traps.html#the-double-import-trap" + ) + return context_tracer.get() + else: + return None + except LookupError: + return None + + +@contextmanager +def _span(name: str) -> Generator[Optional[Span], Any, Any]: + """Context to create a span if there is a tracer, otherwise do nothing.""" + if tracer := _get_tracer(): + with tracer.start_as_current_span(name) as span: + yield cast(Span, span) + else: + yield None + + +class TracingError(RuntimeError): + """Base class for errors raised by this module.""" + + +class UntraceableObjectError(TracingError): + """Raised when an object you're attempting to instrument cannot be autoinstrumented.""" + + +class TLSError(TracingError): + """Raised when the tracing endpoint is https but we don't have a cert yet.""" + + +def _get_tracing_endpoint( + tracing_endpoint_attr: str, + charm_instance: object, + charm_type: type, +): + _tracing_endpoint = getattr(charm_instance, tracing_endpoint_attr) + if callable(_tracing_endpoint): + tracing_endpoint = _tracing_endpoint() + else: + tracing_endpoint = _tracing_endpoint + + if tracing_endpoint is None: + return + + elif not isinstance(tracing_endpoint, str): + raise TypeError( + f"{charm_type.__name__}.{tracing_endpoint_attr} should resolve to a tempo endpoint (string); " + f"got {tracing_endpoint} instead." + ) + + dev_logger.debug(f"Setting up span exporter to endpoint: {tracing_endpoint}/v1/traces") + return f"{tracing_endpoint}/v1/traces" + + +def _get_server_cert( + server_cert_attr: str, + charm_instance: ops.CharmBase, + charm_type: Type[ops.CharmBase], +): + _server_cert = getattr(charm_instance, server_cert_attr) + if callable(_server_cert): + server_cert = _server_cert() + else: + server_cert = _server_cert + + if server_cert is None: + logger.warning( + f"{charm_type}.{server_cert_attr} is None; sending traces over INSECURE connection." + ) + return + elif not Path(server_cert).is_absolute(): + raise ValueError( + f"{charm_type}.{server_cert_attr} should resolve to a valid tls cert absolute path (string | Path)); " + f"got {server_cert} instead." + ) + return server_cert + + +def _setup_root_span_initializer( + charm_type: _CharmType, + tracing_endpoint_attr: str, + server_cert_attr: Optional[str], + service_name: Optional[str] = None, +): + """Patch the charm's initializer.""" + original_init = charm_type.__init__ + + @functools.wraps(original_init) + def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): + # we're using 'self' here because this is charm init code, makes sense to read what's below + # from the perspective of the charm. Self.unit.name... + + original_init(self, framework, *args, **kwargs) + # we call this from inside the init context instead of, say, _autoinstrument, because we want it to + # be checked on a per-charm-instantiation basis, not on a per-type-declaration one. + if not is_enabled(): + # this will only happen during unittesting, hopefully, so it's fine to log a + # bit more verbosely + logger.info("Tracing DISABLED: skipping root span initialization") + return + + # already init some attrs that will be reinited later by calling original_init: + # self.framework = framework + # self.handle = Handle(None, self.handle_kind, None) + + original_event_context = framework._event_context + # default service name isn't just app name because it could conflict with the workload service name + _service_name = service_name or f"{self.app.name}-charm" + + unit_name = self.unit.name + # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. + # it could be trouble if someone ever decides to implement their own tracer parallel to + # ours and before the charm has inited. We assume they won't. + resource = Resource.create( + attributes={ + "service.name": _service_name, + "compose_service": _service_name, + "charm_type": type(self).__name__, + # juju topology + "juju_unit": unit_name, + "juju_application": self.app.name, + "juju_model": self.model.name, + "juju_model_uuid": self.model.uuid, + } + ) + provider = TracerProvider(resource=resource) + + # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. + tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) + + if not tracing_endpoint: + # tracing is off if tracing_endpoint is None + return + + server_cert: Optional[Union[str, Path]] = ( + _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None + ) + + if tracing_endpoint.startswith("https://") and not server_cert: + raise TLSError( + "Tracing endpoint is https, but no server_cert has been passed." + "Please point @trace_charm to a `server_cert` attr." + ) + + exporter = OTLPSpanExporter( + endpoint=tracing_endpoint, + certificate_file=str(Path(server_cert).absolute()) if server_cert else None, + timeout=2, + ) + + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) + set_tracer_provider(provider) + _tracer = get_tracer(_service_name) # type: ignore + _tracer_token = tracer.set(_tracer) + + dispatch_path = os.getenv("JUJU_DISPATCH_PATH", "") # something like hooks/install + event_name = dispatch_path.split("/")[1] if "/" in dispatch_path else dispatch_path + root_span_name = f"{unit_name}: {event_name} event" + span = _tracer.start_span(root_span_name, attributes={"juju.dispatch_path": dispatch_path}) + + # all these shenanigans are to work around the fact that the opentelemetry tracing API is built + # on the assumption that spans will be used as contextmanagers. + # Since we don't (as we need to close the span on framework.commit), + # we need to manually set the root span as current. + ctx = set_span_in_context(span) + + # log a trace id, so we can pick it up from the logs (and jhack) to look it up in tempo. + root_trace_id = hex(span.get_span_context().trace_id)[2:] # strip 0x prefix + logger.debug(f"Starting root trace with id={root_trace_id!r}.") + + span_token = opentelemetry.context.attach(ctx) # type: ignore + + @contextmanager + def wrap_event_context(event_name: str): + dev_logger.info(f"entering event context: {event_name}") + # when the framework enters an event context, we create a span. + with _span("event: " + event_name) as event_context_span: + if event_context_span: + # todo: figure out how to inject event attrs in here + event_context_span.add_event(event_name) + yield original_event_context(event_name) + + framework._event_context = wrap_event_context # type: ignore + + original_close = framework.close + + @functools.wraps(original_close) + def wrap_close(): + dev_logger.info("tearing down tracer and flushing traces") + span.end() + opentelemetry.context.detach(span_token) # type: ignore + tracer.reset(_tracer_token) + tp = cast(TracerProvider, get_tracer_provider()) + tp.force_flush(timeout_millis=1000) # don't block for too long + tp.shutdown() + original_close() + + framework.close = wrap_close + return + + charm_type.__init__ = wrap_init # type: ignore + + +def trace_charm( + tracing_endpoint: str, + server_cert: Optional[str] = None, + service_name: Optional[str] = None, + extra_types: Sequence[type] = (), +) -> Callable[[_T], _T]: + """Autoinstrument the decorated charm with tracing telemetry. + + Use this function to get out-of-the-box traces for all events emitted on this charm and all + method calls on instances of this class. + + Usage: + >>> from charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from charms.tempo_k8s.v1.tracing import TracingEndpointRequirer + >>> from ops import CharmBase + >>> + >>> @trace_charm( + >>> tracing_endpoint="tempo_otlp_http_endpoint", + >>> ) + >>> class MyCharm(CharmBase): + >>> + >>> def __init__(self, framework: Framework): + >>> ... + >>> self.tracing = TracingEndpointRequirer(self) + >>> + >>> @property + >>> def tempo_otlp_http_endpoint(self) -> Optional[str]: + >>> if self.tracing.is_ready(): + >>> return self.tracing.otlp_http_endpoint() + >>> else: + >>> return None + >>> + + :param tracing_endpoint: name of a method, property or attribute on the charm type that returns an + optional (fully resolvable) tempo url to which the charm traces will be pushed. + If None, tracing will be effectively disabled. + :param server_cert: name of a method, property or attribute on the charm type that returns an + optional absolute path to a CA certificate file to be used when sending traces to a remote server. + If it returns None, an _insecure_ connection will be used. To avoid errors in transient + situations where the endpoint is already https but there is no certificate on disk yet, it + is recommended to disable tracing (by returning None from the tracing_endpoint) altogether + until the cert has been written to disk. + :param service_name: service name tag to attach to all traces generated by this charm. + Defaults to the juju application name this charm is deployed under. + :param extra_types: pass any number of types that you also wish to autoinstrument. + For example, charm libs, relation endpoint wrappers, workload abstractions, ... + """ + + def _decorator(charm_type: _T) -> _T: + """Autoinstrument the wrapped charmbase type.""" + _autoinstrument( + charm_type, + tracing_endpoint_attr=tracing_endpoint, + server_cert_attr=server_cert, + service_name=service_name, + extra_types=extra_types, + ) + return charm_type + + return _decorator + + +def _autoinstrument( + charm_type: _T, + tracing_endpoint_attr: str, + server_cert_attr: Optional[str] = None, + service_name: Optional[str] = None, + extra_types: Sequence[type] = (), +) -> _T: + """Set up tracing on this charm class. + + Use this function to get out-of-the-box traces for all events emitted on this charm and all + method calls on instances of this class. + + Usage: + + >>> from charms.tempo_k8s.v1.charm_tracing import _autoinstrument + >>> from ops.main import main + >>> _autoinstrument( + >>> MyCharm, + >>> tracing_endpoint_attr="tempo_otlp_http_endpoint", + >>> service_name="MyCharm", + >>> extra_types=(Foo, Bar) + >>> ) + >>> main(MyCharm) + + :param charm_type: the CharmBase subclass to autoinstrument. + :param tracing_endpoint_attr: name of a method, property or attribute on the charm type that returns an + optional (fully resolvable) tempo url to which the charm traces will be pushed. + If None, tracing will be effectively disabled. + :param server_cert_attr: name of a method, property or attribute on the charm type that returns an + optional absolute path to a CA certificate file to be used when sending traces to a remote server. + If it returns None, an _insecure_ connection will be used. To avoid errors in transient + situations where the endpoint is already https but there is no certificate on disk yet, it + is recommended to disable tracing (by returning None from the tracing_endpoint) altogether + until the cert has been written to disk. + :param service_name: service name tag to attach to all traces generated by this charm. + Defaults to the juju application name this charm is deployed under. + :param extra_types: pass any number of types that you also wish to autoinstrument. + For example, charm libs, relation endpoint wrappers, workload abstractions, ... + """ + dev_logger.info(f"instrumenting {charm_type}") + _setup_root_span_initializer( + charm_type, + tracing_endpoint_attr, + server_cert_attr=server_cert_attr, + service_name=service_name, + ) + trace_type(charm_type) + for type_ in extra_types: + trace_type(type_) + + return charm_type + + +def trace_type(cls: _T) -> _T: + """Set up tracing on this class. + + Use this decorator to get out-of-the-box traces for all method calls on instances of this class. + It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` + has been instantiated. + """ + dev_logger.info(f"instrumenting {cls}") + for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): + dev_logger.info(f"discovered {method}") + + if method.__name__.startswith("__"): + dev_logger.info(f"skipping {method} (dunder)") + continue + + # the span title in the general case should be: + # method call: MyCharmWrappedMethods.b + # if the method has a name (functools.wrapped or regular method), let + # _trace_callable use its default algorithm to determine what name to give the span. + trace_method_name = None + try: + qualname_c0 = method.__qualname__.split(".")[0] + if not hasattr(cls, method.__name__): + # if the callable doesn't have a __name__ (probably a decorated method), + # it probably has a bad qualname too (such as my_decorator..wrapper) which is not + # great for finding out what the trace is about. So we use the method name instead and + # add a reference to the decorator name. Result: + # method call: @my_decorator(MyCharmWrappedMethods.b) + trace_method_name = f"@{qualname_c0}({cls.__name__}.{name})" + except Exception: # noqa: failsafe + pass + + new_method = trace_method(method, name=trace_method_name) + + if isinstance(inspect.getattr_static(cls, name), staticmethod): + new_method = staticmethod(new_method) + setattr(cls, name, new_method) + + return cls + + +def trace_method(method: _F, name: Optional[str] = None) -> _F: + """Trace this method. + + A span will be opened when this method is called and closed when it returns. + """ + return _trace_callable(method, "method", name=name) + + +def trace_function(function: _F, name: Optional[str] = None) -> _F: + """Trace this function. + + A span will be opened when this function is called and closed when it returns. + """ + return _trace_callable(function, "function", name=name) + + +def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: + dev_logger.info(f"instrumenting {callable}") + + # sig = inspect.signature(callable) + @functools.wraps(callable) + def wrapped_function(*args, **kwargs): # type: ignore + name_ = name or getattr( + callable, "__qualname__", getattr(callable, "__name__", str(callable)) + ) + with _span(f"{qualifier} call: {name_}"): # type: ignore + return callable(*args, **kwargs) # type: ignore + + # wrapped_function.__signature__ = sig + return wrapped_function # type: ignore + + +def trace(obj: Union[Type, Callable]): + """Trace this object and send the resulting spans to Tempo. + + It will dispatch to ``trace_type`` if the decorated object is a class, otherwise + ``trace_function``. + """ + if isinstance(obj, type): + if issubclass(obj, CharmBase): + raise ValueError( + "cannot use @trace on CharmBase subclasses: use @trace_charm instead " + "(we need some arguments!)" + ) + return trace_type(obj) + else: + try: + return trace_function(obj) + except Exception: + raise UntraceableObjectError( + f"cannot create span from {type(obj)}; instrument {obj} manually." + ) diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py index ce142b8..81bf1f1 100644 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ b/lib/charms/tempo_k8s/v2/tracing.py @@ -1,29 +1,996 @@ -#!/usr/bin/env python3 -# Copyright 2022 Canonical Ltd. +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +"""## Overview. -"""This charm library has been transferred to the HA version of this charm. +This document explains how to integrate with the Tempo charm for the purpose of pushing traces to a +tracing endpoint provided by Tempo. It also explains how alternative implementations of the Tempo charm +may maintain the same interface and be backward compatible with all currently integrated charms. -The new owner is the `tempo-coordinator-k8s` charm: -- [github](https://github.com/canonical/tempo-coordinator-k8s/) -- [charmhub](https://charmhub.io/tempo-coordinator-k8s/) +## Requirer Library Usage -The new library (with its major version reset to 0) can be found at +Charms seeking to push traces to Tempo, must do so using the `TracingEndpointRequirer` +object from this charm library. For the simplest use cases, using the `TracingEndpointRequirer` +object only requires instantiating it, typically in the constructor of your charm. The +`TracingEndpointRequirer` constructor requires the name of the relation over which a tracing endpoint + is exposed by the Tempo charm, and a list of protocols it intends to send traces with. + This relation must use the `tracing` interface. + The `TracingEndpointRequirer` object may be instantiated as follows -https://charmhub.io/tempo-coordinator-k8s/libraries/tracing + from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer -to install it: + def __init__(self, *args): + super().__init__(*args) + # ... + self.tracing = TracingEndpointRequirer(self, + protocols=['otlp_grpc', 'otlp_http', 'jaeger_http_thrift'] + ) + # ... -> charmcraft fetch-lib charms.tempo_coordinator_k8s.v0.tracing +Note that the first argument (`self`) to `TracingEndpointRequirer` is always a reference to the +parent charm. -The API is unchanged, so you can search and replace the path to swap the old lib with the new one. -""" +Alternatively to providing the list of requested protocols at init time, the charm can do it at +any point in time by calling the +`TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method. +Using this method also allows you to use per-relation protocols. + +Units of provider charms obtain the tempo endpoint to which they will push their traces by calling +`TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example: +- `otlp_grpc` +- `otlp_http` +- `zipkin` +- `tempo` + +If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time, +the library will raise an error. + +## Requirer Library Usage + +The `TracingEndpointProvider` object may be used by charms to manage relations with their +trace sources. For this purposes a Tempo-like charm needs to do two things + +1. Instantiate the `TracingEndpointProvider` object by providing it a +reference to the parent (Tempo) charm and optionally the name of the relation that the Tempo charm +uses to interact with its trace sources. This relation must conform to the `tracing` interface +and it is strongly recommended that this relation be named `tracing` which is its +default value. + +For example a Tempo charm may instantiate the `TracingEndpointProvider` in its constructor as +follows + + from charms.tempo_k8s.v2.tracing import TracingEndpointProvider + + def __init__(self, *args): + super().__init__(*args) + # ... + self.tracing = TracingEndpointProvider(self) + # ... + + + +""" # noqa: W505 +import enum +import json +import logging +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Literal, + MutableMapping, + Optional, + Sequence, + Tuple, + Union, + cast, +) +import pydantic +from ops.charm import ( + CharmBase, + CharmEvents, + RelationBrokenEvent, + RelationEvent, + RelationRole, +) +from ops.framework import EventSource, Object +from ops.model import ModelError, Relation +from pydantic import BaseModel, Field + +# The unique Charmhub library identifier, never change it LIBID = "12977e9aa0b34367903d8afeb8c3d85d" + +# Increment this major API version when introducing breaking changes LIBAPI = 2 -LIBPATCH = 11 -raise DeprecationWarning( - "this charm lib is deprecated; please use charms.tempo_coordinator_k8s.v0.tracing instead. " - "see https://charmhub.io/tempo-coordinator-k8s/libraries/tracing" -) +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 10 + +PYDEPS = ["pydantic"] + +logger = logging.getLogger(__name__) + +DEFAULT_RELATION_NAME = "tracing" +RELATION_INTERFACE_NAME = "tracing" + +# Supported list rationale https://github.com/canonical/tempo-coordinator-k8s-operator/issues/8 +ReceiverProtocol = Literal[ + "zipkin", + "otlp_grpc", + "otlp_http", + "jaeger_grpc", + "jaeger_thrift_http", +] + +RawReceiver = Tuple[ReceiverProtocol, str] +"""Helper type. A raw receiver is defined as a tuple consisting of the protocol name, and the (external, if available), +(secured, if available) resolvable server url. +""" + +BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} + + +class TransportProtocolType(str, enum.Enum): + """Receiver Type.""" + + http = "http" + grpc = "grpc" + + +receiver_protocol_to_transport_protocol: Dict[ReceiverProtocol, TransportProtocolType] = { + "zipkin": TransportProtocolType.http, + "otlp_grpc": TransportProtocolType.grpc, + "otlp_http": TransportProtocolType.http, + "jaeger_thrift_http": TransportProtocolType.http, + "jaeger_grpc": TransportProtocolType.grpc, +} +"""A mapping between telemetry protocols and their corresponding transport protocol. +""" + + +class TracingError(Exception): + """Base class for custom errors raised by this library.""" + + +class NotReadyError(TracingError): + """Raised by the provider wrapper if a requirer hasn't published the required data (yet).""" + + +class ProtocolNotRequestedError(TracingError): + """Raised if the user attempts to obtain an endpoint for a protocol it did not request.""" + + +class DataValidationError(TracingError): + """Raised when data validation fails on IPU relation data.""" + + +class AmbiguousRelationUsageError(TracingError): + """Raised when one wrongly assumes that there can only be one relation on an endpoint.""" + + +if int(pydantic.version.VERSION.split(".")[0]) < 2: + + class DatabagModel(BaseModel): # type: ignore + """Base databag model.""" + + class Config: + """Pydantic config.""" + + # ignore any extra fields in the databag + extra = "ignore" + """Ignore any extra fields in the databag.""" + allow_population_by_field_name = True + """Allow instantiating this class by field name (instead of forcing alias).""" + + _NEST_UNDER = None + + @classmethod + def load(cls, databag: MutableMapping): + """Load this model from a Juju databag.""" + if cls._NEST_UNDER: + return cls.parse_obj(json.loads(databag[cls._NEST_UNDER])) + + try: + data = { + k: json.loads(v) + for k, v in databag.items() + # Don't attempt to parse model-external values + if k in {f.alias for f in cls.__fields__.values()} + } + except json.JSONDecodeError as e: + msg = f"invalid databag contents: expecting json. {databag}" + logger.error(msg) + raise DataValidationError(msg) from e + + try: + return cls.parse_raw(json.dumps(data)) # type: ignore + except pydantic.ValidationError as e: + msg = f"failed to validate databag: {databag}" + logger.debug(msg, exc_info=True) + raise DataValidationError(msg) from e + + def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): + """Write the contents of this model to Juju databag. + + :param databag: the databag to write the data to. + :param clear: ensure the databag is cleared before writing it. + """ + if clear and databag: + databag.clear() + + if databag is None: + databag = {} + + if self._NEST_UNDER: + databag[self._NEST_UNDER] = self.json(by_alias=True) + return databag + + dct = self.dict() + for key, field in self.__fields__.items(): # type: ignore + value = dct[key] + databag[field.alias or key] = json.dumps(value) + + return databag + +else: + from pydantic import ConfigDict + + class DatabagModel(BaseModel): + """Base databag model.""" + + model_config = ConfigDict( + # ignore any extra fields in the databag + extra="ignore", + # Allow instantiating this class by field name (instead of forcing alias). + populate_by_name=True, + # Custom config key: whether to nest the whole datastructure (as json) + # under a field or spread it out at the toplevel. + _NEST_UNDER=None, # type: ignore + ) + """Pydantic config.""" + + @classmethod + def load(cls, databag: MutableMapping): + """Load this model from a Juju databag.""" + nest_under = cls.model_config.get("_NEST_UNDER") # type: ignore + if nest_under: + return cls.model_validate(json.loads(databag[nest_under])) # type: ignore + + try: + data = { + k: json.loads(v) + for k, v in databag.items() + # Don't attempt to parse model-external values + if k in {(f.alias or n) for n, f in cls.__fields__.items()} + } + except json.JSONDecodeError as e: + msg = f"invalid databag contents: expecting json. {databag}" + logger.error(msg) + raise DataValidationError(msg) from e + + try: + return cls.model_validate_json(json.dumps(data)) # type: ignore + except pydantic.ValidationError as e: + msg = f"failed to validate databag: {databag}" + logger.debug(msg, exc_info=True) + raise DataValidationError(msg) from e + + def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): + """Write the contents of this model to Juju databag. + + :param databag: the databag to write the data to. + :param clear: ensure the databag is cleared before writing it. + """ + if clear and databag: + databag.clear() + + if databag is None: + databag = {} + nest_under = self.model_config.get("_NEST_UNDER") + if nest_under: + databag[nest_under] = self.model_dump_json( # type: ignore + by_alias=True, + # skip keys whose values are default + exclude_defaults=True, + ) + return databag + + dct = self.model_dump() # type: ignore + for key, field in self.model_fields.items(): # type: ignore + value = dct[key] + if value == field.default: + continue + databag[field.alias or key] = json.dumps(value) + + return databag + + +# todo use models from charm-relation-interfaces +if int(pydantic.version.VERSION.split(".")[0]) < 2: + + class ProtocolType(BaseModel): # type: ignore + """Protocol Type.""" + + class Config: + """Pydantic config.""" + + use_enum_values = True + """Allow serializing enum values.""" + + name: str = Field( + ..., + description="Receiver protocol name. What protocols are supported (and what they are called) " + "may differ per provider.", + examples=["otlp_grpc", "otlp_http", "tempo_http"], + ) + + type: TransportProtocolType = Field( + ..., + description="The transport protocol used by this receiver.", + examples=["http", "grpc"], + ) + +else: + + class ProtocolType(BaseModel): + """Protocol Type.""" + + model_config = ConfigDict( # type: ignore + # Allow serializing enum values. + use_enum_values=True + ) + """Pydantic config.""" + + name: str = Field( + ..., + description="Receiver protocol name. What protocols are supported (and what they are called) " + "may differ per provider.", + examples=["otlp_grpc", "otlp_http", "tempo_http"], + ) + + type: TransportProtocolType = Field( + ..., + description="The transport protocol used by this receiver.", + examples=["http", "grpc"], + ) + + +class Receiver(BaseModel): + """Specification of an active receiver.""" + + protocol: ProtocolType = Field(..., description="Receiver protocol name and type.") + url: str = Field( + ..., + description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL. + Otherwise, it would be the service's fqdn or internal IP. + If the protocol type is grpc, the url will not contain a scheme.""", + examples=[ + "http://traefik_address:2331", + "https://traefik_address:2331", + "http://tempo_public_ip:2331", + "https://tempo_public_ip:2331", + "tempo_public_ip:2331", + ], + ) + + +class TracingProviderAppData(DatabagModel): # noqa: D101 + """Application databag model for the tracing provider.""" + + receivers: List[Receiver] = Field( + ..., + description="List of all receivers enabled on the tracing provider.", + ) + + +class TracingRequirerAppData(DatabagModel): # noqa: D101 + """Application databag model for the tracing requirer.""" + + receivers: List[ReceiverProtocol] + """Requested receivers.""" + + +class _AutoSnapshotEvent(RelationEvent): + __args__: Tuple[str, ...] = () + __optional_kwargs__: Dict[str, Any] = {} + + @classmethod + def __attrs__(cls): + return cls.__args__ + tuple(cls.__optional_kwargs__.keys()) + + def __init__(self, handle, relation, *args, **kwargs): + super().__init__(handle, relation) + + if not len(self.__args__) == len(args): + raise TypeError("expected {} args, got {}".format(len(self.__args__), len(args))) + + for attr, obj in zip(self.__args__, args): + setattr(self, attr, obj) + for attr, default in self.__optional_kwargs__.items(): + obj = kwargs.get(attr, default) + setattr(self, attr, obj) + + def snapshot(self) -> dict: + dct = super().snapshot() + for attr in self.__attrs__(): + obj = getattr(self, attr) + try: + dct[attr] = obj + except ValueError as e: + raise ValueError( + "cannot automagically serialize {}: " + "override this method and do it " + "manually.".format(obj) + ) from e + + return dct + + def restore(self, snapshot: dict) -> None: + super().restore(snapshot) + for attr, obj in snapshot.items(): + setattr(self, attr, obj) + + +class RelationNotFoundError(Exception): + """Raised if no relation with the given name is found.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has an unexpected interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different role than expected.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +): + """Validate a relation. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the same relation interface + as specified via the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the same role as specified + via the `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + # fixme: why do we need to cast here? + actual_relation_interface = cast(str, relation.interface_name) + + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface + ) + + if expected_relation_role is RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role is RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise TypeError("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +class RequestEvent(RelationEvent): + """Event emitted when a remote requests a tracing endpoint.""" + + @property + def requested_receivers(self) -> List[ReceiverProtocol]: + """List of receiver protocols that have been requested.""" + relation = self.relation + app = relation.app + if not app: + raise NotReadyError("relation.app is None") + + return TracingRequirerAppData.load(relation.data[app]).receivers + + +class BrokenEvent(RelationBrokenEvent): + """Event emitted when a relation on tracing is broken.""" + + +class TracingEndpointProviderEvents(CharmEvents): + """TracingEndpointProvider events.""" + + request = EventSource(RequestEvent) + broken = EventSource(BrokenEvent) + + +class TracingEndpointProvider(Object): + """Class representing a trace receiver service.""" + + on = TracingEndpointProviderEvents() # type: ignore + + def __init__( + self, + charm: CharmBase, + external_url: Optional[str] = None, + relation_name: str = DEFAULT_RELATION_NAME, + ): + """Initialize. + + Args: + charm: a `CharmBase` instance that manages this instance of the Tempo service. + external_url: external address of the node hosting the tempo server, + if an ingress is present. + relation_name: an optional string name of the relation between `charm` + and the Tempo charmed service. The default is "tracing". + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `tracing` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.requires` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + super().__init__(charm, relation_name + "tracing-provider") + self._charm = charm + self._external_url = external_url + self._relation_name = relation_name + self.framework.observe( + self._charm.on[relation_name].relation_joined, self._on_relation_event + ) + self.framework.observe( + self._charm.on[relation_name].relation_created, self._on_relation_event + ) + self.framework.observe( + self._charm.on[relation_name].relation_changed, self._on_relation_event + ) + self.framework.observe( + self._charm.on[relation_name].relation_broken, self._on_relation_broken_event + ) + + def _on_relation_broken_event(self, e: RelationBrokenEvent): + """Handle relation broken events.""" + self.on.broken.emit(e.relation) + + def _on_relation_event(self, e: RelationEvent): + """Handle relation created/joined/changed events.""" + if self.is_requirer_ready(e.relation): + self.on.request.emit(e.relation) + + def is_requirer_ready(self, relation: Relation): + """Attempt to determine if requirer has already populated app data.""" + try: + self._get_requested_protocols(relation) + except NotReadyError: + return False + return True + + @staticmethod + def _get_requested_protocols(relation: Relation): + app = relation.app + if not app: + raise NotReadyError("relation.app is None") + + try: + databag = TracingRequirerAppData.load(relation.data[app]) + except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): + logger.info(f"relation {relation} is not ready to talk tracing") + raise NotReadyError() + return databag.receivers + + def requested_protocols(self): + """All receiver protocols that have been requested by our related apps.""" + requested_protocols = set() + for relation in self.relations: + try: + protocols = self._get_requested_protocols(relation) + except NotReadyError: + continue + requested_protocols.update(protocols) + return requested_protocols + + @property + def relations(self) -> List[Relation]: + """All relations active on this endpoint.""" + return self._charm.model.relations[self._relation_name] + + def publish_receivers(self, receivers: Sequence[RawReceiver]): + """Let all requirers know that these receivers are active and listening.""" + if not self._charm.unit.is_leader(): + raise RuntimeError("only leader can do this") + + for relation in self.relations: + try: + TracingProviderAppData( + receivers=[ + Receiver( + url=url, + protocol=ProtocolType( + name=protocol, + type=receiver_protocol_to_transport_protocol[protocol], + ), + ) + for protocol, url in receivers + ], + ).dump(relation.data[self._charm.app]) + + except ModelError as e: + # args are bytes + msg = e.args[0] + if isinstance(msg, bytes): + if msg.startswith( + b"ERROR cannot read relation application settings: permission denied" + ): + logger.error( + f"encountered error {e} while attempting to update_relation_data." + f"The relation must be gone." + ) + continue + raise + + +class EndpointRemovedEvent(RelationBrokenEvent): + """Event representing a change in one of the receiver endpoints.""" + + +class EndpointChangedEvent(_AutoSnapshotEvent): + """Event representing a change in one of the receiver endpoints.""" + + __args__ = ("_receivers",) + + if TYPE_CHECKING: + _receivers = [] # type: List[dict] + + @property + def receivers(self) -> List[Receiver]: + """Cast receivers back from dict.""" + return [Receiver(**i) for i in self._receivers] + + +class TracingEndpointRequirerEvents(CharmEvents): + """TracingEndpointRequirer events.""" + + endpoint_changed = EventSource(EndpointChangedEvent) + endpoint_removed = EventSource(EndpointRemovedEvent) + + +class TracingEndpointRequirer(Object): + """A tracing endpoint for Tempo.""" + + on = TracingEndpointRequirerEvents() # type: ignore + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + protocols: Optional[List[ReceiverProtocol]] = None, + ): + """Construct a tracing requirer for a Tempo charm. + + If your application supports pushing traces to a distributed tracing backend, the + `TracingEndpointRequirer` object enables your charm to easily access endpoint information + exchanged over a `tracing` relation interface. + + Args: + charm: a `CharmBase` object that manages this + `TracingEndpointRequirer` object. Typically, this is `self` in the instantiating + class. + relation_name: an optional string name of the relation between `charm` + and the Tempo charmed service. The default is "tracing". It is strongly + advised not to change the default, so that people deploying your charm will have a + consistent experience with all other charms that provide tracing endpoints. + protocols: optional list of protocols that the charm intends to send traces with. + The provider will enable receivers for these and only these protocols, + so be sure to enable all protocols the charm or its workload are going to need. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `tracing` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.provides` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + + self._is_single_endpoint = charm.meta.relations[relation_name].limit == 1 + + self._charm = charm + self._relation_name = relation_name + + events = self._charm.on[self._relation_name] + self.framework.observe(events.relation_changed, self._on_tracing_relation_changed) + self.framework.observe(events.relation_broken, self._on_tracing_relation_broken) + + if protocols: + self.request_protocols(protocols) + + def request_protocols( + self, protocols: Sequence[ReceiverProtocol], relation: Optional[Relation] = None + ): + """Publish the list of protocols which the provider should activate.""" + # todo: should we check if _is_single_endpoint and len(self.relations) > 1 and raise, here? + relations = [relation] if relation else self.relations + + if not protocols: + # empty sequence + raise ValueError( + "You need to pass a nonempty sequence of protocols to `request_protocols`." + ) + + try: + if self._charm.unit.is_leader(): + for relation in relations: + TracingRequirerAppData( + receivers=list(protocols), + ).dump(relation.data[self._charm.app]) + + except ModelError as e: + # args are bytes + msg = e.args[0] + if isinstance(msg, bytes): + if msg.startswith( + b"ERROR cannot read relation application settings: permission denied" + ): + logger.error( + f"encountered error {e} while attempting to request_protocols." + f"The relation must be gone." + ) + return + raise + + @property + def relations(self) -> List[Relation]: + """The tracing relations associated with this endpoint.""" + return self._charm.model.relations[self._relation_name] + + @property + def _relation(self) -> Optional[Relation]: + """If this wraps a single endpoint, the relation bound to it, if any.""" + if not self._is_single_endpoint: + objname = type(self).__name__ + raise AmbiguousRelationUsageError( + f"This {objname} wraps a {self._relation_name} endpoint that has " + "limit != 1. We can't determine what relation, of the possibly many, you are " + f"talking about. Please pass a relation instance while calling {objname}, " + "or set limit=1 in the charm metadata." + ) + relations = self.relations + return relations[0] if relations else None + + def is_ready(self, relation: Optional[Relation] = None): + """Is this endpoint ready?""" + relation = relation or self._relation + if not relation: + logger.debug(f"no relation on {self._relation_name !r}: tracing not ready") + return False + if relation.data is None: + logger.error(f"relation data is None for {relation}") + return False + if not relation.app: + logger.error(f"{relation} event received but there is no relation.app") + return False + try: + databag = dict(relation.data[relation.app]) + TracingProviderAppData.load(databag) + + except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): + logger.info(f"failed validating relation data for {relation}") + return False + return True + + def _on_tracing_relation_changed(self, event): + """Notify the providers that there is new endpoint information available.""" + relation = event.relation + if not self.is_ready(relation): + self.on.endpoint_removed.emit(relation) # type: ignore + return + + data = TracingProviderAppData.load(relation.data[relation.app]) + self.on.endpoint_changed.emit(relation, [i.dict() for i in data.receivers]) # type: ignore + + def _on_tracing_relation_broken(self, event: RelationBrokenEvent): + """Notify the providers that the endpoint is broken.""" + relation = event.relation + self.on.endpoint_removed.emit(relation) # type: ignore + + def get_all_endpoints( + self, relation: Optional[Relation] = None + ) -> Optional[TracingProviderAppData]: + """Unmarshalled relation data.""" + relation = relation or self._relation + if not self.is_ready(relation): + return + return TracingProviderAppData.load(relation.data[relation.app]) # type: ignore + + def _get_endpoint( + self, relation: Optional[Relation], protocol: ReceiverProtocol + ) -> Optional[str]: + app_data = self.get_all_endpoints(relation) + if not app_data: + return None + receivers: List[Receiver] = list( + filter(lambda i: i.protocol.name == protocol, app_data.receivers) + ) + if not receivers: + logger.error(f"no receiver found with protocol={protocol!r}") + return + if len(receivers) > 1: + logger.error( + f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}" + ) + return + + receiver = receivers[0] + return receiver.url + + def get_endpoint( + self, protocol: ReceiverProtocol, relation: Optional[Relation] = None + ) -> Optional[str]: + """Receiver endpoint for the given protocol. + + It could happen that this function gets called before the provider publishes the endpoints. + In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to + restricted access. To prevent this, this function needs to be guarded by the `is_ready` check. + + Raises: + ProtocolNotRequestedError: + If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request. + """ + endpoint = self._get_endpoint(relation or self._relation, protocol=protocol) + if not endpoint: + requested_protocols = set() + relations = [relation] if relation else self.relations + for relation in relations: + try: + databag = TracingRequirerAppData.load(relation.data[self._charm.app]) + except DataValidationError: + continue + + requested_protocols.update(databag.receivers) + + if protocol not in requested_protocols: + raise ProtocolNotRequestedError(protocol, relation) + + return None + return endpoint + + +def charm_tracing_config( + endpoint_requirer: TracingEndpointRequirer, cert_path: Optional[Union[Path, str]] +) -> Tuple[Optional[str], Optional[str]]: + """Return the charm_tracing config you likely want. + + If no endpoint is provided: + disable charm tracing. + If https endpoint is provided but cert_path is not found on disk: + disable charm tracing. + If https endpoint is provided and cert_path is None: + ERROR + Else: + proceed with charm tracing (with or without tls, as appropriate) + + Usage: + If you are using charm_tracing >= v1.9: + >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config + >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") + >>> class MyCharm(...): + >>> _cert_path = "/path/to/cert/on/charm/container.crt" + >>> def __init__(self, ...): + >>> self.tracing = TracingEndpointRequirer(...) + >>> self.my_endpoint, self.cert_path = charm_tracing_config( + ... self.tracing, self._cert_path) + + If you are using charm_tracing < v1.9: + >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config + >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") + >>> class MyCharm(...): + >>> _cert_path = "/path/to/cert/on/charm/container.crt" + >>> def __init__(self, ...): + >>> self.tracing = TracingEndpointRequirer(...) + >>> self._my_endpoint, self._cert_path = charm_tracing_config( + ... self.tracing, self._cert_path) + >>> @property + >>> def my_endpoint(self): + >>> return self._my_endpoint + >>> @property + >>> def cert_path(self): + >>> return self._cert_path + + """ + if not endpoint_requirer.is_ready(): + return None, None + + endpoint = endpoint_requirer.get_endpoint("otlp_http") + if not endpoint: + return None, None + + is_https = endpoint.startswith("https://") + + if is_https: + if cert_path is None: + raise TracingError("Cannot send traces to an https endpoint without a certificate.") + elif not Path(cert_path).exists(): + # if endpoint is https BUT we don't have a server_cert yet: + # disable charm tracing until we do to prevent tls errors + return None, None + return endpoint, str(cert_path) + else: + return endpoint, None diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py index 141412b..da7fa95 100644 --- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 23 +LIBPATCH = 20 PYDEPS = ["cryptography", "jsonschema"] @@ -1902,20 +1902,10 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ) else: try: - secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") logger.debug( "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" ) - # Juju < 3.6 will create a new revision even if the content is the same - if ( - secret.get_content(refresh=True).get("certificate", "") - == certificate.certificate - ): - logger.debug( - "Secret %s with correct certificate already exists", - f"{LIBID}-{csr_in_sha256_hex}", - ) - continue + secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") secret.set_content( {"certificate": certificate.certificate, "csr": certificate.csr} ) @@ -1996,19 +1986,11 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None: provider_certificate = self._find_certificate_in_relation_data(csr) if not provider_certificate: # A secret expired but we did not find matching certificate. Cleaning up - logger.warning( - "Failed to find matching certificate for csr, cleaning up secret %s", - event.secret.label, - ) event.secret.remove_all_revisions() return if not provider_certificate.expiry_time: # A secret expired but matching certificate is invalid. Cleaning up - logger.warning( - "Certificate matching csr is invalid, cleaning up secret %s", - event.secret.label, - ) event.secret.remove_all_revisions() return @@ -2041,18 +2023,14 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti return provider_certificate return None - def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]: + def _get_csr_from_secret(self, secret: Secret) -> str: """Extract the CSR from the secret label or content. This function is a workaround to maintain backwards compatibility and fix the issue reported in https://github.com/canonical/tls-certificates-interface/issues/228 """ - try: - content = secret.get_content(refresh=True) - except SecretNotFoundError: - return None - if not (csr := content.get("csr", None)): + if not (csr := secret.get_content().get("csr", "")): # In versions <14 of the Lib we were storing the CSR in the label of the secret # The CSR now is stored int the content of the secret, which was a breaking change # Here we get the CSR if the secret was created by an app using libpatch 14 or lower From a32e7ae65479b41a06eaff4c207fb7c53510926f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 12 Nov 2024 18:49:17 -0300 Subject: [PATCH 6/6] avoid checking tester charms --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7c16514..ad38b9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ ignore-words-list = "assertIn" # Linting tools configuration [tool.ruff] line-length = 99 -exclude = ["__pycache__", "*.egg_info"] +exclude = ["__pycache__", "*.egg_info", "prometheus-tester", "loki-tester"] [tool.ruff.lint] select = ["E", "W", "F", "C", "N", "R", "D", "I001"]