diff --git a/.gitignore b/.gitignore index ea4fe3d..9ccb50d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__/ .idea .vscode/ *.egg-info/ -cos-tool-* \ No newline at end of file +cos-tool-* +src/prometheus_alert_rules/consolidated_rules/** \ No newline at end of file diff --git a/charmcraft.yaml b/charmcraft.yaml index fdcbc28..fd98faa 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -67,6 +67,11 @@ provides: requires: + self-tracing: + interface: tracing + description: | + Integration to enable Tempo to send its own traces to another Tempo instance. + limit: 1 s3: interface: s3 limit: 1 @@ -136,3 +141,11 @@ config: description: Force-enable the receiver for the 'otlp_http' protocol in Tempo, even if there is no integration currently requesting it. type: boolean default: false + always_enable_jaeger_thrift_http: + description: Force-enable the receiver for the 'jaeger_thrift_http' protocol in Tempo, even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_jaeger_grpc: + description: Force-enable the receiver for the 'jaeger_grpc' protocol in Tempo, even if there is no integration currently requesting it. + type: boolean + default: false diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py index 8b9fb4f..dfb2336 100644 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ b/lib/charms/tempo_k8s/v2/tracing.py @@ -107,7 +107,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 8 PYDEPS = ["pydantic"] @@ -116,14 +116,13 @@ def __init__(self, *args): DEFAULT_RELATION_NAME = "tracing" RELATION_INTERFACE_NAME = "tracing" +# Supported list rationale https://github.com/canonical/tempo-coordinator-k8s-operator/issues/8 ReceiverProtocol = Literal[ "zipkin", - "kafka", - "opencensus", - "tempo_http", - "tempo_grpc", "otlp_grpc", "otlp_http", + "jaeger_grpc", + "jaeger_thrift_http", ] RawReceiver = Tuple[ReceiverProtocol, str] @@ -141,14 +140,12 @@ class TransportProtocolType(str, enum.Enum): grpc = "grpc" -receiver_protocol_to_transport_protocol = { +receiver_protocol_to_transport_protocol: Dict[ReceiverProtocol, TransportProtocolType] = { "zipkin": TransportProtocolType.http, - "kafka": TransportProtocolType.http, - "opencensus": TransportProtocolType.http, - "tempo_http": TransportProtocolType.http, - "tempo_grpc": TransportProtocolType.grpc, "otlp_grpc": TransportProtocolType.grpc, "otlp_http": TransportProtocolType.http, + "jaeger_thrift_http": TransportProtocolType.http, + "jaeger_grpc": TransportProtocolType.grpc, } """A mapping between telemetry protocols and their corresponding transport protocol. """ diff --git a/scripts/tracegen.py b/scripts/tracegen.py index 3505be4..75e5e67 100644 --- a/scripts/tracegen.py +++ b/scripts/tracegen.py @@ -1,11 +1,15 @@ import os import time from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, get_args +import requests from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCExporter from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPExporter +from opentelemetry.exporter.zipkin.json import ZipkinExporter +from opentelemetry.exporter.jaeger.thrift import JaegerExporter as JaegerThriftHttpExporter +from opentelemetry.exporter.jaeger.proto.grpc import JaegerExporter as JaegerGRPCExporter from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ( @@ -13,35 +17,82 @@ ConsoleSpanExporter, ) +ReceiverProtocol = Literal[ + "zipkin", + "otlp_grpc", + "otlp_http", + "jaeger_grpc", + "jaeger_thrift_http", +] + +def set_envvars(cert: Path = None): + ca_cert_path = str(Path(cert).absolute()) if cert else "" + os.environ['OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE'] = ca_cert_path + os.environ['OTEL_EXPORTER_JAEGER_CERTIFICATE'] = ca_cert_path + # jaeger thrift http exporter does not expose a parameter to set path for CA verification + os.environ['SSL_CERT_FILE'] = ca_cert_path + os.environ["REQUESTS_CA_BUNDLE"] = ca_cert_path + +def initialize_exporter(protocol: str, endpoint: str, cert: Path = None): + # ip:4317 + if protocol == "otlp_grpc": + return GRPCExporter( + endpoint=endpoint, + insecure=not cert, + ) + # scheme://ip:4318/v1/traces + elif protocol == "otlp_http": + return HTTPExporter( + endpoint=endpoint, + ) + # scheme://ip:9411/v1/traces + elif protocol == "zipkin": + # zipkin does not expose an arg to pass certificate + session = requests.Session() + if cert: + session.verify = cert + return ZipkinExporter( + endpoint=endpoint, + session=session, + ) + # scheme://ip:14268/api/traces?format=jaeger.thrift + elif protocol == "jaeger_thrift_http": + return JaegerThriftHttpExporter( + collector_endpoint=endpoint, + ) + # ip:14250 + elif protocol == "jaeger_grpc": + return JaegerGRPCExporter( + collector_endpoint = endpoint, + insecure=not cert, + ) + else: + raise ValueError(f"Unsupported protocol: {protocol}") def emit_trace( endpoint: str, log_trace_to_console: bool = False, cert: Path = None, - protocol: Literal["grpc", "http", "ALL"] = "grpc", + protocol: ReceiverProtocol = "otlp_http", nonce: Any = None ): - os.environ['OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE'] = str(Path(cert).absolute()) if cert else "" + if protocol == "ALL": + for proto in get_args(protocol): + emit_trace(endpoint, log_trace_to_console, cert, proto, nonce=nonce) + else: + set_envvars(cert) + span_exporter = initialize_exporter(protocol, endpoint, cert) + return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce, protocol = protocol) - if protocol == "grpc": - span_exporter = GRPCExporter( - endpoint=endpoint, - insecure=not cert, - ) - elif protocol == "http": - span_exporter = HTTPExporter( - endpoint=endpoint, - ) - else: # ALL - return (emit_trace(endpoint, log_trace_to_console, cert, "grpc", nonce=nonce) and - emit_trace(endpoint, log_trace_to_console, cert, "http", nonce=nonce)) + - return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce) + + -def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None): +def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: ReceiverProtocol = "otlp_http"): resource = Resource.create(attributes={ - "service.name": "tracegen", + "service.name": f"tracegen-{protocol}", "nonce": str(nonce) } ) diff --git a/src/charm.py b/src/charm.py index 276f4f1..67c9cd0 100755 --- a/src/charm.py +++ b/src/charm.py @@ -3,46 +3,39 @@ # See LICENSE file for licensing details. """Charmed Operator for Tempo; a lightweight object storage based tracing backend.""" -import json import logging import socket from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple, get_args +from subprocess import CalledProcessError, getoutput +from typing import Dict, Optional, Set, Tuple, cast, get_args import ops -from charms.data_platform_libs.v0.s3 import S3Requirer -from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider -from charms.observability_libs.v1.cert_handler import VAULT_SECRET_LABEL, CertHandler -from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from charms.tempo_k8s.v1.charm_tracing import trace_charm from charms.tempo_k8s.v2.tracing import ( ReceiverProtocol, RequestEvent, TracingEndpointProvider, + TransportProtocolType, + receiver_protocol_to_transport_protocol, ) from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer -from ops.charm import CharmBase, CollectStatusEvent, RelationEvent +from cosl.coordinated_workers.coordinator import Coordinator +from cosl.coordinated_workers.nginx import CA_CERT_PATH, CERT_PATH, KEY_PATH +from ops.charm import CharmBase, RelationEvent from ops.main import main -from ops.model import ActiveStatus, BlockedStatus, Relation, WaitingStatus -from coordinator import TempoCoordinator -from nginx import Nginx -from nginx_prometheus_exporter import NginxPrometheusExporter +from nginx_config import NginxConfig from tempo import Tempo -from tempo_cluster import TempoClusterProvider +from tempo_config import TempoRolesConfig logger = logging.getLogger(__name__) -class S3NotFoundError(Exception): - """Raised when the s3 integration is not present or not ready.""" - - @trace_charm( tracing_endpoint="tempo_otlp_http_endpoint", - server_cert="server_cert", - extra_types=(Tempo, TracingEndpointProvider), + server_cert="server_ca_cert", + extra_types=(Tempo, TracingEndpointProvider, Coordinator, TempoRolesConfig), ) class TempoCoordinatorCharm(CharmBase): """Charmed Operator for Tempo; a distributed tracing backend.""" @@ -51,29 +44,28 @@ def __init__(self, *args): super().__init__(*args) self.ingress = TraefikRouteRequirer(self, self.model.get_relation("ingress"), "ingress") # type: ignore - self.tempo_cluster = TempoClusterProvider(self) - self.coordinator = TempoCoordinator(self.tempo_cluster) - - # keep this above Tempo instantiation, as we need it in self.tls_enabled - self.cert_handler = CertHandler( - self, - key="tempo-server-cert", - sans=[self.hostname], - ) - - self.tempo = tempo = Tempo( - external_host=self.hostname, - use_tls=self.tls_available, - ) - - self.s3_requirer = S3Requirer(self, Tempo.s3_relation_name, Tempo.s3_bucket_name) - - self.nginx = Nginx( - self, - cluster_provider=self.tempo_cluster, - server_name=self.hostname, + self.tempo = Tempo(requested_receivers=self._requested_receivers) + # set the open ports for this unit + self.unit.set_ports(*self.tempo.all_ports.values()) + self.coordinator = Coordinator( + charm=self, + roles_config=TempoRolesConfig(), + s3_bucket_name=Tempo.s3_bucket_name, + external_url=self._external_url, + worker_metrics_port=8080, + endpoints={ + "certificates": "certificates", + "cluster": "tempo-cluster", + "grafana-dashboards": "grafana-dashboard", + "logging": "logging", + "metrics": "metrics-endpoint", + "s3": "s3", + "tracing": "self-tracing", + }, + nginx_config=NginxConfig(server_name=self.hostname).config, + workers_config=self.tempo.config, + tracing_receivers=self.requested_receivers_urls, ) - self.nginx_prometheus_exporter = NginxPrometheusExporter(self) # configure this tempo as a datasource in grafana self.grafana_source_provider = GrafanaSourceProvider( @@ -82,91 +74,41 @@ def __init__(self, *args): source_url=self._external_http_server_url, refresh_event=[ # refresh the source url when TLS config might be changing - self.on[self.cert_handler.certificates_relation_name].relation_changed, + self.on[self.coordinator.cert_handler.certificates_relation_name].relation_changed, # or when ingress changes self.ingress.on.ready, ], ) - # # Patch the juju-created Kubernetes service to contain the right ports - self.unit.set_ports(*self.tempo.all_ports.values()) - - # Provide ability for Tempo to be scraped by Prometheus using prometheus_scrape - self._scraping = MetricsEndpointProvider( - self, - relation_name="metrics-endpoint", - jobs=[{"static_configs": [{"targets": [f"*:{tempo.tempo_http_server_port}"]}]}], - ) - self._grafana_dashboards = GrafanaDashboardProvider( - self, relation_name="grafana-dashboard" - ) self.tracing = TracingEndpointProvider(self, external_url=self._external_url) - self._inconsistencies = self.coordinator.get_deployment_inconsistencies( - has_s3=self.s3_ready - ) - self._is_consistent = not self._inconsistencies - # We always listen to collect-status - self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status) - - if not self._is_consistent: - logger.error( - f"Inconsistent deployment. {self.unit.name} will be shutting down. " - "This likely means you need to add an s3 integration. " - "This charm will be unresponsive and refuse to handle any event until " - "the situation is resolved by the cloud admin, to avoid data loss." - ) - return # refuse to handle any other event as we can't possibly know what to do. + # refuse to handle any other event as we can't possibly know what to do. + if not self.coordinator.can_handle_events: + # logging will be handled by `self.coordinator` for each of the above circumstances. + return # lifecycle self.framework.observe(self.on.leader_elected, self._on_leader_elected) - self.framework.observe(self.on.update_status, self._on_update_status) - self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.list_receivers_action, self._on_list_receivers_action) - # nginx - self.framework.observe(self.on.nginx_pebble_ready, self._on_nginx_pebble_ready) - self.framework.observe( - self.on.nginx_prometheus_exporter_pebble_ready, - self._on_nginx_prometheus_exporter_pebble_ready, - ) - # ingress ingress = self.on["ingress"] self.framework.observe(ingress.relation_created, self._on_ingress_relation_created) self.framework.observe(ingress.relation_joined, self._on_ingress_relation_joined) self.framework.observe(self.ingress.on.ready, self._on_ingress_ready) - # s3 - self.framework.observe( - self.s3_requirer.on.credentials_changed, self._on_s3_credentials_changed - ) - self.framework.observe(self.s3_requirer.on.credentials_gone, self._on_s3_credentials_gone) - # tracing self.framework.observe(self.tracing.on.request, self._on_tracing_request) self.framework.observe(self.tracing.on.broken, self._on_tracing_broken) - self.framework.observe(self.on.peers_relation_created, self._on_peers_relation_created) - self.framework.observe(self.on.peers_relation_changed, self._on_peers_relation_changed) # tls - self.framework.observe(self.cert_handler.on.cert_changed, self._on_cert_handler_changed) - - # cluster - self.framework.observe(self.tempo_cluster.on.changed, self._on_tempo_cluster_changed) - - for evt in self.on.events().values(): - self.framework.observe(evt, self._on_event) # type: ignore + self.framework.observe( + self.coordinator.cert_handler.on.cert_changed, self._on_cert_handler_changed + ) ###################### # UTILITY PROPERTIES # ###################### - - @property - def is_clustered(self) -> bool: - """Check whether this Tempo is a coordinator and has worker nodes connected to it.""" - return self.tempo_cluster.has_workers - @property def hostname(self) -> str: """Unit's hostname.""" @@ -195,77 +137,25 @@ def _external_url(self) -> str: @property def _internal_url(self) -> str: - scheme = "https" if self.tls_available else "http" + """Returns workload's FQDN.""" + scheme = "http" + if self.are_certificates_on_disk: + scheme = "https" + return f"{scheme}://{self.hostname}" @property - def tls_available(self) -> bool: - """Return True if tls is enabled and the necessary certs are found.""" + def are_certificates_on_disk(self) -> bool: + """Return True if the certificates files are on disk.""" + nginx_container = self.unit.get_container("nginx") + return ( - self.cert_handler.enabled - and (self.cert_handler.server_cert is not None) - and (self.cert_handler.private_key is not None) - and (self.cert_handler.ca_cert is not None) + nginx_container.can_connect() + and nginx_container.exists(CERT_PATH) + and nginx_container.exists(KEY_PATH) + and nginx_container.exists(CA_CERT_PATH) ) - @property - def _s3_config(self) -> dict: - s3_config = self.s3_requirer.get_s3_connection_info() - if ( - s3_config - and "bucket" in s3_config - and "endpoint" in s3_config - and "access-key" in s3_config - and "secret-key" in s3_config - ): - return s3_config - raise S3NotFoundError("s3 integration inactive") - - @property - def s3_ready(self) -> bool: - """Check whether s3 is configured.""" - try: - return bool(self._s3_config) - except S3NotFoundError: - return False - - @property - def peer_addresses(self) -> List[str]: - peers = self._peers - relation = self.model.get_relation("peers") - # get unit addresses for all the other units from a databag - if peers and relation: - addresses = [relation.data[unit].get("local-ip") for unit in peers] - addresses = list(filter(None, addresses)) - else: - addresses = [] - - # add own address - if self._local_ip: - addresses.append(self._local_ip) - - return addresses - - @property - def _local_ip(self) -> Optional[str]: - try: - binding = self.model.get_binding("peers") - if not binding: - logger.error( - "unable to get local IP at this time: " - "peers binding not active yet. It could be that the charm " - "is still being set up..." - ) - return None - return str(binding.network.bind_address) - except (ops.ModelError, KeyError) as e: - logger.debug("failed to obtain local ip from peers binding", exc_info=True) - logger.error( - f"unable to get local IP at this time: failed with {type(e)}; " - f"see debug log for more info" - ) - return None - @property def enabled_receivers(self) -> Set[str]: """Extra receivers enabled through config""" @@ -288,40 +178,24 @@ def _on_tracing_broken(self, _): """Update tracing relations' databags once one relation is removed.""" self._update_tracing_relations() - def _on_cert_handler_changed(self, _): - if self.tls_available: - logger.debug("enabling TLS") - self.nginx.configure_tls( - server_cert=self.cert_handler.server_cert, # type: ignore - ca_cert=self.cert_handler.ca_cert, # type: ignore - private_key=self.cert_handler.private_key, # type: ignore - ) - else: - logger.debug("disabling TLS") - self.nginx.delete_certificates() + def _on_cert_handler_changed(self, e: ops.RelationChangedEvent): # tls readiness change means config change. # sync scheme change with traefik and related consumers self._configure_ingress() - # sync the server cert with the charm container. + # sync the server CA cert with the charm container. # technically, because of charm tracing, this will be called first thing on each event - self._update_server_cert() + self._update_server_ca_cert() # update relations to reflect the new certificate self._update_tracing_relations() - # notify the cluster - self._update_tempo_cluster() - def _on_tracing_request(self, e: RequestEvent): """Handle a remote requesting a tracing endpoint.""" logger.debug(f"received tracing request from {e.relation.app}: {e.requested_receivers}") self._update_tracing_relations() - def _on_tempo_cluster_changed(self, _: RelationEvent): - self._update_tempo_cluster() - def _on_ingress_relation_created(self, _: RelationEvent): self._configure_ingress() @@ -332,29 +206,6 @@ def _on_leader_elected(self, _: ops.LeaderElectedEvent): # as traefik_route goes through app data, we need to take lead of traefik_route if our leader dies. self._configure_ingress() - def _on_s3_credentials_changed(self, _): - self._on_s3_changed() - - def _on_s3_credentials_gone(self, _): - self._on_s3_changed() - - def _on_s3_changed(self): - self._update_tempo_cluster() - - def _on_peers_relation_created(self, event: ops.RelationCreatedEvent): - if self._local_ip: - event.relation.data[self.unit]["local-ip"] = self._local_ip - - def _on_peers_relation_changed(self, _): - self._update_tempo_cluster() - - def _on_config_changed(self, _): - # check if certificate files haven't disappeared and recreate them if needed - self._update_tempo_cluster() - - def _on_update_status(self, _): - """Update the status of the application.""" - def _on_ingress_ready(self, _event): # whenever there's a change in ingress, we need to update all tracing relations self._update_tracing_relations() @@ -366,53 +217,9 @@ def _on_ingress_revoked(self, _event): def _on_list_receivers_action(self, event: ops.ActionEvent): res = {} for receiver in self._requested_receivers(): - res[receiver.replace("_", "-")] = ( - f"{self.ingress.external_host or self.tempo.url}:{self.tempo.receiver_ports[receiver]}" - ) + res[receiver.replace("_", "-")] = self.get_receiver_url(receiver) event.set_results(res) - # keep this event handler at the bottom - def _on_collect_unit_status(self, e: CollectStatusEvent): - # todo add [nginx.workload] statuses - - if not self.tempo.is_ready: - e.add_status(WaitingStatus("[workload.tempo] Tempo API not ready just yet...")) - - # TODO: should we set these statuses on the leader only, or on all units? - if issues := self._inconsistencies: - for issue in issues: - e.add_status(BlockedStatus("[consistency.issues]" + issue)) - e.add_status(BlockedStatus("[consistency] Unit *disabled*.")) - else: - if self.is_clustered: - # no issues: tempo is consistent - if not self.coordinator.is_recommended: - e.add_status(ActiveStatus("[coordinator] degraded")) - else: - e.add_status(ActiveStatus()) - else: - e.add_status(ActiveStatus()) - - def _on_nginx_pebble_ready(self, _) -> None: - self.nginx.configure_pebble_layer() - - def _on_nginx_prometheus_exporter_pebble_ready(self, _) -> None: - self.nginx_prometheus_exporter.configure_pebble_layer() - - def _on_event(self, event) -> None: - """A set of common configuration actions that should happen on every event.""" - if isinstance(event, CollectStatusEvent): - return - # plan layers - self.nginx.configure_pebble_layer() - self.nginx_prometheus_exporter.configure_pebble_layer() - # configure ingress - self._configure_ingress() - # update cluster relations - self._update_tempo_cluster() - # update tracing relations - self._update_tracing_relations() - ################### # UTILITY METHODS # ################### @@ -429,7 +236,7 @@ def _configure_ingress(self) -> None: self._update_tracing_relations() # notify the cluster - self._update_tempo_cluster() + self.coordinator.update_cluster() def _update_tracing_relations(self) -> None: tracing_relations = self.model.relations["tracing"] @@ -443,10 +250,10 @@ def _update_tracing_relations(self) -> None: # publish requested protocols to all relations if self.unit.is_leader(): self.tracing.publish_receivers( - [(p, self.tempo.get_receiver_url(p, self.ingress)) for p in requested_receivers] + [(p, self.get_receiver_url(p)) for p in requested_receivers] ) - self._update_tempo_cluster() + self.coordinator.update_cluster() def _requested_receivers(self) -> Tuple[ReceiverProtocol, ...]: """List what receivers we should activate, based on the active tracing relations and config-enabled extra receivers.""" @@ -459,95 +266,36 @@ def _requested_receivers(self) -> Tuple[ReceiverProtocol, ...]: requested_receivers = requested_protocols.intersection(set(self.tempo.receiver_ports)) return tuple(requested_receivers) - def server_cert(self) -> str: + def server_ca_cert(self) -> str: """For charm tracing.""" - self._update_server_cert() - return self.tempo.server_cert_path - - def _update_server_cert(self) -> None: - """Server certificate for charm tracing tls, if tls is enabled.""" - server_cert = Path(self.tempo.server_cert_path) - if self.tls_available: - if not server_cert.exists(): - server_cert.parent.mkdir(parents=True, exist_ok=True) - if self.cert_handler.server_cert: - server_cert.write_text(self.cert_handler.server_cert) + self._update_server_ca_cert() + return self.tempo.tls_ca_path + + def _update_server_ca_cert(self) -> None: + """Server CA certificate for charm tracing tls, if tls is enabled.""" + server_ca_cert = Path(self.tempo.tls_ca_path) + if self.coordinator.tls_available: + if self.coordinator.cert_handler.ca_cert: + server_ca_cert.parent.mkdir(parents=True, exist_ok=True) + server_ca_cert.write_text(self.coordinator.cert_handler.ca_cert) else: # tls unavailable: delete local cert - server_cert.unlink(missing_ok=True) + server_ca_cert.unlink(missing_ok=True) def tempo_otlp_http_endpoint(self) -> Optional[str]: """Endpoint at which the charm tracing information will be forwarded.""" # the charm container and the tempo workload container have apparently the same # IP, so we can talk to tempo at localhost. - if self.tempo.is_ready: + if self.coordinator and self.coordinator.tracing.is_ready(): + return self.coordinator.tracing.get_endpoint("otlp_http") + # In absence of another Tempo instance, we don't want to lose this instance's charm traces + elif self.is_workload_ready(): return f"{self._internal_url}:{self.tempo.receiver_ports['otlp_http']}" - return None - - @property - def _peers(self) -> Optional[Set[ops.model.Unit]]: - relation = self.model.get_relation("peers") - if not relation: - return None - - # self is not included in relation.units - return relation.units - - @property - def loki_endpoints_by_unit(self) -> Dict[str, str]: - """Loki endpoints from relation data in the format needed for Pebble log forwarding. - - Returns: - A dictionary of remote units and the respective Loki endpoint. - { - "loki/0": "http://loki:3100/loki/api/v1/push", - "another-loki/0": "http://another-loki:3100/loki/api/v1/push", - } - """ - endpoints: Dict = {} - relations: List[Relation] = self.model.relations.get("logging-consumer", []) - - for relation in relations: - for unit in relation.units: - if "endpoint" not in relation.data[unit]: - continue - endpoint = relation.data[unit]["endpoint"] - deserialized_endpoint = json.loads(endpoint) - url = deserialized_endpoint["url"] - endpoints[unit.name] = url - - return endpoints - - def _update_tempo_cluster(self) -> None: - """Build the config and publish everything to the application databag.""" - if not self._is_consistent: - logger.error("skipped tempo cluster update: inconsistent state") - return - - if not self.unit.is_leader(): - return - - kwargs = {} - - if self.tls_available: - # we share the certs in plaintext as they're not sensitive information - kwargs["ca_cert"] = self.cert_handler.ca_cert - kwargs["server_cert"] = self.cert_handler.server_cert - kwargs["privkey_secret_id"] = self.tempo_cluster.publish_privkey(VAULT_SECRET_LABEL) - - # On every function call, we always publish everything to the databag; however, if there - # are no changes, Juju will notice there's no delta and do nothing - self.tempo_cluster.publish_data( - tempo_config=self.tempo.generate_config( - self._requested_receivers(), - self._s3_config, - self.tempo_cluster.gather_addresses_by_role(), - self.tempo_cluster.gather_addresses(), - ), - loki_endpoints=self.loki_endpoints_by_unit, - # TODO tempo receiver for charm tracing - **kwargs, - ) + def requested_receivers_urls(self) -> Dict[str, str]: + """Endpoints to which the workload (and the worker charm) can push traces to.""" + return { + receiver: self.get_receiver_url(receiver) for receiver in self._requested_receivers() + } @property def _static_ingress_config(self) -> dict: @@ -571,7 +319,11 @@ def _ingress_config(self) -> dict: # TODO better matcher "rule": "ClientIP(`0.0.0.0/0`)", } - if sanitized_protocol.endswith("grpc") and not self.tls_available: + if ( + protocol == "tempo_grpc" + or receiver_protocol_to_transport_protocol.get(cast(ReceiverProtocol, protocol)) + == TransportProtocolType.grpc + ) and not self.coordinator.tls_available: # to send traces to unsecured GRPC endpoints, we need h2c # see https://doc.traefik.io/traefik/v2.0/user-guides/grpc/#with-http-h2c http_services[ @@ -590,6 +342,49 @@ def _ingress_config(self) -> dict: }, } + def get_receiver_url(self, protocol: ReceiverProtocol): + """Return the receiver endpoint URL based on the protocol. + + if ingress is used, return endpoint provided by the ingress instead. + """ + protocol_type = receiver_protocol_to_transport_protocol.get(protocol) + # ingress.is_ready returns True even when traefik hasn't sent any data yet + has_ingress = ( + self.ingress.is_ready() and self.ingress.external_host and self.ingress.scheme + ) + receiver_port = self.tempo.receiver_ports[protocol] + + if has_ingress: + url = ( + self.ingress.external_host + if protocol_type == TransportProtocolType.grpc + else f"{self.ingress.scheme}://{self.ingress.external_host}" + ) + else: + url = ( + self.coordinator.hostname + if protocol_type == TransportProtocolType.grpc + else self.coordinator._internal_url + ) + + return f"{url}:{receiver_port}" + + def is_workload_ready(self): + """Whether the tempo built-in readiness check reports 'ready'.""" + if self.coordinator.tls_available: + tls, s = f" --cacert {self.tempo.tls_ca_path}", "s" + else: + tls = s = "" + + # cert is for fqdn/ingress, not for IP + cmd = f"curl{tls} http{s}://{self.coordinator.hostname}:{self.tempo.tempo_http_server_port}/ready" + + try: + out = getoutput(cmd).split("\n")[-1] + except (CalledProcessError, IndexError): + return False + return out == "ready" + if __name__ == "__main__": # pragma: nocover main(TempoCoordinatorCharm) diff --git a/src/coordinator.py b/src/coordinator.py deleted file mode 100644 index b7bcea9..0000000 --- a/src/coordinator.py +++ /dev/null @@ -1,85 +0,0 @@ -import logging -from collections import Counter -from typing import Dict, List, Optional, Set - -from tempo_cluster import TempoClusterProvider, TempoRole - -logger = logging.getLogger(__name__) - -MINIMAL_DEPLOYMENT = { - TempoRole.querier: 1, - TempoRole.query_frontend: 1, - TempoRole.ingester: 1, - TempoRole.distributor: 1, - TempoRole.compactor: 1, - TempoRole.metrics_generator: 1, -} -"""The minimal set of roles that need to be allocated for the -deployment to be considered consistent (otherwise we set blocked).""" - -# TODO: find out what the actual recommended deployment is -RECOMMENDED_DEPLOYMENT = Counter( - { - TempoRole.querier: 1, - TempoRole.query_frontend: 1, - TempoRole.ingester: 3, - TempoRole.distributor: 1, - TempoRole.compactor: 1, - TempoRole.metrics_generator: 1, - } -) -"""The set of roles that need to be allocated for the -deployment to be considered robust according to the official -recommendations/guidelines.""" - - -class TempoCoordinator: - """Tempo coordinator.""" - - def __init__(self, cluster_provider: TempoClusterProvider): - self._cluster_provider = cluster_provider - self._roles: Dict[TempoRole, int] = self._cluster_provider.gather_roles() - - # Whether the roles list makes up a coherent mimir deployment. - self.is_coherent = set(self._roles.keys()).issuperset(MINIMAL_DEPLOYMENT) - self.missing_roles: Set[TempoRole] = set(MINIMAL_DEPLOYMENT).difference(self._roles.keys()) - # If the coordinator is incoherent, return the roles that are missing for it to become so. - - def _is_recommended(): - for role, min_n in RECOMMENDED_DEPLOYMENT.items(): - if self._roles.get(role, 0) < min_n: - return False - return True - - self.is_recommended: bool = _is_recommended() - # Whether the present roles are a superset of the minimal deployment. - # I.E. If all required roles are assigned, and each role has the recommended amount of units. - # python>=3.11 would support roles >= RECOMMENDED_DEPLOYMENT - - def get_deployment_inconsistencies(self, has_s3: bool) -> List[str]: - """Determine whether the deployment as a whole is consistent. - - Return a list of failed consistency checks. - """ - return self._get_deployment_inconsistencies( - has_s3=has_s3, - coherent=self.is_coherent, - missing_roles=self.missing_roles, - ) - - @staticmethod - def _get_deployment_inconsistencies( - has_s3: bool, - coherent: bool, - missing_roles: Optional[Set[TempoRole]] = None, - ) -> List[str]: - """Determine whether the deployment as a whole is consistent. - - Return a list of failed consistency checks. - """ - failures = [] - if not has_s3: - failures.append("Tempo has no s3 integration.") - if not coherent: - failures.append(f"Incoherent coordinator: missing roles: {missing_roles}.") - return failures diff --git a/src/nginx.py b/src/nginx_config.py similarity index 61% rename from src/nginx.py rename to src/nginx_config.py index b706f9a..3172fdd 100644 --- a/src/nginx.py +++ b/src/nginx_config.py @@ -3,57 +3,37 @@ """Nginx workload.""" import logging -from typing import Any, Dict, List, Optional, Set +from typing import Any, Dict, List, Optional, Set, cast import crossplane -from ops import CharmBase -from ops.pebble import Layer, PathError, ProtocolError +from charms.tempo_k8s.v2.tracing import ( + ReceiverProtocol, + TransportProtocolType, + receiver_protocol_to_transport_protocol, +) +from cosl.coordinated_workers.coordinator import Coordinator +from cosl.coordinated_workers.nginx import CERT_PATH, KEY_PATH from tempo import Tempo -from tempo_cluster import TempoClusterProvider, TempoRole +from tempo_config import TempoRole logger = logging.getLogger(__name__) -NGINX_DIR = "/etc/nginx" -NGINX_CONFIG = f"{NGINX_DIR}/nginx.conf" -KEY_PATH = f"{NGINX_DIR}/certs/server.key" -CERT_PATH = f"{NGINX_DIR}/certs/server.cert" -CA_CERT_PATH = f"{NGINX_DIR}/certs/ca.cert" - - -class Nginx: +class NginxConfig: """Helper class to manage the nginx workload.""" - config_path = NGINX_CONFIG - - def __init__(self, charm: CharmBase, cluster_provider: TempoClusterProvider, server_name: str): - self._charm = charm - self.cluster_provider = cluster_provider + def __init__(self, server_name: str): self.server_name = server_name - self._container = self._charm.unit.get_container("nginx") - - def configure_pebble_layer(self) -> None: - """Configure pebble layer.""" - new_config: str = self.config() - should_restart: bool = self._has_config_changed(new_config) - if self._container.can_connect(): - self._container.push(self.config_path, new_config, make_dirs=True) # type: ignore - self._container.add_layer("nginx", self.layer, combine=True) - self._container.autostart() - if should_restart: - logger.info("new nginx config: reloading the service") - self.reload() - - def config(self) -> str: + def config(self, coordinator: Coordinator) -> str: """Build and return the Nginx configuration.""" - full_config = self._prepare_config() + full_config = self._prepare_config(coordinator) return crossplane.build(full_config) - def _prepare_config(self) -> List[dict]: + def _prepare_config(self, coordinator: Coordinator) -> List[dict]: log_level = "error" - addresses_by_role = self.cluster_provider.gather_addresses_by_role() + addresses_by_role = coordinator.cluster.gather_addresses_by_role() # build the complete configuration full_config = [ {"directive": "worker_processes", "args": ["5"]}, @@ -102,52 +82,14 @@ def _prepare_config(self) -> List[dict]: }, {"directive": "proxy_read_timeout", "args": ["300"]}, # server block - *self._servers(addresses_by_role), + *self._build_servers_config( + addresses_by_role, coordinator.nginx.are_certificates_on_disk + ), ], }, ] return full_config - def _has_config_changed(self, new_config: str) -> bool: - """Return True if the passed config differs from the one on disk.""" - if not self._container.can_connect(): - logger.debug("Could not connect to Nginx container") - return False - - try: - current_config = self._container.pull(self.config_path).read() - except (ProtocolError, PathError) as e: - logger.warning( - "Could not check the current nginx configuration due to " - "a failure in retrieving the file: %s", - e, - ) - return False - - return current_config != new_config - - def reload(self) -> None: - """Reload the nginx config without restarting the service.""" - self._container.exec(["nginx", "-s", "reload"]) - - @property - def layer(self) -> Layer: - """Return the Pebble layer for Nginx.""" - return Layer( - { - "summary": "nginx layer", - "description": "pebble config layer for Nginx", - "services": { - "nginx": { - "override": "replace", - "summary": "nginx", - "command": "nginx -g 'daemon off;'", - "startup": "enabled", - } - }, - } - ) - def _log_verbose(self, verbose: bool = True) -> List[Dict[str, Any]]: if verbose: return [{"directive": "access_log", "args": ["/dev/stderr", "main"]}] @@ -167,28 +109,6 @@ def _upstreams(self, addresses_by_role: Dict[str, Set[str]]) -> List[Dict[str, A addresses_mapped_to_upstreams = {} nginx_upstreams = [] addresses_mapped_to_upstreams = addresses_by_role.copy() - if TempoRole.all in addresses_by_role.keys(): - # for all, we add addresses to existing upstreams for distributor / query_frontend or create the set - if TempoRole.distributor in addresses_mapped_to_upstreams: - addresses_mapped_to_upstreams[TempoRole.distributor] = ( - addresses_mapped_to_upstreams[TempoRole.distributor].union( - addresses_by_role[TempoRole.all] - ) - ) - else: - addresses_mapped_to_upstreams[TempoRole.distributor] = addresses_by_role[ - TempoRole.all - ] - if TempoRole.query_frontend in addresses_mapped_to_upstreams: - addresses_mapped_to_upstreams[TempoRole.query_frontend] = ( - addresses_mapped_to_upstreams[TempoRole.query_frontend].union( - addresses_by_role[TempoRole.all] - ) - ) - else: - addresses_mapped_to_upstreams[TempoRole.query_frontend] = addresses_by_role[ - TempoRole.all - ] if TempoRole.distributor in addresses_mapped_to_upstreams.keys(): nginx_upstreams.extend( self._distributor_upstreams(addresses_mapped_to_upstreams[TempoRole.distributor]) @@ -276,23 +196,34 @@ def _listen_args(self, port: int, ipv6: bool, ssl: bool, http2: bool) -> List[st args.append("http2") return args - def _servers(self, addresses_by_role: Dict[str, Set[str]]) -> List[Dict[str, Any]]: + def _build_servers_config( + self, addresses_by_role: Dict[str, Set[str]], tls: bool = False + ) -> List[Dict[str, Any]]: servers = [] roles = addresses_by_role.keys() - - if TempoRole.distributor.value in roles or TempoRole.all.value in roles: + # generate a server config for receiver protocols (9411, 4317, 4318, 14268, 14250) + if TempoRole.distributor.value in roles: for protocol, port in Tempo.receiver_ports.items(): - servers.append(self._server(port, protocol.replace("_", "-"), "grpc" in protocol)) - if TempoRole.query_frontend.value in roles or TempoRole.all.value in roles: + servers.append( + self._build_server_config( + port, protocol.replace("_", "-"), self._is_protocol_grpc(protocol), tls + ) + ) + # generate a server config for the Tempo server protocols (3200, 9096) + if TempoRole.query_frontend.value in roles: for protocol, port in Tempo.server_ports.items(): - servers.append(self._server(port, protocol.replace("_", "-"), "grpc" in protocol)) + servers.append( + self._build_server_config( + port, protocol.replace("_", "-"), self._is_protocol_grpc(protocol), tls + ) + ) return servers - def _server(self, port: int, upstream: str, grpc: bool = False) -> Dict[str, Any]: + def _build_server_config( + self, port: int, upstream: str, grpc: bool = False, tls: bool = False + ) -> Dict[str, Any]: auth_enabled = False - tls = self.tls_ready - if tls: return { "directive": "server", @@ -329,25 +260,14 @@ def _server(self, port: int, upstream: str, grpc: bool = False) -> Dict[str, Any ], } - @property - def tls_ready(self) -> bool: - """Whether cert, key, and ca paths are found on disk and Nginx is ready to use tls.""" - if not self._container.can_connect(): - return False - return all( - self._container.exists(tls_path) for tls_path in (KEY_PATH, CERT_PATH, CA_CERT_PATH) - ) - - def configure_tls(self, private_key: str, server_cert: str, ca_cert: str) -> None: - """Save the certificates file to disk and run update-ca-certificates.""" - if self._container.can_connect(): - self._container.push(KEY_PATH, private_key, make_dirs=True) - self._container.push(CERT_PATH, server_cert, make_dirs=True) - self._container.push(CA_CERT_PATH, ca_cert, make_dirs=True) - - def delete_certificates(self) -> None: - """Delete the certificate files from disk and run update-ca-certificates.""" - if self._container.can_connect(): - self._container.remove_path(CERT_PATH, recursive=True) - self._container.remove_path(KEY_PATH, recursive=True) - self._container.remove_path(CA_CERT_PATH, recursive=True) + def _is_protocol_grpc(self, protocol: str) -> bool: + """ + Return True if the given protocol is gRPC + """ + if ( + protocol == "tempo_grpc" + or receiver_protocol_to_transport_protocol.get(cast(ReceiverProtocol, protocol)) + == TransportProtocolType.grpc + ): + return True + return False diff --git a/src/nginx_prometheus_exporter.py b/src/nginx_prometheus_exporter.py deleted file mode 100644 index 7ce7b2c..0000000 --- a/src/nginx_prometheus_exporter.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2024 Canonical -# See LICENSE file for licensing details. -"""Nginx Prometheus exporter workload.""" - -import logging - -from ops import CharmBase -from ops.pebble import Layer - -logger = logging.getLogger(__name__) - -NGINX_PROMETHEUS_EXPORTER_PORT = "9113" - - -class NginxPrometheusExporter: - """Helper class to manage the nginx prometheus exporter workload.""" - - def __init__(self, charm: CharmBase) -> None: - self._charm = charm - self._container = self._charm.unit.get_container("nginx-prometheus-exporter") - - def configure_pebble_layer(self) -> None: - """Configure pebble layer.""" - self._container.add_layer("nginx-prometheus-exporter", self.layer, combine=True) - self._container.autostart() - - @property - def layer(self) -> Layer: - """Return the Pebble layer for Nginx Prometheus exporter.""" - scheme = "https" if self._charm.tls_available else "http" # type: ignore - return Layer( - { - "summary": "nginx prometheus exporter layer", - "description": "pebble config layer for Nginx Prometheus exporter", - "services": { - "nginx": { - "override": "replace", - "summary": "nginx prometheus exporter", - "command": f"nginx-prometheus-exporter --no-nginx.ssl-verify --web.listen-address=:{NGINX_PROMETHEUS_EXPORTER_PORT} --nginx.scrape-uri={scheme}://127.0.0.1:3200/status", - "startup": "enabled", - } - }, - } - ) diff --git a/src/prometheus_alert_rules/alerts.yaml b/src/prometheus_alert_rules/alerts.yaml deleted file mode 100644 index 2c21d50..0000000 --- a/src/prometheus_alert_rules/alerts.yaml +++ /dev/null @@ -1,187 +0,0 @@ -groups: -- name: tempo_alerts - rules: - - alert: "TempoBlockListRisingQuickly" - expr: | - avg by (job, instance)(tempodb_blocklist_length) / avg by (job, instance)(tempodb_blocklist_length offset 7d) > 1.4 - for: "15m" - labels: - severity: "critical" - annotations: - summary: "Tempo block list rising quickly (instance {{ $labels.instance }})" - description: "The {{ $labels.job }} is experiencing a 40% rise in tempo blocklist length over the last 7 days. Consider scaling compactors." - - alert: TempoCompactionsFailing - expr: sum by (job, instance)(increase(tempodb_compaction_errors_total{}[1h])) > 2 and sum by (job, instance)(increase(tempodb_compaction_errors_total{}[5m])) > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo compactions failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 compactions failures in the past hour." - - alert: TempoCompactorUnhealthy - expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="compactor"}) > 0 - for: 15m - labels: - severity: critical - annotations: - summary: Tempo unhealthy compactor(s) (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy compactor(s)." - - alert: TempoCompactorsTooManyOutstandingBlocks - expr: sum by (tenant) (tempodb_compaction_outstanding_blocks) / ignoring(tenant) group_left count(tempo_build_info) > 100 - for: "6h" - labels: - severity: warning - annotations: - summary: Tempo too many outstanding compaction blocks (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having too many outstanding compaction blocks for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors." - - alert: TempoDiscardedSpans - expr: 100 * sum by (instance,job)(rate(tempo_discarded_spans_total[5m])) / sum by (instance,job)(rate(tempo_distributor_spans_received_total[5m])) > 5 - for: "5m" - labels: - severity: warning - annotations: - summary: Tempo spans insertion failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% discard of spans." - - alert: TempoDistributorPushLatency - expr: histogram_quantile(0.99, sum by(le, job, instance) (rate(tempo_distributor_push_duration_seconds_bucket[5m]))) > 3 - for: "5m" - labels: - severity: warning - annotations: - summary: Tempo distributor push latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} distributor push is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoDistributorUnhealthy - expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 - for: 15m - labels: - severity: critical - annotations: - summary: Tempo unhealthy distributor(s) (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy distributor(s)." - - alert: TempoFailedIngestingRequests - expr: sum by (job,instance)(increase (tempo_ingester_traces_created_total[5m])) / sum by (instance,job)(rate(tempo_request_duration_seconds_count{route='/tempopb.Pusher/PushBytesV2'}[5m])) == 0 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo pushing traces to ingester failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing failure in distributors pushing traces to the ingesters." - - alert: TempoFrontendClients - expr: tempo_query_frontend_connected_clients == 0 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo frontend connected clients (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} has no frontend connected clients." - - alert: TempoFrontendQueueLatency - expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_query_frontend_queue_duration_seconds_bucket[15m]))) > 2 - for: "15m" - labels: - severity: warning - annotations: - summary: Tempo frontend queue latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} frontend queue is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoIngesterFlushLatency - expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_ingester_flush_duration_seconds_bucket[5m]))) > 5 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo ingester flush latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} ingester flush is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoIngesterFlushesFailing - expr: sum by (instance,job)(increase(tempo_ingester_flush_failed_retries_total[1h])) > 2 and sum by(instance,job)(increase(tempo_ingester_flush_failed_retries_total[5m])) > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo ingester flush retries failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 flush retries failures in the past hour." - - alert: TempoIngesterFlushesUnhealthy - expr: sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[1h])) > 2 and sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[5m])) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: Tempo ingester flush failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 ingester flush failures in the past hour." - - alert: TempoIngestersUnhealthy - expr: max by (instance,job)(tempo_ring_members{state="Unhealthy", name="ingester"}) > 0 - for: "15m" - labels: - severity: critical - annotations: - summary: Tempo unhealthy ingester(s) (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy ingester(s)." - - alert: TempoKVRequestErrors - expr: 100 * sum(rate(tempo_kv_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,instance,job) / sum(rate(tempo_kv_request_duration_seconds_count[5m])) by (route,instance,job) > 10 - for: "15m" - labels: - severity: critical - annotations: - summary: Tempo kv store request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} KV store requests is experiencing {{ printf \"%.2f\" $value }}% error rate." - - alert: TempoTargetMissing - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed." - - alert: TempoNoTenantIndexBuilders - expr: sum by (tenant,job,instance) (tempodb_blocklist_tenant_index_builder) == 0 and max by (tenant,job,instance)(tempodb_blocklist_length) > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo tenant index builder failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having no tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale." - - alert: TempoRequestErrors - expr: 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,job,instance) / sum(rate(tempo_request_duration_seconds_count[5m])) by (route,job,instance) > 10 - for: "15m" - labels: - severity: critical - annotations: - summary: Tempo request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% request error rate." - - alert: TempoRequestLatency - expr: histogram_quantile(0.99, sum by(le, route,job,instance)(rate(tempo_request_duration_seconds_bucket[5m]))) > 5 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo request latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoRetentionsFailing - expr: sum by (job,instance)(increase(tempodb_retention_errors_total[1h])) > 2 and sum by (job,instance)(increase(tempodb_retention_errors_total[5m])) > 0 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo retentions failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 retention failures in the past hour." - - alert: TempoTCPConnectionsLimit - expr: 100 * tempo_tcp_connections / tempo_tcp_connections_limit >= 80 and tempo_tcp_connections_limit > 0 - for: "5m" - labels: - severity: warning - annotations: - summary: Tempo reaching max number of tcp connections (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is reaching {{ printf \"%.2f\" $value }}% of max tcp {{ $labels.protocol }} connections." - - alert: TempoTenantIndexTooOld - expr: max by(tenant,instance,job) (tempodb_blocklist_tenant_index_age_seconds) > 600 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo tenant old index (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing a tenant {{ $labels.tenant }} with a too old index age of 600 seconds." - - alert: TempoUserConfigurableOverridesReloadFailing - expr: sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0 - labels: - severity: critical - annotations: - summary: Tempo user config override reload failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 5 user-configurable override reload failures in the past hour." diff --git a/src/prometheus_alert_rules/tempo_workers/alerts.yaml b/src/prometheus_alert_rules/workers/alerts.yaml similarity index 82% rename from src/prometheus_alert_rules/tempo_workers/alerts.yaml rename to src/prometheus_alert_rules/workers/alerts.yaml index 65109a2..cbe3095 100644 --- a/src/prometheus_alert_rules/tempo_workers/alerts.yaml +++ b/src/prometheus_alert_rules/workers/alerts.yaml @@ -235,6 +235,190 @@ groups: labels: component: ingester severity: warning + - alert: "TempoBlockListRisingQuickly" + expr: | + avg by (job, instance)(tempodb_blocklist_length) / avg by (job, instance)(tempodb_blocklist_length offset 7d) > 1.4 + for: "15m" + labels: + severity: "critical" + annotations: + summary: "Tempo block list rising quickly (instance {{ $labels.instance }})" + description: "The {{ $labels.job }} is experiencing a 40% rise in tempo blocklist length over the last 7 days. Consider scaling compactors." + - alert: TempoCompactionsFailing + expr: sum by (job, instance)(increase(tempodb_compaction_errors_total{}[1h])) > 2 and sum by (job, instance)(increase(tempodb_compaction_errors_total{}[5m])) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo compactions failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 compactions failures in the past hour." + - alert: TempoCompactorUnhealthy + expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="compactor"}) > 0 + for: 15m + labels: + severity: critical + annotations: + summary: Tempo unhealthy compactor(s) (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy compactor(s)." + - alert: TempoCompactorsTooManyOutstandingBlocks + expr: sum by (tenant) (tempodb_compaction_outstanding_blocks) / ignoring(tenant) group_left count(tempo_build_info) > 100 + for: "6h" + labels: + severity: warning + annotations: + summary: Tempo too many outstanding compaction blocks (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having too many outstanding compaction blocks for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors." + - alert: TempoDiscardedSpans + expr: 100 * sum by (instance,job)(rate(tempo_discarded_spans_total[5m])) / sum by (instance,job)(rate(tempo_distributor_spans_received_total[5m])) > 5 + for: "5m" + labels: + severity: warning + annotations: + summary: Tempo spans insertion failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% discard of spans." + - alert: TempoDistributorPushLatency + expr: histogram_quantile(0.99, sum by(le, job, instance) (rate(tempo_distributor_push_duration_seconds_bucket[5m]))) > 3 + for: "5m" + labels: + severity: warning + annotations: + summary: Tempo distributor push latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} distributor push is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoDistributorUnhealthy + expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 + for: 15m + labels: + severity: critical + annotations: + summary: Tempo unhealthy distributor(s) (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy distributor(s)." + - alert: TempoFailedIngestingRequests + expr: sum by (job,instance)(increase (tempo_ingester_traces_created_total[5m])) / sum by (instance,job)(rate(tempo_request_duration_seconds_count{route='/tempopb.Pusher/PushBytesV2'}[5m])) == 0 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo pushing traces to ingester failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing failure in distributors pushing traces to the ingesters." + - alert: TempoFrontendClients + expr: tempo_query_frontend_connected_clients == 0 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo frontend connected clients (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} has no frontend connected clients." + - alert: TempoFrontendQueueLatency + expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_query_frontend_queue_duration_seconds_bucket[15m]))) > 2 + for: "15m" + labels: + severity: warning + annotations: + summary: Tempo frontend queue latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} frontend queue is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoIngesterFlushLatency + expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_ingester_flush_duration_seconds_bucket[5m]))) > 5 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo ingester flush latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} ingester flush is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoIngesterFlushesFailing + expr: sum by (instance,job)(increase(tempo_ingester_flush_failed_retries_total[1h])) > 2 and sum by(instance,job)(increase(tempo_ingester_flush_failed_retries_total[5m])) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo ingester flush retries failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 flush retries failures in the past hour." + - alert: TempoIngesterFlushesUnhealthy + expr: sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[1h])) > 2 and sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[5m])) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: Tempo ingester flush failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 ingester flush failures in the past hour." + - alert: TempoIngestersUnhealthy + expr: max by (instance,job)(tempo_ring_members{state="Unhealthy", name="ingester"}) > 0 + for: "15m" + labels: + severity: critical + annotations: + summary: Tempo unhealthy ingester(s) (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy ingester(s)." + - alert: TempoKVRequestErrors + expr: 100 * sum(rate(tempo_kv_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,instance,job) / sum(rate(tempo_kv_request_duration_seconds_count[5m])) by (route,instance,job) > 10 + for: "15m" + labels: + severity: critical + annotations: + summary: Tempo kv store request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} KV store requests is experiencing {{ printf \"%.2f\" $value }}% error rate." + - alert: TempoTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed." + - alert: TempoNoTenantIndexBuilders + expr: sum by (tenant,job,instance) (tempodb_blocklist_tenant_index_builder) == 0 and max by (tenant,job,instance)(tempodb_blocklist_length) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo tenant index builder failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having no tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale." + - alert: TempoRequestErrors + expr: 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,job,instance) / sum(rate(tempo_request_duration_seconds_count[5m])) by (route,job,instance) > 10 + for: "15m" + labels: + severity: critical + annotations: + summary: Tempo request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% request error rate." + - alert: TempoRequestLatency + expr: histogram_quantile(0.99, sum by(le, route,job,instance)(rate(tempo_request_duration_seconds_bucket[5m]))) > 5 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoRetentionsFailing + expr: sum by (job,instance)(increase(tempodb_retention_errors_total[1h])) > 2 and sum by (job,instance)(increase(tempodb_retention_errors_total[5m])) > 0 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo retentions failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 retention failures in the past hour." + - alert: TempoTCPConnectionsLimit + expr: 100 * tempo_tcp_connections / tempo_tcp_connections_limit >= 80 and tempo_tcp_connections_limit > 0 + for: "5m" + labels: + severity: warning + annotations: + summary: Tempo reaching max number of tcp connections (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is reaching {{ printf \"%.2f\" $value }}% of max tcp {{ $labels.protocol }} connections." + - alert: TempoTenantIndexTooOld + expr: max by(tenant,instance,job) (tempodb_blocklist_tenant_index_age_seconds) > 600 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo tenant old index (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing a tenant {{ $labels.tenant }} with a too old index age of 600 seconds." + - alert: TempoUserConfigurableOverridesReloadFailing + expr: sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0 + labels: + severity: critical + annotations: + summary: Tempo user config override reload failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 5 user-configurable override reload failures in the past hour." - name: tempo_instance_limits_alerts rules: - alert: TempoIngesterReachingSeriesLimit diff --git a/src/prometheus_alert_rules/tempo_workers/rules.yaml b/src/prometheus_alert_rules/workers/rules.yaml similarity index 100% rename from src/prometheus_alert_rules/tempo_workers/rules.yaml rename to src/prometheus_alert_rules/workers/rules.yaml diff --git a/src/tempo.py b/src/tempo.py index fd17ab2..fabc139 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -4,18 +4,13 @@ """Tempo workload configuration and client.""" import logging -import socket -from subprocess import CalledProcessError, getoutput -from typing import Any, Dict, List, Optional, Sequence, Set, Tuple +from typing import Callable, Dict, Optional, Sequence, Set, Tuple -from charms.tempo_k8s.v2.tracing import ( - ReceiverProtocol, - receiver_protocol_to_transport_protocol, -) -from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer +import yaml +from charms.tempo_k8s.v2.tracing import ReceiverProtocol +from cosl.coordinated_workers.coordinator import Coordinator import tempo_config -from tempo_cluster import TempoRole logger = logging.getLogger(__name__) @@ -23,21 +18,13 @@ class Tempo: """Class representing the Tempo client workload configuration.""" - config_path = "/etc/tempo/tempo.yaml" - - # cert path on charm container - server_cert_path = "/usr/local/share/ca-certificates/ca.crt" - # cert paths on tempo container - tls_cert_path = "/etc/tempo/tls/server.crt" - tls_key_path = "/etc/tempo/tls/server.key" + tls_cert_path = "/etc/worker/server.cert" + tls_key_path = "/etc/worker/private.key" tls_ca_path = "/usr/local/share/ca-certificates/ca.crt" wal_path = "/etc/tempo/tempo_wal" - log_path = "/var/log/tempo.log" - tempo_ready_notice_key = "canonical.com/tempo/workload-ready" - s3_relation_name = "s3" s3_bucket_name = "tempo" memberlist_port = 7946 @@ -47,31 +34,24 @@ class Tempo: "tempo_grpc": 9096, # default grpc listen port is 9095, but that conflicts with promtail. } + # ports defined are the default ports specified in + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver + # for each of the below receivers. receiver_ports: Dict[str, int] = { "zipkin": 9411, "otlp_grpc": 4317, "otlp_http": 4318, "jaeger_thrift_http": 14268, - # todo if necessary add support for: - # "kafka": 42, - # "jaeger_grpc": 14250, - # "opencensus": 43, - # "jaeger_thrift_compact": 44, - # "jaeger_thrift_binary": 45, + "jaeger_grpc": 14250, } all_ports = {**server_ports, **receiver_ports} def __init__( self, - external_host: Optional[str] = None, - use_tls: bool = False, + requested_receivers: Callable[[], "Tuple[ReceiverProtocol, ...]"], ): - # ports source: https://github.com/grafana/tempo/blob/main/example/docker-compose/local/docker-compose.yaml - - # fqdn, if an ingress is not available, else the ingress address. - self._external_hostname = external_host or socket.getfqdn() - self.use_tls = use_tls + self._receivers_getter = requested_receivers @property def tempo_http_server_port(self) -> int: @@ -83,91 +63,29 @@ def tempo_grpc_server_port(self) -> int: """Return the receiver port for the built-in tempo_http protocol.""" return self.server_ports["tempo_grpc"] - def get_external_ports(self, service_name_prefix: str) -> List[Tuple[str, int, int]]: - """List of service names and port mappings for the kubernetes service patch. - - Includes the tempo server as well as the receiver ports. - """ - # todo allow remapping ports? - all_ports = {**self.server_ports} - return [ - ( - (f"{service_name_prefix}-{service_name}").replace("_", "-"), - all_ports[service_name], - all_ports[service_name], - ) - for service_name in all_ports - ] - - @property - def url(self) -> str: - """Base url at which the tempo server is locally reachable over http.""" - scheme = "https" if self.use_tls else "http" - return f"{scheme}://{self._external_hostname}" - - def get_receiver_url(self, protocol: ReceiverProtocol, ingress: TraefikRouteRequirer): - """Return the receiver endpoint URL based on the protocol. - - if ingress is used, return endpoint provided by the ingress instead. - """ - protocol_type = receiver_protocol_to_transport_protocol.get(protocol) - # ingress.is_ready returns True even when traefik hasn't sent any data yet - has_ingress = ingress.is_ready() and ingress.external_host and ingress.scheme - receiver_port = self.receiver_ports[protocol] - - if has_ingress: - url = ( - ingress.external_host - if protocol_type == "grpc" - else f"{ingress.scheme}://{ingress.external_host}" - ) - else: - url = self._external_hostname if protocol_type == "grpc" else self.url - - return f"{url}:{receiver_port}" - - def _build_server_config(self): - server_config = tempo_config.Server( - http_listen_port=self.tempo_http_server_port, - # we need to specify a grpc server port even if we're not using the grpc server, - # otherwise it will default to 9595 and make promtail bork - grpc_listen_port=self.tempo_grpc_server_port, - ) - - if self.use_tls: - server_tls_config = tempo_config.TLS( - cert_file=str(self.tls_cert_path), - key_file=str(self.tls_key_path), - client_ca_file=str(self.tls_ca_path), - ) - server_config.http_tls_config = server_tls_config - server_config.grpc_tls_config = server_tls_config - - return server_config - - def generate_config( + def config( self, - receivers: Sequence[ReceiverProtocol], - s3_config: dict, - roles_addresses: Dict[str, Set[str]], - peers: Optional[Set[str]] = None, - ) -> Dict[str, Any]: + coordinator: Coordinator, + ) -> str: """Generate the Tempo configuration. Only activate the provided receivers. """ - config = tempo_config.Tempo( + + config = tempo_config.TempoConfig( auth_enabled=False, - server=self._build_server_config(), - distributor=self._build_distributor_config(receivers), + server=self._build_server_config(coordinator.tls_available), + distributor=self._build_distributor_config( + self._receivers_getter(), coordinator.tls_available + ), ingester=self._build_ingester_config(), - memberlist=self._build_memberlist_config(peers), + memberlist=self._build_memberlist_config(coordinator.cluster.gather_addresses()), compactor=self._build_compactor_config(), - querier=self._build_querier_config(roles_addresses), - storage=self._build_storage_config(s3_config), + querier=self._build_querier_config(coordinator.cluster.gather_addresses_by_role()), + storage=self._build_storage_config(coordinator._s3_config), ) - if self.use_tls: + if coordinator.tls_available: # cfr: # https://grafana.com/docs/tempo/latest/configuration/network/tls/#client-configuration tls_config = { @@ -176,7 +94,7 @@ def generate_config( "tls_key_path": self.tls_key_path, "tls_ca_path": self.tls_ca_path, # try with fqdn? - "tls_server_name": self._external_hostname, + "tls_server_name": coordinator.hostname, } config.ingester_client = tempo_config.Client( grpc_client_config=tempo_config.ClientTLS(**tls_config) @@ -189,7 +107,26 @@ def generate_config( ) config.memberlist = config.memberlist.model_copy(update=tls_config) - return config.model_dump(mode="json", exclude_none=True) + return yaml.dump(config.model_dump(mode="json", by_alias=True, exclude_none=True)) + + def _build_server_config(self, use_tls=False): + server_config = tempo_config.Server( + http_listen_port=self.tempo_http_server_port, + # we need to specify a grpc server port even if we're not using the grpc server, + # otherwise it will default to 9595 and make promtail bork + grpc_listen_port=self.tempo_grpc_server_port, + ) + + if use_tls: + server_tls_config = tempo_config.TLS( + cert_file=str(self.tls_cert_path), + key_file=str(self.tls_key_path), + client_ca_file=str(self.tls_ca_path), + ) + server_config.http_tls_config = server_tls_config + server_config.grpc_tls_config = server_tls_config + + return server_config def _build_storage_config(self, s3_config: dict): storage_config = tempo_config.TraceStorage( @@ -201,39 +138,23 @@ def _build_storage_config(self, s3_config: dict): queue_depth=20000, ), backend="s3", - s3=tempo_config.S3( - bucket=s3_config["bucket"], - access_key=s3_config["access-key"], - endpoint=s3_config["endpoint"], - secret_key=s3_config["secret-key"], - ), + s3=tempo_config.S3(**s3_config), # starting from Tempo 2.4, we need to use at least parquet v3 to have search capabilities (Grafana support) # https://grafana.com/docs/tempo/latest/release-notes/v2-4/#vparquet3-is-now-the-default-block-format block=tempo_config.Block(version="vParquet3"), ) return tempo_config.Storage(trace=storage_config) - def is_ready(self): - """Whether the tempo built-in readiness check reports 'ready'.""" - if self.use_tls: - tls, s = f" --cacert {self.server_cert_path}", "s" - else: - tls = s = "" - - # cert is for fqdn/ingress, not for IP - cmd = f"curl{tls} http{s}://{self._external_hostname}:{self.tempo_http_server_port}/ready" - - try: - out = getoutput(cmd).split("\n")[-1] - except (CalledProcessError, IndexError): - return False - return out == "ready" - def _build_querier_config(self, roles_addresses: Dict[str, Set[str]]): """Build querier config""" - addr = "localhost" - if TempoRole.query_frontend in roles_addresses.keys(): - addr = roles_addresses[TempoRole.query_frontend].pop() + # if distributor and query-frontend have the same address, then the mode of operation is 'all'. + query_frontend_addresses = roles_addresses.get(tempo_config.TempoRole.query_frontend) + distributor_addresses = roles_addresses.get(tempo_config.TempoRole.distributor) + + if not query_frontend_addresses or query_frontend_addresses == distributor_addresses: + addr = "localhost" + else: + addr = query_frontend_addresses.pop() return tempo_config.Querier( frontend_worker=tempo_config.FrontendWorker( @@ -274,7 +195,9 @@ def _build_ingester_config(self): max_block_duration="30m", ) - def _build_distributor_config(self, receivers: Sequence[ReceiverProtocol]): # noqa: C901 + def _build_distributor_config( + self, receivers: Sequence[ReceiverProtocol], use_tls=False + ): # noqa: C901 """Build distributor config""" # receivers: the receivers we have to enable because the requirers we're related to # intend to use them. It already includes receivers that are always enabled @@ -284,7 +207,7 @@ def _build_distributor_config(self, receivers: Sequence[ReceiverProtocol]): # n if not receivers_set: logger.warning("No receivers set. Tempo will be up but not functional.") - if self.use_tls: + if use_tls: receiver_config = { "tls": { "ca_file": str(self.tls_ca_path), @@ -299,8 +222,6 @@ def _build_distributor_config(self, receivers: Sequence[ReceiverProtocol]): # n if "zipkin" in receivers_set: config["zipkin"] = receiver_config - if "opencensus" in receivers_set: - config["opencensus"] = receiver_config otlp_config = {} if "otlp_http" in receivers_set: diff --git a/src/tempo_cluster.py b/src/tempo_cluster.py deleted file mode 100644 index bb474c1..0000000 --- a/src/tempo_cluster.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical -# See LICENSE file for licensing details. - -"""This module contains an endpoint wrapper class for the requirer side of the ``tempo-cluster`` relation. - -As this relation is cluster-internal and not intended for third-party charms to interact with -`tempo-coordinator-k8s`, its only user will be the tempo-worker-k8s charm. As such, -it does not live in a charm lib as most other relation endpoint wrappers do. -""" -import collections -import json -import logging -from enum import Enum, unique -from typing import Any, Dict, List, MutableMapping, Optional, Set - -import ops -import pydantic - -# The only reason we need the tracing lib is this enum. Not super nice. -from charms.tempo_k8s.v2.tracing import ReceiverProtocol -from ops import EventSource, Object, ObjectEvents -from pydantic import BaseModel, ConfigDict - -log = logging.getLogger("tempo_cluster") - -DEFAULT_ENDPOINT_NAME = "tempo-cluster" -BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} - - -# TODO: inherit enum.StrEnum when jammy is no longer supported. -# https://docs.python.org/3/library/enum.html#enum.StrEnum -@unique -class TempoRole(str, Enum): - """Tempo component role names. - - References: - arch: - -> https://grafana.com/docs/tempo/latest/operations/architecture/ - config: - -> https://grafana.com/docs/tempo/latest/configuration/#server - """ - - # scalable-single-binary is a bit too long to type - all = "all" # default, meta-role. gets remapped to scalable-single-binary by the worker. - - querier = "querier" - query_frontend = "query-frontend" - ingester = "ingester" - distributor = "distributor" - compactor = "compactor" - metrics_generator = "metrics-generator" - - @property - def all_nonmeta(self): - return ( - TempoRole.querier, - TempoRole.query_frontend, - TempoRole.ingester, - TempoRole.distributor, - TempoRole.compactor, - TempoRole.metrics_generator, - ) - - -class ConfigReceivedEvent(ops.EventBase): - """Event emitted when the "tempo-cluster" provider has shared a new tempo config.""" - - config: Dict[str, Any] - """The tempo config.""" - - def __init__(self, handle: ops.framework.Handle, config: Dict[str, Any]): - super().__init__(handle) - self.config = config - - def snapshot(self) -> Dict[str, Any]: - """Used by the framework to serialize the event to disk. - - Not meant to be called by charm code. - """ - return {"config": json.dumps(self.config)} - - def restore(self, snapshot: Dict[str, Any]): - """Used by the framework to deserialize the event from disk. - - Not meant to be called by charm code. - """ - self.relation = json.loads(snapshot["config"]) # noqa - - -class TempoClusterError(Exception): - """Base class for exceptions raised by this module.""" - - -class DataValidationError(TempoClusterError): - """Raised when relation databag validation fails.""" - - -class DatabagAccessPermissionError(TempoClusterError): - """Raised when a follower attempts to write leader settings.""" - - -class _JujuTopologyModel(pydantic.BaseModel): - """_JujuTopologyModel.""" - - model: str - model_uuid: str - application: str - charm_name: str - unit: str - - -# DatabagModel implementation from traefik.v1.ingress charm lib. -PYDANTIC_IS_V1 = int(pydantic.version.VERSION.split(".")[0]) < 2 -if PYDANTIC_IS_V1: - - class DatabagModel(BaseModel): # type: ignore - """Base databag model.""" - - class Config: - """Pydantic config.""" - - allow_population_by_field_name = True - """Allow instantiating this class by field name (instead of forcing alias).""" - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {f.alias for f in cls.__fields__.values()} # type: ignore - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - log.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.parse_raw(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - log.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - - for key, value in self.dict(by_alias=True, exclude_defaults=True).items(): # type: ignore - databag[key] = json.dumps(value) - - return databag - -else: - from pydantic import ConfigDict - - class DatabagModel(BaseModel): - """Base databag model.""" - - model_config = ConfigDict( - # tolerate additional keys in databag - extra="ignore", - # Allow instantiating this class by field name (instead of forcing alias). - populate_by_name=True, - ) # type: ignore - """Pydantic config.""" - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {(f.alias or n) for n, f in cls.model_fields.items()} # type: ignore - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - log.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.model_validate_json(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - log.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - - dct = self.model_dump(mode="json", by_alias=True, exclude_defaults=True) # type: ignore - databag.update({k: json.dumps(v) for k, v in dct.items()}) - return databag - - -class TempoClusterRequirerAppData(DatabagModel): - """TempoClusterRequirerAppData.""" - - role: TempoRole - - -class TempoClusterRequirerUnitData(DatabagModel): - """TempoClusterRequirerUnitData.""" - - juju_topology: _JujuTopologyModel - address: str - - -class TempoClusterProviderAppData(DatabagModel): - """TempoClusterProviderAppData.""" - - tempo_config: Dict[str, Any] - loki_endpoints: Optional[Dict[str, str]] = None - ca_cert: Optional[str] = None - server_cert: Optional[str] = None - privkey_secret_id: Optional[str] = None - tempo_receiver: Optional[Dict[ReceiverProtocol, str]] = None - - -class TempoClusterChangedEvent(ops.EventBase): - """Event emitted when any "tempo-cluster" relation event fires.""" - - -class TempoClusterProviderEvents(ObjectEvents): - """Events emitted by the TempoClusterProvider "tempo-cluster" endpoint wrapper.""" - - changed = EventSource(TempoClusterChangedEvent) - - -class TempoClusterProvider(Object): - """``tempo-cluster`` provider endpoint wrapper.""" - - on = TempoClusterProviderEvents() # type: ignore - - def __init__( - self, - charm: ops.CharmBase, - key: Optional[str] = None, - endpoint: str = DEFAULT_ENDPOINT_NAME, - ): - super().__init__(charm, key or endpoint) - self._charm = charm - - # filter out common unhappy relation states - self._relations: List[ops.Relation] = [ - rel for rel in self.model.relations[endpoint] if (rel.app and rel.data) - ] - - # we coalesce all tempo-cluster-relation-* events into a single cluster-changed API. - # the coordinator uses a common exit hook reconciler, that's why. - self.framework.observe( - self._charm.on[endpoint].relation_joined, self._on_tempo_cluster_changed - ) - self.framework.observe( - self._charm.on[endpoint].relation_changed, self._on_tempo_cluster_changed - ) - self.framework.observe( - self._charm.on[endpoint].relation_departed, self._on_tempo_cluster_changed - ) - self.framework.observe( - self._charm.on[endpoint].relation_broken, self._on_tempo_cluster_changed - ) - - def _on_tempo_cluster_changed(self, _): - self.on.changed.emit() - - def publish_privkey(self, label: str) -> str: - """Grant the secret containing the privkey to all relations, and return the secret ID.""" - secret = self.model.get_secret(label=label) - for relation in self._relations: - secret.grant(relation) - # can't return secret.id because secret was obtained by label, and so - # we don't have an ID unless we fetch it - return secret.get_info().id - - def publish_data( - self, - tempo_config: Dict[str, Any], - tempo_receiver: Optional[Dict[ReceiverProtocol, Any]] = None, - ca_cert: Optional[str] = None, - server_cert: Optional[str] = None, - privkey_secret_id: Optional[str] = None, - loki_endpoints: Optional[Dict[str, str]] = None, - ) -> None: - """Publish the tempo config to all related tempo worker clusters.""" - for relation in self._relations: - if relation: - local_app_databag = TempoClusterProviderAppData( - tempo_config=tempo_config, - loki_endpoints=loki_endpoints, - tempo_receiver=tempo_receiver, - ca_cert=ca_cert, - server_cert=server_cert, - privkey_secret_id=privkey_secret_id, - ) - local_app_databag.dump(relation.data[self.model.app]) - - @property - def has_workers(self) -> bool: - """Return whether this tempo coordinator has any connected workers.""" - # we use the presence of relations instead of addresses, because we want this - # check to fail early - return bool(self._relations) - - def gather_addresses_by_role(self) -> Dict[str, Set[str]]: - """Go through the worker's unit databags to collect all the addresses published by the units, by role.""" - data = collections.defaultdict(set) - for relation in self._relations: - - if not relation.app: - log.debug(f"skipped {relation} as .app is None") - continue - - try: - worker_app_data = TempoClusterRequirerAppData.load(relation.data[relation.app]) - except DataValidationError as e: - log.info(f"invalid databag contents: {e}") - continue - - for worker_unit in relation.units: - try: - worker_data = TempoClusterRequirerUnitData.load(relation.data[worker_unit]) - unit_address = worker_data.address - data[worker_app_data.role].add(unit_address) - except DataValidationError as e: - log.info(f"invalid databag contents: {e}") - continue - - return data - - def gather_addresses(self) -> Set[str]: - """Go through the worker's unit databags to collect all the addresses published by the units.""" - data = set() - addresses_by_role = self.gather_addresses_by_role() - for role, address_set in addresses_by_role.items(): - data.update(address_set) - - return data - - def gather_roles(self) -> Dict[TempoRole, int]: - """Go through the worker's app databags and sum the available application roles.""" - data = collections.Counter() - for relation in self._relations: - if relation.app: - remote_app_databag = relation.data[relation.app] - try: - worker_role: TempoRole = TempoClusterRequirerAppData.load( - remote_app_databag - ).role - except DataValidationError as e: - log.debug(f"invalid databag contents: {e}") - continue - - # the number of units with each role is the number of remote units - role_n = len(relation.units) # exclude this unit - if worker_role is TempoRole.all: - for role in [r for r in TempoRole if r is not TempoRole.all]: - data[role] += role_n - continue - - data[worker_role] += role_n - - dct = dict(data) - # exclude all roles from the count, if any slipped through - if TempoRole.all in data: - del data[TempoRole.all] - return dct diff --git a/src/tempo_config.py b/src/tempo_config.py index 1c2a186..4ac225b 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -2,22 +2,94 @@ # See LICENSE file for licensing details. """Helper module for interacting with the Tempo configuration.""" - import enum import logging import re +from enum import Enum, unique from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterable, List, Mapping, Optional from urllib.parse import urlparse -from pydantic import BaseModel, ConfigDict, field_validator, model_validator - -S3_RELATION_NAME = "s3" -BUCKET_NAME = "tempo" +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator logger = logging.getLogger(__name__) +# TODO: inherit enum.StrEnum when jammy is no longer supported. +# https://docs.python.org/3/library/enum.html#enum.StrEnum +@unique +class TempoRole(str, Enum): + """Tempo component role names. + + References: + arch: + -> https://grafana.com/docs/tempo/latest/operations/architecture/ + config: + -> https://grafana.com/docs/tempo/latest/configuration/#server + """ + + # scalable-single-binary is a bit too long to type + all = "all" # default, meta-role. gets remapped to scalable-single-binary by the worker. + + querier = "querier" + query_frontend = "query-frontend" + ingester = "ingester" + distributor = "distributor" + compactor = "compactor" + metrics_generator = "metrics-generator" + + @staticmethod + def all_nonmeta(): + return { + TempoRole.querier, + TempoRole.query_frontend, + TempoRole.ingester, + TempoRole.distributor, + TempoRole.compactor, + } + + +META_ROLES = { + "all": set(TempoRole.all_nonmeta()), +} +"""Tempo component meta-role names.""" + +MINIMAL_DEPLOYMENT = { + TempoRole.querier: 1, + TempoRole.query_frontend: 1, + TempoRole.ingester: 1, + TempoRole.distributor: 1, + TempoRole.compactor: 1, +} +"""The minimal set of roles that need to be allocated for the +deployment to be considered consistent (otherwise we set blocked).""" + +RECOMMENDED_DEPLOYMENT = { + TempoRole.querier.value: 1, + TempoRole.query_frontend.value: 1, + TempoRole.ingester.value: 3, + TempoRole.distributor.value: 1, + TempoRole.compactor.value: 1, + TempoRole.metrics_generator.value: 1, +} + +""" +The set of roles that need to be allocated for the +deployment to be considered robust according to Grafana Tempo's +Helm chart configurations. +https://github.com/grafana/helm-charts/blob/main/charts/tempo-distributed/ +""" + + +class TempoRolesConfig: + """Define the configuration for Tempo roles.""" + + roles: Iterable[str] = {role for role in TempoRole} + meta_roles: Mapping[str, Iterable[str]] = META_ROLES + minimal_deployment: Iterable[str] = MINIMAL_DEPLOYMENT + recommended_deployment: Dict[str, int] = RECOMMENDED_DEPLOYMENT + + class ClientAuthTypeEnum(str, enum.Enum): """Client auth types.""" @@ -162,10 +234,15 @@ class Wal(BaseModel): class S3(BaseModel): """S3 config schema.""" - bucket: str - access_key: str + model_config = ConfigDict(populate_by_name=True) + """Pydantic config.""" + + # Use aliases to override keys in `coordinator::_s3_config` + # to align with upstream Tempo's configuration keys: `bucket`, `access_key`, `secret_key`. + bucket_name: str = Field(alias="bucket") + access_key_id: str = Field(alias="access_key") endpoint: str - secret_key: str + secret_access_key: str = Field(alias="secret_key") insecure: bool = False @model_validator(mode="before") # pyright: ignore @@ -208,7 +285,7 @@ class Storage(BaseModel): trace: TraceStorage -class Tempo(BaseModel): +class TempoConfig(BaseModel): """Tempo config schema.""" auth_enabled: bool diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index fd3d087..d1df125 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,13 +1,22 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. +import json import logging import os +import random import shutil +import tempfile from pathlib import Path from pytest import fixture from pytest_operator.plugin import OpsTest +from tests.integration.helpers import get_relation_data + +APP_NAME = "tempo" +SSC = "self-signed-certificates" +SSC_APP_NAME = "ssc" + logger = logging.getLogger(__name__) @@ -63,3 +72,24 @@ def copy_charm_libs_into_tester_grpc_charm(ops_test): # cleanup: remove all libs for path in copies: Path(path).unlink() + + +@fixture(scope="function") +def server_cert(ops_test: OpsTest): + data = get_relation_data( + requirer_endpoint=f"{APP_NAME}/0:certificates", + provider_endpoint=f"{SSC_APP_NAME}/0:certificates", + model=ops_test.model.name, + ) + cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] + + with tempfile.NamedTemporaryFile() as f: + p = Path(f.name) + p.write_text(cert) + yield p + + +@fixture(scope="function") +def nonce(): + """Generate an integer nonce for easier trace querying.""" + return str(random.random())[2:] diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 887c3e0..fd83122 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1,13 +1,39 @@ +import json import logging +import os +import shlex import subprocess +import tempfile from dataclasses import dataclass -from typing import Dict +from pathlib import Path +from typing import Dict, Literal +import requests import yaml +from juju.application import Application +from juju.unit import Unit +from minio import Minio from pytest_operator.plugin import OpsTest +from tenacity import retry, stop_after_attempt, wait_exponential + +from tempo import Tempo _JUJU_DATA_CACHE = {} _JUJU_KEYS = ("egress-subnets", "ingress-address", "private-address") +ACCESS_KEY = "accesskey" +SECRET_KEY = "secretkey" +MINIO = "minio" +BUCKET_NAME = "tempo" +S3_INTEGRATOR = "s3-integrator" +WORKER_NAME = "tempo-worker" +APP_NAME = "tempo" +protocols_endpoints = { + "jaeger_thrift_http": "https://{}:14268/api/traces?format=jaeger.thrift", + "zipkin": "https://{}:9411/v1/traces", + "jaeger_grpc": "{}:14250", + "otlp_http": "https://{}:4318/v1/traces", + "otlp_grpc": "{}:4317", +} logger = logging.getLogger(__name__) @@ -187,3 +213,134 @@ async def run_command(model_name: str, app_name: str, unit_num: int, command: li logger.error(e.stdout.decode()) raise e return res.stdout + + +def present_facade( + interface: str, + app_data: Dict = None, + unit_data: Dict = None, + role: Literal["provide", "require"] = "provide", + model: str = None, + app: str = "facade", +): + """Set up the facade charm to present this data over the interface ``interface``.""" + data = { + "endpoint": f"{role}-{interface}", + } + if app_data: + data["app_data"] = json.dumps(app_data) + if unit_data: + data["unit_data"] = json.dumps(unit_data) + + with tempfile.NamedTemporaryFile(dir=os.getcwd()) as f: + fpath = Path(f.name) + fpath.write_text(yaml.safe_dump(data)) + + _model = f" --model {model}" if model else "" + + subprocess.run(shlex.split(f"juju run {app}/0{_model} update --params {fpath.absolute()}")) + + +async def get_unit_address(ops_test: OpsTest, app_name, unit_no): + status = await ops_test.model.get_status() + app = status["applications"][app_name] + if app is None: + assert False, f"no app exists with name {app_name}" + unit = app["units"].get(f"{app_name}/{unit_no}") + if unit is None: + assert False, f"no unit exists in app {app_name} with index {unit_no}" + return unit["address"] + + +async def deploy_and_configure_minio(ops_test: OpsTest): + config = { + "access-key": ACCESS_KEY, + "secret-key": SECRET_KEY, + } + await ops_test.model.deploy(MINIO, channel="edge", trust=True, config=config) + await ops_test.model.wait_for_idle(apps=[MINIO], status="active", timeout=2000) + minio_addr = await get_unit_address(ops_test, MINIO, "0") + + mc_client = Minio( + f"{minio_addr}:9000", + access_key="accesskey", + secret_key="secretkey", + secure=False, + ) + + # create tempo bucket + found = mc_client.bucket_exists(BUCKET_NAME) + if not found: + mc_client.make_bucket(BUCKET_NAME) + + # configure s3-integrator + s3_integrator_app: Application = ops_test.model.applications[S3_INTEGRATOR] + s3_integrator_leader: Unit = s3_integrator_app.units[0] + + await s3_integrator_app.set_config( + { + "endpoint": f"minio-0.minio-endpoints.{ops_test.model.name}.svc.cluster.local:9000", + "bucket": BUCKET_NAME, + } + ) + + action = await s3_integrator_leader.run_action("sync-s3-credentials", **config) + action_result = await action.wait() + assert action_result.status == "completed" + + +async def deploy_cluster(ops_test: OpsTest, tempo_app=APP_NAME): + await ops_test.model.deploy("tempo-worker-k8s", application_name=WORKER_NAME, channel="edge") + await ops_test.model.deploy(S3_INTEGRATOR, channel="edge") + + await ops_test.model.integrate(tempo_app + ":s3", S3_INTEGRATOR + ":s3-credentials") + await ops_test.model.integrate(tempo_app + ":tempo-cluster", WORKER_NAME + ":tempo-cluster") + + await deploy_and_configure_minio(ops_test) + + await ops_test.model.wait_for_idle( + apps=[tempo_app, WORKER_NAME, S3_INTEGRATOR], + status="active", + timeout=1000, + idle_period=30, + ) + + +def get_traces(tempo_host: str, service_name="tracegen-otlp_http", tls=True): + url = f"{'https' if tls else 'http'}://{tempo_host}:3200/api/search?tags=service.name={service_name}" + req = requests.get( + url, + verify=False, + ) + assert req.status_code == 200 + traces = json.loads(req.text)["traces"] + return traces + + +@retry(stop=stop_after_attempt(15), wait=wait_exponential(multiplier=1, min=4, max=10)) +async def get_traces_patiently(tempo_host, service_name="tracegen-otlp_http", tls=True): + traces = get_traces(tempo_host, service_name=service_name, tls=tls) + assert len(traces) > 0 + return traces + + +async def emit_trace( + endpoint, ops_test: OpsTest, nonce, proto: str = "otlp_http", verbose=0, use_cert=False +): + """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" + cmd = ( + f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " + f"TRACEGEN_ENDPOINT={endpoint} " + f"TRACEGEN_VERBOSE={verbose} " + f"TRACEGEN_PROTOCOL={proto} " + f"TRACEGEN_CERT={Tempo.tls_ca_path if use_cert else ''} " + f"TRACEGEN_NONCE={nonce} " + "python3 tracegen.py" + ) + return subprocess.getoutput(cmd) + + +async def get_application_ip(ops_test: OpsTest, app_name: str): + status = await ops_test.model.get_status() + app = status["applications"][app_name] + return app.public_address diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index 896f385..9f5a221 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -1,138 +1,109 @@ -# TODO: uncomment and fix when the fully functional tempo cluster is ready (e.g: traces get ingested, can query for traces) -# import asyncio -# import json -# import logging -# import random -# import subprocess -# import tempfile -# from pathlib import Path - -# import pytest -# import requests -# import yaml -# from pytest_operator.plugin import OpsTest -# from tenacity import retry, stop_after_attempt, wait_exponential - -# from tempo import Tempo -# from tests.integration.helpers import get_relation_data - -# METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -# APP_NAME = "tempo" -# SSC = "self-signed-certificates" -# SSC_APP_NAME = "ssc" -# TRAEFIK = "traefik-k8s" -# TRAEFIK_APP_NAME = "trfk" -# TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" - -# logger = logging.getLogger(__name__) - - -# @pytest.fixture(scope="function") -# def nonce(): -# """Generate an integer nonce for easier trace querying.""" -# return str(random.random())[2:] - - -# @pytest.fixture(scope="function") -# def server_cert(ops_test: OpsTest): -# data = get_relation_data( -# requirer_endpoint=f"{APP_NAME}/0:certificates", -# provider_endpoint=f"{SSC_APP_NAME}/0:certificates", -# model=ops_test.model.name, -# ) -# cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] - -# with tempfile.NamedTemporaryFile() as f: -# p = Path(f.name) -# p.write_text(cert) -# yield p - - -# def get_traces(tempo_host: str, nonce, service_name="tracegen"): -# req = requests.get( -# "https://" + tempo_host + ":3200/api/search", -# params={"service.name": service_name, "nonce": nonce}, -# verify=False, -# ) -# assert req.status_code == 200 -# return json.loads(req.text)["traces"] - - -# @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) -# async def get_traces_patiently(tempo_host, nonce): -# assert get_traces(tempo_host, nonce=nonce) - - -# async def get_tempo_host(ops_test: OpsTest): -# status = await ops_test.model.get_status() -# app = status["applications"][TRAEFIK_APP_NAME] -# return app.public_address - - -# async def emit_trace( -# endpoint, ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False -# ): -# """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" -# cmd = ( -# f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " -# f"TRACEGEN_ENDPOINT={endpoint} " -# f"TRACEGEN_VERBOSE={verbose} " -# f"TRACEGEN_PROTOCOL={proto} " -# f"TRACEGEN_CERT={Tempo.server_cert_path if use_cert else ''} " -# f"TRACEGEN_NONCE={nonce} " -# "python3 tracegen.py" -# ) - -# return subprocess.getoutput(cmd) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_build_and_deploy(ops_test: OpsTest): -# tempo_charm = await ops_test.build_charm(".") -# await asyncio.gather( -# ops_test.model.deploy(tempo_charm, application_name=APP_NAME), -# ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), -# ops_test.model.deploy(TRAEFIK, application_name=TRAEFIK_APP_NAME, channel="edge"), -# ) - -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], -# status="active", -# raise_on_blocked=True, -# timeout=10000, -# raise_on_error=False, -# ), -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_push_tracegen_script_and_deps(ops_test: OpsTest): -# await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") -# await ops_test.juju( -# "ssh", -# f"{APP_NAME}/0", -# "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_relate(ops_test: OpsTest): -# await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") -# await ops_test.model.integrate( -# SSC_APP_NAME + ":certificates", TRAEFIK_APP_NAME + ":certificates" -# ) -# await ops_test.model.integrate(APP_NAME + ":ingress", TRAEFIK_APP_NAME + ":traefik-route") -# await ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], -# status="active", -# timeout=1000, -# ) - - +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import ( + deploy_cluster, + emit_trace, + get_traces_patiently, + protocols_endpoints, +) +from juju.application import Application +from pytest_operator.plugin import OpsTest + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +WORKER_NAME = "tempo-worker" +SSC = "self-signed-certificates" +SSC_APP_NAME = "ssc" +TRAEFIK = "traefik-k8s" +TRAEFIK_APP_NAME = "trfk" +TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" + +logger = logging.getLogger(__name__) + + +async def get_ingress_proxied_hostname(ops_test: OpsTest): + status = await ops_test.model.get_status() + app = status["applications"][TRAEFIK_APP_NAME] + status_msg = app["status"]["info"] + + # hacky way to get ingress hostname + if "Serving at" not in status_msg: + assert False, f"Ingressed hostname is not present in {TRAEFIK_APP_NAME} status message." + return status_msg.replace("Serving at", "").strip() + + +async def get_tempo_ingressed_endpoint(hostname, protocol): + protocol_endpoint = protocols_endpoints.get(protocol) + if protocol_endpoint is None: + assert False, f"Invalid {protocol}" + return protocol_endpoint.format(hostname) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + tempo_charm = await ops_test.build_charm(".") + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), + ops_test.model.deploy(TRAEFIK, application_name=TRAEFIK_APP_NAME, channel="edge"), + ) + + # deploy cluster + await deploy_cluster(ops_test) + + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], + status="active", + raise_on_blocked=True, + timeout=10000, + raise_on_error=False, + ), + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_push_tracegen_script_and_deps(ops_test: OpsTest): + await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") + await ops_test.juju( + "ssh", + f"{APP_NAME}/0", + "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http" + + " opentelemetry-exporter-zipkin opentelemetry-exporter-jaeger", + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") + await ops_test.model.integrate( + SSC_APP_NAME + ":certificates", TRAEFIK_APP_NAME + ":certificates" + ) + await ops_test.model.integrate(APP_NAME + ":ingress", TRAEFIK_APP_NAME + ":traefik-route") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME, WORKER_NAME], + status="active", + timeout=1000, + # make idle period 1 minute, as Tempo workload might not be up yet + idle_period=60, + ) + + +# TODO: Uncomment and fix once below issue is fixed +# Currently, traefik, through `traefik_route`, accepts traffic through http although TLS is enabled. # @pytest.mark.abort_on_fail # async def test_verify_ingressed_trace_http_upgrades_to_tls(ops_test: OpsTest, nonce): # tempo_host = await get_tempo_host(ops_test) @@ -145,34 +116,45 @@ # assert get_traces_patiently(tempo_host, nonce=nonce) -# @pytest.mark.abort_on_fail -# async def test_verify_ingressed_trace_http_tls(ops_test: OpsTest, nonce, server_cert): -# tempo_host = await get_tempo_host(ops_test) -# await emit_trace( -# f"https://{tempo_host}:4318/v1/traces", nonce=nonce, ops_test=ops_test, use_cert=True -# ) -# # THEN we can verify it's been ingested -# assert get_traces_patiently(tempo_host, nonce=nonce) - - -# @pytest.mark.abort_on_fail -# async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): -# tempo_host = await get_tempo_host(ops_test) -# await emit_trace( -# f"{tempo_host}:4317", nonce=nonce, proto="grpc", ops_test=ops_test, use_cert=True -# ) -# # THEN we can verify it's been ingested -# assert get_traces_patiently(tempo_host, nonce=nonce) - - -# @pytest.mark.teardown -# @pytest.mark.abort_on_fail -# async def test_remove_relation(ops_test: OpsTest): -# await ops_test.juju( -# "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" -# ) -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 -# ), -# ) +@pytest.mark.abort_on_fail +@pytest.mark.parametrize("protocol", list(protocols_endpoints.keys())) +async def test_verify_traces_force_enabled_protocols_tls(ops_test: OpsTest, nonce, protocol): + + tempo_app: Application = ops_test.model.applications[APP_NAME] + + # enable each protocol receiver + # otlp_http should be enabled by default + if protocol != "otlp_http": + await tempo_app.set_config( + { + f"always_enable_{protocol}": "True", + } + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + tempo_host = await get_ingress_proxied_hostname(ops_test) + tempo_endpoint = await get_tempo_ingressed_endpoint(tempo_host, protocol=protocol) + # WHEN we emit a trace secured with TLS + + await emit_trace( + tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto=protocol, use_cert=True + ) + # THEN we can verify it's been ingested + await get_traces_patiently(tempo_host, service_name=f"tracegen-{protocol}") + + +@pytest.mark.teardown +@pytest.mark.abort_on_fail +async def test_remove_relation(ops_test: OpsTest): + await ops_test.juju( + "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" + ) + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ), + ) diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 2010ace..b87b69b 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -1,166 +1,167 @@ -# TODO: uncomment and fix when the fully functional tempo cluster is ready (e.g: traces get ingested, can query for traces) -# import asyncio -# import json -# import logging -# from pathlib import Path - -# import pytest -# import yaml -# from pytest_operator.plugin import OpsTest - -# METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -# APP_NAME = METADATA["name"] -# TESTER_METADATA = yaml.safe_load(Path("./tests/integration/tester/metadata.yaml").read_text()) -# TESTER_APP_NAME = TESTER_METADATA["name"] -# TESTER_GRPC_METADATA = yaml.safe_load( -# Path("./tests/integration/tester-grpc/metadata.yaml").read_text() -# ) -# TESTER_GRPC_APP_NAME = TESTER_GRPC_METADATA["name"] - -# logger = logging.getLogger(__name__) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_build_and_deploy(ops_test: OpsTest): -# # Given a fresh build of the charm -# # When deploying it together with testers -# # Then applications should eventually be created -# tempo_charm = await ops_test.build_charm(".") -# tester_charm = await ops_test.build_charm("./tests/integration/tester/") -# tester_grpc_charm = await ops_test.build_charm("./tests/integration/tester-grpc/") -# resources_tester = {"workload": TESTER_METADATA["resources"]["workload"]["upstream-source"]} -# resources_tester_grpc = { -# "workload": TESTER_GRPC_METADATA["resources"]["workload"]["upstream-source"] -# } - -# await asyncio.gather( -# ops_test.model.deploy(tempo_charm, application_name=APP_NAME), -# ops_test.model.deploy( -# tester_charm, -# resources=resources_tester, -# application_name=TESTER_APP_NAME, -# num_units=3, -# ), -# ops_test.model.deploy( -# tester_grpc_charm, -# resources=resources_tester_grpc, -# application_name=TESTER_GRPC_APP_NAME, -# num_units=3, -# ), -# ) - -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], -# status="active", -# raise_on_blocked=True, -# timeout=10000, -# raise_on_error=False, -# ), -# # for tester, depending on the result of race with tempo it's either waiting or active -# ops_test.model.wait_for_idle( -# apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False -# ), -# ops_test.model.wait_for_idle( -# apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False -# ), -# ) - -# assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active" - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_relate(ops_test: OpsTest): -# # given a deployed charm -# # when relating it together with the tester -# # then relation should appear -# await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") -# await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing") -# await ops_test.model.wait_for_idle( -# apps=[APP_NAME, TESTER_APP_NAME, TESTER_GRPC_APP_NAME], -# status="active", -# timeout=1000, -# ) - - -# async def test_verify_traces_http(ops_test: OpsTest): -# # given a relation between charms -# # when traces endpoint is queried -# # then it should contain traces from tester charm -# status = await ops_test.model.get_status() -# app = status["applications"][APP_NAME] -# logger.info(app.public_address) -# endpoint = app.public_address + ":3200/api/search" -# cmd = [ -# "curl", -# endpoint, -# ] -# rc, stdout, stderr = await ops_test.run(*cmd) -# logger.info("%s: %s", endpoint, (rc, stdout, stderr)) -# assert rc == 0, ( -# f"curl exited with rc={rc} for {endpoint}; " -# f"non-zero return code means curl encountered a >= 400 HTTP code; " -# f"cmd={cmd}" -# ) -# traces = json.loads(stdout)["traces"] - -# found = False -# for trace in traces: -# if trace["rootServiceName"] == APP_NAME and trace["rootTraceName"] == "charm exec": -# found = True - -# assert found, f"There's no trace of charm exec traces in tempo. {json.dumps(traces, indent=2)}" - - -# async def test_verify_traces_grpc(ops_test: OpsTest): -# # the tester-grpc charm emits a single grpc trace in its common exit hook -# # we verify it's there -# status = await ops_test.model.get_status() -# app = status["applications"][APP_NAME] -# logger.info(app.public_address) -# endpoint = app.public_address + ":3200/api/search" -# cmd = [ -# "curl", -# endpoint, -# ] -# rc, stdout, stderr = await ops_test.run(*cmd) -# logger.info("%s: %s", endpoint, (rc, stdout, stderr)) -# assert rc == 0, ( -# f"curl exited with rc={rc} for {endpoint}; " -# f"non-zero return code means curl encountered a >= 400 HTTP code; " -# f"cmd={cmd}" -# ) -# traces = json.loads(stdout)["traces"] - -# found = False -# for trace in traces: -# if trace["rootServiceName"] == "TempoTesterGrpcCharm": -# found = True - -# assert ( -# found -# ), f"There's no trace of generated grpc traces in tempo. {json.dumps(traces, indent=2)}" - - -# @pytest.mark.teardown -# @pytest.mark.abort_on_fail -# async def test_remove_relation(ops_test: OpsTest): -# # given related charms -# # when relation is removed -# # then both charms should become active again -# await ops_test.juju("remove-relation", APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") -# await ops_test.juju( -# "remove-relation", APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing" -# ) -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 -# ), -# # for tester, depending on the result of race with tempo it's either waiting or active -# ops_test.model.wait_for_idle(apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000), -# ops_test.model.wait_for_idle( -# apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000 -# ), -# ) +import asyncio +import json +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import deploy_cluster +from pytest_operator.plugin import OpsTest + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +TESTER_METADATA = yaml.safe_load(Path("./tests/integration/tester/metadata.yaml").read_text()) +TESTER_APP_NAME = TESTER_METADATA["name"] +TESTER_GRPC_METADATA = yaml.safe_load( + Path("./tests/integration/tester-grpc/metadata.yaml").read_text() +) +TESTER_GRPC_APP_NAME = TESTER_GRPC_METADATA["name"] + +logger = logging.getLogger(__name__) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + # Given a fresh build of the charm + # When deploying it together with testers + # Then applications should eventually be created + tempo_charm = await ops_test.build_charm(".") + tester_charm = await ops_test.build_charm("./tests/integration/tester/") + tester_grpc_charm = await ops_test.build_charm("./tests/integration/tester-grpc/") + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + resources_tester = {"workload": TESTER_METADATA["resources"]["workload"]["upstream-source"]} + resources_tester_grpc = { + "workload": TESTER_GRPC_METADATA["resources"]["workload"]["upstream-source"] + } + + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ops_test.model.deploy( + tester_charm, + resources=resources_tester, + application_name=TESTER_APP_NAME, + num_units=3, + ), + ops_test.model.deploy( + tester_grpc_charm, + resources=resources_tester_grpc, + application_name=TESTER_GRPC_APP_NAME, + num_units=3, + ), + ) + + # deploy cluster + await deploy_cluster(ops_test) + + await asyncio.gather( + # for tester, depending on the result of race with tempo it's either waiting or active + ops_test.model.wait_for_idle( + apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False + ), + ops_test.model.wait_for_idle( + apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False + ), + ) + + assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active" + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + # given a deployed charm + # when relating it together with the tester + # then relation should appear + await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") + await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, TESTER_APP_NAME, TESTER_GRPC_APP_NAME], + status="active", + timeout=1000, + ) + + +async def test_verify_traces_http(ops_test: OpsTest): + # given a relation between charms + # when traces endpoint is queried + # then it should contain traces from tester charm + status = await ops_test.model.get_status() + app = status["applications"][APP_NAME] + endpoint = app.public_address + ":3200/api/search" + cmd = [ + "curl", + endpoint, + ] + rc, stdout, stderr = await ops_test.run(*cmd) + logger.info("%s: %s", endpoint, (rc, stdout, stderr)) + assert rc == 0, ( + f"curl exited with rc={rc} for {endpoint}; " + f"non-zero return code means curl encountered a >= 400 HTTP code; " + f"cmd={cmd}" + ) + traces = json.loads(stdout)["traces"] + + found = False + for trace in traces: + if trace["rootServiceName"] == "TempoTesterCharm": + found = True + + assert found, f"There's no trace of charm exec traces in tempo. {json.dumps(traces, indent=2)}" + + +async def test_verify_traces_grpc(ops_test: OpsTest): + # the tester-grpc charm emits a single grpc trace in its common exit hook + # we verify it's there + status = await ops_test.model.get_status() + app = status["applications"][APP_NAME] + logger.info(app.public_address) + endpoint = app.public_address + ":3200/api/search" + cmd = [ + "curl", + endpoint, + ] + rc, stdout, stderr = await ops_test.run(*cmd) + logger.info("%s: %s", endpoint, (rc, stdout, stderr)) + assert rc == 0, ( + f"curl exited with rc={rc} for {endpoint}; " + f"non-zero return code means curl encountered a >= 400 HTTP code; " + f"cmd={cmd}" + ) + traces = json.loads(stdout)["traces"] + + found = False + for trace in traces: + if trace["rootServiceName"] == "TempoTesterGrpcCharm": + found = True + + assert ( + found + ), f"There's no trace of generated grpc traces in tempo. {json.dumps(traces, indent=2)}" + + +@pytest.mark.teardown +@pytest.mark.abort_on_fail +async def test_remove_relation(ops_test: OpsTest): + # given related charms + # when relation is removed + # then both charms should become active again + await ops_test.juju("remove-relation", APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") + await ops_test.juju( + "remove-relation", APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing" + ) + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ), + # for tester, depending on the result of race with tempo it's either waiting or active + ops_test.model.wait_for_idle(apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000), + ops_test.model.wait_for_idle( + apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000 + ), + ) diff --git a/tests/integration/test_scaling_monolithic.py b/tests/integration/test_scaling_monolithic.py index ed76714..f7fd344 100644 --- a/tests/integration/test_scaling_monolithic.py +++ b/tests/integration/test_scaling_monolithic.py @@ -1,22 +1,16 @@ -import json import logging -import os -import shlex -import tempfile from pathlib import Path -from subprocess import run -from typing import Dict, Literal import pytest import yaml +from helpers import deploy_cluster from juju.application import Application from pytest_operator.plugin import OpsTest METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -APP_NAME = "tempo-coordinator" -FACADE = "facade" +APP_NAME = "tempo" +S3_INTEGRATOR = "s3-integrator" TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" -FACADE_MOCKS_PATH = "/var/lib/juju/agents/unit-facade-0/charm/mocks" logger = logging.getLogger(__name__) @@ -25,6 +19,7 @@ @pytest.mark.abort_on_fail async def test_deploy_tempo(ops_test: OpsTest): tempo_charm = await ops_test.build_charm(".") + resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], "nginx-prometheus-exporter-image": METADATA["resources"][ @@ -54,85 +49,15 @@ async def test_scale_tempo_up_without_s3_blocks(ops_test: OpsTest): ) -def present_facade( - interface: str, - app_data: Dict = None, - unit_data: Dict = None, - role: Literal["provide", "require"] = "provide", - model: str = None, - app: str = "facade", -): - """Set up the facade charm to present this data over the interface ``interface``.""" - data = { - "endpoint": f"{role}-{interface}", - } - if app_data: - data["app_data"] = json.dumps(app_data) - if unit_data: - data["unit_data"] = json.dumps(unit_data) - - with tempfile.NamedTemporaryFile(dir=os.getcwd()) as f: - fpath = Path(f.name) - fpath.write_text(yaml.safe_dump(data)) - - _model = f" --model {model}" if model else "" - - run(shlex.split(f"juju run {app}/0{_model} update --params {fpath.absolute()}")) - - @pytest.mark.setup @pytest.mark.abort_on_fail -async def test_tempo_active_when_deploy_s3_and_workers_facade(ops_test: OpsTest): - await ops_test.model.deploy(FACADE, channel="edge") - await ops_test.model.wait_for_idle( - apps=[FACADE], raise_on_blocked=True, status="active", timeout=2000 - ) - - await ops_test.model.integrate(APP_NAME + ":s3", FACADE + ":provide-s3") - await ops_test.model.integrate(APP_NAME + ":tempo-cluster", FACADE + ":require-tempo_cluster") - - present_facade( - "s3", - model=ops_test.model_name, - app_data={ - "access-key": "key", - "bucket": "tempo", - "endpoint": "http://1.2.3.4:9000", - "secret-key": "soverysecret", - }, - ) - - present_facade( - "tempo_cluster", - model=ops_test.model_name, - app_data={ - "role": '"all"', - }, - unit_data={ - "juju_topology": json.dumps({"model": ops_test.model_name, "unit": FACADE + "/0"}), - "address": FACADE + ".cluster.local.svc", - }, - role="require", - ) - - await ops_test.model.wait_for_idle( - apps=[FACADE], - raise_on_blocked=True, - status="active", - timeout=2000, - ) - - await ops_test.model.wait_for_idle( - apps=[APP_NAME], - raise_on_blocked=True, - status="active", - timeout=10000, - ) +async def test_tempo_active_when_deploy_s3_and_workers(ops_test: OpsTest): + await deploy_cluster(ops_test) @pytest.mark.teardown async def test_tempo_blocks_if_s3_goes_away(ops_test: OpsTest): - app: Application = ops_test.model.applications[FACADE] + app: Application = ops_test.model.applications[S3_INTEGRATOR] await app.destroy(destroy_storage=True) await ops_test.model.wait_for_idle( apps=[APP_NAME], diff --git a/tests/integration/test_self_tracing.py b/tests/integration/test_self_tracing.py new file mode 100644 index 0000000..157cf24 --- /dev/null +++ b/tests/integration/test_self_tracing.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# Copyright 2024 Ubuntu +# See LICENSE file for licensing details. + +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import deploy_cluster, get_application_ip, get_traces_patiently +from pytest_operator.plugin import OpsTest + +logger = logging.getLogger(__name__) + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" + + +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + tempo_charm = await ops_test.build_charm(".") + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ) + + # deploy cluster + await deploy_cluster(ops_test, APP_NAME) + + await asyncio.gather( + ops_test.model.wait_for_idle(status="active", raise_on_blocked=True, timeout=1000) + ) + + +@pytest.mark.abort_on_fail +async def test_verify_trace_http_self(ops_test: OpsTest): + # adjust update-status interval to generate a charm tracing span faster + await ops_test.model.set_config({"update-status-hook-interval": "5s"}) + + # Verify traces from `tempo` are ingested into self Tempo + assert await get_traces_patiently( + await get_application_ip(ops_test, APP_NAME), + service_name="tempo-charm", + tls=False, + ) + + # adjust back to the default interval time + await ops_test.model.set_config({"update-status-hook-interval": "5m"}) diff --git a/tests/integration/test_self_tracing_remote.py b/tests/integration/test_self_tracing_remote.py new file mode 100644 index 0000000..9a889b5 --- /dev/null +++ b/tests/integration/test_self_tracing_remote.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# Copyright 2024 Ubuntu +# See LICENSE file for licensing details. + +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import deploy_cluster, get_application_ip, get_traces_patiently +from pytest_operator.plugin import OpsTest + +logger = logging.getLogger(__name__) + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +APP_REMOTE_NAME = "tempo-source" + + +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + tempo_charm = await ops_test.build_charm(".") + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_REMOTE_NAME), + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ) + + # deploy cluster + await deploy_cluster(ops_test, APP_NAME) + + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ) + ) + + +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + await ops_test.model.integrate(APP_NAME + ":tracing", APP_REMOTE_NAME + ":self-tracing") + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + +@pytest.mark.abort_on_fail +async def test_verify_trace_http(ops_test: OpsTest): + # adjust update-status interval to generate a charm tracing span faster + await ops_test.model.set_config({"update-status-hook-interval": "5s"}) + + # Verify traces from `tempo-source` are ingested into remote tempo instance + assert await get_traces_patiently( + await get_application_ip(ops_test, APP_NAME), + service_name="tempo-source-charm", + tls=False, + ) + + # adjust back to the default interval time + await ops_test.model.set_config({"update-status-hook-interval": "5m"}) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 5ea977f..014e836 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -1,171 +1,139 @@ -# TODO: uncomment and fix when the fully functional tempo cluster is ready (e.g: traces get ingested, can query for traces) -# import asyncio -# import json -# import logging -# import random -# import tempfile -# from pathlib import Path -# from subprocess import getoutput - -# import pytest -# import requests -# import yaml -# from pytest_operator.plugin import OpsTest -# from tenacity import retry, stop_after_attempt, wait_exponential - -# from tempo import Tempo -# from tests.integration.helpers import get_relation_data - -# METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -# APP_NAME = "tempo" -# SSC = "self-signed-certificates" -# SSC_APP_NAME = "ssc" -# TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" -# logger = logging.getLogger(__name__) - - -# @pytest.fixture(scope="function") -# def nonce(): -# """Generate an integer nonce for easier trace querying.""" -# return str(random.random())[2:] - - -# def get_traces(tempo_host: str, nonce): -# url = "https://" + tempo_host + ":3200/api/search" -# req = requests.get( -# url, -# params={"q": f'{{ .nonce = "{nonce}" }}'}, -# # it would fail to verify as the cert was issued for fqdn, not IP. -# verify=False, -# ) -# assert req.status_code == 200 -# return json.loads(req.text)["traces"] - - -# @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) -# async def get_traces_patiently(ops_test, nonce): -# assert get_traces(await get_tempo_ip(ops_test), nonce=nonce) - - -# async def get_tempo_ip(ops_test: OpsTest): -# status = await ops_test.model.get_status() -# app = status["applications"][APP_NAME] -# return app.public_address - - -# async def get_tempo_internal_host(ops_test: OpsTest): -# return f"https://{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" - - -# @pytest.fixture(scope="function") -# def server_cert(ops_test: OpsTest): -# data = get_relation_data( -# requirer_endpoint=f"{APP_NAME}/0:certificates", -# provider_endpoint=f"{SSC_APP_NAME}/0:certificates", -# model=ops_test.model.name, -# ) -# cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] - -# with tempfile.NamedTemporaryFile() as f: -# p = Path(f.name) -# p.write_text(cert) -# yield p - - -# async def emit_trace(ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False): -# """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" -# hostname = await get_tempo_internal_host(ops_test) -# cmd = ( -# f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " -# f"TRACEGEN_ENDPOINT={hostname}:4318/v1/traces " -# f"TRACEGEN_VERBOSE={verbose} " -# f"TRACEGEN_PROTOCOL={proto} " -# f"TRACEGEN_CERT={Tempo.server_cert_path if use_cert else ''} " -# f"TRACEGEN_NONCE={nonce} " -# "python3 tracegen.py" -# ) - -# return getoutput(cmd) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_build_and_deploy(ops_test: OpsTest): -# tempo_charm = await ops_test.build_charm(".") -# resources = { -# "tempo-image": METADATA["resources"]["tempo-image"]["upstream-source"], -# } -# await asyncio.gather( -# ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), -# ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), -# ) - -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME], -# status="active", -# raise_on_blocked=True, -# timeout=10000, -# raise_on_error=False, -# ), -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_relate(ops_test: OpsTest): -# await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") -# await ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME], -# status="active", -# timeout=1000, -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_push_tracegen_script_and_deps(ops_test: OpsTest): -# await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") -# await ops_test.juju( -# "ssh", -# f"{APP_NAME}/0", -# "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", -# ) - - -# async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, nonce): -# # IF tempo is related to SSC -# # WHEN we emit an http trace, **unsecured** -# await emit_trace(ops_test, nonce=nonce) # this should fail -# # THEN we can verify it's not been ingested -# tempo_ip = await get_tempo_ip(ops_test) -# traces = get_traces(tempo_ip, nonce=nonce) -# assert not traces - - -# async def test_verify_trace_http_tls(ops_test: OpsTest, nonce, server_cert): -# # WHEN we emit a trace secured with TLS -# await emit_trace(ops_test, nonce=nonce, use_cert=True) -# # THEN we can verify it's eventually ingested -# await get_traces_patiently(ops_test, nonce) - - -# @pytest.mark.xfail # expected to fail because in this context the grpc receiver is not enabled -# async def test_verify_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): -# # WHEN we emit a trace secured with TLS -# await emit_trace(ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) -# # THEN we can verify it's been ingested -# await get_traces_patiently(ops_test, nonce) - - -# @pytest.mark.teardown -# @pytest.mark.abort_on_fail -# async def test_remove_relation(ops_test: OpsTest): -# await ops_test.juju( -# "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" -# ) -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 -# ), -# ) +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import ( + deploy_cluster, + emit_trace, + get_application_ip, + get_traces, + get_traces_patiently, + protocols_endpoints, +) +from juju.application import Application +from pytest_operator.plugin import OpsTest + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +SSC = "self-signed-certificates" +SSC_APP_NAME = "ssc" +TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" + + +logger = logging.getLogger(__name__) + + +async def get_tempo_traces_internal_endpoint(ops_test: OpsTest, protocol): + hostname = f"{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" + protocol_endpoint = protocols_endpoints.get(protocol) + if protocol_endpoint is None: + assert False, f"Invalid {protocol}" + return protocol_endpoint.format(hostname) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + tempo_charm = await ops_test.build_charm(".") + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), + ) + + # deploy cluster + await deploy_cluster(ops_test) + + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME], + status="active", + raise_on_blocked=True, + timeout=10000, + raise_on_error=False, + ), + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME], + status="active", + timeout=1000, + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_push_tracegen_script_and_deps(ops_test: OpsTest): + await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") + await ops_test.juju( + "ssh", + f"{APP_NAME}/0", + "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http" + + " opentelemetry-exporter-zipkin opentelemetry-exporter-jaeger", + ) + + +async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, nonce): + # IF tempo is related to SSC + # WHEN we emit an http trace, **unsecured** + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test, protocol="otlp_http") + await emit_trace(tempo_endpoint, ops_test, nonce=nonce) # this should fail + # THEN we can verify it's not been ingested + traces = get_traces(await get_application_ip(ops_test, APP_NAME)) + assert len(traces) == 0 + + +@pytest.mark.abort_on_fail +@pytest.mark.parametrize("protocol", list(protocols_endpoints.keys())) +async def test_verify_traces_force_enabled_protocols_tls(ops_test: OpsTest, nonce, protocol): + + tempo_app: Application = ops_test.model.applications[APP_NAME] + + # enable each protocol receiver + # otlp_http should be enabled by default + if protocol != "otlp_http": + await tempo_app.set_config( + { + f"always_enable_{protocol}": "True", + } + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test, protocol=protocol) + # WHEN we emit a trace secured with TLS + await emit_trace( + tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto=protocol, use_cert=True + ) + # THEN we can verify it's been ingested + await get_traces_patiently( + await get_application_ip(ops_test, APP_NAME), service_name=f"tracegen-{protocol}" + ) + + +@pytest.mark.teardown +@pytest.mark.abort_on_fail +async def test_remove_relation(ops_test: OpsTest): + await ops_test.juju( + "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" + ) + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ), + ) diff --git a/tests/integration/tester/src/charm.py b/tests/integration/tester/src/charm.py index 72feedd..a5059a0 100755 --- a/tests/integration/tester/src/charm.py +++ b/tests/integration/tester/src/charm.py @@ -40,7 +40,7 @@ def __init__(self, *args): self.tracing = TracingEndpointRequirer( self, relation_name="tracing", protocols=["otlp_http", "otlp_grpc"] ) - self.tempo_otlp_http_endpoint = charm_tracing_config(self.tracing, None) + self.tempo_otlp_http_endpoint, _ = charm_tracing_config(self.tracing, None) # Core lifecycle events self.framework.observe(self.on.config_changed, self._update) diff --git a/tests/scenario/conftest.py b/tests/scenario/conftest.py index 9fe9ebb..84c387e 100644 --- a/tests/scenario/conftest.py +++ b/tests/scenario/conftest.py @@ -1,16 +1,20 @@ -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from scenario import Container, Context, Relation from charm import TempoCoordinatorCharm -from tempo_cluster import TempoClusterRequirerAppData, TempoRole + + +@pytest.fixture() +def coordinator(): + return MagicMock() @pytest.fixture def tempo_charm(): with patch("lightkube.core.client.GenericSyncClient"): - with patch("charm.TempoCoordinatorCharm._update_server_cert"): + with patch("charm.TempoCoordinatorCharm.are_certificates_on_disk", False): yield TempoCoordinatorCharm @@ -42,7 +46,7 @@ def s3(s3_config): def all_worker(): return Relation( "tempo-cluster", - remote_app_data=TempoClusterRequirerAppData(role=TempoRole.all).dump(), + remote_app_data={"role": '"all"'}, ) diff --git a/tests/scenario/helpers.py b/tests/scenario/helpers.py index fc3b79a..445ca83 100644 --- a/tests/scenario/helpers.py +++ b/tests/scenario/helpers.py @@ -1,8 +1,7 @@ import scenario - -from tempo_cluster import TempoClusterProviderAppData +from cosl.coordinated_workers.interface import ClusterProviderAppData def get_tempo_config(state: scenario.State): cluster_relation = state.get_relations("tempo-cluster")[0] # there's only one - return TempoClusterProviderAppData.load(cluster_relation.local_app_data).tempo_config + return ClusterProviderAppData.load(cluster_relation.local_app_data).worker_config diff --git a/tests/scenario/test_charm_statuses.py b/tests/scenario/test_charm_statuses.py index 5c16e6b..b623a74 100644 --- a/tests/scenario/test_charm_statuses.py +++ b/tests/scenario/test_charm_statuses.py @@ -3,8 +3,6 @@ import ops from scenario import PeerRelation, State -from tempo import Tempo - def test_monolithic_status_no_s3_no_workers(context): state_out = context.run("start", State(unit_status=ops.ActiveStatus(), leader=True)) @@ -48,9 +46,14 @@ def test_scaled_status_with_s3_and_workers( assert state_out.unit_status.name == "active" -@patch.object(Tempo, "is_ready", new=True) +@patch("charm.TempoCoordinatorCharm.is_workload_ready", return_value=True) def test_happy_status( - context, s3, all_worker, nginx_container, nginx_prometheus_exporter_container + workload_ready_mock, + context, + s3, + all_worker, + nginx_container, + nginx_prometheus_exporter_container, ): state_out = context.run( "start", diff --git a/tests/scenario/test_config.py b/tests/scenario/test_config.py index 3532fb5..63052f1 100644 --- a/tests/scenario/test_config.py +++ b/tests/scenario/test_config.py @@ -1,7 +1,8 @@ +import json + from scenario import State from charm import TempoCoordinatorCharm -from tempo_cluster import TempoClusterRequirerUnitData def test_memberlist_multiple_members( @@ -10,18 +11,18 @@ def test_memberlist_multiple_members( workers_no = 3 all_worker = all_worker.replace( remote_units_data={ - worker_idx: TempoClusterRequirerUnitData( - **{ - "address": f"worker-{worker_idx}.test.svc.cluster.local:7946", - "juju_topology": { + worker_idx: { + "address": json.dumps(f"worker-{worker_idx}.test.svc.cluster.local:7946"), + "juju_topology": json.dumps( + { "model": "test", "unit": f"worker/{worker_idx}", "model_uuid": "1", "application": "worker", "charm_name": "TempoWorker", - }, - } - ).dump() + } + ), + } for worker_idx in range(workers_no) }, ) @@ -32,7 +33,7 @@ def test_memberlist_multiple_members( ) with context.manager(all_worker.changed_event, state) as mgr: charm: TempoCoordinatorCharm = mgr.charm - assert charm.tempo_cluster.gather_addresses() == set( + assert charm.coordinator.cluster.gather_addresses() == set( [ "worker-0.test.svc.cluster.local:7946", "worker-1.test.svc.cluster.local:7946", diff --git a/tests/scenario/test_enabled_receivers.py b/tests/scenario/test_enabled_receivers.py index c77d8b0..eb63cad 100644 --- a/tests/scenario/test_enabled_receivers.py +++ b/tests/scenario/test_enabled_receivers.py @@ -53,7 +53,7 @@ def test_receivers_with_relations( action_out = context.run_action("list-receivers", state) assert action_out.results == { "otlp-http": f"http://{socket.getfqdn()}:4318", - "otlp-grpc": f"http://{socket.getfqdn()}:4317", + "otlp-grpc": f"{socket.getfqdn()}:4317", } @@ -93,5 +93,5 @@ def test_receivers_with_relations_and_config( assert action_out.results == { "otlp-http": f"http://{socket.getfqdn()}:4318", "zipkin": f"http://{socket.getfqdn()}:9411", - "otlp-grpc": f"http://{socket.getfqdn()}:4317", + "otlp-grpc": f"{socket.getfqdn()}:4317", } diff --git a/tests/scenario/test_ingressed_tracing.py b/tests/scenario/test_ingressed_tracing.py index ec9c850..e2f6f73 100644 --- a/tests/scenario/test_ingressed_tracing.py +++ b/tests/scenario/test_ingressed_tracing.py @@ -5,8 +5,6 @@ from charms.tempo_k8s.v1.charm_tracing import charm_tracing_disabled from scenario import Relation, State -from tempo import Tempo - @pytest.fixture def base_state(nginx_container, nginx_prometheus_exporter_container): @@ -38,7 +36,7 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w ingress = Relation("ingress", remote_app_data={"external_host": "1.2.3.4", "scheme": "http"}) state = base_state.replace(relations=[ingress, s3, all_worker]) - with patch.object(Tempo, "is_ready", lambda _: False): + with patch("charm.TempoCoordinatorCharm.is_workload_ready", lambda _: False): out = context.run(ingress.joined_event, state) charm_name = "tempo-coordinator-k8s" @@ -76,6 +74,11 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w "rule": "ClientIP(`0.0.0.0/0`)", "service": f"juju-{state.model.name}-{charm_name}-service-tempo-grpc", }, + f"juju-{state.model.name}-{charm_name}-jaeger-grpc": { + "entryPoints": ["jaeger-grpc"], + "rule": "ClientIP(`0.0.0.0/0`)", + "service": f"juju-{state.model.name}-{charm_name}-service-jaeger-grpc", + }, }, "services": { f"juju-{state.model.name}-{charm_name}-service-jaeger-thrift-http": { @@ -96,6 +99,9 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w f"juju-{state.model.name}-{charm_name}-service-tempo-grpc": { "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:9096"}]} }, + f"juju-{state.model.name}-{charm_name}-service-jaeger-grpc": { + "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:14250"}]} + }, }, }, } diff --git a/tests/scenario/test_nginx.py b/tests/scenario/test_nginx.py index 29811a0..3c3786d 100644 --- a/tests/scenario/test_nginx.py +++ b/tests/scenario/test_nginx.py @@ -1,44 +1,23 @@ import logging from typing import List -from unittest.mock import MagicMock import pytest -from nginx import Nginx +from nginx_config import NginxConfig from tempo import Tempo -from tempo_cluster import TempoClusterProvider logger = logging.getLogger(__name__) -@pytest.fixture -def tempo_cluster_provider(): - cluster_mock = MagicMock() - return TempoClusterProvider(cluster_mock) - - -def test_nginx_config_is_list_before_crossplane(context, nginx_container, tempo_cluster_provider): - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) - - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") - - prepared_config = nginx._prepare_config() +def test_nginx_config_is_list_before_crossplane(context, nginx_container, coordinator): + nginx = NginxConfig("localhost") + prepared_config = nginx._prepare_config(coordinator) assert isinstance(prepared_config, List) -def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, tempo_cluster_provider): - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) - - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") - logger.info(nginx._prepare_config()) - - prepared_config = nginx.config() +def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, coordinator): + nginx = NginxConfig("localhost") + prepared_config = nginx.config(coordinator) assert isinstance(prepared_config, str) @@ -46,10 +25,7 @@ def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, tempo_cl "addresses", ( {}, - {"all": {"1.2.3.4"}}, - {"all": {"1.2.3.4", "1.2.3.5"}}, { - "all": {"1.2.3.4"}, "distributor": {"1.2.3.5"}, "ingester": {"1.2.3.6"}, "querier": {"1.2.4.7"}, @@ -83,29 +59,19 @@ def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, tempo_cl }, ), ) -def test_nginx_config_is_parsed_with_workers( - context, nginx_container, tempo_cluster_provider, addresses -): - tempo_cluster_provider.gather_addresses_by_role = MagicMock(return_value=addresses) +def test_nginx_config_is_parsed_with_workers(context, nginx_container, coordinator, addresses): + coordinator.cluster.gather_addresses_by_role.return_value = addresses - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) + nginx = NginxConfig("localhost") - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") - - prepared_config = nginx.config() + prepared_config = nginx.config(coordinator) assert isinstance(prepared_config, str) @pytest.mark.parametrize( "addresses", ( - {"all": {"1.2.3.4"}}, - {"all": {"1.2.3.4", "1.2.3.5"}}, { - "all": {"1.2.3.4"}, "distributor": {"1.2.3.5"}, "ingester": {"1.2.3.6"}, "querier": {"1.2.4.7"}, @@ -124,23 +90,16 @@ def test_nginx_config_is_parsed_with_workers( ), ) def test_nginx_config_contains_upstreams_and_proxy_pass( - context, nginx_container, tempo_cluster_provider, addresses + context, nginx_container, coordinator, addresses ): - tempo_cluster_provider.gather_addresses_by_role = MagicMock(return_value=addresses) - - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) + coordinator.cluster.gather_addresses_by_role.return_value = addresses - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") + nginx = NginxConfig("localhost") - prepared_config = nginx.config() + prepared_config = nginx.config(coordinator) for role, addresses in addresses.items(): for address in addresses: - if role == "all": - _assert_config_per_role(Tempo.all_ports, address, prepared_config) if role == "distributor": _assert_config_per_role(Tempo.receiver_ports, address, prepared_config) if role == "query-frontend": diff --git a/tests/scenario/test_tempo_clustered.py b/tests/scenario/test_tempo_clustered.py index b7eb75c..1e264f5 100644 --- a/tests/scenario/test_tempo_clustered.py +++ b/tests/scenario/test_tempo_clustered.py @@ -1,37 +1,54 @@ import datetime +import json from unittest.mock import MagicMock, patch import pytest import scenario from charms.tempo_k8s.v2.tracing import TracingRequirerAppData from charms.tls_certificates_interface.v3.tls_certificates import ProviderCertificate +from cosl.coordinated_workers.interface import ClusterProvider, ClusterProviderAppData from scenario import Relation, State from charm import TempoCoordinatorCharm from tempo import Tempo -from tempo_cluster import TempoClusterProviderAppData from tests.scenario.helpers import get_tempo_config +@pytest.fixture(scope="function") +def coordinator_with_initial_config(): + new_coordinator_mock = MagicMock() + new_coordinator_mock.return_value.tls_available = False + new_coordinator_mock.return_value.hostname = "tempo-test-0.test.cluster.svc.local" + new_coordinator_mock.return_value._s3_config = { + "access_key_id": "key", + "bucket_name": "tempo", + "endpoint": "http://1.2.3.4:9000", + "secret_access_key": "soverysecret", + } + new_coordinator_mock.return_value.cluster.gather_addresses.return_value = {"localhost"} + new_coordinator_mock.return_value.cluster.gather_addresses_by_role.return_value = { + "query-frontend": {"localhost"}, + "distributor": {"localhost"}, + } + + return new_coordinator_mock + + @pytest.fixture -def all_worker_with_initial_config(all_worker: Relation, s3_config): - container = MagicMock() - container.can_connect = lambda: True - # prevent tls_ready from reporting True - container.exists = lambda path: ( - False if path in [Tempo.tls_cert_path, Tempo.tls_key_path, Tempo.tls_ca_path] else True - ) - initial_config = Tempo(container).generate_config( - ["otlp_http"], s3_config, {"all": "localhost"} +def all_worker_with_initial_config(all_worker: Relation, coordinator_with_initial_config): + + initial_config = Tempo(lambda: ("otlp_http",)).config( + coordinator_with_initial_config.return_value ) - new_local_app_data = TempoClusterProviderAppData( - tempo_config=initial_config, - loki_endpoints={}, - ca_cert="foo cert", - server_cert="bar cert", - privkey_secret_id="super secret", - tempo_receiver={"otlp_http": "https://foo.com/fake_receiver"}, - ).dump() + + new_local_app_data = { + "worker_config": json.dumps(initial_config), + "ca_cert": json.dumps("foo cert"), + "server_cert": json.dumps("bar cert"), + "privkey_secret_id": json.dumps("super secret"), + "tracing_receivers": json.dumps({"otlp_http": "https://foo.com/fake_receiver"}), + } + return all_worker.replace(local_app_data=new_local_app_data) @@ -79,16 +96,16 @@ def state_with_certs( def test_certs_ready(context, state_with_certs): with context.manager("update-status", state_with_certs) as mgr: charm: TempoCoordinatorCharm = mgr.charm - assert charm.cert_handler.server_cert == MOCK_SERVER_CERT - assert charm.cert_handler.ca_cert == MOCK_CA_CERT - assert charm.cert_handler.private_key + assert charm.coordinator.cert_handler.server_cert == MOCK_SERVER_CERT + assert charm.coordinator.cert_handler.ca_cert == MOCK_CA_CERT + assert charm.coordinator.cert_handler.private_key def test_cluster_relation(context, state_with_certs, all_worker): clustered_state = state_with_certs.replace(relations=state_with_certs.relations + [all_worker]) state_out = context.run(all_worker.joined_event, clustered_state) cluster_out = state_out.get_relations(all_worker.endpoint)[0] - local_app_data = TempoClusterProviderAppData.load(cluster_out.local_app_data) + local_app_data = ClusterProviderAppData.load(cluster_out.local_app_data) assert local_app_data.ca_cert == MOCK_CA_CERT assert local_app_data.server_cert == MOCK_SERVER_CERT @@ -97,22 +114,26 @@ def test_cluster_relation(context, state_with_certs, all_worker): # certhandler's vault uses revision 0 to store an uninitialized-vault marker assert secret.contents[1]["private-key"] - assert local_app_data.tempo_config + assert local_app_data.worker_config +@patch.object(ClusterProvider, "gather_addresses") @pytest.mark.parametrize("requested_protocol", ("otlp_grpc", "zipkin")) def test_tempo_restart_on_ingress_v2_changed( + mock_cluster, + coordinator_with_initial_config, context, - tmp_path, requested_protocol, s3, - s3_config, all_worker_with_initial_config, nginx_container, nginx_prometheus_exporter_container, ): + mock_cluster.return_value = {"localhost"} + # GIVEN # the remote end requests an otlp_grpc endpoint + tracing = Relation( "tracing", remote_app_data=TracingRequirerAppData(receivers=[requested_protocol]).dump(), @@ -125,12 +146,13 @@ def test_tempo_restart_on_ingress_v2_changed( relations=[tracing, s3, all_worker_with_initial_config], containers=[nginx_container, nginx_prometheus_exporter_container], ) + state_out = context.run(tracing.changed_event, state) # THEN # Tempo pushes a new config to the all_worker new_config = get_tempo_config(state_out) - expected_config = Tempo().generate_config( - ["otlp_http", requested_protocol], s3_config, {"all": "localhost"} + expected_config = Tempo(lambda: ["otlp_http", requested_protocol]).config( + coordinator_with_initial_config.return_value ) assert new_config == expected_config diff --git a/tests/scenario/test_tls.py b/tests/scenario/test_tls.py index 8021733..777a0ee 100644 --- a/tests/scenario/test_tls.py +++ b/tests/scenario/test_tls.py @@ -4,10 +4,9 @@ import pytest from charms.tempo_k8s.v1.charm_tracing import charm_tracing_disabled from charms.tempo_k8s.v2.tracing import TracingProviderAppData, TracingRequirerAppData +from cosl.coordinated_workers.coordinator import Coordinator from scenario import Relation, Secret, State -from charm import TempoCoordinatorCharm - @pytest.fixture def base_state(nginx_container, nginx_prometheus_exporter_container): @@ -38,9 +37,7 @@ def update_relations_tls_and_verify( tracing, ): state = base_state.replace(relations=relations) - with charm_tracing_disabled(), patch.object( - TempoCoordinatorCharm, "tls_available", local_has_tls - ): + with charm_tracing_disabled(), patch.object(Coordinator, "tls_available", local_has_tls): out = context.run(tracing.changed_event, state) tracing_provider_app_data = TracingProviderAppData.load( out.get_relations(tracing.endpoint)[0].local_app_data diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9a74dac..1da8294 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -30,6 +30,7 @@ def test_entrypoints_are_generated_with_sanitized_names(self): "otlp-grpc": {"address": ":4317"}, "otlp-http": {"address": ":4318"}, "jaeger-thrift-http": {"address": ":14268"}, + "jaeger-grpc": {"address": ":14250"}, } } self.assertEqual(self.harness.charm._static_ingress_config, expected_entrypoints) diff --git a/tests/unit/test_coherence.py b/tests/unit/test_coherence.py index cf85915..ad0fca5 100644 --- a/tests/unit/test_coherence.py +++ b/tests/unit/test_coherence.py @@ -1,22 +1,17 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest as pytest +from cosl.coordinated_workers.coordinator import Coordinator -from coordinator import ( +from tempo_config import ( MINIMAL_DEPLOYMENT, RECOMMENDED_DEPLOYMENT, - TempoCoordinator, TempoRole, + TempoRolesConfig, ) -def _to_endpoint_name(role: TempoRole): - return role.value.replace("_", "-") - - -ALL_TEMPO_RELATION_NAMES = list(map(_to_endpoint_name, TempoRole)) - - +@patch("cosl.coordinated_workers.coordinator.Coordinator.__init__", return_value=None) @pytest.mark.parametrize( "roles, expected", ( @@ -27,13 +22,19 @@ def _to_endpoint_name(role: TempoRole): (RECOMMENDED_DEPLOYMENT, True), ), ) -def test_coherent(roles, expected): - mock = MagicMock() - mock.gather_roles = MagicMock(return_value=roles) - mc = TempoCoordinator(mock) +def test_coherent(mock_coordinator, roles, expected): + + mc = Coordinator(None, None, "", "", 0, None, None, None) + cluster_mock = MagicMock() + cluster_mock.gather_roles = MagicMock(return_value=roles) + mc.cluster = cluster_mock + mc._is_coherent = None + mc.roles_config = TempoRolesConfig() + assert mc.is_coherent is expected +@patch("cosl.coordinated_workers.coordinator.Coordinator.__init__", return_value=None) @pytest.mark.parametrize( "roles, expected", ( @@ -44,8 +45,12 @@ def test_coherent(roles, expected): (RECOMMENDED_DEPLOYMENT, True), ), ) -def test_recommended(roles, expected): - mock = MagicMock() - mock.gather_roles = MagicMock(return_value=roles) - mc = TempoCoordinator(mock) +def test_recommended(mock_coordinator, roles, expected): + mc = Coordinator(None, None, "", "", 0, None, None, None) + cluster_mock = MagicMock() + cluster_mock.gather_roles = MagicMock(return_value=roles) + mc.cluster = cluster_mock + mc._is_recommended = None + mc.roles_config = TempoRolesConfig() + assert mc.is_recommended is expected diff --git a/tests/unit/test_tempo.py b/tests/unit/test_tempo.py index ed0fbd2..27ca813 100644 --- a/tests/unit/test_tempo.py +++ b/tests/unit/test_tempo.py @@ -52,8 +52,8 @@ "thrift_http": { "tls": { "ca_file": "/usr/local/share/ca-certificates/ca.crt", - "cert_file": "/etc/tempo/tls/server.crt", - "key_file": "/etc/tempo/tls/server.key", + "cert_file": "/etc/worker/server.cert", + "key_file": "/etc/worker/private.key", } }, } @@ -61,8 +61,8 @@ "zipkin": { "tls": { "ca_file": "/usr/local/share/ca-certificates/ca.crt", - "cert_file": "/etc/tempo/tls/server.crt", - "key_file": "/etc/tempo/tls/server.key", + "cert_file": "/etc/worker/server.cert", + "key_file": "/etc/worker/private.key", } }, "otlp": { @@ -70,8 +70,8 @@ "http": { "tls": { "ca_file": "/usr/local/share/ca-certificates/ca.crt", - "cert_file": "/etc/tempo/tls/server.crt", - "key_file": "/etc/tempo/tls/server.key", + "cert_file": "/etc/worker/server.cert", + "key_file": "/etc/worker/private.key", } }, } @@ -82,10 +82,7 @@ ), ) def test_tempo_distributor_config(protocols, use_tls, expected_config): - assert ( - Tempo(None, use_tls=use_tls)._build_distributor_config(protocols).receivers - == expected_config - ) + assert Tempo(None)._build_distributor_config(protocols, use_tls).receivers == expected_config @pytest.mark.parametrize( @@ -108,4 +105,4 @@ def test_tempo_distributor_config(protocols, use_tls, expected_config): ), ) def test_tempo_memberlist_config(peers, expected_config): - assert Tempo()._build_memberlist_config(peers) == expected_config + assert Tempo(None)._build_memberlist_config(peers) == expected_config diff --git a/tox.ini b/tox.ini index 5a36eed..309e723 100644 --- a/tox.ini +++ b/tox.ini @@ -89,8 +89,12 @@ deps = pytest-operator requests -r{toxinidir}/requirements.txt + minio + tenacity==8.2.3 # tracegen opentelemetry-exporter-otlp-proto-grpc + opentelemetry-exporter-zipkin + opentelemetry-exporter-jaeger commands = pytest -v --tb native --log-cli-level=INFO {[vars]tst_path}integration -s {posargs}