From bd6d33c7cf56bbac66d997ead094645e65cb0c5a Mon Sep 17 00:00:00 2001 From: michael Date: Wed, 17 Jul 2024 19:26:32 +0300 Subject: [PATCH 01/22] implement generic coord --- src/charm.py | 452 +++++++------------------ src/coordinator.py | 85 ----- src/{nginx.py => nginx_config.py} | 152 ++------- src/nginx_prometheus_exporter.py | 44 --- src/tempo.py | 176 +++------- src/tempo_cluster.py | 391 --------------------- src/tempo_config.py | 84 ++++- tests/scenario/conftest.py | 2 +- tests/scenario/helpers.py | 1 - tests/scenario/test_config.py | 2 +- tests/scenario/test_nginx.py | 4 +- tests/scenario/test_tempo_clustered.py | 2 +- tests/unit/test_coherence.py | 1 - 13 files changed, 282 insertions(+), 1114 deletions(-) delete mode 100644 src/coordinator.py rename src/{nginx.py => nginx_config.py} (61%) delete mode 100644 src/nginx_prometheus_exporter.py delete mode 100644 src/tempo_cluster.py diff --git a/src/charm.py b/src/charm.py index 276f4f1..5947939 100755 --- a/src/charm.py +++ b/src/charm.py @@ -3,45 +3,36 @@ # See LICENSE file for licensing details. """Charmed Operator for Tempo; a lightweight object storage based tracing backend.""" -import json import logging import socket from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple, get_args +from subprocess import CalledProcessError, getoutput +from typing import Dict, Optional, Set, Tuple, get_args import ops -from charms.data_platform_libs.v0.s3 import S3Requirer -from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider -from charms.observability_libs.v1.cert_handler import VAULT_SECRET_LABEL, CertHandler -from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from charms.tempo_k8s.v1.charm_tracing import trace_charm from charms.tempo_k8s.v2.tracing import ( ReceiverProtocol, RequestEvent, TracingEndpointProvider, + receiver_protocol_to_transport_protocol, ) from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer -from ops.charm import CharmBase, CollectStatusEvent, RelationEvent +from cosl.coordinated_workers.coordinator import Coordinator +from ops.charm import CharmBase, RelationEvent from ops.main import main -from ops.model import ActiveStatus, BlockedStatus, Relation, WaitingStatus -from coordinator import TempoCoordinator -from nginx import Nginx -from nginx_prometheus_exporter import NginxPrometheusExporter +from nginx_config import NginxConfig from tempo import Tempo -from tempo_cluster import TempoClusterProvider +from tempo_config import TempoRolesConfig logger = logging.getLogger(__name__) -class S3NotFoundError(Exception): - """Raised when the s3 integration is not present or not ready.""" - - @trace_charm( tracing_endpoint="tempo_otlp_http_endpoint", - server_cert="server_cert", + server_cert="server_ca_cert", extra_types=(Tempo, TracingEndpointProvider), ) class TempoCoordinatorCharm(CharmBase): @@ -51,29 +42,27 @@ def __init__(self, *args): super().__init__(*args) self.ingress = TraefikRouteRequirer(self, self.model.get_relation("ingress"), "ingress") # type: ignore - self.tempo_cluster = TempoClusterProvider(self) - self.coordinator = TempoCoordinator(self.tempo_cluster) - - # keep this above Tempo instantiation, as we need it in self.tls_enabled - self.cert_handler = CertHandler( - self, - key="tempo-server-cert", - sans=[self.hostname], - ) - - self.tempo = tempo = Tempo( - external_host=self.hostname, - use_tls=self.tls_available, - ) - - self.s3_requirer = S3Requirer(self, Tempo.s3_relation_name, Tempo.s3_bucket_name) - - self.nginx = Nginx( - self, - cluster_provider=self.tempo_cluster, - server_name=self.hostname, + self.tempo = Tempo(requested_receivers=self._requested_receivers) + # set the open ports for this unit + self.unit.set_ports(*self.tempo.all_ports.values()) + self.coordinator = Coordinator( + charm=self, + roles_config=TempoRolesConfig(), + s3_bucket_name=Tempo.s3_bucket_name, + external_url=self._external_url, + worker_metrics_port="8080", + endpoints={ + "certificates": "certificates", + "cluster": "tempo-cluster", + "grafana-dashboards": "grafana-dashboard", + "logging": "logging", + "metrics": "metrics-endpoint", + "s3": "s3", + }, + nginx_config=NginxConfig(server_name=self.hostname).config, + workers_config=self.tempo.config, + tracing_receivers=self.requested_receivers_urls, ) - self.nginx_prometheus_exporter = NginxPrometheusExporter(self) # configure this tempo as a datasource in grafana self.grafana_source_provider = GrafanaSourceProvider( @@ -82,91 +71,47 @@ def __init__(self, *args): source_url=self._external_http_server_url, refresh_event=[ # refresh the source url when TLS config might be changing - self.on[self.cert_handler.certificates_relation_name].relation_changed, + self.on[self.coordinator.cert_handler.certificates_relation_name].relation_changed, # or when ingress changes self.ingress.on.ready, ], ) - # # Patch the juju-created Kubernetes service to contain the right ports - self.unit.set_ports(*self.tempo.all_ports.values()) - - # Provide ability for Tempo to be scraped by Prometheus using prometheus_scrape - self._scraping = MetricsEndpointProvider( - self, - relation_name="metrics-endpoint", - jobs=[{"static_configs": [{"targets": [f"*:{tempo.tempo_http_server_port}"]}]}], - ) - self._grafana_dashboards = GrafanaDashboardProvider( - self, relation_name="grafana-dashboard" - ) self.tracing = TracingEndpointProvider(self, external_url=self._external_url) - self._inconsistencies = self.coordinator.get_deployment_inconsistencies( - has_s3=self.s3_ready - ) - self._is_consistent = not self._inconsistencies # We always listen to collect-status self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status) - if not self._is_consistent: - logger.error( - f"Inconsistent deployment. {self.unit.name} will be shutting down. " - "This likely means you need to add an s3 integration. " - "This charm will be unresponsive and refuse to handle any event until " - "the situation is resolved by the cloud admin, to avoid data loss." - ) - return # refuse to handle any other event as we can't possibly know what to do. + # refuse to handle any other event as we can't possibly know what to do. + if ( + not self.coordinator.cluster.has_workers + or not self.coordinator.is_coherent + or not self.coordinator.s3_ready + ): + return # lifecycle self.framework.observe(self.on.leader_elected, self._on_leader_elected) - self.framework.observe(self.on.update_status, self._on_update_status) - self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.list_receivers_action, self._on_list_receivers_action) - # nginx - self.framework.observe(self.on.nginx_pebble_ready, self._on_nginx_pebble_ready) - self.framework.observe( - self.on.nginx_prometheus_exporter_pebble_ready, - self._on_nginx_prometheus_exporter_pebble_ready, - ) - # ingress ingress = self.on["ingress"] self.framework.observe(ingress.relation_created, self._on_ingress_relation_created) self.framework.observe(ingress.relation_joined, self._on_ingress_relation_joined) self.framework.observe(self.ingress.on.ready, self._on_ingress_ready) - # s3 - self.framework.observe( - self.s3_requirer.on.credentials_changed, self._on_s3_credentials_changed - ) - self.framework.observe(self.s3_requirer.on.credentials_gone, self._on_s3_credentials_gone) - # tracing self.framework.observe(self.tracing.on.request, self._on_tracing_request) self.framework.observe(self.tracing.on.broken, self._on_tracing_broken) - self.framework.observe(self.on.peers_relation_created, self._on_peers_relation_created) - self.framework.observe(self.on.peers_relation_changed, self._on_peers_relation_changed) # tls - self.framework.observe(self.cert_handler.on.cert_changed, self._on_cert_handler_changed) - - # cluster - self.framework.observe(self.tempo_cluster.on.changed, self._on_tempo_cluster_changed) - - for evt in self.on.events().values(): - self.framework.observe(evt, self._on_event) # type: ignore + self.framework.observe( + self.coordinator.cert_handler.on.cert_changed, self._on_cert_handler_changed + ) ###################### # UTILITY PROPERTIES # ###################### - - @property - def is_clustered(self) -> bool: - """Check whether this Tempo is a coordinator and has worker nodes connected to it.""" - return self.tempo_cluster.has_workers - @property def hostname(self) -> str: """Unit's hostname.""" @@ -195,77 +140,12 @@ def _external_url(self) -> str: @property def _internal_url(self) -> str: - scheme = "https" if self.tls_available else "http" + """Returns workload's FQDN.""" + scheme = "http" + if hasattr(self, "coordinator") and self.coordinator.nginx.are_certificates_on_disk: + scheme = "https" return f"{scheme}://{self.hostname}" - @property - def tls_available(self) -> bool: - """Return True if tls is enabled and the necessary certs are found.""" - return ( - self.cert_handler.enabled - and (self.cert_handler.server_cert is not None) - and (self.cert_handler.private_key is not None) - and (self.cert_handler.ca_cert is not None) - ) - - @property - def _s3_config(self) -> dict: - s3_config = self.s3_requirer.get_s3_connection_info() - if ( - s3_config - and "bucket" in s3_config - and "endpoint" in s3_config - and "access-key" in s3_config - and "secret-key" in s3_config - ): - return s3_config - raise S3NotFoundError("s3 integration inactive") - - @property - def s3_ready(self) -> bool: - """Check whether s3 is configured.""" - try: - return bool(self._s3_config) - except S3NotFoundError: - return False - - @property - def peer_addresses(self) -> List[str]: - peers = self._peers - relation = self.model.get_relation("peers") - # get unit addresses for all the other units from a databag - if peers and relation: - addresses = [relation.data[unit].get("local-ip") for unit in peers] - addresses = list(filter(None, addresses)) - else: - addresses = [] - - # add own address - if self._local_ip: - addresses.append(self._local_ip) - - return addresses - - @property - def _local_ip(self) -> Optional[str]: - try: - binding = self.model.get_binding("peers") - if not binding: - logger.error( - "unable to get local IP at this time: " - "peers binding not active yet. It could be that the charm " - "is still being set up..." - ) - return None - return str(binding.network.bind_address) - except (ops.ModelError, KeyError) as e: - logger.debug("failed to obtain local ip from peers binding", exc_info=True) - logger.error( - f"unable to get local IP at this time: failed with {type(e)}; " - f"see debug log for more info" - ) - return None - @property def enabled_receivers(self) -> Set[str]: """Extra receivers enabled through config""" @@ -288,39 +168,26 @@ def _on_tracing_broken(self, _): """Update tracing relations' databags once one relation is removed.""" self._update_tracing_relations() - def _on_cert_handler_changed(self, _): - if self.tls_available: - logger.debug("enabling TLS") - self.nginx.configure_tls( - server_cert=self.cert_handler.server_cert, # type: ignore - ca_cert=self.cert_handler.ca_cert, # type: ignore - private_key=self.cert_handler.private_key, # type: ignore - ) - else: - logger.debug("disabling TLS") - self.nginx.delete_certificates() + def _on_cert_handler_changed(self, e: ops.RelationChangedEvent): + + self.coordinator._on_cert_handler_changed(e) # tls readiness change means config change. # sync scheme change with traefik and related consumers self._configure_ingress() - # sync the server cert with the charm container. + # sync the server CA cert with the charm container. # technically, because of charm tracing, this will be called first thing on each event - self._update_server_cert() + self._update_server_ca_cert() # update relations to reflect the new certificate self._update_tracing_relations() - # notify the cluster - self._update_tempo_cluster() - def _on_tracing_request(self, e: RequestEvent): """Handle a remote requesting a tracing endpoint.""" logger.debug(f"received tracing request from {e.relation.app}: {e.requested_receivers}") self._update_tracing_relations() - def _on_tempo_cluster_changed(self, _: RelationEvent): - self._update_tempo_cluster() def _on_ingress_relation_created(self, _: RelationEvent): self._configure_ingress() @@ -332,29 +199,6 @@ def _on_leader_elected(self, _: ops.LeaderElectedEvent): # as traefik_route goes through app data, we need to take lead of traefik_route if our leader dies. self._configure_ingress() - def _on_s3_credentials_changed(self, _): - self._on_s3_changed() - - def _on_s3_credentials_gone(self, _): - self._on_s3_changed() - - def _on_s3_changed(self): - self._update_tempo_cluster() - - def _on_peers_relation_created(self, event: ops.RelationCreatedEvent): - if self._local_ip: - event.relation.data[self.unit]["local-ip"] = self._local_ip - - def _on_peers_relation_changed(self, _): - self._update_tempo_cluster() - - def _on_config_changed(self, _): - # check if certificate files haven't disappeared and recreate them if needed - self._update_tempo_cluster() - - def _on_update_status(self, _): - """Update the status of the application.""" - def _on_ingress_ready(self, _event): # whenever there's a change in ingress, we need to update all tracing relations self._update_tracing_relations() @@ -366,52 +210,16 @@ def _on_ingress_revoked(self, _event): def _on_list_receivers_action(self, event: ops.ActionEvent): res = {} for receiver in self._requested_receivers(): - res[receiver.replace("_", "-")] = ( - f"{self.ingress.external_host or self.tempo.url}:{self.tempo.receiver_ports[receiver]}" - ) + res[receiver.replace("_", "-")] = self.get_receiver_url(receiver) event.set_results(res) # keep this event handler at the bottom - def _on_collect_unit_status(self, e: CollectStatusEvent): - # todo add [nginx.workload] statuses - - if not self.tempo.is_ready: - e.add_status(WaitingStatus("[workload.tempo] Tempo API not ready just yet...")) - - # TODO: should we set these statuses on the leader only, or on all units? - if issues := self._inconsistencies: - for issue in issues: - e.add_status(BlockedStatus("[consistency.issues]" + issue)) - e.add_status(BlockedStatus("[consistency] Unit *disabled*.")) - else: - if self.is_clustered: - # no issues: tempo is consistent - if not self.coordinator.is_recommended: - e.add_status(ActiveStatus("[coordinator] degraded")) - else: - e.add_status(ActiveStatus()) - else: - e.add_status(ActiveStatus()) - - def _on_nginx_pebble_ready(self, _) -> None: - self.nginx.configure_pebble_layer() - - def _on_nginx_prometheus_exporter_pebble_ready(self, _) -> None: - self.nginx_prometheus_exporter.configure_pebble_layer() - - def _on_event(self, event) -> None: - """A set of common configuration actions that should happen on every event.""" - if isinstance(event, CollectStatusEvent): - return - # plan layers - self.nginx.configure_pebble_layer() - self.nginx_prometheus_exporter.configure_pebble_layer() - # configure ingress - self._configure_ingress() - # update cluster relations - self._update_tempo_cluster() - # update tracing relations - self._update_tracing_relations() + def _on_collect_unit_status(self, e: ops.CollectStatusEvent): + self.coordinator._on_collect_unit_status(e) + # add Tempo charm custom blocking conditions + # TODO: avoid waiting for update-status event + # if not self.is_workload_ready(): + # e.add_status(ops.WaitingStatus("[workload.tempo] Tempo API not ready just yet...")) ################### # UTILITY METHODS # @@ -429,7 +237,7 @@ def _configure_ingress(self) -> None: self._update_tracing_relations() # notify the cluster - self._update_tempo_cluster() + self.coordinator.update_cluster() def _update_tracing_relations(self) -> None: tracing_relations = self.model.relations["tracing"] @@ -443,10 +251,10 @@ def _update_tracing_relations(self) -> None: # publish requested protocols to all relations if self.unit.is_leader(): self.tracing.publish_receivers( - [(p, self.tempo.get_receiver_url(p, self.ingress)) for p in requested_receivers] + [(p, self.get_receiver_url(p)) for p in requested_receivers] ) - self._update_tempo_cluster() + self.coordinator.update_cluster() def _requested_receivers(self) -> Tuple[ReceiverProtocol, ...]: """List what receivers we should activate, based on the active tracing relations and config-enabled extra receivers.""" @@ -459,95 +267,34 @@ def _requested_receivers(self) -> Tuple[ReceiverProtocol, ...]: requested_receivers = requested_protocols.intersection(set(self.tempo.receiver_ports)) return tuple(requested_receivers) - def server_cert(self) -> str: + def server_ca_cert(self) -> str: """For charm tracing.""" - self._update_server_cert() - return self.tempo.server_cert_path - - def _update_server_cert(self) -> None: - """Server certificate for charm tracing tls, if tls is enabled.""" - server_cert = Path(self.tempo.server_cert_path) - if self.tls_available: - if not server_cert.exists(): - server_cert.parent.mkdir(parents=True, exist_ok=True) - if self.cert_handler.server_cert: - server_cert.write_text(self.cert_handler.server_cert) + self._update_server_ca_cert() + return self.tempo.tls_ca_path + + def _update_server_ca_cert(self) -> None: + """Server CA certificate for charm tracing tls, if tls is enabled.""" + server_ca_cert = Path(self.tempo.tls_ca_path) + if self.coordinator.tls_available: + if not server_ca_cert.exists(): + server_ca_cert.parent.mkdir(parents=True, exist_ok=True) + if self.coordinator.cert_handler.ca_cert: + server_ca_cert.write_text(self.coordinator.cert_handler.ca_cert) else: # tls unavailable: delete local cert - server_cert.unlink(missing_ok=True) + server_ca_cert.unlink(missing_ok=True) def tempo_otlp_http_endpoint(self) -> Optional[str]: """Endpoint at which the charm tracing information will be forwarded.""" # the charm container and the tempo workload container have apparently the same # IP, so we can talk to tempo at localhost. - if self.tempo.is_ready: + if self.is_workload_ready(): return f"{self._internal_url}:{self.tempo.receiver_ports['otlp_http']}" - return None - - @property - def _peers(self) -> Optional[Set[ops.model.Unit]]: - relation = self.model.get_relation("peers") - if not relation: - return None - - # self is not included in relation.units - return relation.units - - @property - def loki_endpoints_by_unit(self) -> Dict[str, str]: - """Loki endpoints from relation data in the format needed for Pebble log forwarding. - - Returns: - A dictionary of remote units and the respective Loki endpoint. - { - "loki/0": "http://loki:3100/loki/api/v1/push", - "another-loki/0": "http://another-loki:3100/loki/api/v1/push", - } - """ - endpoints: Dict = {} - relations: List[Relation] = self.model.relations.get("logging-consumer", []) - - for relation in relations: - for unit in relation.units: - if "endpoint" not in relation.data[unit]: - continue - endpoint = relation.data[unit]["endpoint"] - deserialized_endpoint = json.loads(endpoint) - url = deserialized_endpoint["url"] - endpoints[unit.name] = url - - return endpoints - - def _update_tempo_cluster(self) -> None: - """Build the config and publish everything to the application databag.""" - if not self._is_consistent: - logger.error("skipped tempo cluster update: inconsistent state") - return - - if not self.unit.is_leader(): - return - - kwargs = {} - - if self.tls_available: - # we share the certs in plaintext as they're not sensitive information - kwargs["ca_cert"] = self.cert_handler.ca_cert - kwargs["server_cert"] = self.cert_handler.server_cert - kwargs["privkey_secret_id"] = self.tempo_cluster.publish_privkey(VAULT_SECRET_LABEL) - - # On every function call, we always publish everything to the databag; however, if there - # are no changes, Juju will notice there's no delta and do nothing - self.tempo_cluster.publish_data( - tempo_config=self.tempo.generate_config( - self._requested_receivers(), - self._s3_config, - self.tempo_cluster.gather_addresses_by_role(), - self.tempo_cluster.gather_addresses(), - ), - loki_endpoints=self.loki_endpoints_by_unit, - # TODO tempo receiver for charm tracing - **kwargs, - ) + def requested_receivers_urls(self) -> Dict[str, str]: + """Endpoints to which the workload (and the worker charm) can push traces to.""" + return { + receiver: self.get_receiver_url(receiver) for receiver in self._requested_receivers() + } @property def _static_ingress_config(self) -> dict: @@ -571,7 +318,7 @@ def _ingress_config(self) -> dict: # TODO better matcher "rule": "ClientIP(`0.0.0.0/0`)", } - if sanitized_protocol.endswith("grpc") and not self.tls_available: + if sanitized_protocol.endswith("grpc") and not self.coordinator.tls_available: # to send traces to unsecured GRPC endpoints, we need h2c # see https://doc.traefik.io/traefik/v2.0/user-guides/grpc/#with-http-h2c http_services[ @@ -590,6 +337,49 @@ def _ingress_config(self) -> dict: }, } + def get_receiver_url(self, protocol: ReceiverProtocol): + """Return the receiver endpoint URL based on the protocol. + + if ingress is used, return endpoint provided by the ingress instead. + """ + protocol_type = receiver_protocol_to_transport_protocol.get(protocol) + # ingress.is_ready returns True even when traefik hasn't sent any data yet + has_ingress = ( + self.ingress.is_ready() and self.ingress.external_host and self.ingress.scheme + ) + receiver_port = self.tempo.receiver_ports[protocol] + + if has_ingress: + url = ( + self.ingress.external_host + if protocol_type == "grpc" + else f"{self.ingress.scheme}://{self.ingress.external_host}" + ) + else: + url = ( + self.coordinator.hostname + if protocol_type == "grpc" + else self.coordinator._internal_url + ) + + return f"{url}:{receiver_port}" + + def is_workload_ready(self): + """Whether the tempo built-in readiness check reports 'ready'.""" + if self.coordinator.tls_available: + tls, s = f" --cacert {self.tempo.tls_ca_path}", "s" + else: + tls = s = "" + + # cert is for fqdn/ingress, not for IP + cmd = f"curl{tls} http{s}://{self.coordinator.hostname}:{self.tempo.tempo_http_server_port}/ready" + + try: + out = getoutput(cmd).split("\n")[-1] + except (CalledProcessError, IndexError): + return False + return out == "ready" + if __name__ == "__main__": # pragma: nocover main(TempoCoordinatorCharm) diff --git a/src/coordinator.py b/src/coordinator.py deleted file mode 100644 index b7bcea9..0000000 --- a/src/coordinator.py +++ /dev/null @@ -1,85 +0,0 @@ -import logging -from collections import Counter -from typing import Dict, List, Optional, Set - -from tempo_cluster import TempoClusterProvider, TempoRole - -logger = logging.getLogger(__name__) - -MINIMAL_DEPLOYMENT = { - TempoRole.querier: 1, - TempoRole.query_frontend: 1, - TempoRole.ingester: 1, - TempoRole.distributor: 1, - TempoRole.compactor: 1, - TempoRole.metrics_generator: 1, -} -"""The minimal set of roles that need to be allocated for the -deployment to be considered consistent (otherwise we set blocked).""" - -# TODO: find out what the actual recommended deployment is -RECOMMENDED_DEPLOYMENT = Counter( - { - TempoRole.querier: 1, - TempoRole.query_frontend: 1, - TempoRole.ingester: 3, - TempoRole.distributor: 1, - TempoRole.compactor: 1, - TempoRole.metrics_generator: 1, - } -) -"""The set of roles that need to be allocated for the -deployment to be considered robust according to the official -recommendations/guidelines.""" - - -class TempoCoordinator: - """Tempo coordinator.""" - - def __init__(self, cluster_provider: TempoClusterProvider): - self._cluster_provider = cluster_provider - self._roles: Dict[TempoRole, int] = self._cluster_provider.gather_roles() - - # Whether the roles list makes up a coherent mimir deployment. - self.is_coherent = set(self._roles.keys()).issuperset(MINIMAL_DEPLOYMENT) - self.missing_roles: Set[TempoRole] = set(MINIMAL_DEPLOYMENT).difference(self._roles.keys()) - # If the coordinator is incoherent, return the roles that are missing for it to become so. - - def _is_recommended(): - for role, min_n in RECOMMENDED_DEPLOYMENT.items(): - if self._roles.get(role, 0) < min_n: - return False - return True - - self.is_recommended: bool = _is_recommended() - # Whether the present roles are a superset of the minimal deployment. - # I.E. If all required roles are assigned, and each role has the recommended amount of units. - # python>=3.11 would support roles >= RECOMMENDED_DEPLOYMENT - - def get_deployment_inconsistencies(self, has_s3: bool) -> List[str]: - """Determine whether the deployment as a whole is consistent. - - Return a list of failed consistency checks. - """ - return self._get_deployment_inconsistencies( - has_s3=has_s3, - coherent=self.is_coherent, - missing_roles=self.missing_roles, - ) - - @staticmethod - def _get_deployment_inconsistencies( - has_s3: bool, - coherent: bool, - missing_roles: Optional[Set[TempoRole]] = None, - ) -> List[str]: - """Determine whether the deployment as a whole is consistent. - - Return a list of failed consistency checks. - """ - failures = [] - if not has_s3: - failures.append("Tempo has no s3 integration.") - if not coherent: - failures.append(f"Incoherent coordinator: missing roles: {missing_roles}.") - return failures diff --git a/src/nginx.py b/src/nginx_config.py similarity index 61% rename from src/nginx.py rename to src/nginx_config.py index b706f9a..6f14d50 100644 --- a/src/nginx.py +++ b/src/nginx_config.py @@ -6,54 +6,29 @@ from typing import Any, Dict, List, Optional, Set import crossplane -from ops import CharmBase -from ops.pebble import Layer, PathError, ProtocolError +from cosl.coordinated_workers.coordinator import Coordinator +from cosl.coordinated_workers.nginx import CERT_PATH, KEY_PATH from tempo import Tempo -from tempo_cluster import TempoClusterProvider, TempoRole +from tempo_config import TempoRole logger = logging.getLogger(__name__) -NGINX_DIR = "/etc/nginx" -NGINX_CONFIG = f"{NGINX_DIR}/nginx.conf" -KEY_PATH = f"{NGINX_DIR}/certs/server.key" -CERT_PATH = f"{NGINX_DIR}/certs/server.cert" -CA_CERT_PATH = f"{NGINX_DIR}/certs/ca.cert" - - -class Nginx: +class NginxConfig: """Helper class to manage the nginx workload.""" - config_path = NGINX_CONFIG - - def __init__(self, charm: CharmBase, cluster_provider: TempoClusterProvider, server_name: str): - self._charm = charm - self.cluster_provider = cluster_provider + def __init__(self, server_name: str): self.server_name = server_name - self._container = self._charm.unit.get_container("nginx") - - def configure_pebble_layer(self) -> None: - """Configure pebble layer.""" - new_config: str = self.config() - should_restart: bool = self._has_config_changed(new_config) - if self._container.can_connect(): - self._container.push(self.config_path, new_config, make_dirs=True) # type: ignore - self._container.add_layer("nginx", self.layer, combine=True) - self._container.autostart() - if should_restart: - logger.info("new nginx config: reloading the service") - self.reload() - - def config(self) -> str: + def config(self, coordinator: Coordinator) -> str: """Build and return the Nginx configuration.""" - full_config = self._prepare_config() + full_config = self._prepare_config(coordinator) return crossplane.build(full_config) - def _prepare_config(self) -> List[dict]: + def _prepare_config(self, coordinator: Coordinator) -> List[dict]: log_level = "error" - addresses_by_role = self.cluster_provider.gather_addresses_by_role() + addresses_by_role = coordinator.cluster.gather_addresses_by_role() # build the complete configuration full_config = [ {"directive": "worker_processes", "args": ["5"]}, @@ -102,52 +77,12 @@ def _prepare_config(self) -> List[dict]: }, {"directive": "proxy_read_timeout", "args": ["300"]}, # server block - *self._servers(addresses_by_role), + *self._servers(addresses_by_role, coordinator.nginx.are_certificates_on_disk), ], }, ] return full_config - def _has_config_changed(self, new_config: str) -> bool: - """Return True if the passed config differs from the one on disk.""" - if not self._container.can_connect(): - logger.debug("Could not connect to Nginx container") - return False - - try: - current_config = self._container.pull(self.config_path).read() - except (ProtocolError, PathError) as e: - logger.warning( - "Could not check the current nginx configuration due to " - "a failure in retrieving the file: %s", - e, - ) - return False - - return current_config != new_config - - def reload(self) -> None: - """Reload the nginx config without restarting the service.""" - self._container.exec(["nginx", "-s", "reload"]) - - @property - def layer(self) -> Layer: - """Return the Pebble layer for Nginx.""" - return Layer( - { - "summary": "nginx layer", - "description": "pebble config layer for Nginx", - "services": { - "nginx": { - "override": "replace", - "summary": "nginx", - "command": "nginx -g 'daemon off;'", - "startup": "enabled", - } - }, - } - ) - def _log_verbose(self, verbose: bool = True) -> List[Dict[str, Any]]: if verbose: return [{"directive": "access_log", "args": ["/dev/stderr", "main"]}] @@ -167,28 +102,6 @@ def _upstreams(self, addresses_by_role: Dict[str, Set[str]]) -> List[Dict[str, A addresses_mapped_to_upstreams = {} nginx_upstreams = [] addresses_mapped_to_upstreams = addresses_by_role.copy() - if TempoRole.all in addresses_by_role.keys(): - # for all, we add addresses to existing upstreams for distributor / query_frontend or create the set - if TempoRole.distributor in addresses_mapped_to_upstreams: - addresses_mapped_to_upstreams[TempoRole.distributor] = ( - addresses_mapped_to_upstreams[TempoRole.distributor].union( - addresses_by_role[TempoRole.all] - ) - ) - else: - addresses_mapped_to_upstreams[TempoRole.distributor] = addresses_by_role[ - TempoRole.all - ] - if TempoRole.query_frontend in addresses_mapped_to_upstreams: - addresses_mapped_to_upstreams[TempoRole.query_frontend] = ( - addresses_mapped_to_upstreams[TempoRole.query_frontend].union( - addresses_by_role[TempoRole.all] - ) - ) - else: - addresses_mapped_to_upstreams[TempoRole.query_frontend] = addresses_by_role[ - TempoRole.all - ] if TempoRole.distributor in addresses_mapped_to_upstreams.keys(): nginx_upstreams.extend( self._distributor_upstreams(addresses_mapped_to_upstreams[TempoRole.distributor]) @@ -276,23 +189,29 @@ def _listen_args(self, port: int, ipv6: bool, ssl: bool, http2: bool) -> List[st args.append("http2") return args - def _servers(self, addresses_by_role: Dict[str, Set[str]]) -> List[Dict[str, Any]]: + def _servers( + self, addresses_by_role: Dict[str, Set[str]], tls: bool = False + ) -> List[Dict[str, Any]]: servers = [] roles = addresses_by_role.keys() - if TempoRole.distributor.value in roles or TempoRole.all.value in roles: + if TempoRole.distributor.value in roles: for protocol, port in Tempo.receiver_ports.items(): - servers.append(self._server(port, protocol.replace("_", "-"), "grpc" in protocol)) - if TempoRole.query_frontend.value in roles or TempoRole.all.value in roles: + servers.append( + self._server(port, protocol.replace("_", "-"), "grpc" in protocol, tls) + ) + if TempoRole.query_frontend.value in roles: for protocol, port in Tempo.server_ports.items(): - servers.append(self._server(port, protocol.replace("_", "-"), "grpc" in protocol)) + servers.append( + self._server(port, protocol.replace("_", "-"), "grpc" in protocol, tls) + ) return servers - def _server(self, port: int, upstream: str, grpc: bool = False) -> Dict[str, Any]: + def _server( + self, port: int, upstream: str, grpc: bool = False, tls: bool = False + ) -> Dict[str, Any]: auth_enabled = False - tls = self.tls_ready - if tls: return { "directive": "server", @@ -328,26 +247,3 @@ def _server(self, port: int, upstream: str, grpc: bool = False) -> Dict[str, Any *self._locations(upstream, grpc, tls), ], } - - @property - def tls_ready(self) -> bool: - """Whether cert, key, and ca paths are found on disk and Nginx is ready to use tls.""" - if not self._container.can_connect(): - return False - return all( - self._container.exists(tls_path) for tls_path in (KEY_PATH, CERT_PATH, CA_CERT_PATH) - ) - - def configure_tls(self, private_key: str, server_cert: str, ca_cert: str) -> None: - """Save the certificates file to disk and run update-ca-certificates.""" - if self._container.can_connect(): - self._container.push(KEY_PATH, private_key, make_dirs=True) - self._container.push(CERT_PATH, server_cert, make_dirs=True) - self._container.push(CA_CERT_PATH, ca_cert, make_dirs=True) - - def delete_certificates(self) -> None: - """Delete the certificate files from disk and run update-ca-certificates.""" - if self._container.can_connect(): - self._container.remove_path(CERT_PATH, recursive=True) - self._container.remove_path(KEY_PATH, recursive=True) - self._container.remove_path(CA_CERT_PATH, recursive=True) diff --git a/src/nginx_prometheus_exporter.py b/src/nginx_prometheus_exporter.py deleted file mode 100644 index 7ce7b2c..0000000 --- a/src/nginx_prometheus_exporter.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2024 Canonical -# See LICENSE file for licensing details. -"""Nginx Prometheus exporter workload.""" - -import logging - -from ops import CharmBase -from ops.pebble import Layer - -logger = logging.getLogger(__name__) - -NGINX_PROMETHEUS_EXPORTER_PORT = "9113" - - -class NginxPrometheusExporter: - """Helper class to manage the nginx prometheus exporter workload.""" - - def __init__(self, charm: CharmBase) -> None: - self._charm = charm - self._container = self._charm.unit.get_container("nginx-prometheus-exporter") - - def configure_pebble_layer(self) -> None: - """Configure pebble layer.""" - self._container.add_layer("nginx-prometheus-exporter", self.layer, combine=True) - self._container.autostart() - - @property - def layer(self) -> Layer: - """Return the Pebble layer for Nginx Prometheus exporter.""" - scheme = "https" if self._charm.tls_available else "http" # type: ignore - return Layer( - { - "summary": "nginx prometheus exporter layer", - "description": "pebble config layer for Nginx Prometheus exporter", - "services": { - "nginx": { - "override": "replace", - "summary": "nginx prometheus exporter", - "command": f"nginx-prometheus-exporter --no-nginx.ssl-verify --web.listen-address=:{NGINX_PROMETHEUS_EXPORTER_PORT} --nginx.scrape-uri={scheme}://127.0.0.1:3200/status", - "startup": "enabled", - } - }, - } - ) diff --git a/src/tempo.py b/src/tempo.py index fd17ab2..e1b64cc 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -4,18 +4,13 @@ """Tempo workload configuration and client.""" import logging -import socket -from subprocess import CalledProcessError, getoutput -from typing import Any, Dict, List, Optional, Sequence, Set, Tuple +from typing import Callable, Dict, Optional, Sequence, Set, Tuple -from charms.tempo_k8s.v2.tracing import ( - ReceiverProtocol, - receiver_protocol_to_transport_protocol, -) -from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer +import yaml +from charms.tempo_k8s.v2.tracing import ReceiverProtocol +from cosl.coordinated_workers.coordinator import Coordinator import tempo_config -from tempo_cluster import TempoRole logger = logging.getLogger(__name__) @@ -23,21 +18,13 @@ class Tempo: """Class representing the Tempo client workload configuration.""" - config_path = "/etc/tempo/tempo.yaml" - - # cert path on charm container - server_cert_path = "/usr/local/share/ca-certificates/ca.crt" - # cert paths on tempo container - tls_cert_path = "/etc/tempo/tls/server.crt" - tls_key_path = "/etc/tempo/tls/server.key" + tls_cert_path = "/etc/worker/server.cert" + tls_key_path = "/etc/worker/private.key" tls_ca_path = "/usr/local/share/ca-certificates/ca.crt" wal_path = "/etc/tempo/tempo_wal" - log_path = "/var/log/tempo.log" - tempo_ready_notice_key = "canonical.com/tempo/workload-ready" - s3_relation_name = "s3" s3_bucket_name = "tempo" memberlist_port = 7946 @@ -47,6 +34,7 @@ class Tempo: "tempo_grpc": 9096, # default grpc listen port is 9095, but that conflicts with promtail. } + # ports source: https://github.com/grafana/tempo/blob/main/example/docker-compose/local/docker-compose.yaml receiver_ports: Dict[str, int] = { "zipkin": 9411, "otlp_grpc": 4317, @@ -64,14 +52,9 @@ class Tempo: def __init__( self, - external_host: Optional[str] = None, - use_tls: bool = False, + requested_receivers: Callable[[], "Tuple[ReceiverProtocol, ...]"], ): - # ports source: https://github.com/grafana/tempo/blob/main/example/docker-compose/local/docker-compose.yaml - - # fqdn, if an ingress is not available, else the ingress address. - self._external_hostname = external_host or socket.getfqdn() - self.use_tls = use_tls + self._receivers_getter = requested_receivers @property def tempo_http_server_port(self) -> int: @@ -83,91 +66,28 @@ def tempo_grpc_server_port(self) -> int: """Return the receiver port for the built-in tempo_http protocol.""" return self.server_ports["tempo_grpc"] - def get_external_ports(self, service_name_prefix: str) -> List[Tuple[str, int, int]]: - """List of service names and port mappings for the kubernetes service patch. - - Includes the tempo server as well as the receiver ports. - """ - # todo allow remapping ports? - all_ports = {**self.server_ports} - return [ - ( - (f"{service_name_prefix}-{service_name}").replace("_", "-"), - all_ports[service_name], - all_ports[service_name], - ) - for service_name in all_ports - ] - - @property - def url(self) -> str: - """Base url at which the tempo server is locally reachable over http.""" - scheme = "https" if self.use_tls else "http" - return f"{scheme}://{self._external_hostname}" - - def get_receiver_url(self, protocol: ReceiverProtocol, ingress: TraefikRouteRequirer): - """Return the receiver endpoint URL based on the protocol. - - if ingress is used, return endpoint provided by the ingress instead. - """ - protocol_type = receiver_protocol_to_transport_protocol.get(protocol) - # ingress.is_ready returns True even when traefik hasn't sent any data yet - has_ingress = ingress.is_ready() and ingress.external_host and ingress.scheme - receiver_port = self.receiver_ports[protocol] - - if has_ingress: - url = ( - ingress.external_host - if protocol_type == "grpc" - else f"{ingress.scheme}://{ingress.external_host}" - ) - else: - url = self._external_hostname if protocol_type == "grpc" else self.url - - return f"{url}:{receiver_port}" - - def _build_server_config(self): - server_config = tempo_config.Server( - http_listen_port=self.tempo_http_server_port, - # we need to specify a grpc server port even if we're not using the grpc server, - # otherwise it will default to 9595 and make promtail bork - grpc_listen_port=self.tempo_grpc_server_port, - ) - - if self.use_tls: - server_tls_config = tempo_config.TLS( - cert_file=str(self.tls_cert_path), - key_file=str(self.tls_key_path), - client_ca_file=str(self.tls_ca_path), - ) - server_config.http_tls_config = server_tls_config - server_config.grpc_tls_config = server_tls_config - - return server_config - - def generate_config( + def config( self, - receivers: Sequence[ReceiverProtocol], - s3_config: dict, - roles_addresses: Dict[str, Set[str]], - peers: Optional[Set[str]] = None, - ) -> Dict[str, Any]: + coordinator: Coordinator, + ) -> str: """Generate the Tempo configuration. Only activate the provided receivers. """ config = tempo_config.Tempo( auth_enabled=False, - server=self._build_server_config(), - distributor=self._build_distributor_config(receivers), + server=self._build_server_config(coordinator.tls_available), + distributor=self._build_distributor_config( + self._receivers_getter(), coordinator.tls_available + ), ingester=self._build_ingester_config(), - memberlist=self._build_memberlist_config(peers), + memberlist=self._build_memberlist_config(coordinator.cluster.gather_addresses()), compactor=self._build_compactor_config(), - querier=self._build_querier_config(roles_addresses), - storage=self._build_storage_config(s3_config), + querier=self._build_querier_config(coordinator.cluster.gather_addresses_by_role()), + storage=self._build_storage_config(coordinator._s3_config), ) - if self.use_tls: + if coordinator.tls_available: # cfr: # https://grafana.com/docs/tempo/latest/configuration/network/tls/#client-configuration tls_config = { @@ -176,7 +96,7 @@ def generate_config( "tls_key_path": self.tls_key_path, "tls_ca_path": self.tls_ca_path, # try with fqdn? - "tls_server_name": self._external_hostname, + "tls_server_name": coordinator.hostname, } config.ingester_client = tempo_config.Client( grpc_client_config=tempo_config.ClientTLS(**tls_config) @@ -189,7 +109,26 @@ def generate_config( ) config.memberlist = config.memberlist.model_copy(update=tls_config) - return config.model_dump(mode="json", exclude_none=True) + return yaml.dump(config.model_dump(mode="json", exclude_none=True)) + + def _build_server_config(self, use_tls=False): + server_config = tempo_config.Server( + http_listen_port=self.tempo_http_server_port, + # we need to specify a grpc server port even if we're not using the grpc server, + # otherwise it will default to 9595 and make promtail bork + grpc_listen_port=self.tempo_grpc_server_port, + ) + + if use_tls: + server_tls_config = tempo_config.TLS( + cert_file=str(self.tls_cert_path), + key_file=str(self.tls_key_path), + client_ca_file=str(self.tls_ca_path), + ) + server_config.http_tls_config = server_tls_config + server_config.grpc_tls_config = server_tls_config + + return server_config def _build_storage_config(self, s3_config: dict): storage_config = tempo_config.TraceStorage( @@ -213,27 +152,16 @@ def _build_storage_config(self, s3_config: dict): ) return tempo_config.Storage(trace=storage_config) - def is_ready(self): - """Whether the tempo built-in readiness check reports 'ready'.""" - if self.use_tls: - tls, s = f" --cacert {self.server_cert_path}", "s" - else: - tls = s = "" - - # cert is for fqdn/ingress, not for IP - cmd = f"curl{tls} http{s}://{self._external_hostname}:{self.tempo_http_server_port}/ready" - - try: - out = getoutput(cmd).split("\n")[-1] - except (CalledProcessError, IndexError): - return False - return out == "ready" - def _build_querier_config(self, roles_addresses: Dict[str, Set[str]]): """Build querier config""" - addr = "localhost" - if TempoRole.query_frontend in roles_addresses.keys(): - addr = roles_addresses[TempoRole.query_frontend].pop() + # if distributor and query-frontend have the same address, then the mode of operation is 'all'. + if roles_addresses.get(tempo_config.TempoRole.query_frontend) == roles_addresses.get( + tempo_config.TempoRole.distributor + or (not roles_addresses.get(tempo_config.TempoRole.query_frontend)) + ): + addr = "localhost" + else: + addr = roles_addresses.get(tempo_config.TempoRole.query_frontend).pop() return tempo_config.Querier( frontend_worker=tempo_config.FrontendWorker( @@ -274,7 +202,9 @@ def _build_ingester_config(self): max_block_duration="30m", ) - def _build_distributor_config(self, receivers: Sequence[ReceiverProtocol]): # noqa: C901 + def _build_distributor_config( + self, receivers: Sequence[ReceiverProtocol], use_tls=False + ): # noqa: C901 """Build distributor config""" # receivers: the receivers we have to enable because the requirers we're related to # intend to use them. It already includes receivers that are always enabled @@ -284,7 +214,7 @@ def _build_distributor_config(self, receivers: Sequence[ReceiverProtocol]): # n if not receivers_set: logger.warning("No receivers set. Tempo will be up but not functional.") - if self.use_tls: + if use_tls: receiver_config = { "tls": { "ca_file": str(self.tls_ca_path), diff --git a/src/tempo_cluster.py b/src/tempo_cluster.py deleted file mode 100644 index bb474c1..0000000 --- a/src/tempo_cluster.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical -# See LICENSE file for licensing details. - -"""This module contains an endpoint wrapper class for the requirer side of the ``tempo-cluster`` relation. - -As this relation is cluster-internal and not intended for third-party charms to interact with -`tempo-coordinator-k8s`, its only user will be the tempo-worker-k8s charm. As such, -it does not live in a charm lib as most other relation endpoint wrappers do. -""" -import collections -import json -import logging -from enum import Enum, unique -from typing import Any, Dict, List, MutableMapping, Optional, Set - -import ops -import pydantic - -# The only reason we need the tracing lib is this enum. Not super nice. -from charms.tempo_k8s.v2.tracing import ReceiverProtocol -from ops import EventSource, Object, ObjectEvents -from pydantic import BaseModel, ConfigDict - -log = logging.getLogger("tempo_cluster") - -DEFAULT_ENDPOINT_NAME = "tempo-cluster" -BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} - - -# TODO: inherit enum.StrEnum when jammy is no longer supported. -# https://docs.python.org/3/library/enum.html#enum.StrEnum -@unique -class TempoRole(str, Enum): - """Tempo component role names. - - References: - arch: - -> https://grafana.com/docs/tempo/latest/operations/architecture/ - config: - -> https://grafana.com/docs/tempo/latest/configuration/#server - """ - - # scalable-single-binary is a bit too long to type - all = "all" # default, meta-role. gets remapped to scalable-single-binary by the worker. - - querier = "querier" - query_frontend = "query-frontend" - ingester = "ingester" - distributor = "distributor" - compactor = "compactor" - metrics_generator = "metrics-generator" - - @property - def all_nonmeta(self): - return ( - TempoRole.querier, - TempoRole.query_frontend, - TempoRole.ingester, - TempoRole.distributor, - TempoRole.compactor, - TempoRole.metrics_generator, - ) - - -class ConfigReceivedEvent(ops.EventBase): - """Event emitted when the "tempo-cluster" provider has shared a new tempo config.""" - - config: Dict[str, Any] - """The tempo config.""" - - def __init__(self, handle: ops.framework.Handle, config: Dict[str, Any]): - super().__init__(handle) - self.config = config - - def snapshot(self) -> Dict[str, Any]: - """Used by the framework to serialize the event to disk. - - Not meant to be called by charm code. - """ - return {"config": json.dumps(self.config)} - - def restore(self, snapshot: Dict[str, Any]): - """Used by the framework to deserialize the event from disk. - - Not meant to be called by charm code. - """ - self.relation = json.loads(snapshot["config"]) # noqa - - -class TempoClusterError(Exception): - """Base class for exceptions raised by this module.""" - - -class DataValidationError(TempoClusterError): - """Raised when relation databag validation fails.""" - - -class DatabagAccessPermissionError(TempoClusterError): - """Raised when a follower attempts to write leader settings.""" - - -class _JujuTopologyModel(pydantic.BaseModel): - """_JujuTopologyModel.""" - - model: str - model_uuid: str - application: str - charm_name: str - unit: str - - -# DatabagModel implementation from traefik.v1.ingress charm lib. -PYDANTIC_IS_V1 = int(pydantic.version.VERSION.split(".")[0]) < 2 -if PYDANTIC_IS_V1: - - class DatabagModel(BaseModel): # type: ignore - """Base databag model.""" - - class Config: - """Pydantic config.""" - - allow_population_by_field_name = True - """Allow instantiating this class by field name (instead of forcing alias).""" - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {f.alias for f in cls.__fields__.values()} # type: ignore - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - log.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.parse_raw(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - log.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - - for key, value in self.dict(by_alias=True, exclude_defaults=True).items(): # type: ignore - databag[key] = json.dumps(value) - - return databag - -else: - from pydantic import ConfigDict - - class DatabagModel(BaseModel): - """Base databag model.""" - - model_config = ConfigDict( - # tolerate additional keys in databag - extra="ignore", - # Allow instantiating this class by field name (instead of forcing alias). - populate_by_name=True, - ) # type: ignore - """Pydantic config.""" - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {(f.alias or n) for n, f in cls.model_fields.items()} # type: ignore - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - log.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.model_validate_json(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - log.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - - dct = self.model_dump(mode="json", by_alias=True, exclude_defaults=True) # type: ignore - databag.update({k: json.dumps(v) for k, v in dct.items()}) - return databag - - -class TempoClusterRequirerAppData(DatabagModel): - """TempoClusterRequirerAppData.""" - - role: TempoRole - - -class TempoClusterRequirerUnitData(DatabagModel): - """TempoClusterRequirerUnitData.""" - - juju_topology: _JujuTopologyModel - address: str - - -class TempoClusterProviderAppData(DatabagModel): - """TempoClusterProviderAppData.""" - - tempo_config: Dict[str, Any] - loki_endpoints: Optional[Dict[str, str]] = None - ca_cert: Optional[str] = None - server_cert: Optional[str] = None - privkey_secret_id: Optional[str] = None - tempo_receiver: Optional[Dict[ReceiverProtocol, str]] = None - - -class TempoClusterChangedEvent(ops.EventBase): - """Event emitted when any "tempo-cluster" relation event fires.""" - - -class TempoClusterProviderEvents(ObjectEvents): - """Events emitted by the TempoClusterProvider "tempo-cluster" endpoint wrapper.""" - - changed = EventSource(TempoClusterChangedEvent) - - -class TempoClusterProvider(Object): - """``tempo-cluster`` provider endpoint wrapper.""" - - on = TempoClusterProviderEvents() # type: ignore - - def __init__( - self, - charm: ops.CharmBase, - key: Optional[str] = None, - endpoint: str = DEFAULT_ENDPOINT_NAME, - ): - super().__init__(charm, key or endpoint) - self._charm = charm - - # filter out common unhappy relation states - self._relations: List[ops.Relation] = [ - rel for rel in self.model.relations[endpoint] if (rel.app and rel.data) - ] - - # we coalesce all tempo-cluster-relation-* events into a single cluster-changed API. - # the coordinator uses a common exit hook reconciler, that's why. - self.framework.observe( - self._charm.on[endpoint].relation_joined, self._on_tempo_cluster_changed - ) - self.framework.observe( - self._charm.on[endpoint].relation_changed, self._on_tempo_cluster_changed - ) - self.framework.observe( - self._charm.on[endpoint].relation_departed, self._on_tempo_cluster_changed - ) - self.framework.observe( - self._charm.on[endpoint].relation_broken, self._on_tempo_cluster_changed - ) - - def _on_tempo_cluster_changed(self, _): - self.on.changed.emit() - - def publish_privkey(self, label: str) -> str: - """Grant the secret containing the privkey to all relations, and return the secret ID.""" - secret = self.model.get_secret(label=label) - for relation in self._relations: - secret.grant(relation) - # can't return secret.id because secret was obtained by label, and so - # we don't have an ID unless we fetch it - return secret.get_info().id - - def publish_data( - self, - tempo_config: Dict[str, Any], - tempo_receiver: Optional[Dict[ReceiverProtocol, Any]] = None, - ca_cert: Optional[str] = None, - server_cert: Optional[str] = None, - privkey_secret_id: Optional[str] = None, - loki_endpoints: Optional[Dict[str, str]] = None, - ) -> None: - """Publish the tempo config to all related tempo worker clusters.""" - for relation in self._relations: - if relation: - local_app_databag = TempoClusterProviderAppData( - tempo_config=tempo_config, - loki_endpoints=loki_endpoints, - tempo_receiver=tempo_receiver, - ca_cert=ca_cert, - server_cert=server_cert, - privkey_secret_id=privkey_secret_id, - ) - local_app_databag.dump(relation.data[self.model.app]) - - @property - def has_workers(self) -> bool: - """Return whether this tempo coordinator has any connected workers.""" - # we use the presence of relations instead of addresses, because we want this - # check to fail early - return bool(self._relations) - - def gather_addresses_by_role(self) -> Dict[str, Set[str]]: - """Go through the worker's unit databags to collect all the addresses published by the units, by role.""" - data = collections.defaultdict(set) - for relation in self._relations: - - if not relation.app: - log.debug(f"skipped {relation} as .app is None") - continue - - try: - worker_app_data = TempoClusterRequirerAppData.load(relation.data[relation.app]) - except DataValidationError as e: - log.info(f"invalid databag contents: {e}") - continue - - for worker_unit in relation.units: - try: - worker_data = TempoClusterRequirerUnitData.load(relation.data[worker_unit]) - unit_address = worker_data.address - data[worker_app_data.role].add(unit_address) - except DataValidationError as e: - log.info(f"invalid databag contents: {e}") - continue - - return data - - def gather_addresses(self) -> Set[str]: - """Go through the worker's unit databags to collect all the addresses published by the units.""" - data = set() - addresses_by_role = self.gather_addresses_by_role() - for role, address_set in addresses_by_role.items(): - data.update(address_set) - - return data - - def gather_roles(self) -> Dict[TempoRole, int]: - """Go through the worker's app databags and sum the available application roles.""" - data = collections.Counter() - for relation in self._relations: - if relation.app: - remote_app_databag = relation.data[relation.app] - try: - worker_role: TempoRole = TempoClusterRequirerAppData.load( - remote_app_databag - ).role - except DataValidationError as e: - log.debug(f"invalid databag contents: {e}") - continue - - # the number of units with each role is the number of remote units - role_n = len(relation.units) # exclude this unit - if worker_role is TempoRole.all: - for role in [r for r in TempoRole if r is not TempoRole.all]: - data[role] += role_n - continue - - data[worker_role] += role_n - - dct = dict(data) - # exclude all roles from the count, if any slipped through - if TempoRole.all in data: - del data[TempoRole.all] - return dct diff --git a/src/tempo_config.py b/src/tempo_config.py index 1c2a186..c487dac 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -2,22 +2,96 @@ # See LICENSE file for licensing details. """Helper module for interacting with the Tempo configuration.""" - import enum import logging import re +from enum import Enum, unique from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterable, List, Mapping, Optional from urllib.parse import urlparse from pydantic import BaseModel, ConfigDict, field_validator, model_validator -S3_RELATION_NAME = "s3" -BUCKET_NAME = "tempo" - logger = logging.getLogger(__name__) +# TODO: inherit enum.StrEnum when jammy is no longer supported. +# https://docs.python.org/3/library/enum.html#enum.StrEnum +@unique +class TempoRole(str, Enum): + """Tempo component role names. + + References: + arch: + -> https://grafana.com/docs/tempo/latest/operations/architecture/ + config: + -> https://grafana.com/docs/tempo/latest/configuration/#server + """ + + # scalable-single-binary is a bit too long to type + all = "all" # default, meta-role. gets remapped to scalable-single-binary by the worker. + + querier = "querier" + query_frontend = "query-frontend" + ingester = "ingester" + distributor = "distributor" + compactor = "compactor" + metrics_generator = "metrics-generator" + + @staticmethod + def all_nonmeta(): + return { + TempoRole.querier, + TempoRole.query_frontend, + TempoRole.ingester, + TempoRole.distributor, + TempoRole.compactor, + TempoRole.metrics_generator, + } + + +META_ROLES = { + "all": set(TempoRole.all_nonmeta()), +} +"""Tempo component meta-role names.""" + +MINIMAL_DEPLOYMENT = { + TempoRole.querier: 1, + TempoRole.query_frontend: 1, + TempoRole.ingester: 1, + TempoRole.distributor: 1, + TempoRole.compactor: 1, + TempoRole.metrics_generator: 1, +} +"""The minimal set of roles that need to be allocated for the +deployment to be considered consistent (otherwise we set blocked).""" + +RECOMMENDED_DEPLOYMENT = { + TempoRole.querier: 1, + TempoRole.query_frontend: 1, + TempoRole.ingester: 3, + TempoRole.distributor: 1, + TempoRole.compactor: 1, + TempoRole.metrics_generator: 1, +} + +""" +The set of roles that need to be allocated for the +deployment to be considered robust according to Grafana Tempo's +Helm chart configurations. +https://github.com/grafana/helm-charts/blob/main/charts/tempo-distributed/ +""" + + +class TempoRolesConfig: + """Define the configuration for Tempo roles.""" + + roles: Iterable[str] = {role for role in TempoRole} + meta_roles: Mapping[str, Iterable[str]] = META_ROLES + minimal_deployment: Iterable[str] = MINIMAL_DEPLOYMENT + recommended_deployment = RECOMMENDED_DEPLOYMENT + + class ClientAuthTypeEnum(str, enum.Enum): """Client auth types.""" diff --git a/tests/scenario/conftest.py b/tests/scenario/conftest.py index 9fe9ebb..d5826b9 100644 --- a/tests/scenario/conftest.py +++ b/tests/scenario/conftest.py @@ -2,9 +2,9 @@ import pytest from scenario import Container, Context, Relation +from tempo_cluster import TempoClusterRequirerAppData, TempoRole from charm import TempoCoordinatorCharm -from tempo_cluster import TempoClusterRequirerAppData, TempoRole @pytest.fixture diff --git a/tests/scenario/helpers.py b/tests/scenario/helpers.py index fc3b79a..dbe8452 100644 --- a/tests/scenario/helpers.py +++ b/tests/scenario/helpers.py @@ -1,5 +1,4 @@ import scenario - from tempo_cluster import TempoClusterProviderAppData diff --git a/tests/scenario/test_config.py b/tests/scenario/test_config.py index 3532fb5..9c8e95a 100644 --- a/tests/scenario/test_config.py +++ b/tests/scenario/test_config.py @@ -1,7 +1,7 @@ from scenario import State +from tempo_cluster import TempoClusterRequirerUnitData from charm import TempoCoordinatorCharm -from tempo_cluster import TempoClusterRequirerUnitData def test_memberlist_multiple_members( diff --git a/tests/scenario/test_nginx.py b/tests/scenario/test_nginx.py index 29811a0..a629774 100644 --- a/tests/scenario/test_nginx.py +++ b/tests/scenario/test_nginx.py @@ -3,11 +3,11 @@ from unittest.mock import MagicMock import pytest - from nginx import Nginx -from tempo import Tempo from tempo_cluster import TempoClusterProvider +from tempo import Tempo + logger = logging.getLogger(__name__) diff --git a/tests/scenario/test_tempo_clustered.py b/tests/scenario/test_tempo_clustered.py index b7eb75c..27d7988 100644 --- a/tests/scenario/test_tempo_clustered.py +++ b/tests/scenario/test_tempo_clustered.py @@ -6,10 +6,10 @@ from charms.tempo_k8s.v2.tracing import TracingRequirerAppData from charms.tls_certificates_interface.v3.tls_certificates import ProviderCertificate from scenario import Relation, State +from tempo_cluster import TempoClusterProviderAppData from charm import TempoCoordinatorCharm from tempo import Tempo -from tempo_cluster import TempoClusterProviderAppData from tests.scenario.helpers import get_tempo_config diff --git a/tests/unit/test_coherence.py b/tests/unit/test_coherence.py index cf85915..c3d9ab0 100644 --- a/tests/unit/test_coherence.py +++ b/tests/unit/test_coherence.py @@ -1,7 +1,6 @@ from unittest.mock import MagicMock import pytest as pytest - from coordinator import ( MINIMAL_DEPLOYMENT, RECOMMENDED_DEPLOYMENT, From 0b0784cc24d5d3015fe33867187b685e37262740 Mon Sep 17 00:00:00 2001 From: michael Date: Fri, 19 Jul 2024 04:23:37 +0300 Subject: [PATCH 02/22] fix tests --- .gitignore | 3 +- src/charm.py | 2 +- src/prometheus_alert_rules/alerts.yaml | 187 --------- .../{tempo_workers => workers}/alerts.yaml | 184 +++++++++ .../{tempo_workers => workers}/rules.yaml | 0 src/tempo.py | 6 +- tests/integration/helpers.py | 122 +++++- tests/integration/test_ingressed_tls.py | 360 +++++++++-------- tests/integration/test_integration.py | 334 ++++++++-------- tests/integration/test_scaling_monolithic.py | 92 +---- tests/integration/test_tls.py | 371 ++++++++++-------- tests/integration/tester/src/charm.py | 2 +- tests/scenario/conftest.py | 13 +- tests/scenario/helpers.py | 4 +- tests/scenario/test_charm_statuses.py | 11 +- tests/scenario/test_config.py | 19 +- tests/scenario/test_enabled_receivers.py | 4 +- tests/scenario/test_ingressed_tracing.py | 4 +- tests/scenario/test_nginx.py | 71 +--- tests/scenario/test_tempo_clustered.py | 71 ++-- tests/scenario/test_tls.py | 7 +- tests/unit/test_coherence.py | 42 +- tests/unit/test_tempo.py | 19 +- 23 files changed, 1007 insertions(+), 921 deletions(-) delete mode 100644 src/prometheus_alert_rules/alerts.yaml rename src/prometheus_alert_rules/{tempo_workers => workers}/alerts.yaml (82%) rename src/prometheus_alert_rules/{tempo_workers => workers}/rules.yaml (100%) diff --git a/.gitignore b/.gitignore index ea4fe3d..9ccb50d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__/ .idea .vscode/ *.egg-info/ -cos-tool-* \ No newline at end of file +cos-tool-* +src/prometheus_alert_rules/consolidated_rules/** \ No newline at end of file diff --git a/src/charm.py b/src/charm.py index 5947939..55b41b4 100755 --- a/src/charm.py +++ b/src/charm.py @@ -144,6 +144,7 @@ def _internal_url(self) -> str: scheme = "http" if hasattr(self, "coordinator") and self.coordinator.nginx.are_certificates_on_disk: scheme = "https" + return f"{scheme}://{self.hostname}" @property @@ -188,7 +189,6 @@ def _on_tracing_request(self, e: RequestEvent): logger.debug(f"received tracing request from {e.relation.app}: {e.requested_receivers}") self._update_tracing_relations() - def _on_ingress_relation_created(self, _: RelationEvent): self._configure_ingress() diff --git a/src/prometheus_alert_rules/alerts.yaml b/src/prometheus_alert_rules/alerts.yaml deleted file mode 100644 index 2c21d50..0000000 --- a/src/prometheus_alert_rules/alerts.yaml +++ /dev/null @@ -1,187 +0,0 @@ -groups: -- name: tempo_alerts - rules: - - alert: "TempoBlockListRisingQuickly" - expr: | - avg by (job, instance)(tempodb_blocklist_length) / avg by (job, instance)(tempodb_blocklist_length offset 7d) > 1.4 - for: "15m" - labels: - severity: "critical" - annotations: - summary: "Tempo block list rising quickly (instance {{ $labels.instance }})" - description: "The {{ $labels.job }} is experiencing a 40% rise in tempo blocklist length over the last 7 days. Consider scaling compactors." - - alert: TempoCompactionsFailing - expr: sum by (job, instance)(increase(tempodb_compaction_errors_total{}[1h])) > 2 and sum by (job, instance)(increase(tempodb_compaction_errors_total{}[5m])) > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo compactions failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 compactions failures in the past hour." - - alert: TempoCompactorUnhealthy - expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="compactor"}) > 0 - for: 15m - labels: - severity: critical - annotations: - summary: Tempo unhealthy compactor(s) (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy compactor(s)." - - alert: TempoCompactorsTooManyOutstandingBlocks - expr: sum by (tenant) (tempodb_compaction_outstanding_blocks) / ignoring(tenant) group_left count(tempo_build_info) > 100 - for: "6h" - labels: - severity: warning - annotations: - summary: Tempo too many outstanding compaction blocks (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having too many outstanding compaction blocks for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors." - - alert: TempoDiscardedSpans - expr: 100 * sum by (instance,job)(rate(tempo_discarded_spans_total[5m])) / sum by (instance,job)(rate(tempo_distributor_spans_received_total[5m])) > 5 - for: "5m" - labels: - severity: warning - annotations: - summary: Tempo spans insertion failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% discard of spans." - - alert: TempoDistributorPushLatency - expr: histogram_quantile(0.99, sum by(le, job, instance) (rate(tempo_distributor_push_duration_seconds_bucket[5m]))) > 3 - for: "5m" - labels: - severity: warning - annotations: - summary: Tempo distributor push latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} distributor push is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoDistributorUnhealthy - expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 - for: 15m - labels: - severity: critical - annotations: - summary: Tempo unhealthy distributor(s) (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy distributor(s)." - - alert: TempoFailedIngestingRequests - expr: sum by (job,instance)(increase (tempo_ingester_traces_created_total[5m])) / sum by (instance,job)(rate(tempo_request_duration_seconds_count{route='/tempopb.Pusher/PushBytesV2'}[5m])) == 0 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo pushing traces to ingester failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing failure in distributors pushing traces to the ingesters." - - alert: TempoFrontendClients - expr: tempo_query_frontend_connected_clients == 0 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo frontend connected clients (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} has no frontend connected clients." - - alert: TempoFrontendQueueLatency - expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_query_frontend_queue_duration_seconds_bucket[15m]))) > 2 - for: "15m" - labels: - severity: warning - annotations: - summary: Tempo frontend queue latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} frontend queue is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoIngesterFlushLatency - expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_ingester_flush_duration_seconds_bucket[5m]))) > 5 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo ingester flush latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} ingester flush is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoIngesterFlushesFailing - expr: sum by (instance,job)(increase(tempo_ingester_flush_failed_retries_total[1h])) > 2 and sum by(instance,job)(increase(tempo_ingester_flush_failed_retries_total[5m])) > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo ingester flush retries failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 flush retries failures in the past hour." - - alert: TempoIngesterFlushesUnhealthy - expr: sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[1h])) > 2 and sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[5m])) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: Tempo ingester flush failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 ingester flush failures in the past hour." - - alert: TempoIngestersUnhealthy - expr: max by (instance,job)(tempo_ring_members{state="Unhealthy", name="ingester"}) > 0 - for: "15m" - labels: - severity: critical - annotations: - summary: Tempo unhealthy ingester(s) (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy ingester(s)." - - alert: TempoKVRequestErrors - expr: 100 * sum(rate(tempo_kv_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,instance,job) / sum(rate(tempo_kv_request_duration_seconds_count[5m])) by (route,instance,job) > 10 - for: "15m" - labels: - severity: critical - annotations: - summary: Tempo kv store request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} KV store requests is experiencing {{ printf \"%.2f\" $value }}% error rate." - - alert: TempoTargetMissing - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed." - - alert: TempoNoTenantIndexBuilders - expr: sum by (tenant,job,instance) (tempodb_blocklist_tenant_index_builder) == 0 and max by (tenant,job,instance)(tempodb_blocklist_length) > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo tenant index builder failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is having no tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale." - - alert: TempoRequestErrors - expr: 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,job,instance) / sum(rate(tempo_request_duration_seconds_count[5m])) by (route,job,instance) > 10 - for: "15m" - labels: - severity: critical - annotations: - summary: Tempo request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% request error rate." - - alert: TempoRequestLatency - expr: histogram_quantile(0.99, sum by(le, route,job,instance)(rate(tempo_request_duration_seconds_bucket[5m]))) > 5 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo request latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - alert: TempoRetentionsFailing - expr: sum by (job,instance)(increase(tempodb_retention_errors_total[1h])) > 2 and sum by (job,instance)(increase(tempodb_retention_errors_total[5m])) > 0 - for: "5m" - labels: - severity: critical - annotations: - summary: Tempo retentions failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 2 retention failures in the past hour." - - alert: TempoTCPConnectionsLimit - expr: 100 * tempo_tcp_connections / tempo_tcp_connections_limit >= 80 and tempo_tcp_connections_limit > 0 - for: "5m" - labels: - severity: warning - annotations: - summary: Tempo reaching max number of tcp connections (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is reaching {{ printf \"%.2f\" $value }}% of max tcp {{ $labels.protocol }} connections." - - alert: TempoTenantIndexTooOld - expr: max by(tenant,instance,job) (tempodb_blocklist_tenant_index_age_seconds) > 600 - for: 5m - labels: - severity: critical - annotations: - summary: Tempo tenant old index (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing a tenant {{ $labels.tenant }} with a too old index age of 600 seconds." - - alert: TempoUserConfigurableOverridesReloadFailing - expr: sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0 - labels: - severity: critical - annotations: - summary: Tempo user config override reload failing (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing more than 5 user-configurable override reload failures in the past hour." diff --git a/src/prometheus_alert_rules/tempo_workers/alerts.yaml b/src/prometheus_alert_rules/workers/alerts.yaml similarity index 82% rename from src/prometheus_alert_rules/tempo_workers/alerts.yaml rename to src/prometheus_alert_rules/workers/alerts.yaml index 65109a2..cbe3095 100644 --- a/src/prometheus_alert_rules/tempo_workers/alerts.yaml +++ b/src/prometheus_alert_rules/workers/alerts.yaml @@ -235,6 +235,190 @@ groups: labels: component: ingester severity: warning + - alert: "TempoBlockListRisingQuickly" + expr: | + avg by (job, instance)(tempodb_blocklist_length) / avg by (job, instance)(tempodb_blocklist_length offset 7d) > 1.4 + for: "15m" + labels: + severity: "critical" + annotations: + summary: "Tempo block list rising quickly (instance {{ $labels.instance }})" + description: "The {{ $labels.job }} is experiencing a 40% rise in tempo blocklist length over the last 7 days. Consider scaling compactors." + - alert: TempoCompactionsFailing + expr: sum by (job, instance)(increase(tempodb_compaction_errors_total{}[1h])) > 2 and sum by (job, instance)(increase(tempodb_compaction_errors_total{}[5m])) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo compactions failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 compactions failures in the past hour." + - alert: TempoCompactorUnhealthy + expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="compactor"}) > 0 + for: 15m + labels: + severity: critical + annotations: + summary: Tempo unhealthy compactor(s) (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy compactor(s)." + - alert: TempoCompactorsTooManyOutstandingBlocks + expr: sum by (tenant) (tempodb_compaction_outstanding_blocks) / ignoring(tenant) group_left count(tempo_build_info) > 100 + for: "6h" + labels: + severity: warning + annotations: + summary: Tempo too many outstanding compaction blocks (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having too many outstanding compaction blocks for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors." + - alert: TempoDiscardedSpans + expr: 100 * sum by (instance,job)(rate(tempo_discarded_spans_total[5m])) / sum by (instance,job)(rate(tempo_distributor_spans_received_total[5m])) > 5 + for: "5m" + labels: + severity: warning + annotations: + summary: Tempo spans insertion failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% discard of spans." + - alert: TempoDistributorPushLatency + expr: histogram_quantile(0.99, sum by(le, job, instance) (rate(tempo_distributor_push_duration_seconds_bucket[5m]))) > 3 + for: "5m" + labels: + severity: warning + annotations: + summary: Tempo distributor push latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} distributor push is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoDistributorUnhealthy + expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 + for: 15m + labels: + severity: critical + annotations: + summary: Tempo unhealthy distributor(s) (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy distributor(s)." + - alert: TempoFailedIngestingRequests + expr: sum by (job,instance)(increase (tempo_ingester_traces_created_total[5m])) / sum by (instance,job)(rate(tempo_request_duration_seconds_count{route='/tempopb.Pusher/PushBytesV2'}[5m])) == 0 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo pushing traces to ingester failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing failure in distributors pushing traces to the ingesters." + - alert: TempoFrontendClients + expr: tempo_query_frontend_connected_clients == 0 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo frontend connected clients (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} has no frontend connected clients." + - alert: TempoFrontendQueueLatency + expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_query_frontend_queue_duration_seconds_bucket[15m]))) > 2 + for: "15m" + labels: + severity: warning + annotations: + summary: Tempo frontend queue latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} frontend queue is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoIngesterFlushLatency + expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_ingester_flush_duration_seconds_bucket[5m]))) > 5 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo ingester flush latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} ingester flush is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoIngesterFlushesFailing + expr: sum by (instance,job)(increase(tempo_ingester_flush_failed_retries_total[1h])) > 2 and sum by(instance,job)(increase(tempo_ingester_flush_failed_retries_total[5m])) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo ingester flush retries failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 flush retries failures in the past hour." + - alert: TempoIngesterFlushesUnhealthy + expr: sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[1h])) > 2 and sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[5m])) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: Tempo ingester flush failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 ingester flush failures in the past hour." + - alert: TempoIngestersUnhealthy + expr: max by (instance,job)(tempo_ring_members{state="Unhealthy", name="ingester"}) > 0 + for: "15m" + labels: + severity: critical + annotations: + summary: Tempo unhealthy ingester(s) (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy ingester(s)." + - alert: TempoKVRequestErrors + expr: 100 * sum(rate(tempo_kv_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,instance,job) / sum(rate(tempo_kv_request_duration_seconds_count[5m])) by (route,instance,job) > 10 + for: "15m" + labels: + severity: critical + annotations: + summary: Tempo kv store request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} KV store requests is experiencing {{ printf \"%.2f\" $value }}% error rate." + - alert: TempoTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed." + - alert: TempoNoTenantIndexBuilders + expr: sum by (tenant,job,instance) (tempodb_blocklist_tenant_index_builder) == 0 and max by (tenant,job,instance)(tempodb_blocklist_length) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo tenant index builder failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is having no tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale." + - alert: TempoRequestErrors + expr: 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,job,instance) / sum(rate(tempo_request_duration_seconds_count[5m])) by (route,job,instance) > 10 + for: "15m" + labels: + severity: critical + annotations: + summary: Tempo request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% request error rate." + - alert: TempoRequestLatency + expr: histogram_quantile(0.99, sum by(le, route,job,instance)(rate(tempo_request_duration_seconds_bucket[5m]))) > 5 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + - alert: TempoRetentionsFailing + expr: sum by (job,instance)(increase(tempodb_retention_errors_total[1h])) > 2 and sum by (job,instance)(increase(tempodb_retention_errors_total[5m])) > 0 + for: "5m" + labels: + severity: critical + annotations: + summary: Tempo retentions failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 2 retention failures in the past hour." + - alert: TempoTCPConnectionsLimit + expr: 100 * tempo_tcp_connections / tempo_tcp_connections_limit >= 80 and tempo_tcp_connections_limit > 0 + for: "5m" + labels: + severity: warning + annotations: + summary: Tempo reaching max number of tcp connections (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is reaching {{ printf \"%.2f\" $value }}% of max tcp {{ $labels.protocol }} connections." + - alert: TempoTenantIndexTooOld + expr: max by(tenant,instance,job) (tempodb_blocklist_tenant_index_age_seconds) > 600 + for: 5m + labels: + severity: critical + annotations: + summary: Tempo tenant old index (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing a tenant {{ $labels.tenant }} with a too old index age of 600 seconds." + - alert: TempoUserConfigurableOverridesReloadFailing + expr: sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0 + labels: + severity: critical + annotations: + summary: Tempo user config override reload failing (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing more than 5 user-configurable override reload failures in the past hour." - name: tempo_instance_limits_alerts rules: - alert: TempoIngesterReachingSeriesLimit diff --git a/src/prometheus_alert_rules/tempo_workers/rules.yaml b/src/prometheus_alert_rules/workers/rules.yaml similarity index 100% rename from src/prometheus_alert_rules/tempo_workers/rules.yaml rename to src/prometheus_alert_rules/workers/rules.yaml diff --git a/src/tempo.py b/src/tempo.py index e1b64cc..04902e6 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -74,6 +74,7 @@ def config( Only activate the provided receivers. """ + config = tempo_config.Tempo( auth_enabled=False, server=self._build_server_config(coordinator.tls_available), @@ -157,11 +158,10 @@ def _build_querier_config(self, roles_addresses: Dict[str, Set[str]]): # if distributor and query-frontend have the same address, then the mode of operation is 'all'. if roles_addresses.get(tempo_config.TempoRole.query_frontend) == roles_addresses.get( tempo_config.TempoRole.distributor - or (not roles_addresses.get(tempo_config.TempoRole.query_frontend)) - ): + ) or not roles_addresses.get(tempo_config.TempoRole.query_frontend): addr = "localhost" else: - addr = roles_addresses.get(tempo_config.TempoRole.query_frontend).pop() + addr = roles_addresses.get(tempo_config.TempoRole.query_frontend, set()).pop() return tempo_config.Querier( frontend_worker=tempo_config.FrontendWorker( diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 887c3e0..059ee5a 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1,13 +1,28 @@ +import json import logging +import os +import shlex import subprocess +import tempfile from dataclasses import dataclass -from typing import Dict +from pathlib import Path +from typing import Dict, Literal import yaml +from juju.application import Application +from juju.unit import Unit +from minio import Minio from pytest_operator.plugin import OpsTest _JUJU_DATA_CACHE = {} _JUJU_KEYS = ("egress-subnets", "ingress-address", "private-address") +ACCESS_KEY = "accesskey" +SECRET_KEY = "secretkey" +MINIO = "minio" +BUCKET_NAME = "tempo" +S3_INTEGRATOR = "s3-integrator" +WORKER_NAME = "tempo-worker" +APP_NAME = "tempo" logger = logging.getLogger(__name__) @@ -187,3 +202,108 @@ async def run_command(model_name: str, app_name: str, unit_num: int, command: li logger.error(e.stdout.decode()) raise e return res.stdout + + +def present_facade( + interface: str, + app_data: Dict = None, + unit_data: Dict = None, + role: Literal["provide", "require"] = "provide", + model: str = None, + app: str = "facade", +): + """Set up the facade charm to present this data over the interface ``interface``.""" + data = { + "endpoint": f"{role}-{interface}", + } + if app_data: + data["app_data"] = json.dumps(app_data) + if unit_data: + data["unit_data"] = json.dumps(unit_data) + + with tempfile.NamedTemporaryFile(dir=os.getcwd()) as f: + fpath = Path(f.name) + fpath.write_text(yaml.safe_dump(data)) + + _model = f" --model {model}" if model else "" + + subprocess.run(shlex.split(f"juju run {app}/0{_model} update --params {fpath.absolute()}")) + + +async def get_unit_address(ops_test: OpsTest, app_name, unit_no): + status = await ops_test.model.get_status() + app = status["applications"][app_name] + if app is None: + assert False, f"no app exists with name {app_name}" + unit = app["units"].get(f"{app_name}/{unit_no}") + if unit is None: + assert False, f"no unit exists in app {app_name} with index {unit_no}" + return unit["address"] + + +async def deploy_and_configure_minio(ops_test: OpsTest): + config = { + "access-key": ACCESS_KEY, + "secret-key": SECRET_KEY, + } + await ops_test.model.deploy(MINIO, channel="edge", trust=True, config=config) + await ops_test.model.wait_for_idle(apps=[MINIO], status="active", timeout=2000) + minio_addr = await get_unit_address(ops_test, MINIO, "0") + + mc_client = Minio( + f"{minio_addr}:9000", + access_key="accesskey", + secret_key="secretkey", + secure=False, + ) + + # create tempo bucket + found = mc_client.bucket_exists(BUCKET_NAME) + if not found: + mc_client.make_bucket(BUCKET_NAME) + + # configure s3-integrator + s3_integrator_app: Application = ops_test.model.applications[S3_INTEGRATOR] + s3_integrator_leader: Unit = s3_integrator_app.units[0] + + await s3_integrator_app.set_config( + { + "endpoint": f"minio-0.minio-endpoints.{ops_test.model.name}.svc.cluster.local:9000", + "bucket": BUCKET_NAME, + } + ) + + action = await s3_integrator_leader.run_action("sync-s3-credentials", **config) + action_result = await action.wait() + assert action_result.status == "completed" + + +async def deploy_cluster(ops_test: OpsTest): + # await ops_test.model.deploy(FACADE, channel="edge") + # await ops_test.model.wait_for_idle( + # apps=[FACADE], raise_on_blocked=True, status="active", timeout=2000 + # ) + + # TODO: deploy from latest edge + tempo_worker_charm = "/home/michael/Work/tempo-worker-k8s-operator/charm" + + resources = { + "tempo-image": "docker.io/ubuntu/tempo:2-22.04", + } + + await ops_test.model.deploy( + tempo_worker_charm, resources=resources, application_name=WORKER_NAME + ) + await ops_test.model.deploy(S3_INTEGRATOR, channel="edge") + + await ops_test.model.integrate(APP_NAME + ":s3", S3_INTEGRATOR + ":s3-credentials") + await ops_test.model.integrate(APP_NAME + ":tempo-cluster", WORKER_NAME + ":tempo-cluster") + + await deploy_and_configure_minio(ops_test) + + await ops_test.model.wait_for_idle( + apps=[APP_NAME, WORKER_NAME, S3_INTEGRATOR], + status="active", + timeout=1000, + idle_period=30, + ) diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index 896f385..10b8275 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -1,138 +1,151 @@ -# TODO: uncomment and fix when the fully functional tempo cluster is ready (e.g: traces get ingested, can query for traces) -# import asyncio -# import json -# import logging -# import random -# import subprocess -# import tempfile -# from pathlib import Path - -# import pytest -# import requests -# import yaml -# from pytest_operator.plugin import OpsTest -# from tenacity import retry, stop_after_attempt, wait_exponential - -# from tempo import Tempo -# from tests.integration.helpers import get_relation_data - -# METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -# APP_NAME = "tempo" -# SSC = "self-signed-certificates" -# SSC_APP_NAME = "ssc" -# TRAEFIK = "traefik-k8s" -# TRAEFIK_APP_NAME = "trfk" -# TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" - -# logger = logging.getLogger(__name__) - - -# @pytest.fixture(scope="function") -# def nonce(): -# """Generate an integer nonce for easier trace querying.""" -# return str(random.random())[2:] - - -# @pytest.fixture(scope="function") -# def server_cert(ops_test: OpsTest): -# data = get_relation_data( -# requirer_endpoint=f"{APP_NAME}/0:certificates", -# provider_endpoint=f"{SSC_APP_NAME}/0:certificates", -# model=ops_test.model.name, -# ) -# cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] - -# with tempfile.NamedTemporaryFile() as f: -# p = Path(f.name) -# p.write_text(cert) -# yield p - - -# def get_traces(tempo_host: str, nonce, service_name="tracegen"): -# req = requests.get( -# "https://" + tempo_host + ":3200/api/search", -# params={"service.name": service_name, "nonce": nonce}, -# verify=False, -# ) -# assert req.status_code == 200 -# return json.loads(req.text)["traces"] - - -# @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) -# async def get_traces_patiently(tempo_host, nonce): -# assert get_traces(tempo_host, nonce=nonce) - - -# async def get_tempo_host(ops_test: OpsTest): -# status = await ops_test.model.get_status() -# app = status["applications"][TRAEFIK_APP_NAME] -# return app.public_address - - -# async def emit_trace( -# endpoint, ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False -# ): -# """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" -# cmd = ( -# f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " -# f"TRACEGEN_ENDPOINT={endpoint} " -# f"TRACEGEN_VERBOSE={verbose} " -# f"TRACEGEN_PROTOCOL={proto} " -# f"TRACEGEN_CERT={Tempo.server_cert_path if use_cert else ''} " -# f"TRACEGEN_NONCE={nonce} " -# "python3 tracegen.py" -# ) - -# return subprocess.getoutput(cmd) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_build_and_deploy(ops_test: OpsTest): -# tempo_charm = await ops_test.build_charm(".") -# await asyncio.gather( -# ops_test.model.deploy(tempo_charm, application_name=APP_NAME), -# ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), -# ops_test.model.deploy(TRAEFIK, application_name=TRAEFIK_APP_NAME, channel="edge"), -# ) - -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], -# status="active", -# raise_on_blocked=True, -# timeout=10000, -# raise_on_error=False, -# ), -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_push_tracegen_script_and_deps(ops_test: OpsTest): -# await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") -# await ops_test.juju( -# "ssh", -# f"{APP_NAME}/0", -# "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_relate(ops_test: OpsTest): -# await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") -# await ops_test.model.integrate( -# SSC_APP_NAME + ":certificates", TRAEFIK_APP_NAME + ":certificates" -# ) -# await ops_test.model.integrate(APP_NAME + ":ingress", TRAEFIK_APP_NAME + ":traefik-route") -# await ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], -# status="active", -# timeout=1000, -# ) - - +import asyncio +import json +import logging +import random +import subprocess +import tempfile +from pathlib import Path + +import pytest +import requests +import yaml +from helpers import deploy_cluster +from juju.application import Application +from pytest_operator.plugin import OpsTest +from tenacity import retry, stop_after_attempt, wait_exponential + +from tempo import Tempo +from tests.integration.helpers import get_relation_data + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +SSC = "self-signed-certificates" +SSC_APP_NAME = "ssc" +TRAEFIK = "traefik-k8s" +TRAEFIK_APP_NAME = "trfk" +TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="function") +def nonce(): + """Generate an integer nonce for easier trace querying.""" + return str(random.random())[2:] + + +@pytest.fixture(scope="function") +def server_cert(ops_test: OpsTest): + data = get_relation_data( + requirer_endpoint=f"{APP_NAME}/0:certificates", + provider_endpoint=f"{SSC_APP_NAME}/0:certificates", + model=ops_test.model.name, + ) + cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] + + with tempfile.NamedTemporaryFile() as f: + p = Path(f.name) + p.write_text(cert) + yield p + + +def get_traces(tempo_host: str, nonce, service_name="tracegen"): + req = requests.get( + "https://" + tempo_host + ":3200/api/search", + params={"service.name": service_name, "nonce": nonce}, + verify=False, + ) + assert req.status_code == 200 + return json.loads(req.text)["traces"] + + +@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) +async def get_traces_patiently(tempo_host, nonce): + assert get_traces(tempo_host, nonce=nonce) + + +async def get_tempo_host(ops_test: OpsTest): + status = await ops_test.model.get_status() + app = status["applications"][TRAEFIK_APP_NAME] + return app.public_address + + +async def emit_trace( + endpoint, ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False +): + """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" + cmd = ( + f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " + f"TRACEGEN_ENDPOINT={endpoint} " + f"TRACEGEN_VERBOSE={verbose} " + f"TRACEGEN_PROTOCOL={proto} " + f"TRACEGEN_CERT={Tempo.tls_ca_path if use_cert else ''} " + f"TRACEGEN_NONCE={nonce} " + "python3 tracegen.py" + ) + + return subprocess.getoutput(cmd) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + # tempo_charm = await ops_test.build_charm(".") + tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), + ops_test.model.deploy(TRAEFIK, application_name=TRAEFIK_APP_NAME, channel="edge"), + ) + + # deploy cluster + await deploy_cluster(ops_test) + + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], + status="active", + raise_on_blocked=True, + timeout=10000, + raise_on_error=False, + ), + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_push_tracegen_script_and_deps(ops_test: OpsTest): + await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") + await ops_test.juju( + "ssh", + f"{APP_NAME}/0", + "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") + await ops_test.model.integrate( + SSC_APP_NAME + ":certificates", TRAEFIK_APP_NAME + ":certificates" + ) + await ops_test.model.integrate(APP_NAME + ":ingress", TRAEFIK_APP_NAME + ":traefik-route") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], + status="active", + timeout=1000, + ) + + +# TODO: Uncomment and fix once below issue is fixed +# Currently, traefik, through `traefik_route`, accepts traffic through http although TLS is enabled. # @pytest.mark.abort_on_fail # async def test_verify_ingressed_trace_http_upgrades_to_tls(ops_test: OpsTest, nonce): # tempo_host = await get_tempo_host(ops_test) @@ -145,34 +158,49 @@ # assert get_traces_patiently(tempo_host, nonce=nonce) -# @pytest.mark.abort_on_fail -# async def test_verify_ingressed_trace_http_tls(ops_test: OpsTest, nonce, server_cert): -# tempo_host = await get_tempo_host(ops_test) -# await emit_trace( -# f"https://{tempo_host}:4318/v1/traces", nonce=nonce, ops_test=ops_test, use_cert=True -# ) -# # THEN we can verify it's been ingested -# assert get_traces_patiently(tempo_host, nonce=nonce) - - -# @pytest.mark.abort_on_fail -# async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): -# tempo_host = await get_tempo_host(ops_test) -# await emit_trace( -# f"{tempo_host}:4317", nonce=nonce, proto="grpc", ops_test=ops_test, use_cert=True -# ) -# # THEN we can verify it's been ingested -# assert get_traces_patiently(tempo_host, nonce=nonce) - - -# @pytest.mark.teardown -# @pytest.mark.abort_on_fail -# async def test_remove_relation(ops_test: OpsTest): -# await ops_test.juju( -# "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" -# ) -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 -# ), -# ) +@pytest.mark.abort_on_fail +async def test_verify_ingressed_trace_http_tls(ops_test: OpsTest, nonce, server_cert): + tempo_host = await get_tempo_host(ops_test) + + await emit_trace( + f"https://{tempo_host}:4318/v1/traces", nonce=nonce, ops_test=ops_test, use_cert=True + ) + # THEN we can verify it's been ingested + assert get_traces_patiently(tempo_host, nonce=nonce) + + +@pytest.mark.abort_on_fail +async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): + # enable otlp grpc receiver + tempo_app: Application = ops_test.model.applications[APP_NAME] + await tempo_app.set_config( + { + "always_enable_otlp_grpc": "True", + } + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + tempo_host = await get_tempo_host(ops_test) + + await emit_trace( + f"{tempo_host}:4317", nonce=nonce, proto="grpc", ops_test=ops_test, use_cert=True + ) + # THEN we can verify it's been ingested + assert get_traces_patiently(tempo_host, nonce=nonce) + + +@pytest.mark.teardown +@pytest.mark.abort_on_fail +async def test_remove_relation(ops_test: OpsTest): + await ops_test.juju( + "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" + ) + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ), + ) diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 2010ace..eec0039 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -1,166 +1,168 @@ -# TODO: uncomment and fix when the fully functional tempo cluster is ready (e.g: traces get ingested, can query for traces) -# import asyncio -# import json -# import logging -# from pathlib import Path - -# import pytest -# import yaml -# from pytest_operator.plugin import OpsTest - -# METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -# APP_NAME = METADATA["name"] -# TESTER_METADATA = yaml.safe_load(Path("./tests/integration/tester/metadata.yaml").read_text()) -# TESTER_APP_NAME = TESTER_METADATA["name"] -# TESTER_GRPC_METADATA = yaml.safe_load( -# Path("./tests/integration/tester-grpc/metadata.yaml").read_text() -# ) -# TESTER_GRPC_APP_NAME = TESTER_GRPC_METADATA["name"] - -# logger = logging.getLogger(__name__) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_build_and_deploy(ops_test: OpsTest): -# # Given a fresh build of the charm -# # When deploying it together with testers -# # Then applications should eventually be created -# tempo_charm = await ops_test.build_charm(".") -# tester_charm = await ops_test.build_charm("./tests/integration/tester/") -# tester_grpc_charm = await ops_test.build_charm("./tests/integration/tester-grpc/") -# resources_tester = {"workload": TESTER_METADATA["resources"]["workload"]["upstream-source"]} -# resources_tester_grpc = { -# "workload": TESTER_GRPC_METADATA["resources"]["workload"]["upstream-source"] -# } - -# await asyncio.gather( -# ops_test.model.deploy(tempo_charm, application_name=APP_NAME), -# ops_test.model.deploy( -# tester_charm, -# resources=resources_tester, -# application_name=TESTER_APP_NAME, -# num_units=3, -# ), -# ops_test.model.deploy( -# tester_grpc_charm, -# resources=resources_tester_grpc, -# application_name=TESTER_GRPC_APP_NAME, -# num_units=3, -# ), -# ) - -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], -# status="active", -# raise_on_blocked=True, -# timeout=10000, -# raise_on_error=False, -# ), -# # for tester, depending on the result of race with tempo it's either waiting or active -# ops_test.model.wait_for_idle( -# apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False -# ), -# ops_test.model.wait_for_idle( -# apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False -# ), -# ) - -# assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active" - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_relate(ops_test: OpsTest): -# # given a deployed charm -# # when relating it together with the tester -# # then relation should appear -# await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") -# await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing") -# await ops_test.model.wait_for_idle( -# apps=[APP_NAME, TESTER_APP_NAME, TESTER_GRPC_APP_NAME], -# status="active", -# timeout=1000, -# ) - - -# async def test_verify_traces_http(ops_test: OpsTest): -# # given a relation between charms -# # when traces endpoint is queried -# # then it should contain traces from tester charm -# status = await ops_test.model.get_status() -# app = status["applications"][APP_NAME] -# logger.info(app.public_address) -# endpoint = app.public_address + ":3200/api/search" -# cmd = [ -# "curl", -# endpoint, -# ] -# rc, stdout, stderr = await ops_test.run(*cmd) -# logger.info("%s: %s", endpoint, (rc, stdout, stderr)) -# assert rc == 0, ( -# f"curl exited with rc={rc} for {endpoint}; " -# f"non-zero return code means curl encountered a >= 400 HTTP code; " -# f"cmd={cmd}" -# ) -# traces = json.loads(stdout)["traces"] - -# found = False -# for trace in traces: -# if trace["rootServiceName"] == APP_NAME and trace["rootTraceName"] == "charm exec": -# found = True - -# assert found, f"There's no trace of charm exec traces in tempo. {json.dumps(traces, indent=2)}" - - -# async def test_verify_traces_grpc(ops_test: OpsTest): -# # the tester-grpc charm emits a single grpc trace in its common exit hook -# # we verify it's there -# status = await ops_test.model.get_status() -# app = status["applications"][APP_NAME] -# logger.info(app.public_address) -# endpoint = app.public_address + ":3200/api/search" -# cmd = [ -# "curl", -# endpoint, -# ] -# rc, stdout, stderr = await ops_test.run(*cmd) -# logger.info("%s: %s", endpoint, (rc, stdout, stderr)) -# assert rc == 0, ( -# f"curl exited with rc={rc} for {endpoint}; " -# f"non-zero return code means curl encountered a >= 400 HTTP code; " -# f"cmd={cmd}" -# ) -# traces = json.loads(stdout)["traces"] - -# found = False -# for trace in traces: -# if trace["rootServiceName"] == "TempoTesterGrpcCharm": -# found = True - -# assert ( -# found -# ), f"There's no trace of generated grpc traces in tempo. {json.dumps(traces, indent=2)}" - - -# @pytest.mark.teardown -# @pytest.mark.abort_on_fail -# async def test_remove_relation(ops_test: OpsTest): -# # given related charms -# # when relation is removed -# # then both charms should become active again -# await ops_test.juju("remove-relation", APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") -# await ops_test.juju( -# "remove-relation", APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing" -# ) -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 -# ), -# # for tester, depending on the result of race with tempo it's either waiting or active -# ops_test.model.wait_for_idle(apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000), -# ops_test.model.wait_for_idle( -# apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000 -# ), -# ) +import asyncio +import json +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import deploy_cluster +from pytest_operator.plugin import OpsTest + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +TESTER_METADATA = yaml.safe_load(Path("./tests/integration/tester/metadata.yaml").read_text()) +TESTER_APP_NAME = TESTER_METADATA["name"] +TESTER_GRPC_METADATA = yaml.safe_load( + Path("./tests/integration/tester-grpc/metadata.yaml").read_text() +) +TESTER_GRPC_APP_NAME = TESTER_GRPC_METADATA["name"] + +logger = logging.getLogger(__name__) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + # Given a fresh build of the charm + # When deploying it together with testers + # Then applications should eventually be created + # tempo_charm = await ops_test.build_charm(".") + tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tester_charm = await ops_test.build_charm("./tests/integration/tester/") + tester_grpc_charm = await ops_test.build_charm("./tests/integration/tester-grpc/") + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + resources_tester = {"workload": TESTER_METADATA["resources"]["workload"]["upstream-source"]} + resources_tester_grpc = { + "workload": TESTER_GRPC_METADATA["resources"]["workload"]["upstream-source"] + } + + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ops_test.model.deploy( + tester_charm, + resources=resources_tester, + application_name=TESTER_APP_NAME, + num_units=3, + ), + ops_test.model.deploy( + tester_grpc_charm, + resources=resources_tester_grpc, + application_name=TESTER_GRPC_APP_NAME, + num_units=3, + ), + ) + + # deploy cluster + await deploy_cluster(ops_test) + + await asyncio.gather( + # for tester, depending on the result of race with tempo it's either waiting or active + ops_test.model.wait_for_idle( + apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False + ), + ops_test.model.wait_for_idle( + apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000, raise_on_error=False + ), + ) + + assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active" + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + # given a deployed charm + # when relating it together with the tester + # then relation should appear + await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") + await ops_test.model.add_relation(APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, TESTER_APP_NAME, TESTER_GRPC_APP_NAME], + status="active", + timeout=1000, + ) + + +async def test_verify_traces_http(ops_test: OpsTest): + # given a relation between charms + # when traces endpoint is queried + # then it should contain traces from tester charm + status = await ops_test.model.get_status() + app = status["applications"][APP_NAME] + endpoint = app.public_address + ":3200/api/search" + cmd = [ + "curl", + endpoint, + ] + rc, stdout, stderr = await ops_test.run(*cmd) + logger.info("%s: %s", endpoint, (rc, stdout, stderr)) + assert rc == 0, ( + f"curl exited with rc={rc} for {endpoint}; " + f"non-zero return code means curl encountered a >= 400 HTTP code; " + f"cmd={cmd}" + ) + traces = json.loads(stdout)["traces"] + + found = False + for trace in traces: + if trace["rootServiceName"] == "TempoTesterCharm": + found = True + + assert found, f"There's no trace of charm exec traces in tempo. {json.dumps(traces, indent=2)}" + + +async def test_verify_traces_grpc(ops_test: OpsTest): + # the tester-grpc charm emits a single grpc trace in its common exit hook + # we verify it's there + status = await ops_test.model.get_status() + app = status["applications"][APP_NAME] + logger.info(app.public_address) + endpoint = app.public_address + ":3200/api/search" + cmd = [ + "curl", + endpoint, + ] + rc, stdout, stderr = await ops_test.run(*cmd) + logger.info("%s: %s", endpoint, (rc, stdout, stderr)) + assert rc == 0, ( + f"curl exited with rc={rc} for {endpoint}; " + f"non-zero return code means curl encountered a >= 400 HTTP code; " + f"cmd={cmd}" + ) + traces = json.loads(stdout)["traces"] + + found = False + for trace in traces: + if trace["rootServiceName"] == "TempoTesterGrpcCharm": + found = True + + assert ( + found + ), f"There's no trace of generated grpc traces in tempo. {json.dumps(traces, indent=2)}" + + +@pytest.mark.teardown +@pytest.mark.abort_on_fail +async def test_remove_relation(ops_test: OpsTest): + # given related charms + # when relation is removed + # then both charms should become active again + await ops_test.juju("remove-relation", APP_NAME + ":tracing", TESTER_APP_NAME + ":tracing") + await ops_test.juju( + "remove-relation", APP_NAME + ":tracing", TESTER_GRPC_APP_NAME + ":tracing" + ) + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ), + # for tester, depending on the result of race with tempo it's either waiting or active + ops_test.model.wait_for_idle(apps=[TESTER_APP_NAME], raise_on_blocked=True, timeout=1000), + ops_test.model.wait_for_idle( + apps=[TESTER_GRPC_APP_NAME], raise_on_blocked=True, timeout=1000 + ), + ) diff --git a/tests/integration/test_scaling_monolithic.py b/tests/integration/test_scaling_monolithic.py index ed76714..f7a247f 100644 --- a/tests/integration/test_scaling_monolithic.py +++ b/tests/integration/test_scaling_monolithic.py @@ -1,22 +1,16 @@ -import json import logging -import os -import shlex -import tempfile from pathlib import Path -from subprocess import run -from typing import Dict, Literal import pytest import yaml +from helpers import deploy_cluster from juju.application import Application from pytest_operator.plugin import OpsTest METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -APP_NAME = "tempo-coordinator" -FACADE = "facade" +APP_NAME = "tempo" +S3_INTEGRATOR = "s3-integrator" TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" -FACADE_MOCKS_PATH = "/var/lib/juju/agents/unit-facade-0/charm/mocks" logger = logging.getLogger(__name__) @@ -24,7 +18,9 @@ @pytest.mark.setup @pytest.mark.abort_on_fail async def test_deploy_tempo(ops_test: OpsTest): - tempo_charm = await ops_test.build_charm(".") + # tempo_charm = await ops_test.build_charm(".") + tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], "nginx-prometheus-exporter-image": METADATA["resources"][ @@ -54,85 +50,15 @@ async def test_scale_tempo_up_without_s3_blocks(ops_test: OpsTest): ) -def present_facade( - interface: str, - app_data: Dict = None, - unit_data: Dict = None, - role: Literal["provide", "require"] = "provide", - model: str = None, - app: str = "facade", -): - """Set up the facade charm to present this data over the interface ``interface``.""" - data = { - "endpoint": f"{role}-{interface}", - } - if app_data: - data["app_data"] = json.dumps(app_data) - if unit_data: - data["unit_data"] = json.dumps(unit_data) - - with tempfile.NamedTemporaryFile(dir=os.getcwd()) as f: - fpath = Path(f.name) - fpath.write_text(yaml.safe_dump(data)) - - _model = f" --model {model}" if model else "" - - run(shlex.split(f"juju run {app}/0{_model} update --params {fpath.absolute()}")) - - @pytest.mark.setup @pytest.mark.abort_on_fail -async def test_tempo_active_when_deploy_s3_and_workers_facade(ops_test: OpsTest): - await ops_test.model.deploy(FACADE, channel="edge") - await ops_test.model.wait_for_idle( - apps=[FACADE], raise_on_blocked=True, status="active", timeout=2000 - ) - - await ops_test.model.integrate(APP_NAME + ":s3", FACADE + ":provide-s3") - await ops_test.model.integrate(APP_NAME + ":tempo-cluster", FACADE + ":require-tempo_cluster") - - present_facade( - "s3", - model=ops_test.model_name, - app_data={ - "access-key": "key", - "bucket": "tempo", - "endpoint": "http://1.2.3.4:9000", - "secret-key": "soverysecret", - }, - ) - - present_facade( - "tempo_cluster", - model=ops_test.model_name, - app_data={ - "role": '"all"', - }, - unit_data={ - "juju_topology": json.dumps({"model": ops_test.model_name, "unit": FACADE + "/0"}), - "address": FACADE + ".cluster.local.svc", - }, - role="require", - ) - - await ops_test.model.wait_for_idle( - apps=[FACADE], - raise_on_blocked=True, - status="active", - timeout=2000, - ) - - await ops_test.model.wait_for_idle( - apps=[APP_NAME], - raise_on_blocked=True, - status="active", - timeout=10000, - ) +async def test_tempo_active_when_deploy_s3_and_workers(ops_test: OpsTest): + await deploy_cluster(ops_test) @pytest.mark.teardown async def test_tempo_blocks_if_s3_goes_away(ops_test: OpsTest): - app: Application = ops_test.model.applications[FACADE] + app: Application = ops_test.model.applications[S3_INTEGRATOR] await app.destroy(destroy_storage=True) await ops_test.model.wait_for_idle( apps=[APP_NAME], diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 5ea977f..5781f56 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -1,171 +1,200 @@ -# TODO: uncomment and fix when the fully functional tempo cluster is ready (e.g: traces get ingested, can query for traces) -# import asyncio -# import json -# import logging -# import random -# import tempfile -# from pathlib import Path -# from subprocess import getoutput - -# import pytest -# import requests -# import yaml -# from pytest_operator.plugin import OpsTest -# from tenacity import retry, stop_after_attempt, wait_exponential - -# from tempo import Tempo -# from tests.integration.helpers import get_relation_data - -# METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) -# APP_NAME = "tempo" -# SSC = "self-signed-certificates" -# SSC_APP_NAME = "ssc" -# TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" -# logger = logging.getLogger(__name__) - - -# @pytest.fixture(scope="function") -# def nonce(): -# """Generate an integer nonce for easier trace querying.""" -# return str(random.random())[2:] - - -# def get_traces(tempo_host: str, nonce): -# url = "https://" + tempo_host + ":3200/api/search" -# req = requests.get( -# url, -# params={"q": f'{{ .nonce = "{nonce}" }}'}, -# # it would fail to verify as the cert was issued for fqdn, not IP. -# verify=False, -# ) -# assert req.status_code == 200 -# return json.loads(req.text)["traces"] - - -# @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) -# async def get_traces_patiently(ops_test, nonce): -# assert get_traces(await get_tempo_ip(ops_test), nonce=nonce) - - -# async def get_tempo_ip(ops_test: OpsTest): -# status = await ops_test.model.get_status() -# app = status["applications"][APP_NAME] -# return app.public_address - - -# async def get_tempo_internal_host(ops_test: OpsTest): -# return f"https://{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" - - -# @pytest.fixture(scope="function") -# def server_cert(ops_test: OpsTest): -# data = get_relation_data( -# requirer_endpoint=f"{APP_NAME}/0:certificates", -# provider_endpoint=f"{SSC_APP_NAME}/0:certificates", -# model=ops_test.model.name, -# ) -# cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] - -# with tempfile.NamedTemporaryFile() as f: -# p = Path(f.name) -# p.write_text(cert) -# yield p - - -# async def emit_trace(ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False): -# """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" -# hostname = await get_tempo_internal_host(ops_test) -# cmd = ( -# f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " -# f"TRACEGEN_ENDPOINT={hostname}:4318/v1/traces " -# f"TRACEGEN_VERBOSE={verbose} " -# f"TRACEGEN_PROTOCOL={proto} " -# f"TRACEGEN_CERT={Tempo.server_cert_path if use_cert else ''} " -# f"TRACEGEN_NONCE={nonce} " -# "python3 tracegen.py" -# ) - -# return getoutput(cmd) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_build_and_deploy(ops_test: OpsTest): -# tempo_charm = await ops_test.build_charm(".") -# resources = { -# "tempo-image": METADATA["resources"]["tempo-image"]["upstream-source"], -# } -# await asyncio.gather( -# ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), -# ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), -# ) - -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME], -# status="active", -# raise_on_blocked=True, -# timeout=10000, -# raise_on_error=False, -# ), -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_relate(ops_test: OpsTest): -# await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") -# await ops_test.model.wait_for_idle( -# apps=[APP_NAME, SSC_APP_NAME], -# status="active", -# timeout=1000, -# ) - - -# @pytest.mark.setup -# @pytest.mark.abort_on_fail -# async def test_push_tracegen_script_and_deps(ops_test: OpsTest): -# await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") -# await ops_test.juju( -# "ssh", -# f"{APP_NAME}/0", -# "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", -# ) - - -# async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, nonce): -# # IF tempo is related to SSC -# # WHEN we emit an http trace, **unsecured** -# await emit_trace(ops_test, nonce=nonce) # this should fail -# # THEN we can verify it's not been ingested -# tempo_ip = await get_tempo_ip(ops_test) -# traces = get_traces(tempo_ip, nonce=nonce) -# assert not traces - - -# async def test_verify_trace_http_tls(ops_test: OpsTest, nonce, server_cert): -# # WHEN we emit a trace secured with TLS -# await emit_trace(ops_test, nonce=nonce, use_cert=True) -# # THEN we can verify it's eventually ingested -# await get_traces_patiently(ops_test, nonce) - - -# @pytest.mark.xfail # expected to fail because in this context the grpc receiver is not enabled -# async def test_verify_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): -# # WHEN we emit a trace secured with TLS -# await emit_trace(ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) -# # THEN we can verify it's been ingested -# await get_traces_patiently(ops_test, nonce) - - -# @pytest.mark.teardown -# @pytest.mark.abort_on_fail -# async def test_remove_relation(ops_test: OpsTest): -# await ops_test.juju( -# "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" -# ) -# await asyncio.gather( -# ops_test.model.wait_for_idle( -# apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 -# ), -# ) +import asyncio +import json +import logging +import random +import tempfile +from pathlib import Path +from subprocess import getoutput + +import pytest +import requests +import yaml +from helpers import deploy_cluster +from juju.application import Application +from pytest_operator.plugin import OpsTest +from tenacity import retry, stop_after_attempt, wait_exponential + +from tempo import Tempo +from tests.integration.helpers import get_relation_data + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +SSC = "self-signed-certificates" +SSC_APP_NAME = "ssc" +TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="function") +def nonce(): + """Generate an integer nonce for easier trace querying.""" + return str(random.random())[2:] + + +def get_traces(tempo_host: str, nonce): + url = "https://" + tempo_host + ":3200/api/search" + req = requests.get( + url, + params={"q": f'{{ .nonce = "{nonce}" }}'}, + # it would fail to verify as the cert was issued for fqdn, not IP. + verify=False, + ) + assert req.status_code == 200 + return json.loads(req.text)["traces"] + + +@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) +async def get_traces_patiently(ops_test, nonce): + assert get_traces(await get_tempo_ip(ops_test), nonce=nonce) + + +async def get_tempo_ip(ops_test: OpsTest): + status = await ops_test.model.get_status() + app = status["applications"][APP_NAME] + return app.public_address + + +async def get_tempo_internal_host(ops_test: OpsTest): + return f"{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" + + +@pytest.fixture(scope="function") +def server_cert(ops_test: OpsTest): + data = get_relation_data( + requirer_endpoint=f"{APP_NAME}/0:certificates", + provider_endpoint=f"{SSC_APP_NAME}/0:certificates", + model=ops_test.model.name, + ) + cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] + + with tempfile.NamedTemporaryFile() as f: + p = Path(f.name) + p.write_text(cert) + yield p + + +async def emit_trace(ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False): + """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" + hostname = await get_tempo_internal_host(ops_test) + port = "4317" + endpoint_postfix = "" + if proto == "http": + hostname = f"https://{hostname}" + port = "4318" + endpoint_postfix = "/v1/traces" + + cmd = ( + f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " + f"TRACEGEN_ENDPOINT={hostname}:{port}{endpoint_postfix} " + f"TRACEGEN_VERBOSE={verbose} " + f"TRACEGEN_PROTOCOL={proto} " + f"TRACEGEN_CERT={Tempo.tls_ca_path if use_cert else ''} " + f"TRACEGEN_NONCE={nonce} " + "python3 tracegen.py" + ) + + return getoutput(cmd) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + # tempo_charm = await ops_test.build_charm(".") + tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ops_test.model.deploy(SSC, application_name=SSC_APP_NAME), + ) + + # deploy cluster + await deploy_cluster(ops_test) + + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME], + status="active", + raise_on_blocked=True, + timeout=10000, + raise_on_error=False, + ), + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + await ops_test.model.integrate(APP_NAME + ":certificates", SSC_APP_NAME + ":certificates") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, SSC_APP_NAME], + status="active", + timeout=1000, + ) + + +@pytest.mark.setup +@pytest.mark.abort_on_fail +async def test_push_tracegen_script_and_deps(ops_test: OpsTest): + await ops_test.juju("scp", TRACEGEN_SCRIPT_PATH, f"{APP_NAME}/0:tracegen.py") + await ops_test.juju( + "ssh", + f"{APP_NAME}/0", + "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", + ) + + +async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, nonce): + # IF tempo is related to SSC + # WHEN we emit an http trace, **unsecured** + await emit_trace(ops_test, nonce=nonce) # this should fail + # THEN we can verify it's not been ingested + tempo_ip = await get_tempo_ip(ops_test) + traces = get_traces(tempo_ip, nonce=nonce) + assert not traces + + +@pytest.mark.abort_on_fail +async def test_verify_trace_http_tls(ops_test: OpsTest, nonce, server_cert): + # WHEN we emit a trace secured with TLS + await emit_trace(ops_test, nonce=nonce, use_cert=True) + # THEN we can verify it's eventually ingested + await get_traces_patiently(ops_test, nonce) + + +@pytest.mark.abort_on_fail +async def test_verify_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): + # enable otlp grpc receiver + tempo_app: Application = ops_test.model.applications[APP_NAME] + await tempo_app.set_config( + { + "always_enable_otlp_grpc": "True", + } + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + # WHEN we emit a trace secured with TLS + await emit_trace(ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) + # THEN we can verify it's been ingested + await get_traces_patiently(ops_test, nonce) + + +@pytest.mark.teardown +@pytest.mark.abort_on_fail +async def test_remove_relation(ops_test: OpsTest): + await ops_test.juju( + "remove-relation", APP_NAME + ":certificates", SSC_APP_NAME + ":certificates" + ) + await asyncio.gather( + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ), + ) diff --git a/tests/integration/tester/src/charm.py b/tests/integration/tester/src/charm.py index 72feedd..a5059a0 100755 --- a/tests/integration/tester/src/charm.py +++ b/tests/integration/tester/src/charm.py @@ -40,7 +40,7 @@ def __init__(self, *args): self.tracing = TracingEndpointRequirer( self, relation_name="tracing", protocols=["otlp_http", "otlp_grpc"] ) - self.tempo_otlp_http_endpoint = charm_tracing_config(self.tracing, None) + self.tempo_otlp_http_endpoint, _ = charm_tracing_config(self.tracing, None) # Core lifecycle events self.framework.observe(self.on.config_changed, self._update) diff --git a/tests/scenario/conftest.py b/tests/scenario/conftest.py index d5826b9..f40f72d 100644 --- a/tests/scenario/conftest.py +++ b/tests/scenario/conftest.py @@ -1,16 +1,21 @@ -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from cosl.coordinated_workers.nginx import Nginx from scenario import Container, Context, Relation -from tempo_cluster import TempoClusterRequirerAppData, TempoRole from charm import TempoCoordinatorCharm +@pytest.fixture() +def coordinator(): + return MagicMock() + + @pytest.fixture def tempo_charm(): with patch("lightkube.core.client.GenericSyncClient"): - with patch("charm.TempoCoordinatorCharm._update_server_cert"): + with patch.object(Nginx, "are_certificates_on_disk", False): yield TempoCoordinatorCharm @@ -42,7 +47,7 @@ def s3(s3_config): def all_worker(): return Relation( "tempo-cluster", - remote_app_data=TempoClusterRequirerAppData(role=TempoRole.all).dump(), + remote_app_data={"role": '"all"'}, ) diff --git a/tests/scenario/helpers.py b/tests/scenario/helpers.py index dbe8452..445ca83 100644 --- a/tests/scenario/helpers.py +++ b/tests/scenario/helpers.py @@ -1,7 +1,7 @@ import scenario -from tempo_cluster import TempoClusterProviderAppData +from cosl.coordinated_workers.interface import ClusterProviderAppData def get_tempo_config(state: scenario.State): cluster_relation = state.get_relations("tempo-cluster")[0] # there's only one - return TempoClusterProviderAppData.load(cluster_relation.local_app_data).tempo_config + return ClusterProviderAppData.load(cluster_relation.local_app_data).worker_config diff --git a/tests/scenario/test_charm_statuses.py b/tests/scenario/test_charm_statuses.py index 5c16e6b..b623a74 100644 --- a/tests/scenario/test_charm_statuses.py +++ b/tests/scenario/test_charm_statuses.py @@ -3,8 +3,6 @@ import ops from scenario import PeerRelation, State -from tempo import Tempo - def test_monolithic_status_no_s3_no_workers(context): state_out = context.run("start", State(unit_status=ops.ActiveStatus(), leader=True)) @@ -48,9 +46,14 @@ def test_scaled_status_with_s3_and_workers( assert state_out.unit_status.name == "active" -@patch.object(Tempo, "is_ready", new=True) +@patch("charm.TempoCoordinatorCharm.is_workload_ready", return_value=True) def test_happy_status( - context, s3, all_worker, nginx_container, nginx_prometheus_exporter_container + workload_ready_mock, + context, + s3, + all_worker, + nginx_container, + nginx_prometheus_exporter_container, ): state_out = context.run( "start", diff --git a/tests/scenario/test_config.py b/tests/scenario/test_config.py index 9c8e95a..63052f1 100644 --- a/tests/scenario/test_config.py +++ b/tests/scenario/test_config.py @@ -1,5 +1,6 @@ +import json + from scenario import State -from tempo_cluster import TempoClusterRequirerUnitData from charm import TempoCoordinatorCharm @@ -10,18 +11,18 @@ def test_memberlist_multiple_members( workers_no = 3 all_worker = all_worker.replace( remote_units_data={ - worker_idx: TempoClusterRequirerUnitData( - **{ - "address": f"worker-{worker_idx}.test.svc.cluster.local:7946", - "juju_topology": { + worker_idx: { + "address": json.dumps(f"worker-{worker_idx}.test.svc.cluster.local:7946"), + "juju_topology": json.dumps( + { "model": "test", "unit": f"worker/{worker_idx}", "model_uuid": "1", "application": "worker", "charm_name": "TempoWorker", - }, - } - ).dump() + } + ), + } for worker_idx in range(workers_no) }, ) @@ -32,7 +33,7 @@ def test_memberlist_multiple_members( ) with context.manager(all_worker.changed_event, state) as mgr: charm: TempoCoordinatorCharm = mgr.charm - assert charm.tempo_cluster.gather_addresses() == set( + assert charm.coordinator.cluster.gather_addresses() == set( [ "worker-0.test.svc.cluster.local:7946", "worker-1.test.svc.cluster.local:7946", diff --git a/tests/scenario/test_enabled_receivers.py b/tests/scenario/test_enabled_receivers.py index c77d8b0..eb63cad 100644 --- a/tests/scenario/test_enabled_receivers.py +++ b/tests/scenario/test_enabled_receivers.py @@ -53,7 +53,7 @@ def test_receivers_with_relations( action_out = context.run_action("list-receivers", state) assert action_out.results == { "otlp-http": f"http://{socket.getfqdn()}:4318", - "otlp-grpc": f"http://{socket.getfqdn()}:4317", + "otlp-grpc": f"{socket.getfqdn()}:4317", } @@ -93,5 +93,5 @@ def test_receivers_with_relations_and_config( assert action_out.results == { "otlp-http": f"http://{socket.getfqdn()}:4318", "zipkin": f"http://{socket.getfqdn()}:9411", - "otlp-grpc": f"http://{socket.getfqdn()}:4317", + "otlp-grpc": f"{socket.getfqdn()}:4317", } diff --git a/tests/scenario/test_ingressed_tracing.py b/tests/scenario/test_ingressed_tracing.py index ec9c850..69a1916 100644 --- a/tests/scenario/test_ingressed_tracing.py +++ b/tests/scenario/test_ingressed_tracing.py @@ -5,8 +5,6 @@ from charms.tempo_k8s.v1.charm_tracing import charm_tracing_disabled from scenario import Relation, State -from tempo import Tempo - @pytest.fixture def base_state(nginx_container, nginx_prometheus_exporter_container): @@ -38,7 +36,7 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w ingress = Relation("ingress", remote_app_data={"external_host": "1.2.3.4", "scheme": "http"}) state = base_state.replace(relations=[ingress, s3, all_worker]) - with patch.object(Tempo, "is_ready", lambda _: False): + with patch("charm.TempoCoordinatorCharm.is_workload_ready", lambda _: False): out = context.run(ingress.joined_event, state) charm_name = "tempo-coordinator-k8s" diff --git a/tests/scenario/test_nginx.py b/tests/scenario/test_nginx.py index a629774..3c3786d 100644 --- a/tests/scenario/test_nginx.py +++ b/tests/scenario/test_nginx.py @@ -1,44 +1,23 @@ import logging from typing import List -from unittest.mock import MagicMock import pytest -from nginx import Nginx -from tempo_cluster import TempoClusterProvider +from nginx_config import NginxConfig from tempo import Tempo logger = logging.getLogger(__name__) -@pytest.fixture -def tempo_cluster_provider(): - cluster_mock = MagicMock() - return TempoClusterProvider(cluster_mock) - - -def test_nginx_config_is_list_before_crossplane(context, nginx_container, tempo_cluster_provider): - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) - - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") - - prepared_config = nginx._prepare_config() +def test_nginx_config_is_list_before_crossplane(context, nginx_container, coordinator): + nginx = NginxConfig("localhost") + prepared_config = nginx._prepare_config(coordinator) assert isinstance(prepared_config, List) -def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, tempo_cluster_provider): - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) - - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") - logger.info(nginx._prepare_config()) - - prepared_config = nginx.config() +def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, coordinator): + nginx = NginxConfig("localhost") + prepared_config = nginx.config(coordinator) assert isinstance(prepared_config, str) @@ -46,10 +25,7 @@ def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, tempo_cl "addresses", ( {}, - {"all": {"1.2.3.4"}}, - {"all": {"1.2.3.4", "1.2.3.5"}}, { - "all": {"1.2.3.4"}, "distributor": {"1.2.3.5"}, "ingester": {"1.2.3.6"}, "querier": {"1.2.4.7"}, @@ -83,29 +59,19 @@ def test_nginx_config_is_parsed_by_crossplane(context, nginx_container, tempo_cl }, ), ) -def test_nginx_config_is_parsed_with_workers( - context, nginx_container, tempo_cluster_provider, addresses -): - tempo_cluster_provider.gather_addresses_by_role = MagicMock(return_value=addresses) +def test_nginx_config_is_parsed_with_workers(context, nginx_container, coordinator, addresses): + coordinator.cluster.gather_addresses_by_role.return_value = addresses - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) + nginx = NginxConfig("localhost") - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") - - prepared_config = nginx.config() + prepared_config = nginx.config(coordinator) assert isinstance(prepared_config, str) @pytest.mark.parametrize( "addresses", ( - {"all": {"1.2.3.4"}}, - {"all": {"1.2.3.4", "1.2.3.5"}}, { - "all": {"1.2.3.4"}, "distributor": {"1.2.3.5"}, "ingester": {"1.2.3.6"}, "querier": {"1.2.4.7"}, @@ -124,23 +90,16 @@ def test_nginx_config_is_parsed_with_workers( ), ) def test_nginx_config_contains_upstreams_and_proxy_pass( - context, nginx_container, tempo_cluster_provider, addresses + context, nginx_container, coordinator, addresses ): - tempo_cluster_provider.gather_addresses_by_role = MagicMock(return_value=addresses) - - unit = MagicMock() - unit.get_container = nginx_container - tempo_charm = MagicMock() - tempo_charm.unit = MagicMock(return_value=unit) + coordinator.cluster.gather_addresses_by_role.return_value = addresses - nginx = Nginx(tempo_charm, tempo_cluster_provider, "lolcathost") + nginx = NginxConfig("localhost") - prepared_config = nginx.config() + prepared_config = nginx.config(coordinator) for role, addresses in addresses.items(): for address in addresses: - if role == "all": - _assert_config_per_role(Tempo.all_ports, address, prepared_config) if role == "distributor": _assert_config_per_role(Tempo.receiver_ports, address, prepared_config) if role == "query-frontend": diff --git a/tests/scenario/test_tempo_clustered.py b/tests/scenario/test_tempo_clustered.py index 27d7988..9be1129 100644 --- a/tests/scenario/test_tempo_clustered.py +++ b/tests/scenario/test_tempo_clustered.py @@ -1,37 +1,49 @@ import datetime +import json from unittest.mock import MagicMock, patch import pytest import scenario from charms.tempo_k8s.v2.tracing import TracingRequirerAppData from charms.tls_certificates_interface.v3.tls_certificates import ProviderCertificate +from cosl.coordinated_workers.interface import ClusterProvider, ClusterProviderAppData from scenario import Relation, State -from tempo_cluster import TempoClusterProviderAppData from charm import TempoCoordinatorCharm from tempo import Tempo from tests.scenario.helpers import get_tempo_config +@pytest.fixture(scope="function") +def coordinator_with_initial_config(s3_config): + new_coordinator_mock = MagicMock() + new_coordinator_mock.return_value.tls_available = False + new_coordinator_mock.return_value.hostname = "tempo-test-0.test.cluster.svc.local" + new_coordinator_mock.return_value._s3_config = s3_config + new_coordinator_mock.return_value.cluster.gather_addresses.return_value = {"localhost"} + new_coordinator_mock.return_value.cluster.gather_addresses_by_role.return_value = { + "query-frontend": {"localhost"}, + "distributor": {"localhost"}, + } + + return new_coordinator_mock + + @pytest.fixture -def all_worker_with_initial_config(all_worker: Relation, s3_config): - container = MagicMock() - container.can_connect = lambda: True - # prevent tls_ready from reporting True - container.exists = lambda path: ( - False if path in [Tempo.tls_cert_path, Tempo.tls_key_path, Tempo.tls_ca_path] else True - ) - initial_config = Tempo(container).generate_config( - ["otlp_http"], s3_config, {"all": "localhost"} +def all_worker_with_initial_config(all_worker: Relation, coordinator_with_initial_config): + + initial_config = Tempo(lambda: ("otlp_http",)).config( + coordinator_with_initial_config.return_value ) - new_local_app_data = TempoClusterProviderAppData( - tempo_config=initial_config, - loki_endpoints={}, - ca_cert="foo cert", - server_cert="bar cert", - privkey_secret_id="super secret", - tempo_receiver={"otlp_http": "https://foo.com/fake_receiver"}, - ).dump() + + new_local_app_data = { + "worker_config": json.dumps(initial_config), + "ca_cert": json.dumps("foo cert"), + "server_cert": json.dumps("bar cert"), + "privkey_secret_id": json.dumps("super secret"), + "tracing_receivers": json.dumps({"otlp_http": "https://foo.com/fake_receiver"}), + } + return all_worker.replace(local_app_data=new_local_app_data) @@ -79,16 +91,16 @@ def state_with_certs( def test_certs_ready(context, state_with_certs): with context.manager("update-status", state_with_certs) as mgr: charm: TempoCoordinatorCharm = mgr.charm - assert charm.cert_handler.server_cert == MOCK_SERVER_CERT - assert charm.cert_handler.ca_cert == MOCK_CA_CERT - assert charm.cert_handler.private_key + assert charm.coordinator.cert_handler.server_cert == MOCK_SERVER_CERT + assert charm.coordinator.cert_handler.ca_cert == MOCK_CA_CERT + assert charm.coordinator.cert_handler.private_key def test_cluster_relation(context, state_with_certs, all_worker): clustered_state = state_with_certs.replace(relations=state_with_certs.relations + [all_worker]) state_out = context.run(all_worker.joined_event, clustered_state) cluster_out = state_out.get_relations(all_worker.endpoint)[0] - local_app_data = TempoClusterProviderAppData.load(cluster_out.local_app_data) + local_app_data = ClusterProviderAppData.load(cluster_out.local_app_data) assert local_app_data.ca_cert == MOCK_CA_CERT assert local_app_data.server_cert == MOCK_SERVER_CERT @@ -97,22 +109,26 @@ def test_cluster_relation(context, state_with_certs, all_worker): # certhandler's vault uses revision 0 to store an uninitialized-vault marker assert secret.contents[1]["private-key"] - assert local_app_data.tempo_config + assert local_app_data.worker_config +@patch.object(ClusterProvider, "gather_addresses") @pytest.mark.parametrize("requested_protocol", ("otlp_grpc", "zipkin")) def test_tempo_restart_on_ingress_v2_changed( + mock_cluster, + coordinator_with_initial_config, context, - tmp_path, requested_protocol, s3, - s3_config, all_worker_with_initial_config, nginx_container, nginx_prometheus_exporter_container, ): + mock_cluster.return_value = {"localhost"} + # GIVEN # the remote end requests an otlp_grpc endpoint + tracing = Relation( "tracing", remote_app_data=TracingRequirerAppData(receivers=[requested_protocol]).dump(), @@ -125,12 +141,13 @@ def test_tempo_restart_on_ingress_v2_changed( relations=[tracing, s3, all_worker_with_initial_config], containers=[nginx_container, nginx_prometheus_exporter_container], ) + state_out = context.run(tracing.changed_event, state) # THEN # Tempo pushes a new config to the all_worker new_config = get_tempo_config(state_out) - expected_config = Tempo().generate_config( - ["otlp_http", requested_protocol], s3_config, {"all": "localhost"} + expected_config = Tempo(lambda: ["otlp_http", requested_protocol]).config( + coordinator_with_initial_config.return_value ) assert new_config == expected_config diff --git a/tests/scenario/test_tls.py b/tests/scenario/test_tls.py index 8021733..777a0ee 100644 --- a/tests/scenario/test_tls.py +++ b/tests/scenario/test_tls.py @@ -4,10 +4,9 @@ import pytest from charms.tempo_k8s.v1.charm_tracing import charm_tracing_disabled from charms.tempo_k8s.v2.tracing import TracingProviderAppData, TracingRequirerAppData +from cosl.coordinated_workers.coordinator import Coordinator from scenario import Relation, Secret, State -from charm import TempoCoordinatorCharm - @pytest.fixture def base_state(nginx_container, nginx_prometheus_exporter_container): @@ -38,9 +37,7 @@ def update_relations_tls_and_verify( tracing, ): state = base_state.replace(relations=relations) - with charm_tracing_disabled(), patch.object( - TempoCoordinatorCharm, "tls_available", local_has_tls - ): + with charm_tracing_disabled(), patch.object(Coordinator, "tls_available", local_has_tls): out = context.run(tracing.changed_event, state) tracing_provider_app_data = TracingProviderAppData.load( out.get_relations(tracing.endpoint)[0].local_app_data diff --git a/tests/unit/test_coherence.py b/tests/unit/test_coherence.py index c3d9ab0..ad0fca5 100644 --- a/tests/unit/test_coherence.py +++ b/tests/unit/test_coherence.py @@ -1,21 +1,17 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest as pytest -from coordinator import ( +from cosl.coordinated_workers.coordinator import Coordinator + +from tempo_config import ( MINIMAL_DEPLOYMENT, RECOMMENDED_DEPLOYMENT, - TempoCoordinator, TempoRole, + TempoRolesConfig, ) -def _to_endpoint_name(role: TempoRole): - return role.value.replace("_", "-") - - -ALL_TEMPO_RELATION_NAMES = list(map(_to_endpoint_name, TempoRole)) - - +@patch("cosl.coordinated_workers.coordinator.Coordinator.__init__", return_value=None) @pytest.mark.parametrize( "roles, expected", ( @@ -26,13 +22,19 @@ def _to_endpoint_name(role: TempoRole): (RECOMMENDED_DEPLOYMENT, True), ), ) -def test_coherent(roles, expected): - mock = MagicMock() - mock.gather_roles = MagicMock(return_value=roles) - mc = TempoCoordinator(mock) +def test_coherent(mock_coordinator, roles, expected): + + mc = Coordinator(None, None, "", "", 0, None, None, None) + cluster_mock = MagicMock() + cluster_mock.gather_roles = MagicMock(return_value=roles) + mc.cluster = cluster_mock + mc._is_coherent = None + mc.roles_config = TempoRolesConfig() + assert mc.is_coherent is expected +@patch("cosl.coordinated_workers.coordinator.Coordinator.__init__", return_value=None) @pytest.mark.parametrize( "roles, expected", ( @@ -43,8 +45,12 @@ def test_coherent(roles, expected): (RECOMMENDED_DEPLOYMENT, True), ), ) -def test_recommended(roles, expected): - mock = MagicMock() - mock.gather_roles = MagicMock(return_value=roles) - mc = TempoCoordinator(mock) +def test_recommended(mock_coordinator, roles, expected): + mc = Coordinator(None, None, "", "", 0, None, None, None) + cluster_mock = MagicMock() + cluster_mock.gather_roles = MagicMock(return_value=roles) + mc.cluster = cluster_mock + mc._is_recommended = None + mc.roles_config = TempoRolesConfig() + assert mc.is_recommended is expected diff --git a/tests/unit/test_tempo.py b/tests/unit/test_tempo.py index ed0fbd2..27ca813 100644 --- a/tests/unit/test_tempo.py +++ b/tests/unit/test_tempo.py @@ -52,8 +52,8 @@ "thrift_http": { "tls": { "ca_file": "/usr/local/share/ca-certificates/ca.crt", - "cert_file": "/etc/tempo/tls/server.crt", - "key_file": "/etc/tempo/tls/server.key", + "cert_file": "/etc/worker/server.cert", + "key_file": "/etc/worker/private.key", } }, } @@ -61,8 +61,8 @@ "zipkin": { "tls": { "ca_file": "/usr/local/share/ca-certificates/ca.crt", - "cert_file": "/etc/tempo/tls/server.crt", - "key_file": "/etc/tempo/tls/server.key", + "cert_file": "/etc/worker/server.cert", + "key_file": "/etc/worker/private.key", } }, "otlp": { @@ -70,8 +70,8 @@ "http": { "tls": { "ca_file": "/usr/local/share/ca-certificates/ca.crt", - "cert_file": "/etc/tempo/tls/server.crt", - "key_file": "/etc/tempo/tls/server.key", + "cert_file": "/etc/worker/server.cert", + "key_file": "/etc/worker/private.key", } }, } @@ -82,10 +82,7 @@ ), ) def test_tempo_distributor_config(protocols, use_tls, expected_config): - assert ( - Tempo(None, use_tls=use_tls)._build_distributor_config(protocols).receivers - == expected_config - ) + assert Tempo(None)._build_distributor_config(protocols, use_tls).receivers == expected_config @pytest.mark.parametrize( @@ -108,4 +105,4 @@ def test_tempo_distributor_config(protocols, use_tls, expected_config): ), ) def test_tempo_memberlist_config(peers, expected_config): - assert Tempo()._build_memberlist_config(peers) == expected_config + assert Tempo(None)._build_memberlist_config(peers) == expected_config From 89512e82737a1c84e0e163bbf1e88c72a1eafa0a Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 22 Jul 2024 12:44:25 +0300 Subject: [PATCH 03/22] add self-tracing --- charmcraft.yaml | 5 + scripts/tracegen.py | 6 +- src/charm.py | 6 +- src/tempo.py | 9 +- src/tempo_config.py | 11 +- tests/integration/conftest.py | 30 ++++++ tests/integration/helpers.py | 53 ++++++++- tests/integration/test_ingressed_tls.py | 84 +++------------ tests/integration/test_self_tracing.py | 55 ++++++++++ tests/integration/test_self_tracing_remote.py | 66 ++++++++++++ tests/integration/test_tls.py | 101 ++++-------------- tests/scenario/test_tempo_clustered.py | 9 +- 12 files changed, 264 insertions(+), 171 deletions(-) create mode 100644 tests/integration/test_self_tracing.py create mode 100644 tests/integration/test_self_tracing_remote.py diff --git a/charmcraft.yaml b/charmcraft.yaml index fdcbc28..104cea2 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -67,6 +67,11 @@ provides: requires: + self-tracing: + interface: tracing + description: | + Integration to enable Tempo to send its own traces to itself or to another Tempo instance. + limit: 1 s3: interface: s3 limit: 1 diff --git a/scripts/tracegen.py b/scripts/tracegen.py index 3505be4..c5fcd5f 100644 --- a/scripts/tracegen.py +++ b/scripts/tracegen.py @@ -36,12 +36,12 @@ def emit_trace( return (emit_trace(endpoint, log_trace_to_console, cert, "grpc", nonce=nonce) and emit_trace(endpoint, log_trace_to_console, cert, "http", nonce=nonce)) - return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce) + return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce, protocol = protocol) -def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None): +def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: Literal["grpc", "http"] = "http"): resource = Resource.create(attributes={ - "service.name": "tracegen", + "service.name": f"tracegen{protocol}", "nonce": str(nonce) } ) diff --git a/src/charm.py b/src/charm.py index 55b41b4..85f53a5 100755 --- a/src/charm.py +++ b/src/charm.py @@ -58,6 +58,7 @@ def __init__(self, *args): "logging": "logging", "metrics": "metrics-endpoint", "s3": "s3", + "tracing": "self-tracing", }, nginx_config=NginxConfig(server_name=self.hostname).config, workers_config=self.tempo.config, @@ -287,7 +288,10 @@ def tempo_otlp_http_endpoint(self) -> Optional[str]: """Endpoint at which the charm tracing information will be forwarded.""" # the charm container and the tempo workload container have apparently the same # IP, so we can talk to tempo at localhost. - if self.is_workload_ready(): + if hasattr(self, "coordinator") and self.coordinator.tracing.is_ready(): + return self.coordinator.tracing.get_endpoint("otlp_http") + # In absence of another Tempo instance, we don't want to lose this instance's charm traces + elif self.is_workload_ready(): return f"{self._internal_url}:{self.tempo.receiver_ports['otlp_http']}" def requested_receivers_urls(self) -> Dict[str, str]: diff --git a/src/tempo.py b/src/tempo.py index 04902e6..93b08fb 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -110,7 +110,7 @@ def config( ) config.memberlist = config.memberlist.model_copy(update=tls_config) - return yaml.dump(config.model_dump(mode="json", exclude_none=True)) + return yaml.dump(config.model_dump(mode="json", by_alias=True, exclude_none=True)) def _build_server_config(self, use_tls=False): server_config = tempo_config.Server( @@ -141,12 +141,7 @@ def _build_storage_config(self, s3_config: dict): queue_depth=20000, ), backend="s3", - s3=tempo_config.S3( - bucket=s3_config["bucket"], - access_key=s3_config["access-key"], - endpoint=s3_config["endpoint"], - secret_key=s3_config["secret-key"], - ), + s3=tempo_config.S3(**s3_config), # starting from Tempo 2.4, we need to use at least parquet v3 to have search capabilities (Grafana support) # https://grafana.com/docs/tempo/latest/release-notes/v2-4/#vparquet3-is-now-the-default-block-format block=tempo_config.Block(version="vParquet3"), diff --git a/src/tempo_config.py b/src/tempo_config.py index c487dac..c54cbba 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -10,7 +10,7 @@ from typing import Any, Dict, Iterable, List, Mapping, Optional from urllib.parse import urlparse -from pydantic import BaseModel, ConfigDict, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator logger = logging.getLogger(__name__) @@ -236,10 +236,13 @@ class Wal(BaseModel): class S3(BaseModel): """S3 config schema.""" - bucket: str - access_key: str + model_config = ConfigDict(populate_by_name=True) + """Pydantic config.""" + + bucket_name: str = Field(alias="bucket") + access_key_id: str = Field(alias="access_key") endpoint: str - secret_key: str + secret_access_key: str = Field(alias="secret_key") insecure: bool = False @model_validator(mode="before") # pyright: ignore diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index fd3d087..d1df125 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,13 +1,22 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. +import json import logging import os +import random import shutil +import tempfile from pathlib import Path from pytest import fixture from pytest_operator.plugin import OpsTest +from tests.integration.helpers import get_relation_data + +APP_NAME = "tempo" +SSC = "self-signed-certificates" +SSC_APP_NAME = "ssc" + logger = logging.getLogger(__name__) @@ -63,3 +72,24 @@ def copy_charm_libs_into_tester_grpc_charm(ops_test): # cleanup: remove all libs for path in copies: Path(path).unlink() + + +@fixture(scope="function") +def server_cert(ops_test: OpsTest): + data = get_relation_data( + requirer_endpoint=f"{APP_NAME}/0:certificates", + provider_endpoint=f"{SSC_APP_NAME}/0:certificates", + model=ops_test.model.name, + ) + cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] + + with tempfile.NamedTemporaryFile() as f: + p = Path(f.name) + p.write_text(cert) + yield p + + +@fixture(scope="function") +def nonce(): + """Generate an integer nonce for easier trace querying.""" + return str(random.random())[2:] diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 059ee5a..4f8542e 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -8,11 +8,15 @@ from pathlib import Path from typing import Dict, Literal +import requests import yaml from juju.application import Application from juju.unit import Unit from minio import Minio from pytest_operator.plugin import OpsTest +from tenacity import retry, stop_after_attempt, wait_exponential + +from tempo import Tempo _JUJU_DATA_CACHE = {} _JUJU_KEYS = ("egress-subnets", "ingress-address", "private-address") @@ -278,7 +282,7 @@ async def deploy_and_configure_minio(ops_test: OpsTest): assert action_result.status == "completed" -async def deploy_cluster(ops_test: OpsTest): +async def deploy_cluster(ops_test: OpsTest, tempo_app=APP_NAME): # await ops_test.model.deploy(FACADE, channel="edge") # await ops_test.model.wait_for_idle( # apps=[FACADE], raise_on_blocked=True, status="active", timeout=2000 @@ -296,14 +300,55 @@ async def deploy_cluster(ops_test: OpsTest): ) await ops_test.model.deploy(S3_INTEGRATOR, channel="edge") - await ops_test.model.integrate(APP_NAME + ":s3", S3_INTEGRATOR + ":s3-credentials") - await ops_test.model.integrate(APP_NAME + ":tempo-cluster", WORKER_NAME + ":tempo-cluster") + await ops_test.model.integrate(tempo_app + ":s3", S3_INTEGRATOR + ":s3-credentials") + await ops_test.model.integrate(tempo_app + ":tempo-cluster", WORKER_NAME + ":tempo-cluster") await deploy_and_configure_minio(ops_test) await ops_test.model.wait_for_idle( - apps=[APP_NAME, WORKER_NAME, S3_INTEGRATOR], + apps=[tempo_app, WORKER_NAME, S3_INTEGRATOR], status="active", timeout=1000, idle_period=30, ) + + +def get_traces(tempo_host: str, service_name="tracegenhttp", tls=True): + url = f"{'https' if tls else 'http'}://{tempo_host}:3200/api/search?tags=service.name={service_name}" + req = requests.get( + url, + verify=False, + ) + assert req.status_code == 200 + traces = json.loads(req.text)["traces"] + return traces + + +@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=10)) +async def get_traces_patiently(tempo_host, service_name="tracegenhttp", tls=True): + traces = get_traces(tempo_host, service_name=service_name, tls=tls) + assert len(traces) > 0 + return traces + + +async def emit_trace( + endpoint, ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False +): + """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" + cmd = ( + f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " + f"TRACEGEN_ENDPOINT={endpoint} " + f"TRACEGEN_VERBOSE={verbose} " + f"TRACEGEN_PROTOCOL={proto} " + f"TRACEGEN_CERT={Tempo.tls_ca_path if use_cert else ''} " + f"TRACEGEN_NONCE={nonce} " + "python3 tracegen.py" + ) + + return subprocess.getoutput(cmd) + + +async def get_application_ip(ops_test: OpsTest, app_name: str): + status = await ops_test.model.get_status() + app = status["applications"][app_name] + return app.public_address diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index 10b8275..6d46d0c 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -1,24 +1,16 @@ import asyncio -import json import logging -import random -import subprocess -import tempfile from pathlib import Path import pytest -import requests import yaml -from helpers import deploy_cluster +from helpers import deploy_cluster, emit_trace, get_traces_patiently from juju.application import Application from pytest_operator.plugin import OpsTest -from tenacity import retry, stop_after_attempt, wait_exponential - -from tempo import Tempo -from tests.integration.helpers import get_relation_data METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) APP_NAME = "tempo" +WORKER_NAME = "tempo-worker" SSC = "self-signed-certificates" SSC_APP_NAME = "ssc" TRAEFIK = "traefik-k8s" @@ -28,63 +20,15 @@ logger = logging.getLogger(__name__) -@pytest.fixture(scope="function") -def nonce(): - """Generate an integer nonce for easier trace querying.""" - return str(random.random())[2:] - - -@pytest.fixture(scope="function") -def server_cert(ops_test: OpsTest): - data = get_relation_data( - requirer_endpoint=f"{APP_NAME}/0:certificates", - provider_endpoint=f"{SSC_APP_NAME}/0:certificates", - model=ops_test.model.name, - ) - cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] - - with tempfile.NamedTemporaryFile() as f: - p = Path(f.name) - p.write_text(cert) - yield p - - -def get_traces(tempo_host: str, nonce, service_name="tracegen"): - req = requests.get( - "https://" + tempo_host + ":3200/api/search", - params={"service.name": service_name, "nonce": nonce}, - verify=False, - ) - assert req.status_code == 200 - return json.loads(req.text)["traces"] - - -@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) -async def get_traces_patiently(tempo_host, nonce): - assert get_traces(tempo_host, nonce=nonce) - - -async def get_tempo_host(ops_test: OpsTest): +async def get_ingress_proxied_endpoint(ops_test: OpsTest): status = await ops_test.model.get_status() app = status["applications"][TRAEFIK_APP_NAME] - return app.public_address - - -async def emit_trace( - endpoint, ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False -): - """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" - cmd = ( - f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " - f"TRACEGEN_ENDPOINT={endpoint} " - f"TRACEGEN_VERBOSE={verbose} " - f"TRACEGEN_PROTOCOL={proto} " - f"TRACEGEN_CERT={Tempo.tls_ca_path if use_cert else ''} " - f"TRACEGEN_NONCE={nonce} " - "python3 tracegen.py" - ) + status_msg = app["status"]["info"] - return subprocess.getoutput(cmd) + # hacky way to get ingress hostname + if "Serving at" not in status_msg: + assert False, f"Ingressed hostname is not present in {TRAEFIK_APP_NAME} status message." + return status_msg.replace("Serving at", "").strip() @pytest.mark.setup @@ -138,9 +82,11 @@ async def test_relate(ops_test: OpsTest): ) await ops_test.model.integrate(APP_NAME + ":ingress", TRAEFIK_APP_NAME + ":traefik-route") await ops_test.model.wait_for_idle( - apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME], + apps=[APP_NAME, SSC_APP_NAME, TRAEFIK_APP_NAME, WORKER_NAME], status="active", timeout=1000, + # make idle period 1 minute, as Tempo workload might not be up yet + idle_period=60, ) @@ -160,13 +106,13 @@ async def test_relate(ops_test: OpsTest): @pytest.mark.abort_on_fail async def test_verify_ingressed_trace_http_tls(ops_test: OpsTest, nonce, server_cert): - tempo_host = await get_tempo_host(ops_test) + tempo_host = await get_ingress_proxied_endpoint(ops_test) await emit_trace( f"https://{tempo_host}:4318/v1/traces", nonce=nonce, ops_test=ops_test, use_cert=True ) # THEN we can verify it's been ingested - assert get_traces_patiently(tempo_host, nonce=nonce) + assert await get_traces_patiently(tempo_host) @pytest.mark.abort_on_fail @@ -184,13 +130,13 @@ async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server timeout=1000, ) - tempo_host = await get_tempo_host(ops_test) + tempo_host = await get_ingress_proxied_endpoint(ops_test) await emit_trace( f"{tempo_host}:4317", nonce=nonce, proto="grpc", ops_test=ops_test, use_cert=True ) # THEN we can verify it's been ingested - assert get_traces_patiently(tempo_host, nonce=nonce) + assert await get_traces_patiently(tempo_host, service_name="grpc") @pytest.mark.teardown diff --git a/tests/integration/test_self_tracing.py b/tests/integration/test_self_tracing.py new file mode 100644 index 0000000..01badd9 --- /dev/null +++ b/tests/integration/test_self_tracing.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# Copyright 2024 Ubuntu +# See LICENSE file for licensing details. + +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import deploy_cluster, get_application_ip, get_traces_patiently +from pytest_operator.plugin import OpsTest + +logger = logging.getLogger(__name__) + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" + + +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ) + + # deploy cluster + await deploy_cluster(ops_test, APP_NAME) + + await asyncio.gather( + ops_test.model.wait_for_idle(status="active", raise_on_blocked=True, timeout=1000) + ) + + +@pytest.mark.abort_on_fail +async def test_verify_trace_http_self(ops_test: OpsTest): + # adjust update-status interval to generate a charm tracing span faster + await ops_test.model.set_config({"update-status-hook-interval": "5s"}) + + # Verify traces from `tempo` are ingested into self Tempo + assert await get_traces_patiently( + await get_application_ip(ops_test, APP_NAME), + service_name="tempo-charm", + tls=False, + ) + + # adjust back to the default interval time + await ops_test.model.set_config({"update-status-hook-interval": "5m"}) diff --git a/tests/integration/test_self_tracing_remote.py b/tests/integration/test_self_tracing_remote.py new file mode 100644 index 0000000..0cfcd60 --- /dev/null +++ b/tests/integration/test_self_tracing_remote.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# Copyright 2024 Ubuntu +# See LICENSE file for licensing details. + +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import deploy_cluster, get_application_ip, get_traces_patiently +from pytest_operator.plugin import OpsTest + +logger = logging.getLogger(__name__) + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "tempo" +APP_REMOTE_NAME = "tempo-source" + + +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest): + tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + resources = { + "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], + "nginx-prometheus-exporter-image": METADATA["resources"][ + "nginx-prometheus-exporter-image" + ]["upstream-source"], + } + + await asyncio.gather( + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_REMOTE_NAME), + ops_test.model.deploy(tempo_charm, resources=resources, application_name=APP_NAME), + ) + + # deploy cluster + await deploy_cluster(ops_test, APP_NAME) + + await asyncio.gather( + ops_test.model.wait_for_idle(status="active", raise_on_blocked=True, timeout=1000) + ) + + +@pytest.mark.abort_on_fail +async def test_relate(ops_test: OpsTest): + await ops_test.model.integrate(APP_NAME + ":tracing", APP_REMOTE_NAME + ":self-tracing") + await ops_test.model.wait_for_idle( + status="active", + timeout=1000, + ) + + +@pytest.mark.abort_on_fail +async def test_verify_trace_http(ops_test: OpsTest): + # adjust update-status interval to generate a charm tracing span faster + await ops_test.model.set_config({"update-status-hook-interval": "5s"}) + + # Verify traces from `tempo-source` are ingested into remote tempo instance + assert await get_traces_patiently( + await get_application_ip(ops_test, APP_NAME), + service_name="tempo-source-charm", + tls=False, + ) + + # adjust back to the default interval time + await ops_test.model.set_config({"update-status-hook-interval": "5m"}) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 5781f56..34a8655 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -1,21 +1,18 @@ import asyncio -import json import logging -import random -import tempfile from pathlib import Path -from subprocess import getoutput import pytest -import requests import yaml -from helpers import deploy_cluster +from helpers import ( + deploy_cluster, + emit_trace, + get_application_ip, + get_traces, + get_traces_patiently, +) from juju.application import Application from pytest_operator.plugin import OpsTest -from tenacity import retry, stop_after_attempt, wait_exponential - -from tempo import Tempo -from tests.integration.helpers import get_relation_data METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) APP_NAME = "tempo" @@ -25,75 +22,15 @@ logger = logging.getLogger(__name__) -@pytest.fixture(scope="function") -def nonce(): - """Generate an integer nonce for easier trace querying.""" - return str(random.random())[2:] - - -def get_traces(tempo_host: str, nonce): - url = "https://" + tempo_host + ":3200/api/search" - req = requests.get( - url, - params={"q": f'{{ .nonce = "{nonce}" }}'}, - # it would fail to verify as the cert was issued for fqdn, not IP. - verify=False, - ) - assert req.status_code == 200 - return json.loads(req.text)["traces"] - - -@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) -async def get_traces_patiently(ops_test, nonce): - assert get_traces(await get_tempo_ip(ops_test), nonce=nonce) - - -async def get_tempo_ip(ops_test: OpsTest): - status = await ops_test.model.get_status() - app = status["applications"][APP_NAME] - return app.public_address - - -async def get_tempo_internal_host(ops_test: OpsTest): - return f"{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" - - -@pytest.fixture(scope="function") -def server_cert(ops_test: OpsTest): - data = get_relation_data( - requirer_endpoint=f"{APP_NAME}/0:certificates", - provider_endpoint=f"{SSC_APP_NAME}/0:certificates", - model=ops_test.model.name, - ) - cert = json.loads(data.provider.application_data["certificates"])[0]["certificate"] - - with tempfile.NamedTemporaryFile() as f: - p = Path(f.name) - p.write_text(cert) - yield p - - -async def emit_trace(ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False): - """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" - hostname = await get_tempo_internal_host(ops_test) +async def get_tempo_traces_internal_endpoint(ops_test: OpsTest, proto="http"): + hostname = f"{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" port = "4317" endpoint_postfix = "" if proto == "http": hostname = f"https://{hostname}" port = "4318" endpoint_postfix = "/v1/traces" - - cmd = ( - f"juju ssh -m {ops_test.model_name} {APP_NAME}/0 " - f"TRACEGEN_ENDPOINT={hostname}:{port}{endpoint_postfix} " - f"TRACEGEN_VERBOSE={verbose} " - f"TRACEGEN_PROTOCOL={proto} " - f"TRACEGEN_CERT={Tempo.tls_ca_path if use_cert else ''} " - f"TRACEGEN_NONCE={nonce} " - "python3 tracegen.py" - ) - - return getoutput(cmd) + return f"{hostname}:{port}{endpoint_postfix}" @pytest.mark.setup @@ -151,19 +88,20 @@ async def test_push_tracegen_script_and_deps(ops_test: OpsTest): async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, nonce): # IF tempo is related to SSC # WHEN we emit an http trace, **unsecured** - await emit_trace(ops_test, nonce=nonce) # this should fail + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test) + await emit_trace(tempo_endpoint, ops_test, nonce=nonce) # this should fail # THEN we can verify it's not been ingested - tempo_ip = await get_tempo_ip(ops_test) - traces = get_traces(tempo_ip, nonce=nonce) - assert not traces + traces = get_traces(await get_application_ip(ops_test, APP_NAME)) + assert len(traces) == 0 @pytest.mark.abort_on_fail async def test_verify_trace_http_tls(ops_test: OpsTest, nonce, server_cert): # WHEN we emit a trace secured with TLS - await emit_trace(ops_test, nonce=nonce, use_cert=True) + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test) + await emit_trace(tempo_endpoint, ops_test, nonce=nonce, use_cert=True) # THEN we can verify it's eventually ingested - await get_traces_patiently(ops_test, nonce) + await get_traces_patiently(await get_application_ip(ops_test, APP_NAME)) @pytest.mark.abort_on_fail @@ -181,10 +119,11 @@ async def test_verify_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): timeout=1000, ) + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test, proto="grpc") # WHEN we emit a trace secured with TLS - await emit_trace(ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) + await emit_trace(tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) # THEN we can verify it's been ingested - await get_traces_patiently(ops_test, nonce) + await get_traces_patiently(await get_application_ip(ops_test, APP_NAME), service_name="grpc") @pytest.mark.teardown diff --git a/tests/scenario/test_tempo_clustered.py b/tests/scenario/test_tempo_clustered.py index 9be1129..1e264f5 100644 --- a/tests/scenario/test_tempo_clustered.py +++ b/tests/scenario/test_tempo_clustered.py @@ -15,11 +15,16 @@ @pytest.fixture(scope="function") -def coordinator_with_initial_config(s3_config): +def coordinator_with_initial_config(): new_coordinator_mock = MagicMock() new_coordinator_mock.return_value.tls_available = False new_coordinator_mock.return_value.hostname = "tempo-test-0.test.cluster.svc.local" - new_coordinator_mock.return_value._s3_config = s3_config + new_coordinator_mock.return_value._s3_config = { + "access_key_id": "key", + "bucket_name": "tempo", + "endpoint": "http://1.2.3.4:9000", + "secret_access_key": "soverysecret", + } new_coordinator_mock.return_value.cluster.gather_addresses.return_value = {"localhost"} new_coordinator_mock.return_value.cluster.gather_addresses_by_role.return_value = { "query-frontend": {"localhost"}, From dad257a3201b0d84390881e0523d1e2d417dddf9 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 22 Jul 2024 16:38:00 +0300 Subject: [PATCH 04/22] address comments --- charmcraft.yaml | 2 +- src/charm.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/charmcraft.yaml b/charmcraft.yaml index 104cea2..809c523 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -70,7 +70,7 @@ requires: self-tracing: interface: tracing description: | - Integration to enable Tempo to send its own traces to itself or to another Tempo instance. + Integration to enable Tempo to send its own traces to another Tempo instance. limit: 1 s3: interface: s3 diff --git a/src/charm.py b/src/charm.py index 85f53a5..e8a9e9c 100755 --- a/src/charm.py +++ b/src/charm.py @@ -277,10 +277,9 @@ def _update_server_ca_cert(self) -> None: """Server CA certificate for charm tracing tls, if tls is enabled.""" server_ca_cert = Path(self.tempo.tls_ca_path) if self.coordinator.tls_available: - if not server_ca_cert.exists(): + if self.coordinator.cert_handler.ca_cert: server_ca_cert.parent.mkdir(parents=True, exist_ok=True) - if self.coordinator.cert_handler.ca_cert: - server_ca_cert.write_text(self.coordinator.cert_handler.ca_cert) + server_ca_cert.write_text(self.coordinator.cert_handler.ca_cert) else: # tls unavailable: delete local cert server_ca_cert.unlink(missing_ok=True) From 6f65396cc4b63ec8d9ce763940e420c2da3615a4 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 22 Jul 2024 16:40:17 +0300 Subject: [PATCH 05/22] remove metrics-generator from minimal deployment --- src/tempo_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tempo_config.py b/src/tempo_config.py index c54cbba..1a6454e 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -61,7 +61,6 @@ def all_nonmeta(): TempoRole.ingester: 1, TempoRole.distributor: 1, TempoRole.compactor: 1, - TempoRole.metrics_generator: 1, } """The minimal set of roles that need to be allocated for the deployment to be considered consistent (otherwise we set blocked).""" From caeb90fbc8ffcb77eca77a4b2d1c3934b8407428 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 22 Jul 2024 19:21:31 +0300 Subject: [PATCH 06/22] add latest cosl --- requirements.txt | 2 +- src/charm.py | 2 +- src/tempo_config.py | 15 +++++++-------- tests/integration/helpers.py | 2 +- tests/integration/test_ingressed_tls.py | 3 +-- tests/integration/test_integration.py | 3 +-- tests/integration/test_scaling_monolithic.py | 3 +-- tests/integration/test_self_tracing.py | 2 +- tests/integration/test_self_tracing_remote.py | 5 +++-- tests/integration/test_tls.py | 3 +-- tox.ini | 2 ++ 11 files changed, 20 insertions(+), 22 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3834f32..6fa4fec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ cryptography # lib/charms/tempo_k8s/v1/tracing.py pydantic>=2 # lib/charms/prometheus_k8s/v0/prometheus_scrape.py -cosl +cosl>=0.0.14 diff --git a/src/charm.py b/src/charm.py index e8a9e9c..754bfac 100755 --- a/src/charm.py +++ b/src/charm.py @@ -50,7 +50,7 @@ def __init__(self, *args): roles_config=TempoRolesConfig(), s3_bucket_name=Tempo.s3_bucket_name, external_url=self._external_url, - worker_metrics_port="8080", + worker_metrics_port=8080, endpoints={ "certificates": "certificates", "cluster": "tempo-cluster", diff --git a/src/tempo_config.py b/src/tempo_config.py index 1a6454e..20ef054 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -46,7 +46,6 @@ def all_nonmeta(): TempoRole.ingester, TempoRole.distributor, TempoRole.compactor, - TempoRole.metrics_generator, } @@ -66,12 +65,12 @@ def all_nonmeta(): deployment to be considered consistent (otherwise we set blocked).""" RECOMMENDED_DEPLOYMENT = { - TempoRole.querier: 1, - TempoRole.query_frontend: 1, - TempoRole.ingester: 3, - TempoRole.distributor: 1, - TempoRole.compactor: 1, - TempoRole.metrics_generator: 1, + TempoRole.querier.value: 1, + TempoRole.query_frontend.value: 1, + TempoRole.ingester.value: 3, + TempoRole.distributor.value: 1, + TempoRole.compactor.value: 1, + TempoRole.metrics_generator.value: 1, } """ @@ -88,7 +87,7 @@ class TempoRolesConfig: roles: Iterable[str] = {role for role in TempoRole} meta_roles: Mapping[str, Iterable[str]] = META_ROLES minimal_deployment: Iterable[str] = MINIMAL_DEPLOYMENT - recommended_deployment = RECOMMENDED_DEPLOYMENT + recommended_deployment: Dict[str, int] = RECOMMENDED_DEPLOYMENT class ClientAuthTypeEnum(str, enum.Enum): diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 4f8542e..3e19cbc 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -324,7 +324,7 @@ def get_traces(tempo_host: str, service_name="tracegenhttp", tls=True): return traces -@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=10)) +@retry(stop=stop_after_attempt(15), wait=wait_exponential(multiplier=1, min=4, max=10)) async def get_traces_patiently(tempo_host, service_name="tracegenhttp", tls=True): traces = get_traces(tempo_host, service_name=service_name, tls=tls) assert len(traces) > 0 diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index 6d46d0c..afdcec2 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -34,8 +34,7 @@ async def get_ingress_proxied_endpoint(ops_test: OpsTest): @pytest.mark.setup @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest): - # tempo_charm = await ops_test.build_charm(".") - tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tempo_charm = await ops_test.build_charm(".") resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], "nginx-prometheus-exporter-image": METADATA["resources"][ diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index eec0039..b87b69b 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -26,8 +26,7 @@ async def test_build_and_deploy(ops_test: OpsTest): # Given a fresh build of the charm # When deploying it together with testers # Then applications should eventually be created - # tempo_charm = await ops_test.build_charm(".") - tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tempo_charm = await ops_test.build_charm(".") tester_charm = await ops_test.build_charm("./tests/integration/tester/") tester_grpc_charm = await ops_test.build_charm("./tests/integration/tester-grpc/") resources = { diff --git a/tests/integration/test_scaling_monolithic.py b/tests/integration/test_scaling_monolithic.py index f7a247f..f7fd344 100644 --- a/tests/integration/test_scaling_monolithic.py +++ b/tests/integration/test_scaling_monolithic.py @@ -18,8 +18,7 @@ @pytest.mark.setup @pytest.mark.abort_on_fail async def test_deploy_tempo(ops_test: OpsTest): - # tempo_charm = await ops_test.build_charm(".") - tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tempo_charm = await ops_test.build_charm(".") resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], diff --git a/tests/integration/test_self_tracing.py b/tests/integration/test_self_tracing.py index 01badd9..157cf24 100644 --- a/tests/integration/test_self_tracing.py +++ b/tests/integration/test_self_tracing.py @@ -19,7 +19,7 @@ @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest): - tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tempo_charm = await ops_test.build_charm(".") resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], "nginx-prometheus-exporter-image": METADATA["resources"][ diff --git a/tests/integration/test_self_tracing_remote.py b/tests/integration/test_self_tracing_remote.py index 0cfcd60..7b337dd 100644 --- a/tests/integration/test_self_tracing_remote.py +++ b/tests/integration/test_self_tracing_remote.py @@ -20,7 +20,7 @@ @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest): - tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tempo_charm = await ops_test.build_charm(".") resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], "nginx-prometheus-exporter-image": METADATA["resources"][ @@ -37,7 +37,7 @@ async def test_build_and_deploy(ops_test: OpsTest): await deploy_cluster(ops_test, APP_NAME) await asyncio.gather( - ops_test.model.wait_for_idle(status="active", raise_on_blocked=True, timeout=1000) + ops_test.model.wait_for_idle( apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000) ) @@ -45,6 +45,7 @@ async def test_build_and_deploy(ops_test: OpsTest): async def test_relate(ops_test: OpsTest): await ops_test.model.integrate(APP_NAME + ":tracing", APP_REMOTE_NAME + ":self-tracing") await ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", timeout=1000, ) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 34a8655..a1e5c31 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -36,8 +36,7 @@ async def get_tempo_traces_internal_endpoint(ops_test: OpsTest, proto="http"): @pytest.mark.setup @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest): - # tempo_charm = await ops_test.build_charm(".") - tempo_charm = "/home/michael/Work/tempo-coordinator-k8s-operator/charm" + tempo_charm = await ops_test.build_charm(".") resources = { "nginx-image": METADATA["resources"]["nginx-image"]["upstream-source"], "nginx-prometheus-exporter-image": METADATA["resources"][ diff --git a/tox.ini b/tox.ini index 5a36eed..957eb0e 100644 --- a/tox.ini +++ b/tox.ini @@ -91,6 +91,8 @@ deps = -r{toxinidir}/requirements.txt # tracegen opentelemetry-exporter-otlp-proto-grpc + minio + tenacity==8.2.3 commands = pytest -v --tb native --log-cli-level=INFO {[vars]tst_path}integration -s {posargs} From e68a9c832c44df02cd253b62884b6f3d5b801658 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 22 Jul 2024 19:22:45 +0300 Subject: [PATCH 07/22] fix lint --- tests/integration/test_self_tracing_remote.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_self_tracing_remote.py b/tests/integration/test_self_tracing_remote.py index 7b337dd..9a889b5 100644 --- a/tests/integration/test_self_tracing_remote.py +++ b/tests/integration/test_self_tracing_remote.py @@ -37,7 +37,9 @@ async def test_build_and_deploy(ops_test: OpsTest): await deploy_cluster(ops_test, APP_NAME) await asyncio.gather( - ops_test.model.wait_for_idle( apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000) + ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + ) ) From a171fce722aa8e0010380df9f67a994f916b3f30 Mon Sep 17 00:00:00 2001 From: michael Date: Tue, 23 Jul 2024 00:04:27 +0300 Subject: [PATCH 08/22] cosl lib increment --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6fa4fec..bfcdc8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ cryptography # lib/charms/tempo_k8s/v1/tracing.py pydantic>=2 # lib/charms/prometheus_k8s/v0/prometheus_scrape.py -cosl>=0.0.14 +cosl>=0.0.15 From bc9e8c92991be05e47a8dcb77accc1c6595e07dd Mon Sep 17 00:00:00 2001 From: michael Date: Tue, 23 Jul 2024 11:44:39 +0300 Subject: [PATCH 09/22] unpin version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bfcdc8d..3834f32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ cryptography # lib/charms/tempo_k8s/v1/tracing.py pydantic>=2 # lib/charms/prometheus_k8s/v0/prometheus_scrape.py -cosl>=0.0.15 +cosl From f7c55373d1cf3c2c72b3722d0e23536af43da92a Mon Sep 17 00:00:00 2001 From: michael Date: Thu, 25 Jul 2024 14:54:14 +0300 Subject: [PATCH 10/22] comments address --- src/charm.py | 7 +++---- src/tempo.py | 9 +++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/charm.py b/src/charm.py index 754bfac..2227962 100755 --- a/src/charm.py +++ b/src/charm.py @@ -89,6 +89,7 @@ def __init__(self, *args): or not self.coordinator.is_coherent or not self.coordinator.s3_ready ): + # logging will be handled by `self.coordinator` for each of the above circumstances. return # lifecycle @@ -172,8 +173,6 @@ def _on_tracing_broken(self, _): def _on_cert_handler_changed(self, e: ops.RelationChangedEvent): - self.coordinator._on_cert_handler_changed(e) - # tls readiness change means config change. # sync scheme change with traefik and related consumers self._configure_ingress() @@ -216,7 +215,7 @@ def _on_list_receivers_action(self, event: ops.ActionEvent): # keep this event handler at the bottom def _on_collect_unit_status(self, e: ops.CollectStatusEvent): - self.coordinator._on_collect_unit_status(e) + pass # add Tempo charm custom blocking conditions # TODO: avoid waiting for update-status event # if not self.is_workload_ready(): @@ -287,7 +286,7 @@ def tempo_otlp_http_endpoint(self) -> Optional[str]: """Endpoint at which the charm tracing information will be forwarded.""" # the charm container and the tempo workload container have apparently the same # IP, so we can talk to tempo at localhost. - if hasattr(self, "coordinator") and self.coordinator.tracing.is_ready(): + if self.coordinator and self.coordinator.tracing.is_ready(): return self.coordinator.tracing.get_endpoint("otlp_http") # In absence of another Tempo instance, we don't want to lose this instance's charm traces elif self.is_workload_ready(): diff --git a/src/tempo.py b/src/tempo.py index 93b08fb..4b87682 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -151,12 +151,13 @@ def _build_storage_config(self, s3_config: dict): def _build_querier_config(self, roles_addresses: Dict[str, Set[str]]): """Build querier config""" # if distributor and query-frontend have the same address, then the mode of operation is 'all'. - if roles_addresses.get(tempo_config.TempoRole.query_frontend) == roles_addresses.get( - tempo_config.TempoRole.distributor - ) or not roles_addresses.get(tempo_config.TempoRole.query_frontend): + query_frontend_addresses = roles_addresses.get(tempo_config.TempoRole.query_frontend) + distributor_addresses = roles_addresses.get(tempo_config.TempoRole.distributor) + + if not query_frontend_addresses or query_frontend_addresses == distributor_addresses: addr = "localhost" else: - addr = roles_addresses.get(tempo_config.TempoRole.query_frontend, set()).pop() + addr = query_frontend_addresses.pop() return tempo_config.Querier( frontend_worker=tempo_config.FrontendWorker( From 4d0452bc7844fb6c42a4531035b878893abfae1b Mon Sep 17 00:00:00 2001 From: michael Date: Thu, 25 Jul 2024 15:22:37 +0300 Subject: [PATCH 11/22] add comment --- src/tempo_config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tempo_config.py b/src/tempo_config.py index 20ef054..1adbed9 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -237,6 +237,8 @@ class S3(BaseModel): model_config = ConfigDict(populate_by_name=True) """Pydantic config.""" + # Use aliases to override keys in `coordinator::_s3_config` + # to align with upstream Tempo's configuration keys: `bucket`, `access_key`, `secret_key`. bucket_name: str = Field(alias="bucket") access_key_id: str = Field(alias="access_key") endpoint: str From 519bdd17b62900852b81f182c1df8708a46a9a7e Mon Sep 17 00:00:00 2001 From: michael Date: Fri, 26 Jul 2024 13:19:45 +0300 Subject: [PATCH 12/22] fix names --- scripts/tracegen.py | 2 +- tests/integration/helpers.py | 4 ++-- tests/integration/test_ingressed_tls.py | 2 +- tests/integration/test_tls.py | 4 +++- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/tracegen.py b/scripts/tracegen.py index c5fcd5f..76ffb4d 100644 --- a/scripts/tracegen.py +++ b/scripts/tracegen.py @@ -41,7 +41,7 @@ def emit_trace( def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: Literal["grpc", "http"] = "http"): resource = Resource.create(attributes={ - "service.name": f"tracegen{protocol}", + "service.name": f"tracegen-{protocol}", "nonce": str(nonce) } ) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 3e19cbc..5b8ffac 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -313,7 +313,7 @@ async def deploy_cluster(ops_test: OpsTest, tempo_app=APP_NAME): ) -def get_traces(tempo_host: str, service_name="tracegenhttp", tls=True): +def get_traces(tempo_host: str, service_name="tracegen-http", tls=True): url = f"{'https' if tls else 'http'}://{tempo_host}:3200/api/search?tags=service.name={service_name}" req = requests.get( url, @@ -325,7 +325,7 @@ def get_traces(tempo_host: str, service_name="tracegenhttp", tls=True): @retry(stop=stop_after_attempt(15), wait=wait_exponential(multiplier=1, min=4, max=10)) -async def get_traces_patiently(tempo_host, service_name="tracegenhttp", tls=True): +async def get_traces_patiently(tempo_host, service_name="tracegen-http", tls=True): traces = get_traces(tempo_host, service_name=service_name, tls=tls) assert len(traces) > 0 return traces diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index afdcec2..3741f04 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -135,7 +135,7 @@ async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server f"{tempo_host}:4317", nonce=nonce, proto="grpc", ops_test=ops_test, use_cert=True ) # THEN we can verify it's been ingested - assert await get_traces_patiently(tempo_host, service_name="grpc") + assert await get_traces_patiently(tempo_host, service_name="tracegen-grpc") @pytest.mark.teardown diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index a1e5c31..8fb3e27 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -122,7 +122,9 @@ async def test_verify_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): # WHEN we emit a trace secured with TLS await emit_trace(tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) # THEN we can verify it's been ingested - await get_traces_patiently(await get_application_ip(ops_test, APP_NAME), service_name="grpc") + await get_traces_patiently( + await get_application_ip(ops_test, APP_NAME), service_name="tracegen-grpc" + ) @pytest.mark.teardown From a405fd96b6ad6c4a88aa7b4cb3e6d1515b2a2435 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 11:39:04 +0300 Subject: [PATCH 13/22] address comments #2 --- src/charm.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/charm.py b/src/charm.py index 2227962..ff1e7c1 100755 --- a/src/charm.py +++ b/src/charm.py @@ -84,11 +84,7 @@ def __init__(self, *args): self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status) # refuse to handle any other event as we can't possibly know what to do. - if ( - not self.coordinator.cluster.has_workers - or not self.coordinator.is_coherent - or not self.coordinator.s3_ready - ): + if not self.coordinator.can_handle_events: # logging will be handled by `self.coordinator` for each of the above circumstances. return @@ -144,11 +140,23 @@ def _external_url(self) -> str: def _internal_url(self) -> str: """Returns workload's FQDN.""" scheme = "http" - if hasattr(self, "coordinator") and self.coordinator.nginx.are_certificates_on_disk: + if self.are_certificates_on_disk: scheme = "https" return f"{scheme}://{self.hostname}" + @property + def are_certificates_on_disk(self) -> bool: + """Return True if the certificates files are on disk.""" + nginx_container = self.unit.get_container("nginx") + + return ( + nginx_container.can_connect() + and nginx_container.exists(CERT_PATH) + and nginx_container.exists(KEY_PATH) + and nginx_container.exists(CA_CERT_PATH) + ) + @property def enabled_receivers(self) -> Set[str]: """Extra receivers enabled through config""" From 844e1d0bfdfa0134a18872077954bdfa112fd700 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 11:42:48 +0300 Subject: [PATCH 14/22] fix lint --- src/charm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/charm.py b/src/charm.py index ff1e7c1..00d9344 100755 --- a/src/charm.py +++ b/src/charm.py @@ -20,6 +20,7 @@ ) from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer from cosl.coordinated_workers.coordinator import Coordinator +from cosl.coordinated_workers.nginx import CA_CERT_PATH, CERT_PATH, KEY_PATH from ops.charm import CharmBase, RelationEvent from ops.main import main From 9cc4939dc9e821b7b1e5bd445b233cf2e8c9a877 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 14:15:01 +0300 Subject: [PATCH 15/22] nginx comments --- src/charm.py | 13 +------------ src/nginx_config.py | 38 +++++++++++++++++++++++++++++++------- src/tempo.py | 2 +- src/tempo_config.py | 2 +- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/charm.py b/src/charm.py index 00d9344..3ba2bec 100755 --- a/src/charm.py +++ b/src/charm.py @@ -34,7 +34,7 @@ @trace_charm( tracing_endpoint="tempo_otlp_http_endpoint", server_cert="server_ca_cert", - extra_types=(Tempo, TracingEndpointProvider), + extra_types=(Tempo, TracingEndpointProvider, Coordinator, TempoRolesConfig), ) class TempoCoordinatorCharm(CharmBase): """Charmed Operator for Tempo; a distributed tracing backend.""" @@ -81,9 +81,6 @@ def __init__(self, *args): self.tracing = TracingEndpointProvider(self, external_url=self._external_url) - # We always listen to collect-status - self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status) - # refuse to handle any other event as we can't possibly know what to do. if not self.coordinator.can_handle_events: # logging will be handled by `self.coordinator` for each of the above circumstances. @@ -222,14 +219,6 @@ def _on_list_receivers_action(self, event: ops.ActionEvent): res[receiver.replace("_", "-")] = self.get_receiver_url(receiver) event.set_results(res) - # keep this event handler at the bottom - def _on_collect_unit_status(self, e: ops.CollectStatusEvent): - pass - # add Tempo charm custom blocking conditions - # TODO: avoid waiting for update-status event - # if not self.is_workload_ready(): - # e.add_status(ops.WaitingStatus("[workload.tempo] Tempo API not ready just yet...")) - ################### # UTILITY METHODS # ################### diff --git a/src/nginx_config.py b/src/nginx_config.py index 6f14d50..3172fdd 100644 --- a/src/nginx_config.py +++ b/src/nginx_config.py @@ -3,9 +3,14 @@ """Nginx workload.""" import logging -from typing import Any, Dict, List, Optional, Set +from typing import Any, Dict, List, Optional, Set, cast import crossplane +from charms.tempo_k8s.v2.tracing import ( + ReceiverProtocol, + TransportProtocolType, + receiver_protocol_to_transport_protocol, +) from cosl.coordinated_workers.coordinator import Coordinator from cosl.coordinated_workers.nginx import CERT_PATH, KEY_PATH @@ -77,7 +82,9 @@ def _prepare_config(self, coordinator: Coordinator) -> List[dict]: }, {"directive": "proxy_read_timeout", "args": ["300"]}, # server block - *self._servers(addresses_by_role, coordinator.nginx.are_certificates_on_disk), + *self._build_servers_config( + addresses_by_role, coordinator.nginx.are_certificates_on_disk + ), ], }, ] @@ -189,25 +196,30 @@ def _listen_args(self, port: int, ipv6: bool, ssl: bool, http2: bool) -> List[st args.append("http2") return args - def _servers( + def _build_servers_config( self, addresses_by_role: Dict[str, Set[str]], tls: bool = False ) -> List[Dict[str, Any]]: servers = [] roles = addresses_by_role.keys() - + # generate a server config for receiver protocols (9411, 4317, 4318, 14268, 14250) if TempoRole.distributor.value in roles: for protocol, port in Tempo.receiver_ports.items(): servers.append( - self._server(port, protocol.replace("_", "-"), "grpc" in protocol, tls) + self._build_server_config( + port, protocol.replace("_", "-"), self._is_protocol_grpc(protocol), tls + ) ) + # generate a server config for the Tempo server protocols (3200, 9096) if TempoRole.query_frontend.value in roles: for protocol, port in Tempo.server_ports.items(): servers.append( - self._server(port, protocol.replace("_", "-"), "grpc" in protocol, tls) + self._build_server_config( + port, protocol.replace("_", "-"), self._is_protocol_grpc(protocol), tls + ) ) return servers - def _server( + def _build_server_config( self, port: int, upstream: str, grpc: bool = False, tls: bool = False ) -> Dict[str, Any]: auth_enabled = False @@ -247,3 +259,15 @@ def _server( *self._locations(upstream, grpc, tls), ], } + + def _is_protocol_grpc(self, protocol: str) -> bool: + """ + Return True if the given protocol is gRPC + """ + if ( + protocol == "tempo_grpc" + or receiver_protocol_to_transport_protocol.get(cast(ReceiverProtocol, protocol)) + == TransportProtocolType.grpc + ): + return True + return False diff --git a/src/tempo.py b/src/tempo.py index 4b87682..36c5e67 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -75,7 +75,7 @@ def config( Only activate the provided receivers. """ - config = tempo_config.Tempo( + config = tempo_config.TempoConfig( auth_enabled=False, server=self._build_server_config(coordinator.tls_available), distributor=self._build_distributor_config( diff --git a/src/tempo_config.py b/src/tempo_config.py index 1adbed9..4ac225b 100644 --- a/src/tempo_config.py +++ b/src/tempo_config.py @@ -285,7 +285,7 @@ class Storage(BaseModel): trace: TraceStorage -class Tempo(BaseModel): +class TempoConfig(BaseModel): """Tempo config schema.""" auth_enabled: bool From 13f94e5b592a9189b1137213b873c909e219d137 Mon Sep 17 00:00:00 2001 From: michael Date: Fri, 26 Jul 2024 12:29:52 +0300 Subject: [PATCH 16/22] fix list --- charmcraft.yaml | 12 ++++++++++++ lib/charms/tempo_k8s/v2/tracing.py | 19 +++++++++---------- src/charm.py | 13 +++++++++---- src/tempo.py | 12 +++++------- tests/scenario/test_ingressed_tracing.py | 16 ++++++++++++++++ tests/unit/test_charm.py | 2 ++ 6 files changed, 53 insertions(+), 21 deletions(-) diff --git a/charmcraft.yaml b/charmcraft.yaml index 809c523..bee82b0 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -141,3 +141,15 @@ config: description: Force-enable the receiver for the 'otlp_http' protocol in Tempo, even if there is no integration currently requesting it. type: boolean default: false + always_enable_opencensus: + description: Force-enable the receiver for the 'opencensus' protocol in Tempo, even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_jaeger_thrift_http: + description: Force-enable the receiver for the 'jaeger_thrift_http' protocol in Tempo, even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_jaeger_grpc: + description: Force-enable the receiver for the 'jaeger_grpc' protocol in Tempo, even if there is no integration currently requesting it. + type: boolean + default: false diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py index 8b9fb4f..e047325 100644 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ b/lib/charms/tempo_k8s/v2/tracing.py @@ -107,7 +107,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 8 PYDEPS = ["pydantic"] @@ -116,14 +116,14 @@ def __init__(self, *args): DEFAULT_RELATION_NAME = "tracing" RELATION_INTERFACE_NAME = "tracing" +# Supported list rationale https://github.com/canonical/tempo-coordinator-k8s-operator/issues/8 ReceiverProtocol = Literal[ "zipkin", - "kafka", - "opencensus", - "tempo_http", - "tempo_grpc", "otlp_grpc", "otlp_http", + "opencensus", + "jaeger_grpc", + "jaeger_thrift_http", ] RawReceiver = Tuple[ReceiverProtocol, str] @@ -141,14 +141,13 @@ class TransportProtocolType(str, enum.Enum): grpc = "grpc" -receiver_protocol_to_transport_protocol = { +receiver_protocol_to_transport_protocol: Dict[ReceiverProtocol, TransportProtocolType] = { "zipkin": TransportProtocolType.http, - "kafka": TransportProtocolType.http, - "opencensus": TransportProtocolType.http, - "tempo_http": TransportProtocolType.http, - "tempo_grpc": TransportProtocolType.grpc, "otlp_grpc": TransportProtocolType.grpc, "otlp_http": TransportProtocolType.http, + "opencensus": TransportProtocolType.grpc, + "jaeger_thrift_http": TransportProtocolType.http, + "jaeger_grpc": TransportProtocolType.grpc, } """A mapping between telemetry protocols and their corresponding transport protocol. """ diff --git a/src/charm.py b/src/charm.py index 3ba2bec..67c9cd0 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,7 +7,7 @@ import socket from pathlib import Path from subprocess import CalledProcessError, getoutput -from typing import Dict, Optional, Set, Tuple, get_args +from typing import Dict, Optional, Set, Tuple, cast, get_args import ops from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider @@ -16,6 +16,7 @@ ReceiverProtocol, RequestEvent, TracingEndpointProvider, + TransportProtocolType, receiver_protocol_to_transport_protocol, ) from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer @@ -318,7 +319,11 @@ def _ingress_config(self) -> dict: # TODO better matcher "rule": "ClientIP(`0.0.0.0/0`)", } - if sanitized_protocol.endswith("grpc") and not self.coordinator.tls_available: + if ( + protocol == "tempo_grpc" + or receiver_protocol_to_transport_protocol.get(cast(ReceiverProtocol, protocol)) + == TransportProtocolType.grpc + ) and not self.coordinator.tls_available: # to send traces to unsecured GRPC endpoints, we need h2c # see https://doc.traefik.io/traefik/v2.0/user-guides/grpc/#with-http-h2c http_services[ @@ -352,13 +357,13 @@ def get_receiver_url(self, protocol: ReceiverProtocol): if has_ingress: url = ( self.ingress.external_host - if protocol_type == "grpc" + if protocol_type == TransportProtocolType.grpc else f"{self.ingress.scheme}://{self.ingress.external_host}" ) else: url = ( self.coordinator.hostname - if protocol_type == "grpc" + if protocol_type == TransportProtocolType.grpc else self.coordinator._internal_url ) diff --git a/src/tempo.py b/src/tempo.py index 36c5e67..7bf2b3b 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -34,18 +34,16 @@ class Tempo: "tempo_grpc": 9096, # default grpc listen port is 9095, but that conflicts with promtail. } - # ports source: https://github.com/grafana/tempo/blob/main/example/docker-compose/local/docker-compose.yaml + # ports defined are the default ports specified in + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver + # for each of the below receivers. receiver_ports: Dict[str, int] = { "zipkin": 9411, "otlp_grpc": 4317, "otlp_http": 4318, "jaeger_thrift_http": 14268, - # todo if necessary add support for: - # "kafka": 42, - # "jaeger_grpc": 14250, - # "opencensus": 43, - # "jaeger_thrift_compact": 44, - # "jaeger_thrift_binary": 45, + "jaeger_grpc": 14250, + "opencensus": 55678, } all_ports = {**server_ports, **receiver_ports} diff --git a/tests/scenario/test_ingressed_tracing.py b/tests/scenario/test_ingressed_tracing.py index 69a1916..43e0e54 100644 --- a/tests/scenario/test_ingressed_tracing.py +++ b/tests/scenario/test_ingressed_tracing.py @@ -74,6 +74,16 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w "rule": "ClientIP(`0.0.0.0/0`)", "service": f"juju-{state.model.name}-{charm_name}-service-tempo-grpc", }, + f"juju-{state.model.name}-{charm_name}-opencensus": { + "entryPoints": ["opencensus"], + "rule": "ClientIP(`0.0.0.0/0`)", + "service": f"juju-{state.model.name}-{charm_name}-service-opencensus", + }, + f"juju-{state.model.name}-{charm_name}-jaeger-grpc": { + "entryPoints": ["jaeger-grpc"], + "rule": "ClientIP(`0.0.0.0/0`)", + "service": f"juju-{state.model.name}-{charm_name}-service-jaeger-grpc", + }, }, "services": { f"juju-{state.model.name}-{charm_name}-service-jaeger-thrift-http": { @@ -94,6 +104,12 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w f"juju-{state.model.name}-{charm_name}-service-tempo-grpc": { "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:9096"}]} }, + f"juju-{state.model.name}-{charm_name}-service-opencensus": { + "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:55678"}]} + }, + f"juju-{state.model.name}-{charm_name}-service-jaeger-grpc": { + "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:14250"}]} + }, }, }, } diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9a74dac..1c2bcc4 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -30,6 +30,8 @@ def test_entrypoints_are_generated_with_sanitized_names(self): "otlp-grpc": {"address": ":4317"}, "otlp-http": {"address": ":4318"}, "jaeger-thrift-http": {"address": ":14268"}, + "jaeger-grpc": {"address": ":14250"}, + "opencensus": {"address": ":55678"}, } } self.assertEqual(self.harness.charm._static_ingress_config, expected_entrypoints) From 8da72c3b0e4069a91cba7620361203bfa2805e04 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 11:26:49 +0300 Subject: [PATCH 17/22] add tests for protocols --- lib/charms/tempo_k8s/v2/tracing.py | 2 - scripts/tracegen.py | 55 +++++++++++++++--- tests/integration/helpers.py | 11 ++-- tests/integration/test_ingressed_tls.py | 4 +- tests/integration/test_tls.py | 72 +++++++++++++----------- tests/scenario/test_ingressed_tracing.py | 2 +- tox.ini | 6 +- 7 files changed, 99 insertions(+), 53 deletions(-) diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py index e047325..dfb2336 100644 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ b/lib/charms/tempo_k8s/v2/tracing.py @@ -121,7 +121,6 @@ def __init__(self, *args): "zipkin", "otlp_grpc", "otlp_http", - "opencensus", "jaeger_grpc", "jaeger_thrift_http", ] @@ -145,7 +144,6 @@ class TransportProtocolType(str, enum.Enum): "zipkin": TransportProtocolType.http, "otlp_grpc": TransportProtocolType.grpc, "otlp_http": TransportProtocolType.http, - "opencensus": TransportProtocolType.grpc, "jaeger_thrift_http": TransportProtocolType.http, "jaeger_grpc": TransportProtocolType.grpc, } diff --git a/scripts/tracegen.py b/scripts/tracegen.py index 76ffb4d..08d38aa 100644 --- a/scripts/tracegen.py +++ b/scripts/tracegen.py @@ -1,11 +1,15 @@ import os import time from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, get_args +import requests from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCExporter from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPExporter +from opentelemetry.exporter.zipkin.json import ZipkinExporter +from opentelemetry.exporter.jaeger.thrift import JaegerExporter as JaegerThriftHttpExporter +from opentelemetry.exporter.jaeger.proto.grpc import JaegerExporter as JaegerGRPCExporter from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ( @@ -13,33 +17,68 @@ ConsoleSpanExporter, ) +protocols = Literal[ + "zipkin", + "otlp_grpc", + "otlp_http", + "jaeger_grpc", + "jaeger_thrift_http", +] + def emit_trace( endpoint: str, log_trace_to_console: bool = False, cert: Path = None, - protocol: Literal["grpc", "http", "ALL"] = "grpc", + protocol: protocols = "otlp_http", nonce: Any = None ): os.environ['OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE'] = str(Path(cert).absolute()) if cert else "" + os.environ['OTEL_EXPORTER_JAEGER_CERTIFICATE'] = str(Path(cert).absolute()) if cert else "" + # jaeger thrift http exporter does not expose a parameter to set path for CA verification + os.environ['SSL_CERT_FILE'] = str(Path(cert).absolute()) if cert else "" + os.environ["REQUESTS_CA_BUNDLE"] = str(Path(cert).absolute()) if cert else "" - if protocol == "grpc": + # ip:4317 + if protocol == "otlp_grpc": span_exporter = GRPCExporter( endpoint=endpoint, insecure=not cert, ) - elif protocol == "http": + # scheme://ip:4318/v1/traces + elif protocol == "otlp_http": span_exporter = HTTPExporter( endpoint=endpoint, ) + # scheme://ip:9411/v1/traces + elif protocol == "zipkin": + # zipkin does not expose an arg to pass certificate + session = requests.Session() + if cert: + session.verify = cert + span_exporter = ZipkinExporter( + endpoint=endpoint, + session=session, + ) + # scheme://ip:14268/api/traces?format=jaeger.thrift + elif protocol == "jaeger_thrift_http": + span_exporter = JaegerThriftHttpExporter( + collector_endpoint=endpoint, + ) + # ip:14250 + elif protocol == "jaeger_grpc": + span_exporter = JaegerGRPCExporter( + collector_endpoint = endpoint, + insecure=not cert, + ) else: # ALL - return (emit_trace(endpoint, log_trace_to_console, cert, "grpc", nonce=nonce) and - emit_trace(endpoint, log_trace_to_console, cert, "http", nonce=nonce)) - + for proto in get_args(protocols): + emit_trace(endpoint, log_trace_to_console, cert, proto, nonce=nonce) + return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce, protocol = protocol) -def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: Literal["grpc", "http"] = "http"): +def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: protocols = "otlp_http"): resource = Resource.create(attributes={ "service.name": f"tracegen-{protocol}", "nonce": str(nonce) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 5b8ffac..a8d1ff0 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -289,7 +289,9 @@ async def deploy_cluster(ops_test: OpsTest, tempo_app=APP_NAME): # ) # TODO: deploy from latest edge - tempo_worker_charm = "/home/michael/Work/tempo-worker-k8s-operator/charm" + tempo_worker_charm = ( + "/home/michael/Work/tempo-worker-k8s-operator/tempo-worker-k8s_ubuntu-22.04-amd64.charm" + ) resources = { "tempo-image": "docker.io/ubuntu/tempo:2-22.04", @@ -313,7 +315,7 @@ async def deploy_cluster(ops_test: OpsTest, tempo_app=APP_NAME): ) -def get_traces(tempo_host: str, service_name="tracegen-http", tls=True): +def get_traces(tempo_host: str, service_name="tracegen-otlp_http", tls=True): url = f"{'https' if tls else 'http'}://{tempo_host}:3200/api/search?tags=service.name={service_name}" req = requests.get( url, @@ -325,14 +327,14 @@ def get_traces(tempo_host: str, service_name="tracegen-http", tls=True): @retry(stop=stop_after_attempt(15), wait=wait_exponential(multiplier=1, min=4, max=10)) -async def get_traces_patiently(tempo_host, service_name="tracegen-http", tls=True): +async def get_traces_patiently(tempo_host, service_name="tracegen-otlp_http", tls=True): traces = get_traces(tempo_host, service_name=service_name, tls=tls) assert len(traces) > 0 return traces async def emit_trace( - endpoint, ops_test: OpsTest, nonce, proto: str = "http", verbose=0, use_cert=False + endpoint, ops_test: OpsTest, nonce, proto: str = "otlp_http", verbose=0, use_cert=False ): """Use juju ssh to run tracegen from the tempo charm; to avoid any DNS issues.""" cmd = ( @@ -344,7 +346,6 @@ async def emit_trace( f"TRACEGEN_NONCE={nonce} " "python3 tracegen.py" ) - return subprocess.getoutput(cmd) diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index 3741f04..49c9a0b 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -132,10 +132,10 @@ async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server tempo_host = await get_ingress_proxied_endpoint(ops_test) await emit_trace( - f"{tempo_host}:4317", nonce=nonce, proto="grpc", ops_test=ops_test, use_cert=True + f"{tempo_host}:4317", nonce=nonce, proto="otlp_grpc", ops_test=ops_test, use_cert=True ) # THEN we can verify it's been ingested - assert await get_traces_patiently(tempo_host, service_name="tracegen-grpc") + assert await get_traces_patiently(tempo_host, service_name="tracegen-otlp_grpc") @pytest.mark.teardown diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 8fb3e27..3719c6b 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -19,18 +19,23 @@ SSC = "self-signed-certificates" SSC_APP_NAME = "ssc" TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" +protocols_endpoints = { + "jaeger_thrift_http": "https://{}:14268/api/traces?format=jaeger.thrift", + "zipkin": "https://{}:9411/v1/traces", + "jaeger_grpc": "{}:14250", + "otlp_http": "https://{}:4318/v1/traces", + "otlp_grpc": "{}:4317", +} + logger = logging.getLogger(__name__) -async def get_tempo_traces_internal_endpoint(ops_test: OpsTest, proto="http"): +async def get_tempo_traces_internal_endpoint(ops_test: OpsTest, protocol): hostname = f"{APP_NAME}-0.{APP_NAME}-endpoints.{ops_test.model.name}.svc.cluster.local" - port = "4317" - endpoint_postfix = "" - if proto == "http": - hostname = f"https://{hostname}" - port = "4318" - endpoint_postfix = "/v1/traces" - return f"{hostname}:{port}{endpoint_postfix}" + protocol_endpoint = protocols_endpoints.get(protocol) + if protocol_endpoint is None: + assert False, f"Invalid {protocol}" + return protocol_endpoint.format(hostname) @pytest.mark.setup @@ -80,14 +85,15 @@ async def test_push_tracegen_script_and_deps(ops_test: OpsTest): await ops_test.juju( "ssh", f"{APP_NAME}/0", - "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", + "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http" + + " opentelemetry-exporter-zipkin opentelemetry-exporter-jaeger", ) async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, nonce): # IF tempo is related to SSC # WHEN we emit an http trace, **unsecured** - tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test) + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test, protocol="otlp_http") await emit_trace(tempo_endpoint, ops_test, nonce=nonce) # this should fail # THEN we can verify it's not been ingested traces = get_traces(await get_application_ip(ops_test, APP_NAME)) @@ -95,35 +101,35 @@ async def test_verify_trace_http_no_tls_fails(ops_test: OpsTest, server_cert, no @pytest.mark.abort_on_fail -async def test_verify_trace_http_tls(ops_test: OpsTest, nonce, server_cert): - # WHEN we emit a trace secured with TLS - tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test) - await emit_trace(tempo_endpoint, ops_test, nonce=nonce, use_cert=True) - # THEN we can verify it's eventually ingested - await get_traces_patiently(await get_application_ip(ops_test, APP_NAME)) - +@pytest.mark.parametrize("protocol", list(protocols_endpoints.keys())) +async def test_verify_traces_force_enabled_protocols_tls(ops_test: OpsTest, nonce, protocol): -@pytest.mark.abort_on_fail -async def test_verify_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): - # enable otlp grpc receiver tempo_app: Application = ops_test.model.applications[APP_NAME] - await tempo_app.set_config( - { - "always_enable_otlp_grpc": "True", - } - ) - await ops_test.model.wait_for_idle( - apps=[APP_NAME], - status="active", - timeout=1000, - ) - tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test, proto="grpc") + # enable each protocol receiver + # for protocol in protocols_endpoints: + + # otlp_http should be enabled by default + if protocol != "otlp_http": + await tempo_app.set_config( + { + f"always_enable_{protocol}": "True", + } + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + tempo_endpoint = await get_tempo_traces_internal_endpoint(ops_test, protocol=protocol) # WHEN we emit a trace secured with TLS - await emit_trace(tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto="grpc", use_cert=True) + await emit_trace( + tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto=protocol, use_cert=True + ) # THEN we can verify it's been ingested await get_traces_patiently( - await get_application_ip(ops_test, APP_NAME), service_name="tracegen-grpc" + await get_application_ip(ops_test, APP_NAME), service_name=f"tracegen-{protocol}" ) diff --git a/tests/scenario/test_ingressed_tracing.py b/tests/scenario/test_ingressed_tracing.py index 43e0e54..b84bfce 100644 --- a/tests/scenario/test_ingressed_tracing.py +++ b/tests/scenario/test_ingressed_tracing.py @@ -105,7 +105,7 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:9096"}]} }, f"juju-{state.model.name}-{charm_name}-service-opencensus": { - "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:55678"}]} + "loadBalancer": {"servers": [{"url": "http://1.2.3.4:55678"}]} }, f"juju-{state.model.name}-{charm_name}-service-jaeger-grpc": { "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:14250"}]} diff --git a/tox.ini b/tox.ini index 957eb0e..309e723 100644 --- a/tox.ini +++ b/tox.ini @@ -89,10 +89,12 @@ deps = pytest-operator requests -r{toxinidir}/requirements.txt - # tracegen - opentelemetry-exporter-otlp-proto-grpc minio tenacity==8.2.3 + # tracegen + opentelemetry-exporter-otlp-proto-grpc + opentelemetry-exporter-zipkin + opentelemetry-exporter-jaeger commands = pytest -v --tb native --log-cli-level=INFO {[vars]tst_path}integration -s {posargs} From f4f4d5bfd19bf90c42f6f79130ec1b0953018fc6 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 11:28:33 +0300 Subject: [PATCH 18/22] remove opencensus --- charmcraft.yaml | 4 ---- src/tempo.py | 3 --- tests/scenario/test_ingressed_tracing.py | 8 -------- tests/unit/test_charm.py | 1 - 4 files changed, 16 deletions(-) diff --git a/charmcraft.yaml b/charmcraft.yaml index bee82b0..fd98faa 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -141,10 +141,6 @@ config: description: Force-enable the receiver for the 'otlp_http' protocol in Tempo, even if there is no integration currently requesting it. type: boolean default: false - always_enable_opencensus: - description: Force-enable the receiver for the 'opencensus' protocol in Tempo, even if there is no integration currently requesting it. - type: boolean - default: false always_enable_jaeger_thrift_http: description: Force-enable the receiver for the 'jaeger_thrift_http' protocol in Tempo, even if there is no integration currently requesting it. type: boolean diff --git a/src/tempo.py b/src/tempo.py index 7bf2b3b..fabc139 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -43,7 +43,6 @@ class Tempo: "otlp_http": 4318, "jaeger_thrift_http": 14268, "jaeger_grpc": 14250, - "opencensus": 55678, } all_ports = {**server_ports, **receiver_ports} @@ -223,8 +222,6 @@ def _build_distributor_config( if "zipkin" in receivers_set: config["zipkin"] = receiver_config - if "opencensus" in receivers_set: - config["opencensus"] = receiver_config otlp_config = {} if "otlp_http" in receivers_set: diff --git a/tests/scenario/test_ingressed_tracing.py b/tests/scenario/test_ingressed_tracing.py index b84bfce..e2f6f73 100644 --- a/tests/scenario/test_ingressed_tracing.py +++ b/tests/scenario/test_ingressed_tracing.py @@ -74,11 +74,6 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w "rule": "ClientIP(`0.0.0.0/0`)", "service": f"juju-{state.model.name}-{charm_name}-service-tempo-grpc", }, - f"juju-{state.model.name}-{charm_name}-opencensus": { - "entryPoints": ["opencensus"], - "rule": "ClientIP(`0.0.0.0/0`)", - "service": f"juju-{state.model.name}-{charm_name}-service-opencensus", - }, f"juju-{state.model.name}-{charm_name}-jaeger-grpc": { "entryPoints": ["jaeger-grpc"], "rule": "ClientIP(`0.0.0.0/0`)", @@ -104,9 +99,6 @@ def test_ingress_relation_set_with_dynamic_config(context, base_state, s3, all_w f"juju-{state.model.name}-{charm_name}-service-tempo-grpc": { "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:9096"}]} }, - f"juju-{state.model.name}-{charm_name}-service-opencensus": { - "loadBalancer": {"servers": [{"url": "http://1.2.3.4:55678"}]} - }, f"juju-{state.model.name}-{charm_name}-service-jaeger-grpc": { "loadBalancer": {"servers": [{"url": "h2c://1.2.3.4:14250"}]} }, diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 1c2bcc4..1da8294 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -31,7 +31,6 @@ def test_entrypoints_are_generated_with_sanitized_names(self): "otlp-http": {"address": ":4318"}, "jaeger-thrift-http": {"address": ":14268"}, "jaeger-grpc": {"address": ":14250"}, - "opencensus": {"address": ":55678"}, } } self.assertEqual(self.harness.charm._static_ingress_config, expected_entrypoints) From 14e9abdb527b319eb43c8b0da555e6dd1fa2f8e1 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 13:52:27 +0300 Subject: [PATCH 19/22] refactor tracegen --- scripts/tracegen.py | 56 +++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/scripts/tracegen.py b/scripts/tracegen.py index 08d38aa..75e5e67 100644 --- a/scripts/tracegen.py +++ b/scripts/tracegen.py @@ -17,7 +17,7 @@ ConsoleSpanExporter, ) -protocols = Literal[ +ReceiverProtocol = Literal[ "zipkin", "otlp_grpc", "otlp_http", @@ -25,29 +25,24 @@ "jaeger_thrift_http", ] - -def emit_trace( - endpoint: str, - log_trace_to_console: bool = False, - cert: Path = None, - protocol: protocols = "otlp_http", - nonce: Any = None -): - os.environ['OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE'] = str(Path(cert).absolute()) if cert else "" - os.environ['OTEL_EXPORTER_JAEGER_CERTIFICATE'] = str(Path(cert).absolute()) if cert else "" +def set_envvars(cert: Path = None): + ca_cert_path = str(Path(cert).absolute()) if cert else "" + os.environ['OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE'] = ca_cert_path + os.environ['OTEL_EXPORTER_JAEGER_CERTIFICATE'] = ca_cert_path # jaeger thrift http exporter does not expose a parameter to set path for CA verification - os.environ['SSL_CERT_FILE'] = str(Path(cert).absolute()) if cert else "" - os.environ["REQUESTS_CA_BUNDLE"] = str(Path(cert).absolute()) if cert else "" + os.environ['SSL_CERT_FILE'] = ca_cert_path + os.environ["REQUESTS_CA_BUNDLE"] = ca_cert_path +def initialize_exporter(protocol: str, endpoint: str, cert: Path = None): # ip:4317 if protocol == "otlp_grpc": - span_exporter = GRPCExporter( + return GRPCExporter( endpoint=endpoint, insecure=not cert, ) # scheme://ip:4318/v1/traces elif protocol == "otlp_http": - span_exporter = HTTPExporter( + return HTTPExporter( endpoint=endpoint, ) # scheme://ip:9411/v1/traces @@ -56,29 +51,46 @@ def emit_trace( session = requests.Session() if cert: session.verify = cert - span_exporter = ZipkinExporter( + return ZipkinExporter( endpoint=endpoint, session=session, ) # scheme://ip:14268/api/traces?format=jaeger.thrift elif protocol == "jaeger_thrift_http": - span_exporter = JaegerThriftHttpExporter( + return JaegerThriftHttpExporter( collector_endpoint=endpoint, ) # ip:14250 elif protocol == "jaeger_grpc": - span_exporter = JaegerGRPCExporter( + return JaegerGRPCExporter( collector_endpoint = endpoint, insecure=not cert, ) - else: # ALL - for proto in get_args(protocols): + else: + raise ValueError(f"Unsupported protocol: {protocol}") + +def emit_trace( + endpoint: str, + log_trace_to_console: bool = False, + cert: Path = None, + protocol: ReceiverProtocol = "otlp_http", + nonce: Any = None +): + if protocol == "ALL": + for proto in get_args(protocol): emit_trace(endpoint, log_trace_to_console, cert, proto, nonce=nonce) + else: + set_envvars(cert) + span_exporter = initialize_exporter(protocol, endpoint, cert) + return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce, protocol = protocol) + + + + - return _export_trace(span_exporter, log_trace_to_console=log_trace_to_console, nonce=nonce, protocol = protocol) -def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: protocols = "otlp_http"): +def _export_trace(span_exporter, log_trace_to_console: bool = False, nonce: Any = None, protocol: ReceiverProtocol = "otlp_http"): resource = Resource.create(attributes={ "service.name": f"tracegen-{protocol}", "nonce": str(nonce) From e754f8b8f34dabc203a3402a2da0cc899507fc77 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 29 Jul 2024 14:35:40 +0300 Subject: [PATCH 20/22] fix scenario --- tests/scenario/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/scenario/conftest.py b/tests/scenario/conftest.py index f40f72d..84c387e 100644 --- a/tests/scenario/conftest.py +++ b/tests/scenario/conftest.py @@ -1,7 +1,6 @@ from unittest.mock import MagicMock, patch import pytest -from cosl.coordinated_workers.nginx import Nginx from scenario import Container, Context, Relation from charm import TempoCoordinatorCharm @@ -15,7 +14,7 @@ def coordinator(): @pytest.fixture def tempo_charm(): with patch("lightkube.core.client.GenericSyncClient"): - with patch.object(Nginx, "are_certificates_on_disk", False): + with patch("charm.TempoCoordinatorCharm.are_certificates_on_disk", False): yield TempoCoordinatorCharm From 3c389ecb1e907296f3055700a3c6f98d297c9fb5 Mon Sep 17 00:00:00 2001 From: michael Date: Tue, 30 Jul 2024 17:49:12 +0300 Subject: [PATCH 21/22] fix path --- tests/integration/helpers.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index a8d1ff0..edc37b5 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -283,23 +283,7 @@ async def deploy_and_configure_minio(ops_test: OpsTest): async def deploy_cluster(ops_test: OpsTest, tempo_app=APP_NAME): - # await ops_test.model.deploy(FACADE, channel="edge") - # await ops_test.model.wait_for_idle( - # apps=[FACADE], raise_on_blocked=True, status="active", timeout=2000 - # ) - - # TODO: deploy from latest edge - tempo_worker_charm = ( - "/home/michael/Work/tempo-worker-k8s-operator/tempo-worker-k8s_ubuntu-22.04-amd64.charm" - ) - - resources = { - "tempo-image": "docker.io/ubuntu/tempo:2-22.04", - } - - await ops_test.model.deploy( - tempo_worker_charm, resources=resources, application_name=WORKER_NAME - ) + await ops_test.model.deploy("tempo-worker-k8s", application_name=WORKER_NAME, channel="edge") await ops_test.model.deploy(S3_INTEGRATOR, channel="edge") await ops_test.model.integrate(tempo_app + ":s3", S3_INTEGRATOR + ":s3-credentials") From c98827d2d2f0179c853f163da2fae5f3ada52c12 Mon Sep 17 00:00:00 2001 From: michael Date: Tue, 30 Jul 2024 19:43:18 +0300 Subject: [PATCH 22/22] fix tests --- tests/integration/helpers.py | 7 +++ tests/integration/test_ingressed_tls.py | 65 ++++++++++++++----------- tests/integration/test_tls.py | 11 +---- 3 files changed, 46 insertions(+), 37 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index edc37b5..fd83122 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -27,6 +27,13 @@ S3_INTEGRATOR = "s3-integrator" WORKER_NAME = "tempo-worker" APP_NAME = "tempo" +protocols_endpoints = { + "jaeger_thrift_http": "https://{}:14268/api/traces?format=jaeger.thrift", + "zipkin": "https://{}:9411/v1/traces", + "jaeger_grpc": "{}:14250", + "otlp_http": "https://{}:4318/v1/traces", + "otlp_grpc": "{}:4317", +} logger = logging.getLogger(__name__) diff --git a/tests/integration/test_ingressed_tls.py b/tests/integration/test_ingressed_tls.py index 49c9a0b..9f5a221 100644 --- a/tests/integration/test_ingressed_tls.py +++ b/tests/integration/test_ingressed_tls.py @@ -4,7 +4,12 @@ import pytest import yaml -from helpers import deploy_cluster, emit_trace, get_traces_patiently +from helpers import ( + deploy_cluster, + emit_trace, + get_traces_patiently, + protocols_endpoints, +) from juju.application import Application from pytest_operator.plugin import OpsTest @@ -20,7 +25,7 @@ logger = logging.getLogger(__name__) -async def get_ingress_proxied_endpoint(ops_test: OpsTest): +async def get_ingress_proxied_hostname(ops_test: OpsTest): status = await ops_test.model.get_status() app = status["applications"][TRAEFIK_APP_NAME] status_msg = app["status"]["info"] @@ -31,6 +36,13 @@ async def get_ingress_proxied_endpoint(ops_test: OpsTest): return status_msg.replace("Serving at", "").strip() +async def get_tempo_ingressed_endpoint(hostname, protocol): + protocol_endpoint = protocols_endpoints.get(protocol) + if protocol_endpoint is None: + assert False, f"Invalid {protocol}" + return protocol_endpoint.format(hostname) + + @pytest.mark.setup @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest): @@ -68,7 +80,8 @@ async def test_push_tracegen_script_and_deps(ops_test: OpsTest): await ops_test.juju( "ssh", f"{APP_NAME}/0", - "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http", + "python3 -m pip install opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http" + + " opentelemetry-exporter-zipkin opentelemetry-exporter-jaeger", ) @@ -104,38 +117,34 @@ async def test_relate(ops_test: OpsTest): @pytest.mark.abort_on_fail -async def test_verify_ingressed_trace_http_tls(ops_test: OpsTest, nonce, server_cert): - tempo_host = await get_ingress_proxied_endpoint(ops_test) - - await emit_trace( - f"https://{tempo_host}:4318/v1/traces", nonce=nonce, ops_test=ops_test, use_cert=True - ) - # THEN we can verify it's been ingested - assert await get_traces_patiently(tempo_host) +@pytest.mark.parametrize("protocol", list(protocols_endpoints.keys())) +async def test_verify_traces_force_enabled_protocols_tls(ops_test: OpsTest, nonce, protocol): - -@pytest.mark.abort_on_fail -async def test_verify_ingressed_traces_grpc_tls(ops_test: OpsTest, nonce, server_cert): - # enable otlp grpc receiver tempo_app: Application = ops_test.model.applications[APP_NAME] - await tempo_app.set_config( - { - "always_enable_otlp_grpc": "True", - } - ) - await ops_test.model.wait_for_idle( - apps=[APP_NAME], - status="active", - timeout=1000, - ) - tempo_host = await get_ingress_proxied_endpoint(ops_test) + # enable each protocol receiver + # otlp_http should be enabled by default + if protocol != "otlp_http": + await tempo_app.set_config( + { + f"always_enable_{protocol}": "True", + } + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], + status="active", + timeout=1000, + ) + + tempo_host = await get_ingress_proxied_hostname(ops_test) + tempo_endpoint = await get_tempo_ingressed_endpoint(tempo_host, protocol=protocol) + # WHEN we emit a trace secured with TLS await emit_trace( - f"{tempo_host}:4317", nonce=nonce, proto="otlp_grpc", ops_test=ops_test, use_cert=True + tempo_endpoint, ops_test, nonce=nonce, verbose=1, proto=protocol, use_cert=True ) # THEN we can verify it's been ingested - assert await get_traces_patiently(tempo_host, service_name="tracegen-otlp_grpc") + await get_traces_patiently(tempo_host, service_name=f"tracegen-{protocol}") @pytest.mark.teardown diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 3719c6b..014e836 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -10,6 +10,7 @@ get_application_ip, get_traces, get_traces_patiently, + protocols_endpoints, ) from juju.application import Application from pytest_operator.plugin import OpsTest @@ -19,13 +20,7 @@ SSC = "self-signed-certificates" SSC_APP_NAME = "ssc" TRACEGEN_SCRIPT_PATH = Path() / "scripts" / "tracegen.py" -protocols_endpoints = { - "jaeger_thrift_http": "https://{}:14268/api/traces?format=jaeger.thrift", - "zipkin": "https://{}:9411/v1/traces", - "jaeger_grpc": "{}:14250", - "otlp_http": "https://{}:4318/v1/traces", - "otlp_grpc": "{}:4317", -} + logger = logging.getLogger(__name__) @@ -107,8 +102,6 @@ async def test_verify_traces_force_enabled_protocols_tls(ops_test: OpsTest, nonc tempo_app: Application = ops_test.model.applications[APP_NAME] # enable each protocol receiver - # for protocol in protocols_endpoints: - # otlp_http should be enabled by default if protocol != "otlp_http": await tempo_app.set_config(