From fd2679a35647a61b2f3145991b7d569153849110 Mon Sep 17 00:00:00 2001 From: PietroPasotti Date: Thu, 12 Dec 2024 11:33:43 +0100 Subject: [PATCH] Datasource exchange (#86) * precommit * ds exchange implementation * interface tests passing * explicit CRI dep * unpinned CRI branch * git build dep * added grafana uid * removed ctx * fixed ds test * pr comments * pr comments * adjusted comment --- .pre-commit-config.yaml | 59 +++++++++++++ charmcraft.yaml | 42 +++++---- lib/charms/grafana_k8s/v0/grafana_source.py | 23 ++++- requirements.txt | 12 +-- src/charm.py | 53 ++++++++++- tests/interface/conftest.py | 29 ++++++- .../test_grafana_datasource_exchange.py | 13 +++ tests/scenario/conftest.py | 10 +++ tests/scenario/test_datasources.py | 87 +++++++++++++++++++ 9 files changed, 301 insertions(+), 27 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 tests/interface/test_grafana_datasource_exchange.py create mode 100644 tests/scenario/test_datasources.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d32099b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,59 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-ast + - id: check-builtin-literals + - id: check-docstring-first + - id: check-merge-conflict + - id: check-yaml + - id: check-toml + - id: debug-statements + - id: trailing-whitespace + - repo: local + hooks: + - id: fmt + name: fmt + entry: tox -e fmt -- + language: system + - repo: local + hooks: + - id: static-charm + name: static-charm + entry: tox -e static-charm -- + language: system + - repo: local + hooks: + - id: static-lib + name: static-lib + entry: tox -e static-lib -- + language: system + - repo: https://github.com/asottile/blacken-docs + rev: 1.13.0 + hooks: + - id: blacken-docs + additional_dependencies: [black==23.3] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier + additional_dependencies: + - prettier@2.7.1 + - "@prettier/plugin-xml@2.2" + args: ["--print-width=120", "--prose-wrap=always"] + - repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.33.0 + hooks: + - id: markdownlint + - repo: local + hooks: + - id: changelogs-rst + name: Changelog filenames + language: fail + entry: "changelog files must be named ####.(feature|bugfix|doc|removal|misc).rst" + # exclude: ^docs/changelog/(\d+\.(feature|bugfix|doc|removal|misc).rst|template.jinja2) + files: ^docs/changelog/ + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes diff --git a/charmcraft.yaml b/charmcraft.yaml index 44b535d..9a760d8 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -61,17 +61,16 @@ provides: description: | Integration to offer other charms the possibility to send traces to Tempo. - requires: self-charm-tracing: interface: tracing description: | - Integration to enable Tempo to send its own charm traces to another Tempo instance. + Integration to enable Tempo to send its own charm traces to another Tempo instance. limit: 1 self-workload-tracing: interface: tracing description: | - Integration to enable Tempo to send its own workload traces to another Tempo instance. + Integration to enable Tempo to send its own workload traces to another Tempo instance. limit: 1 s3: interface: s3 @@ -92,12 +91,16 @@ requires: interface: tls-certificates limit: 1 description: | - Certificate and key files for securing Tempo internal and external + Certificate and key files for securing Tempo internal and external communications with TLS. send-remote-write: interface: prometheus_remote_write description: | Prometheus-like remote write endpoints to push traces' metrics generated by the `metrics-generator` component. + receive-datasource: + interface: grafana_datasource_exchange + description: | + Integration to share with other COS components this charm's datasources, and receive theirs. storage: data: @@ -115,7 +118,6 @@ peers: description: | peer relation for internal coordination - bases: - build-on: - name: "ubuntu" @@ -126,9 +128,9 @@ bases: parts: charm: -# uncomment this if you add git+ dependencies in requirements.txt - # build-packages: - # - "git" + # uncomment this if you add git+ dependencies in requirements.txt: + #build-packages: + # - "git" charm-binary-python-packages: - "pydantic>=2" - "cryptography" @@ -139,28 +141,38 @@ config: options: retention-period: description: | - Maximum trace retention period, in hours. This will be used to configure the compactor to clean up trace data after this time. + Maximum trace retention period, in hours. This will be used to configure the compactor to clean up trace data after this time. Defaults to 720 hours, which is equivalent to 30 days. Per-stream retention limits are currently not supported. type: int default: 720 always_enable_zipkin: - description: Force-enable the receiver for the 'zipkin' protocol in Tempo, even if there is no integration currently requesting it. + description: | + Force-enable the receiver for the 'zipkin' protocol in Tempo, even if there is no integration currently + requesting it. type: boolean default: false always_enable_otlp_grpc: - description: Force-enable the receiver for the 'otlp_grpc' protocol in Tempo, even if there is no integration currently requesting it. + description: | + Force-enable the receiver for the 'otlp_grpc' protocol in Tempo, even if there is no integration currently + requesting it. type: boolean default: false always_enable_otlp_http: - description: Force-enable the receiver for the 'otlp_http' protocol in Tempo, even if there is no integration currently requesting it. + description: | + Force-enable the receiver for the 'otlp_http' protocol in Tempo, even if there is no integration currently + requesting it. type: boolean default: false always_enable_jaeger_thrift_http: - description: Force-enable the receiver for the 'jaeger_thrift_http' protocol in Tempo, even if there is no integration currently requesting it. + description: | + Force-enable the receiver for the 'jaeger_thrift_http' protocol in Tempo, even if there is no integration + currently requesting it. type: boolean default: false always_enable_jaeger_grpc: - description: Force-enable the receiver for the 'jaeger_grpc' protocol in Tempo, even if there is no integration currently requesting it. + description: | + Force-enable the receiver for the 'jaeger_grpc' protocol in Tempo, even if there is no integration currently + requesting it. type: boolean default: false cpu_limit: @@ -174,4 +186,4 @@ config: K8s memory resource limit, e.g. "1Gi". Default is unset (no limit). This value is used for the "limits" portion of the resource requirements. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: string \ No newline at end of file + type: string diff --git a/lib/charms/grafana_k8s/v0/grafana_source.py b/lib/charms/grafana_k8s/v0/grafana_source.py index f3492be..6e09c50 100644 --- a/lib/charms/grafana_k8s/v0/grafana_source.py +++ b/lib/charms/grafana_k8s/v0/grafana_source.py @@ -162,7 +162,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 22 +LIBPATCH = 24 logger = logging.getLogger(__name__) @@ -432,13 +432,22 @@ def update_source(self, source_url: Optional[str] = ""): def get_source_uids(self) -> Dict[str, Dict[str, str]]: """Get the datasource UID(s) assigned by the remote end(s) to this datasource. - Returns a mapping from remote application names to unit names to datasource uids. + Returns a mapping from remote application UIDs to unit names to datasource uids. """ uids = {} for rel in self._charm.model.relations.get(self._relation_name, []): if not rel: continue - uids[rel.app.name] = json.loads(rel.data[rel.app]["datasource_uids"]) + app_databag = rel.data[rel.app] + grafana_uid = app_databag.get("grafana_uid") + if not grafana_uid: + logger.warning( + "remote end is using an old grafana_datasource interface: " + "`grafana_uid` field not found." + ) + continue + + uids[grafana_uid] = json.loads(app_databag.get("datasource_uids", "{}")) return uids def _set_sources_from_event(self, event: RelationJoinedEvent) -> None: @@ -568,6 +577,14 @@ def _publish_source_uids(self, rel: Relation, uids: Dict[str, str]): Assumes only leader unit will call this method """ + unique_grafana_name = "juju_{}_{}_{}_{}".format( + self._charm.model.name, + self._charm.model.uuid, + self._charm.model.app.name, + self._charm.model.unit.name.split("/")[1], # type: ignore + ) + + rel.data[self._charm.app]["grafana_uid"] = unique_grafana_name rel.data[self._charm.app]["datasource_uids"] = json.dumps(uids) def _get_source_config(self, rel: Relation): diff --git a/requirements.txt b/requirements.txt index 0662053..c03c4cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ -# pin importlib-metadata version else charmcraft pack will fail to resolve the dependencies for the pydeps-installed otlp library -importlib-metadata~=6.0.0 ops>=2.17 crossplane jsonschema==4.17.0 lightkube>=0.15.4 lightkube-models>=1.24.1.4 tenacity==8.2.3 +pydantic>=2 +cosl>=0.0.46 + # crossplane is a package from nginxinc to interact with the Nginx config crossplane @@ -16,7 +17,6 @@ opentelemetry-exporter-otlp-proto-http==1.21.0 # lib/charms/tls_certificates_interface/v2/tls_certificates.py jsonschema cryptography -# lib/charms/tempo_coordinator_k8s/v1/tracing.py -pydantic>=2 -# lib/charms/prometheus_k8s/v0/prometheus_scrape.py -cosl>=0.0.43 + +# pin importlib-metadata version else charmcraft pack will fail to resolve the dependencies for the pydeps-installed otlp library +importlib-metadata~=6.0.0 diff --git a/src/charm.py b/src/charm.py index c30ac8f..ba67c70 100755 --- a/src/charm.py +++ b/src/charm.py @@ -24,8 +24,9 @@ ) from charms.traefik_k8s.v0.traefik_route import TraefikRouteRequirer from cosl.coordinated_workers.coordinator import ClusterRolesConfig, Coordinator -from cosl.coordinated_workers.interface import DatabagModel from cosl.coordinated_workers.nginx import CA_CERT_PATH, CERT_PATH, KEY_PATH +from cosl.interfaces.datasource_exchange import DatasourceDict +from cosl.interfaces.utils import DatabagModel from ops import CollectStatusEvent from ops.charm import CharmBase @@ -108,6 +109,8 @@ def __init__(self, *args): "s3": "s3", "charm-tracing": "self-charm-tracing", "workload-tracing": "self-workload-tracing", + "send-datasource": None, + "receive-datasource": "receive-datasource", }, nginx_config=NginxConfig(server_name=self.hostname).config, workers_config=self.tempo.config, @@ -443,6 +446,53 @@ def remote_write_endpoints(self): """Return remote-write endpoints.""" return self._remote_write.endpoints + def _update_source_exchange(self) -> None: + """Update the grafana-datasource-exchange relations with what we receive from grafana-source.""" + + # leader operation only + if not self.unit.is_leader(): + return + + # Each grafana we're sending our data to gives us back a mapping from unit names to datasource UIDs. + # so if we have two tempo units, and we're related to two grafanas, + # the grafana-source databag will look something like this: + # {"grafana_uid": "1234-1234-1234-1234", + # "datasource_uids": { + # "grafana/0": { + # "tempo/0": "0000-0000-0000-0000", + # "tempo/1": "0000-0000-0000-0001" + # }, + # "grafana/1": { + # "tempo/0": "0000-0000-0000-0000", + # "tempo/1": "0000-0000-0000-0001" + # }, + # }} + # This is an implementation detail, but the UID is 'unique-per-grafana' and ATM it turns out to be + # deterministic given tempo's jujutopology, so if we are related to multiple grafanas, + # the UIDs they assign to our units will be the same. + # Either way, we assume for simplicity (also, why not) that we're sending the same data to + # different grafanas, and not somehow different subsets to different places. Therefore, it should not + # matter which grafana you talk to, to cross-reference the data this unit is presenting to the world. + # However, if at some point grafana's algorithm to generate source UIDs changes, we'd be in trouble since + # the unit we are sharing our data to might be talking to 'another grafana' which might be using a + # different UID convention! + + # we might have multiple grafana-source relations, this method collects them all and returns a mapping from + # the `grafana_uid` to the contents of the `datasource_uids` field + grafana_uids_to_units_to_uids = self.grafana_source_provider.get_source_uids() + raw_datasources: List[DatasourceDict] = [] + + for grafana_uid, ds_uids in grafana_uids_to_units_to_uids.items(): + # we don't need the grafana unit name + for _, ds_uid in ds_uids.items(): + # we also don't care about which unit's server we're looking at, since hopefully the data is the same. + raw_datasources.append( + {"type": "tempo", "uid": ds_uid, "grafana_uid": grafana_uid} + ) + + # publish() already sorts the data for us, to prevent databag flapping and ensuing event storms + self.coordinator.datasource_exchange.publish(datasources=raw_datasources) + def _reconcile(self): # This method contains unconditional update logic, i.e. logic that should be executed # regardless of the event we are processing. @@ -450,6 +500,7 @@ def _reconcile(self): # we need to 'remember' to run this logic as soon as we become ready, which is hard and error-prone self._update_ingress_relation() self._update_tracing_relations() + self._update_source_exchange() if __name__ == "__main__": # pragma: nocover diff --git a/tests/interface/conftest.py b/tests/interface/conftest.py index 16a1217..14870cf 100644 --- a/tests/interface/conftest.py +++ b/tests/interface/conftest.py @@ -1,6 +1,9 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. +import json +import pathlib from contextlib import ExitStack +from shutil import rmtree from unittest.mock import MagicMock, patch import pytest @@ -63,6 +66,11 @@ }, ) +grafana_source_relation = Relation( + "grafana-source", + remote_app_data={"datasources": json.dumps({"tempo/0": {"type": "tempo", "uid": "01234"}})}, +) + peers = PeerRelation("peers", peers_data={1: {}}) k8s_resource_patch_ready = MagicMock(return_value=True) @@ -82,9 +90,13 @@ def patch_all(): ) ) stack.enter_context(charm_tracing_disabled()) - yield + # cleanup: some tests create a spurious src folder for alert rules in ./ + src_root = pathlib.Path(__file__).parent / "src" + if src_root.exists(): + rmtree(src_root) + # Interface tests are centrally hosted at https://github.com/canonical/charm-relation-interfaces. # this fixture is used by the test runner of charm-relation-interfaces to test tempo's compliance @@ -138,7 +150,20 @@ def grafana_datasource_tester(interface_tester: InterfaceTester): state_template=State( leader=True, containers=[nginx_container, nginx_prometheus_exporter_container], - relations=[peers, cluster_relation], + relations=[peers, s3_relation, cluster_relation], + ), + ) + yield interface_tester + + +@pytest.fixture +def grafana_datasource_exchange_tester(interface_tester: InterfaceTester): + interface_tester.configure( + charm_type=TempoCoordinatorCharm, + state_template=State( + leader=True, + containers=[nginx_container, nginx_prometheus_exporter_container], + relations=[peers, s3_relation, cluster_relation, grafana_source_relation], ), ) yield interface_tester diff --git a/tests/interface/test_grafana_datasource_exchange.py b/tests/interface/test_grafana_datasource_exchange.py new file mode 100644 index 0000000..b2301de --- /dev/null +++ b/tests/interface/test_grafana_datasource_exchange.py @@ -0,0 +1,13 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. +from interface_tester import InterfaceTester + + +def test_grafana_datasource_exchange_v0_interface( + grafana_datasource_exchange_tester: InterfaceTester, +): + grafana_datasource_exchange_tester.configure( + interface_name="grafana_datasource_exchange", + interface_version=0, + ) + grafana_datasource_exchange_tester.run() diff --git a/tests/scenario/conftest.py b/tests/scenario/conftest.py index 06c2550..d7523b9 100644 --- a/tests/scenario/conftest.py +++ b/tests/scenario/conftest.py @@ -1,4 +1,6 @@ import json +from pathlib import Path +from shutil import rmtree from unittest.mock import MagicMock, patch import pytest @@ -18,6 +20,14 @@ def patch_buffer_file_for_charm_tracing(tmp_path): yield +@pytest.fixture(autouse=True, scope="session") +def cleanup_prometheus_alert_rules(tmp_path): + # some tests trigger the charm to generate prometheus alert rules file in ./src; clean it up + yield + src_path = Path(__file__).parent / "src" + rmtree(src_path) + + @pytest.fixture(autouse=True, scope="session") def disable_charm_tracing(): with charm_tracing_disabled(): diff --git a/tests/scenario/test_datasources.py b/tests/scenario/test_datasources.py new file mode 100644 index 0000000..e1f555c --- /dev/null +++ b/tests/scenario/test_datasources.py @@ -0,0 +1,87 @@ +import json +from unittest.mock import patch + +import scenario +from cosl.interfaces.datasource_exchange import ( + DatasourceExchange, + DSExchangeAppData, + GrafanaDatasource, +) +from scenario import PeerRelation, Relation, State + + +@patch("charm.TempoCoordinatorCharm.is_workload_ready", return_value=True) +def test_datasource_receive( + workload_ready_mock, + context, + s3, + all_worker, + nginx_container, + nginx_prometheus_exporter_container, +): + # GIVEN a regular HA deployment and two ds_exchange integrations with a mimir and a loki + ds_loki = [ + {"type": "loki", "uid": "3", "grafana_uid": "4"}, + ] + + ds_mimir = [ + {"type": "prometheus", "uid": "8", "grafana_uid": "9"}, + ] + + mimir_dsx = Relation( + "receive-datasource", + remote_app_data=DSExchangeAppData( + datasources=json.dumps(sorted(ds_mimir, key=lambda raw_ds: raw_ds["uid"])) + ).dump(), + ) + loki_dsx = Relation( + "receive-datasource", + remote_app_data=DSExchangeAppData( + datasources=json.dumps(sorted(ds_loki, key=lambda raw_ds: raw_ds["uid"])) + ).dump(), + ) + + ds = Relation( + "grafana-source", + remote_app_data={ + "grafana_uid": "foo-something-bars", + "datasource_uids": json.dumps({"tempo/0": "1234"}), + }, + ) + + state_in = State( + relations=[ + PeerRelation("peers", peers_data={1: {}, 2: {}}), + s3, + all_worker, + ds, + mimir_dsx, + loki_dsx, + ], + containers=[nginx_container, nginx_prometheus_exporter_container], + unit_status=scenario.ActiveStatus(), + leader=True, + ) + + # WHEN we receive any event + with context(context.on.update_status(), state_in) as mgr: + charm = mgr.charm + # THEN we can find all received datasource uids in the coordinator + dsx: DatasourceExchange = charm.coordinator.datasource_exchange + received = dsx.received_datasources + assert received == ( + GrafanaDatasource(type="loki", uid="3", grafana_uid="4"), + GrafanaDatasource(type="prometheus", uid="8", grafana_uid="9"), + ) + state_out = mgr.run() + + # AND THEN we forward our own datasource information to mimir and loki + assert state_out.unit_status.name == "active" + published_dsx_mimir = state_out.get_relation(mimir_dsx.id).local_app_data + published_dsx_loki = state_out.get_relation(loki_dsx.id).local_app_data + assert published_dsx_loki == published_dsx_mimir + assert json.loads(published_dsx_loki["datasources"])[0] == { + "type": "tempo", + "uid": "1234", + "grafana_uid": "foo-something-bars", + }