Skip to content

Commit

Permalink
Charm tracing cache (#57)
Browse files Browse the repository at this point in the history
* buffering

* fmt

* typing fixes

* docstring fix

* refactored buffering logic to avoid losing traces

* log on no buffer

* ops pin

* refactor test

* itest for buffer

* lower flush retry period

* fmt

* adjusted buffer path to PVC and logs

* fmt

* pr feedback

* mb-> mib and docs

* rename cleanup

* accept 2xx status codes

* more logs pushed to devlogger debug

* more patience in itests

* more patience in itests

* reduced max buffer size default to 10

* wait for 100

* test cleanup and xfailed flaky one

* fixed tls certs union type

* skip with reason instead of xfail
  • Loading branch information
PietroPasotti authored Oct 18, 2024
1 parent 87d7d9b commit 65018e2
Show file tree
Hide file tree
Showing 19 changed files with 1,276 additions and 137 deletions.
386 changes: 356 additions & 30 deletions lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py

Large diffs are not rendered by default.

30 changes: 26 additions & 4 deletions lib/charms/tls_certificates_interface/v3/tls_certificates.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 20
LIBPATCH = 22

PYDEPS = ["cryptography", "jsonschema"]

Expand Down Expand Up @@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
)
else:
try:
secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
logger.debug(
"Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}"
)
secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
# Juju < 3.6 will create a new revision even if the content is the same
if (
secret.get_content(refresh=True).get("certificate", "")
== certificate.certificate
):
logger.debug(
"Secret %s with correct certificate already exists",
f"{LIBID}-{csr_in_sha256_hex}",
)
return
secret.set_content(
{"certificate": certificate.certificate, "csr": certificate.csr}
)
Expand Down Expand Up @@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None:
provider_certificate = self._find_certificate_in_relation_data(csr)
if not provider_certificate:
# A secret expired but we did not find matching certificate. Cleaning up
logger.warning(
"Failed to find matching certificate for csr, cleaning up secret %s",
event.secret.label,
)
event.secret.remove_all_revisions()
return

if not provider_certificate.expiry_time:
# A secret expired but matching certificate is invalid. Cleaning up
logger.warning(
"Certificate matching csr is invalid, cleaning up secret %s",
event.secret.label,
)
event.secret.remove_all_revisions()
return

Expand Down Expand Up @@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti
return provider_certificate
return None

def _get_csr_from_secret(self, secret: Secret) -> str:
def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]:
"""Extract the CSR from the secret label or content.
This function is a workaround to maintain backwards compatibility
and fix the issue reported in
https://github.com/canonical/tls-certificates-interface/issues/228
"""
if not (csr := secret.get_content().get("csr", "")):
try:
content = secret.get_content(refresh=True)
except SecretNotFoundError:
return None
if not (csr := content.get("csr", None)):
# In versions <14 of the Lib we were storing the CSR in the label of the secret
# The CSR now is stored int the content of the secret, which was a breaking change
# Here we get the CSR if the secret was created by an app using libpatch 14 or lower
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# pin importlib-metadata version else charmcraft pack will fail to resolve the dependencies for the pydeps-installed otlp library
importlib-metadata~=6.0.0
ops
ops>=2.17
crossplane
jsonschema==4.17.0
lightkube>=0.15.4
Expand Down
2 changes: 2 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ class PeerData(DatabagModel):
tracing_endpoint="tempo_otlp_http_endpoint",
server_cert="server_ca_cert",
extra_types=(Tempo, TracingEndpointProvider, Coordinator, ClusterRolesConfig),
# use PVC path for buffer data, so we don't lose it on pod churn
buffer_path=Path("/tempo-data/.charm_tracing_buffer.raw"),
)
class TempoCoordinatorCharm(CharmBase):
"""Charmed Operator for Tempo; a distributed tracing backend."""
Expand Down
8 changes: 4 additions & 4 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import logging
import os
import random
import shlex
import shutil
import subprocess
import tempfile
from pathlib import Path
from subprocess import check_output

from pytest import fixture
from pytest_operator.plugin import OpsTest
Expand Down Expand Up @@ -67,8 +69,7 @@ def copy_charm_libs_into_tester_charm(ops_test):
yield

# cleanup: remove all libs
for path in copies:
Path(path).unlink()
check_output(shlex.split("rm -rf ./tests/integration/tester/lib"))


@fixture(scope="module", autouse=True)
Expand All @@ -89,8 +90,7 @@ def copy_charm_libs_into_tester_grpc_charm(ops_test):
yield

# cleanup: remove all libs
for path in copies:
Path(path).unlink()
check_output(shlex.split("rm -rf ./tests/integration/tester-grpc/lib"))


@fixture(scope="function")
Expand Down
73 changes: 35 additions & 38 deletions tests/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from helpers import WORKER_NAME, deploy_cluster
from pytest_operator.plugin import OpsTest

from tests.integration.helpers import get_traces_patiently

METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text())
APP_NAME = "tempo"
TESTER_METADATA = yaml.safe_load(Path("./tests/integration/tester/metadata.yaml").read_text())
Expand All @@ -22,7 +24,7 @@

@pytest.mark.setup
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest, tempo_charm: Path):
async def test_build_deploy_testers(ops_test: OpsTest, tempo_charm: Path):
# Given a fresh build of the charm
# When deploying it together with testers
# Then applications should eventually be created
Expand Down Expand Up @@ -91,29 +93,40 @@ async def test_relate(ops_test: OpsTest):
async def test_verify_traces_http(ops_test: OpsTest):
# given a relation between charms
# when traces endpoint is queried
# then it should contain traces from tester charm
# then it should contain traces from the tester charm
status = await ops_test.model.get_status()
app = status["applications"][APP_NAME]
endpoint = app.public_address + ":3200/api/search"
cmd = [
"curl",
endpoint,
]
rc, stdout, stderr = await ops_test.run(*cmd)
logger.info("%s: %s", endpoint, (rc, stdout, stderr))
assert rc == 0, (
f"curl exited with rc={rc} for {endpoint}; "
f"non-zero return code means curl encountered a >= 400 HTTP code; "
f"cmd={cmd}"
traces = await get_traces_patiently(
tempo_host=app.public_address, service_name="TempoTesterCharm", tls=False
)
traces = json.loads(stdout)["traces"]
assert (
traces
), f"There's no trace of charm exec traces in tempo. {json.dumps(traces, indent=2)}"

found = False
for trace in traces:
if trace["rootServiceName"] == "TempoTesterCharm":
found = True

assert found, f"There's no trace of charm exec traces in tempo. {json.dumps(traces, indent=2)}"
@pytest.mark.skip(reason="fails because search query results are not stable")
# keep an eye onhttps://github.com/grafana/tempo/issues/3777 and see if they fix it
async def test_verify_buffered_charm_traces_http(ops_test: OpsTest):
# given a relation between charms
# when traces endpoint is queried
# then it should contain all traces from the tester charm since the setup phase, thanks to the buffer
status = await ops_test.model.get_status()
app = status["applications"][APP_NAME]
traces = await get_traces_patiently(
tempo_host=app.public_address, service_name="TempoTesterCharm", tls=False
)

# charm-tracing trace names are in the format:
# "mycharm/0: <event-name> event"
captured_events = {trace["rootTraceName"].split(" ")[1] for trace in traces}
expected_setup_events = {
"start",
"install",
"leader-elected",
"tracing-relation-created",
"replicas-relation-created",
}
assert expected_setup_events.issubset(captured_events)


async def test_verify_traces_grpc(ops_test: OpsTest):
Expand All @@ -122,27 +135,11 @@ async def test_verify_traces_grpc(ops_test: OpsTest):
status = await ops_test.model.get_status()
app = status["applications"][APP_NAME]
logger.info(app.public_address)
endpoint = app.public_address + ":3200/api/search"
cmd = [
"curl",
endpoint,
]
rc, stdout, stderr = await ops_test.run(*cmd)
logger.info("%s: %s", endpoint, (rc, stdout, stderr))
assert rc == 0, (
f"curl exited with rc={rc} for {endpoint}; "
f"non-zero return code means curl encountered a >= 400 HTTP code; "
f"cmd={cmd}"
traces = await get_traces_patiently(
tempo_host=app.public_address, service_name="TempoTesterGrpcCharm", tls=False
)
traces = json.loads(stdout)["traces"]

found = False
for trace in traces:
if trace["rootServiceName"] == "TempoTesterGrpcCharm":
found = True

assert (
found
traces
), f"There's no trace of generated grpc traces in tempo. {json.dumps(traces, indent=2)}"


Expand Down
1 change: 0 additions & 1 deletion tests/integration/tester-grpc/lib/.gitignore

This file was deleted.

Empty file.
Empty file.
Empty file.
1 change: 0 additions & 1 deletion tests/integration/tester/lib/.gitignore

This file was deleted.

Empty file.
Empty file.
Empty file.
6 changes: 5 additions & 1 deletion tests/integration/tester/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
TRACING_APP_NAME = "TempoTesterCharm"


@trace_charm(tracing_endpoint="tempo_otlp_http_endpoint", service_name=TRACING_APP_NAME)
@trace_charm(
tracing_endpoint="tempo_otlp_http_endpoint",
service_name=TRACING_APP_NAME,
buffer_max_events=100,
)
class TempoTesterCharm(CharmBase):
"""Charm the service."""

Expand Down
35 changes: 26 additions & 9 deletions tests/scenario/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,46 @@
from unittest.mock import MagicMock, patch

import pytest
from charms.tempo_coordinator_k8s.v0.charm_tracing import charm_tracing_disabled
from ops import ActiveStatus
from scenario import Container, Context, PeerRelation, Relation

from charm import PEERS_RELATION_ENDPOINT_NAME, TempoCoordinatorCharm


@pytest.fixture(autouse=True)
def patch_buffer_file_for_charm_tracing(tmp_path):
with patch(
"charms.tempo_coordinator_k8s.v0.charm_tracing.BUFFER_DEFAULT_CACHE_FILE_NAME",
str(tmp_path / "foo.json"),
):
yield


@pytest.fixture(autouse=True, scope="session")
def disable_charm_tracing():
with charm_tracing_disabled():
yield


@pytest.fixture()
def coordinator():
return MagicMock()


@pytest.fixture
def tempo_charm():
def tempo_charm(tmp_path):
with patch("lightkube.core.client.GenericSyncClient"):
with patch("charm.TempoCoordinatorCharm.are_certificates_on_disk", False):
with patch.multiple(
"cosl.coordinated_workers.coordinator.KubernetesComputeResourcesPatch",
_namespace="test-namespace",
_patch=lambda _: None,
get_status=lambda _: ActiveStatus(""),
is_ready=lambda _: True,
):
yield TempoCoordinatorCharm
with patch("tempo.Tempo.tls_ca_path", str(tmp_path / "cert.tmp")):
with patch.multiple(
"cosl.coordinated_workers.coordinator.KubernetesComputeResourcesPatch",
_namespace="test-namespace",
_patch=lambda _: None,
get_status=lambda _: ActiveStatus(""),
is_ready=lambda _: True,
):
yield TempoCoordinatorCharm


@pytest.fixture(scope="function")
Expand Down
Loading

0 comments on commit 65018e2

Please sign in to comment.