Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of tls config #524

Merged
merged 9 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,4 @@ resources:
prometheus-image:
type: oci-image
description: Container image for Prometheus
upstream-source: ghcr.io/canonical/prometheus:latest
upstream-source: ghcr.io/canonical/prometheus:dev
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed until canonical/oci-factory#58 is applied.

84 changes: 47 additions & 37 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import logging
import re
import socket
import subprocess
from pathlib import Path
from typing import Dict, Optional, cast
from urllib.parse import urlparse
Expand Down Expand Up @@ -73,8 +74,6 @@
CA_CERT_PATH = f"{PROMETHEUS_DIR}/ca.cert"
WEB_CONFIG_PATH = f"{PROMETHEUS_DIR}/prometheus-web-config.yml"

WAITING_FOR_TLS = "Waiting for TLS certificates to be written to file"

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -144,8 +143,6 @@ def __init__(self, *args):
relation_name="alertmanager",
)

external_url = urlparse(self.external_url)

self._scraping = MetricsEndpointProvider(
self,
relation_name="self-metrics-endpoint",
Expand All @@ -158,7 +155,7 @@ def __init__(self, *args):
self.cert_handler.on.cert_changed,
],
)
self._prometheus_client = Prometheus(f"{external_url.scheme}://localhost:9090")
self._prometheus_client = Prometheus(self.internal_url)
sed-i marked this conversation as resolved.
Show resolved Hide resolved

self.remote_write_provider = PrometheusRemoteWriteProvider(
charm=self,
Expand Down Expand Up @@ -345,6 +342,7 @@ def external_url(self) -> str:
return self.internal_url

def _is_tls_enabled(self):
"""Return True if the certificates relations is in place (cert may not be ready yet!)."""
return self.cert_handler.enabled

@property
Expand All @@ -370,10 +368,6 @@ def _prometheus_layer(self) -> Layer:

return Layer(layer_config) # pyright: ignore

def stop(self) -> None:
"""Stop Prometheus."""
self.container.stop("prometheus")

def _resource_reqs_from_config(self):
limits = {
"cpu": self.model.config.get("cpu"),
Expand All @@ -394,37 +388,56 @@ def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent):
self.unit.status = BlockedStatus(cast(str, event.message))

def _on_server_cert_changed(self, _):
for path in [KEY_PATH, CERT_PATH, CA_CERT_PATH]:
self.container.remove_path(path, recursive=True)
self._update_cert()

if self.cert_handler.ca:
self.container.push(
CA_CERT_PATH,
self.cert_handler.ca,
make_dirs=True,
)
if self.cert_handler.cert and self.cert_handler.key:
self.grafana_source_provider.update_source(self.external_url)
self.ingress.provide_ingress_requirements(
scheme=urlparse(self.internal_url).scheme, port=self._port
)
self._configure(_)

def _update_cert(self):
ca_cert_path = Path("/usr/local/share/ca-certificates/ca.crt")

if self.cert_handler.cert and self.cert_handler.key and self.cert_handler.ca:
# Save the workload certificates
self.container.push(
CERT_PATH,
self.cert_handler.cert,
make_dirs=True,
)
self.container.push(
"/etc/grafana/grafana.key",
KEY_PATH,
self.cert_handler.key,
make_dirs=True,
)
# Save the CA among the trusted CAs and trust it
self.container.push(
ca_cert_path,
self.cert_handler.ca,
make_dirs=True,
)
# FIXME with the update-ca-certificates machinery prometheus shouldn't need
# CA_CERT_PATH.
self.container.push(
CA_CERT_PATH,
self.cert_handler.ca,
make_dirs=True,
)

self.grafana_source_provider.update_source(self.external_url)
self.ingress.provide_ingress_requirements(
scheme="https" if self._is_tls_enabled() else "http", port=self._port
)
self._configure(_)
if (
isinstance(self.unit.status, WaitingStatus)
and self.unit.status.message == WAITING_FOR_TLS
):
self.unit.status = ActiveStatus()
# Repeat for the charm container. We need it there for prometheus client requests.
ca_cert_path.parent.mkdir(exist_ok=True, parents=True)
ca_cert_path.write_text(self.cert_handler.ca)
else:
self.container.remove_path(CERT_PATH, recursive=True)
self.container.remove_path(KEY_PATH, recursive=True)
self.container.remove_path(ca_cert_path, recursive=True)
self.container.remove_path(CA_CERT_PATH, recursive=True) # TODO: remove (see FIXME ^)
# Repeat for the charm container.
ca_cert_path.unlink(missing_ok=True)

self.container.exec(["update-ca-certificates", "--fresh"]).wait()
subprocess.run(["update-ca-certificates", "--fresh"])

def _configure(self, _):
"""Reconfigure and either reload or restart Prometheus.
Expand Down Expand Up @@ -539,7 +552,11 @@ def _configure(self, _):

self.remote_write_provider.update_endpoint()
self.grafana_source_provider.update_source(self.external_url)
self.unit.status = ActiveStatus()

if self._is_tls_enabled() and not self.container.exists(CERT_PATH):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be more clear if we abstracted this to a self._is_tls_ready() method.
Enabled but not ready -> push cert to container, then restart. Regardless of how we got there, this should be the reasonable course of action. Or what am I missing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking of writing the cert only on cert-changed event, and:

Relation present Cert file on disk Hypothetical is_tls_ready() -> bool
Isolated charm No No No
tls-cert rel just joined Yes No No
on-cert-changed (after rel join) Yes Yes Yes
cert revoked/expired Yes Yes (keep on disk even if expired) Yes
relation departed / broken Yes ? No (how? code ordering...)
any event after rel-broken No No No

Wdyt?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems reasonable. I think departed/broken should immediately remove the cert from disk and from that moment onwards is_tls_ready should return false (assuming it checks the file presence synchronously).
Or we refactor to use manifests, but I'm not convinced that will make code ordering less of a headache

self.unit.status = WaitingStatus("Waiting for TLS certificates to be written to file")
else:
self.unit.status = ActiveStatus()

def _on_pebble_ready(self, event) -> None:
"""Pebble ready hook.
Expand Down Expand Up @@ -877,13 +894,6 @@ def _generate_prometheus_config(self) -> bool:
for filename, contents in certs.items():
self._push(filename, contents)

if self._is_tls_enabled() and not self.container.exists(CERT_PATH):
# After a `stop`, the service will autostart on next call to `_configure`, which is
# expected to happen as soon as the related CA replies with a cert.
self.stop()
if isinstance(self.unit.status, ActiveStatus):
self.unit.status = WaitingStatus(WAITING_FOR_TLS)

if web_config:
self._push(WEB_CONFIG_PATH, yaml.safe_dump(web_config))
else:
Expand Down