Skip to content

Commit

Permalink
[DPE-5275] patroni passwords (#661)
Browse files Browse the repository at this point in the history
* Revert to 14.12

* Add Patroni password

* Add auth to patroni calls

* Int test password

* Generate Patroni pass during upgrade

* Reload Patroni password
  • Loading branch information
dragomirp authored Aug 29, 2024
1 parent de7c929 commit 85c8055
Show file tree
Hide file tree
Showing 17 changed files with 149 additions and 43 deletions.
2 changes: 1 addition & 1 deletion actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ get-password:
username:
type: string
description: The username, the default value 'operator'.
Possible values - backup, operator, replication, rewind.
Possible values - backup, operator, replication, rewind, patroni.
list-backups:
description: Lists backups in s3 storage in AWS.
pre-upgrade-check:
Expand Down
2 changes: 1 addition & 1 deletion metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ resources:
postgresql-image:
type: oci-image
description: OCI image for PostgreSQL
upstream-source: ghcr.io/canonical/charmed-postgresql@sha256:7ef86a352c94e2a664f621a1cc683d7a983fd86e923d98c32b863f717cb1c173 # renovate: oci-image tag: 14.12-22.04_edge
upstream-source: ghcr.io/canonical/charmed-postgresql@sha256:2c066876e80d60058d79835c8b5d18090963b3a0f84261385afa1fc652477605 # renovate: oci-image tag: 14.12-22.04_edge

peers:
database-peers:
Expand Down
8 changes: 6 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
METRICS_PORT,
MONITORING_PASSWORD_KEY,
MONITORING_USER,
PATRONI_PASSWORD_KEY,
PEER,
POSTGRES_LOG_FILES,
REPLICATION_PASSWORD_KEY,
Expand Down Expand Up @@ -131,6 +132,7 @@
logging.getLogger("httpx").setLevel(logging.ERROR)

Scopes = Literal[APP_SCOPE, UNIT_SCOPE]
PASSWORD_USERS = [*SYSTEM_USERS, "patroni"]


@trace_charm(
Expand Down Expand Up @@ -810,6 +812,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
REPLICATION_PASSWORD_KEY,
REWIND_PASSWORD_KEY,
MONITORING_PASSWORD_KEY,
PATRONI_PASSWORD_KEY,
}:
if self.get_secret(APP_SCOPE, password) is None:
self.set_secret(APP_SCOPE, password, new_password())
Expand Down Expand Up @@ -1159,10 +1162,10 @@ def _on_get_password(self, event: ActionEvent) -> None:
If no user is provided, the password of the operator user is returned.
"""
username = event.params.get("username", USER)
if username not in SYSTEM_USERS:
if username not in PASSWORD_USERS:
event.fail(
f"The action can be run only for users used by the charm or Patroni:"
f" {', '.join(SYSTEM_USERS)} not {username}"
f" {', '.join(PASSWORD_USERS)} not {username}"
)
return
event.set_results({"password": self.get_secret(APP_SCOPE, f"{username}-password")})
Expand Down Expand Up @@ -1477,6 +1480,7 @@ def _patroni(self):
self.get_secret(APP_SCOPE, REPLICATION_PASSWORD_KEY),
self.get_secret(APP_SCOPE, REWIND_PASSWORD_KEY),
bool(self.unit_peer_data.get("tls")),
self.get_secret(APP_SCOPE, PATRONI_PASSWORD_KEY),
)

@property
Expand Down
1 change: 1 addition & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
REWIND_PASSWORD_KEY = "rewind-password"
MONITORING_USER = "monitoring"
MONITORING_PASSWORD_KEY = "monitoring-password"
PATRONI_PASSWORD_KEY = "patroni-password"
TLS_KEY_FILE = "key.pem"
TLS_CA_FILE = "ca.pem"
TLS_CERT_FILE = "cert.pem"
Expand Down
64 changes: 50 additions & 14 deletions src/patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(
replication_password: str,
rewind_password: str,
tls_enabled: bool,
patroni_password: str,
):
self._charm = charm
self._endpoint = endpoint
Expand All @@ -79,11 +80,16 @@ def __init__(
self._replication_password = replication_password
self._rewind_password = rewind_password
self._tls_enabled = tls_enabled
self._patroni_password = patroni_password
# Variable mapping to requests library verify parameter.
# The CA bundle file is used to validate the server certificate when
# TLS is enabled, otherwise True is set because it's the default value.
self._verify = f"{self._storage_path}/{TLS_CA_FILE}" if tls_enabled else True

@property
def _patroni_auth(self) -> requests.auth.HTTPBasicAuth:
return requests.auth.HTTPBasicAuth("patroni", self._patroni_password)

@property
def _patroni_url(self) -> str:
"""Patroni REST API URL."""
Expand Down Expand Up @@ -134,7 +140,9 @@ def get_primary(self, unit_name_pattern=False, alternative_endpoints: List[str]
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt, alternative_endpoints)
r = requests.get(f"{url}/cluster", verify=self._verify, timeout=5)
r = requests.get(
f"{url}/cluster", verify=self._verify, timeout=5, auth=self._patroni_auth
)
for member in r.json()["members"]:
if member["role"] == "leader":
primary = member["name"]
Expand All @@ -161,7 +169,7 @@ def get_standby_leader(
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
r = requests.get(f"{url}/cluster", verify=self._verify)
r = requests.get(f"{url}/cluster", verify=self._verify, auth=self._patroni_auth)
for member in r.json()["members"]:
if member["role"] == "standby_leader":
if check_whether_is_running and member["state"] not in RUNNING_STATES:
Expand All @@ -181,7 +189,7 @@ def get_sync_standby_names(self) -> List[str]:
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
r = requests.get(f"{url}/cluster", verify=self._verify)
r = requests.get(f"{url}/cluster", verify=self._verify, auth=self._patroni_auth)
for member in r.json()["members"]:
if member["role"] == "sync_standby":
sync_standbys.append("/".join(member["name"].rsplit("-", 1)))
Expand All @@ -192,7 +200,9 @@ def get_sync_standby_names(self) -> List[str]:
def cluster_members(self) -> set:
"""Get the current cluster members."""
# Request info from cluster endpoint (which returns all members of the cluster).
r = requests.get(f"{self._patroni_url}/cluster", verify=self._verify)
r = requests.get(
f"{self._patroni_url}/cluster", verify=self._verify, auth=self._patroni_auth
)
return {member["name"] for member in r.json()["members"]}

def are_all_members_ready(self) -> bool:
Expand All @@ -207,7 +217,11 @@ def are_all_members_ready(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
with attempt:
r = requests.get(f"{self._patroni_url}/cluster", verify=self._verify)
r = requests.get(
f"{self._patroni_url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
)
except RetryError:
return False

Expand All @@ -222,7 +236,11 @@ def is_creating_backup(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
with attempt:
r = requests.get(f"{self._patroni_url}/cluster", verify=self._verify)
r = requests.get(
f"{self._patroni_url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
)
except RetryError:
return False

Expand All @@ -247,7 +265,9 @@ def is_replication_healthy(self) -> bool:
"leader" if member_endpoint == primary_endpoint else "replica?lag=16kB"
)
url = self._patroni_url.replace(self._endpoint, member_endpoint)
member_status = requests.get(f"{url}/{endpoint}", verify=self._verify)
member_status = requests.get(
f"{url}/{endpoint}", verify=self._verify, auth=self._patroni_auth
)
if member_status.status_code != 200:
raise Exception
except RetryError:
Expand All @@ -270,6 +290,7 @@ def primary_endpoint_ready(self) -> bool:
r = requests.get(
f"{'https' if self._tls_enabled else 'http'}://{self._primary_endpoint}:8008/health",
verify=self._verify,
auth=self._patroni_auth,
)
if r.json()["state"] not in RUNNING_STATES:
raise EndpointNotReadyError
Expand All @@ -288,6 +309,7 @@ def member_replication_lag(self) -> str:
f"{self._patroni_url}/cluster",
verify=self._verify,
timeout=5,
auth=self._patroni_auth,
)
except RetryError:
return "unknown"
Expand All @@ -309,7 +331,9 @@ def member_started(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)):
with attempt:
r = requests.get(f"{self._patroni_url}/health", verify=self._verify)
r = requests.get(
f"{self._patroni_url}/health", verify=self._verify, auth=self._patroni_auth
)
except RetryError:
return False

Expand All @@ -326,7 +350,9 @@ def member_streaming(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)):
with attempt:
r = requests.get(f"{self._patroni_url}/health", verify=self._verify)
r = requests.get(
f"{self._patroni_url}/health", verify=self._verify, auth=self._patroni_auth
)
except RetryError:
return False

Expand Down Expand Up @@ -355,15 +381,21 @@ def bulk_update_parameters_controller_by_patroni(self, parameters: Dict[str, Any
f"{self._patroni_url}/config",
verify=self._verify,
json={"postgresql": {"parameters": parameters}},
auth=self._patroni_auth,
)

def promote_standby_cluster(self) -> None:
"""Promote a standby cluster to be a regular cluster."""
config_response = requests.get(f"{self._patroni_url}/config", verify=self._verify)
config_response = requests.get(
f"{self._patroni_url}/config", verify=self._verify, auth=self._patroni_auth
)
if "standby_cluster" not in config_response.json():
raise StandbyClusterAlreadyPromotedError("standby cluster is already promoted")
requests.patch(
f"{self._patroni_url}/config", verify=self._verify, json={"standby_cluster": None}
f"{self._patroni_url}/config",
verify=self._verify,
json={"standby_cluster": None},
auth=self._patroni_auth,
)
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
Expand All @@ -373,7 +405,9 @@ def promote_standby_cluster(self) -> None:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def reinitialize_postgresql(self) -> None:
"""Reinitialize PostgreSQL."""
requests.post(f"{self._patroni_url}/reinitialize", verify=self._verify)
requests.post(
f"{self._patroni_url}/reinitialize", verify=self._verify, auth=self._patroni_auth
)

def _render_file(self, path: str, content: str, mode: int) -> None:
"""Write a content rendered from a template to a file.
Expand Down Expand Up @@ -457,13 +491,14 @@ def render_patroni_yml_file(
pg_parameters=parameters,
primary_cluster_endpoint=self._charm.async_replication.get_primary_cluster_endpoint(),
extra_replication_endpoints=self._charm.async_replication.get_standby_endpoints(),
patroni_password=self._patroni_password,
)
self._render_file(f"{self._storage_path}/patroni.yml", rendered, 0o644)

@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30))
def reload_patroni_configuration(self) -> None:
"""Reloads the configuration after it was updated in the file."""
requests.post(f"{self._patroni_url}/reload", verify=self._verify)
requests.post(f"{self._patroni_url}/reload", verify=self._verify, auth=self._patroni_auth)

def last_postgresql_logs(self) -> str:
"""Get last log file content of Postgresql service in the container.
Expand Down Expand Up @@ -492,7 +527,7 @@ def last_postgresql_logs(self) -> str:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def restart_postgresql(self) -> None:
"""Restart PostgreSQL."""
requests.post(f"{self._patroni_url}/restart", verify=self._verify)
requests.post(f"{self._patroni_url}/restart", verify=self._verify, auth=self._patroni_auth)

def switchover(self, candidate: str = None) -> None:
"""Trigger a switchover."""
Expand All @@ -507,6 +542,7 @@ def switchover(self, candidate: str = None) -> None:
f"{self._patroni_url}/switchover",
json={"leader": primary, "candidate": candidate},
verify=self._verify,
auth=self._patroni_auth,
)

# Check whether the switchover was unsuccessful.
Expand Down
15 changes: 11 additions & 4 deletions src/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import json
import logging
from signal import SIGHUP

from charms.data_platform_libs.v0.upgrade import (
ClusterNotReadyError,
Expand All @@ -21,7 +22,7 @@
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
from typing_extensions import override

from constants import APP_SCOPE, MONITORING_PASSWORD_KEY, MONITORING_USER
from constants import APP_SCOPE, MONITORING_PASSWORD_KEY, MONITORING_USER, PATRONI_PASSWORD_KEY
from patroni import SwitchoverFailedError
from utils import new_password

Expand Down Expand Up @@ -143,12 +144,17 @@ def _on_postgresql_pebble_ready(self, event: WorkloadEvent) -> None:
"upgrade failed. Check logs for rollback instruction"
)

def _on_upgrade_changed(self, _) -> None:
def _on_upgrade_changed(self, event) -> None:
"""Update the Patroni nosync tag in the unit if needed."""
if not self.peer_relation or not self.charm._patroni.member_started:
return

self.charm.update_config()
container = self.charm.unit.get_container("postgresql")
if not container.can_connect():
event.defer()
return
container.send_signal(SIGHUP, "postgresql")

def _on_upgrade_charm_check_legacy(self, event: UpgradeCharmEvent) -> None:
if not self.peer_relation:
Expand Down Expand Up @@ -271,8 +277,9 @@ def _set_first_rolling_update_partition(self) -> None:

def _set_up_new_credentials_for_legacy(self) -> None:
"""Create missing password and user."""
if self.charm.get_secret(APP_SCOPE, MONITORING_PASSWORD_KEY) is None:
self.charm.set_secret(APP_SCOPE, MONITORING_PASSWORD_KEY, new_password())
for key in (MONITORING_PASSWORD_KEY, PATRONI_PASSWORD_KEY):
if self.charm.get_secret(APP_SCOPE, key) is None:
self.charm.set_secret(APP_SCOPE, key, new_password())
users = self.charm.postgresql.list_users()
if MONITORING_USER not in users:
self.charm.postgresql.create_user(
Expand Down
3 changes: 3 additions & 0 deletions templates/patroni.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ log:
restapi:
connect_address: '{{ endpoint }}:8008'
listen: 0.0.0.0:8008
authentication:
username: patroni
password: {{ patroni_password }}
{%- if enable_tls %}
cafile: {{ storage_path }}/ca.pem
certfile: {{ storage_path }}/cert.pem
Expand Down
11 changes: 8 additions & 3 deletions tests/integration/ha_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_delay, wait_fixed

from ..helpers import app_name
from ..helpers import app_name, get_password
from .helpers import (
APPLICATION_NAME,
change_patroni_setting,
Expand Down Expand Up @@ -62,10 +62,13 @@ async def primary_start_timeout(ops_test: OpsTest) -> None:
"""Temporary change the primary start timeout configuration."""
# Change the parameter that makes the primary reelection faster.
initial_primary_start_timeout = await get_patroni_setting(ops_test, "primary_start_timeout")
await change_patroni_setting(ops_test, "primary_start_timeout", 0)
patroni_password = await get_password(ops_test, "patroni")
await change_patroni_setting(ops_test, "primary_start_timeout", 0, patroni_password)
yield
# Rollback to the initial configuration.
await change_patroni_setting(ops_test, "primary_start_timeout", initial_primary_start_timeout)
await change_patroni_setting(
ops_test, "primary_start_timeout", initial_primary_start_timeout, patroni_password
)


@pytest.fixture()
Expand All @@ -78,13 +81,15 @@ async def wal_settings(ops_test: OpsTest) -> None:
yield
# Rollback to the initial settings.
app = await app_name(ops_test)
patroni_password = await get_password(ops_test, "patroni")
for unit in ops_test.model.applications[app].units:
await change_wal_settings(
ops_test,
unit.name,
initial_max_wal_size,
initial_min_wal_size,
initial_wal_keep_segments,
patroni_password,
)


Expand Down
Loading

0 comments on commit 85c8055

Please sign in to comment.