Skip to content

Commit

Permalink
reinit PostgreSQL only if there is a healthy majority
Browse files Browse the repository at this point in the history
  • Loading branch information
delgod committed Oct 26, 2024
1 parent 8e98f55 commit 75224dd
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 3 deletions.
2 changes: 2 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ def _on_peer_relation_changed(self, event: HookEvent): # noqa: C901
or int(self._patroni.member_replication_lag) > 1000
)
):
logger.warning("Reinitialising replica because of stuck on the starting state after backup recovery")
self._patroni.reinitialize_postgresql()
logger.debug("Deferring on_peer_relation_changed: reinitialising replica")
self.unit.status = MaintenanceStatus("reinitialising replica")
Expand Down Expand Up @@ -1483,6 +1484,7 @@ def _handle_workload_failures(self) -> bool:
and "postgresql_restarted" in self._peers.data[self.unit]
and self._patroni.member_replication_lag == "unknown"
):
logger.warning("Reinitialising replica because of stuck on the starting state on status update")
self._patroni.reinitialize_postgresql()
return True

Expand Down
23 changes: 20 additions & 3 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pwd
import re
import subprocess
from typing import Any
from typing import Any, List

import requests
from charms.operator_libs_linux.v2 import snap
Expand Down Expand Up @@ -425,11 +425,16 @@ def is_creating_backup(self) -> bool:
for member in r.json()["members"]
)

def is_replication_healthy(self) -> bool:
def is_replication_healthy(self, majority_check :bool = False) -> bool:
"""Return whether the replication is healthy."""
expected_healthy_replicas_count = self.planned_units -1
if majority_check:
expected_healthy_replicas_count = self.planned_units // 2
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
healthy_primary = False
healthy_replicas_count = 0
primary = self.get_primary()
primary_ip = self.get_member_ip(primary)
members_ips = {self.unit_ip}
Expand All @@ -447,7 +452,13 @@ def is_replication_healthy(self) -> bool:
logger.debug(
f"Failed replication check for {members_ip} with code {member_status.status_code}"
)
raise Exception
continue
if members_ip == primary_ip:
healthy_primary = True
else:
healthy_replicas_count += 1
if not healthy_primary or healthy_replicas_count < expected_healthy_replicas_count:
raise Exception
except RetryError:
logger.exception("replication is not healthy")
return False
Expand Down Expand Up @@ -816,6 +827,12 @@ def restart_postgresql(self) -> None:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def reinitialize_postgresql(self) -> None:
"""Reinitialize PostgreSQL."""

if not self.is_replication_healthy(majority_check=True):
logger.debug("skipping reinitialize PostgreSQL, because of lack of healthy majority")
raise Exception

logger.debug("reinitialize PostgreSQL")
requests.post(
f"{self._patroni_url}/reinitialize",
verify=self.verify,
Expand Down

0 comments on commit 75224dd

Please sign in to comment.