Skip to content

Commit

Permalink
Prevent stuck raft cluster on leader departure
Browse files Browse the repository at this point in the history
  • Loading branch information
dragomirp committed Feb 29, 2024
1 parent baef092 commit 1b187f5
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 6 deletions.
12 changes: 11 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 8 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ cosl = "^0.0.8"
pydantic = "^1.10.14"
poetry-core = "^1.9.0"
pyOpenSSL = "^24.0.0"
pysyncobj = "^0.3.12"

[tool.poetry.group.charm-libs.dependencies]
# data_platform_libs/v0/data_interfaces.py
Expand Down Expand Up @@ -104,10 +105,12 @@ target-version = ["py38"]
[tool.ruff]
# preview and explicit preview are enabled for CPY001
preview = true
explicit-preview-rules = true
target-version = "py38"
src = ["src", "."]
line-length = 99

[tool.ruff.lint]
explicit-preview-rules = true
select = ["A", "E", "W", "F", "C", "N", "D", "I001", "CPY001"]
extend-ignore = [
"D203",
Expand All @@ -126,16 +129,16 @@ extend-ignore = [
# Ignore D107 Missing docstring in __init__
ignore = ["E501", "D107"]

[tool.ruff.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"tests/*" = ["D100", "D101", "D102", "D103", "D104"]

[tool.ruff.flake8-copyright]
[tool.ruff.lint.flake8-copyright]
# Check for properly formatted copyright header in each file
author = "Canonical Ltd."
notice-rgx = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+"

[tool.ruff.mccabe]
[tool.ruff.lint.mccabe]
max-complexity = 10

[tool.ruff.pydocstyle]
[tool.ruff.lint.pydocstyle]
convention = "google"
48 changes: 48 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
Unit,
WaitingStatus,
)
from pysyncobj.utility import TcpUtility
from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed

from backups import PostgreSQLBackups
Expand Down Expand Up @@ -147,6 +148,7 @@ def __init__(self, *args):
self.framework.observe(self.on[PEER].relation_departed, self._on_peer_relation_departed)
self.framework.observe(self.on.pgdata_storage_detaching, self._on_pgdata_storage_detaching)
self.framework.observe(self.on.start, self._on_start)
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.update_status, self._on_update_status)
Expand Down Expand Up @@ -992,6 +994,52 @@ def _on_start(self, event: StartEvent) -> None:
# Bootstrap the cluster in the leader unit.
self._start_primary(event)

def _remove_raft_node(self, syncobj_util: TcpUtility, partner: str, current: str) -> None:
"""Try to remove a raft member calling a partner node."""
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
if not self._patroni.stop_patroni():
logger.warning("cannot remove raft member: failed to stop Patroni")
raise Exception("Failed to stop service")
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
status = syncobj_util.executeCommand(partner, ["status"])
if f"partner_node_status_server_{current}" not in status:
logger.debug("cannot remove raft member: not part of the cluster")
return
removal_result = syncobj_util.executeCommand(partner, ["remove", current])
if removal_result.startswith("FAIL"):
logger.warning("failed to remove raft member: %s", removal_result)
raise Exception("Failed to remove the unit")

def _on_stop(self, _) -> None:
syncobj_util = TcpUtility(timeout=3)
raft_host = "localhost:2222"
# Try to call a different unit
for ip in self._units_ips:
if ip != self._unit_ip:
raft_host = f"{ip}:2222"
break
status = syncobj_util.executeCommand(raft_host, ["status"])
partners = []
ready = []
for key in status.keys():
if key.startswith("partner_node_status_server_") and status[key]:
partner = key.split("partner_node_status_server_")[-1]
partners.append(partner)
if status[key] == 2:
ready.append(partner)
if not ready and not partners:
logger.debug("Terminating last node")
self._patroni.stop_patroni()
return
if not ready:
raise Exception("Cannot stop unit: All other members are connecting")

try:
self._remove_raft_node(syncobj_util, ready[0], status["self"].address)
except Exception:
self._patroni.start_patroni()
raise

def _setup_exporter(self) -> None:
"""Set up postgresql_exporter options."""
cache = snap.SnapCache()
Expand Down

0 comments on commit 1b187f5

Please sign in to comment.