Skip to content

Commit

Permalink
Try all raft partners while removing
Browse files Browse the repository at this point in the history
  • Loading branch information
dragomirp committed Mar 1, 2024
1 parent db94025 commit a030215
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 12 deletions.
40 changes: 29 additions & 11 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,27 +985,45 @@ def _on_start(self, event: StartEvent) -> None:
# Bootstrap the cluster in the leader unit.
self._start_primary(event)

def _remove_raft_node(self, syncobj_util: TcpUtility, partner: str, current: str) -> None:
def _remove_raft_status_check(self, status: Dict, current: str) -> None:
if not status:
raise Exception("Failed to get raft status")
if status["leader"].address == current:
logger.warning("cannot remove raft member: member is leader")
raise Exception("Failed to remove raft leader")

def _remove_raft_node(
self, syncobj_util: TcpUtility, partners: List[str], current: str
) -> None:
"""Try to remove a raft member calling a partner node."""
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
with attempt:
if not self._patroni.stop_patroni():
logger.warning("cannot remove raft member: failed to stop Patroni")
raise Exception("Failed to stop service")

for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
with attempt:
if not (status := self._get_raft_status(syncobj_util, partner)):
raise Exception("Cannot remove raft member: cannot get status")
status = None
for partner in partners:
if not (status := self._get_raft_status(syncobj_util, partner)):
continue
self._remove_raft_status_check(status, current)

if f"partner_node_status_server_{current}" not in status:
logger.debug("Raft member already removed")
return
if status["leader"].address == current:
logger.warning("cannot remove raft member: member is leader")
raise Exception("Raft member is leader")
removal_result = syncobj_util.executeCommand(partner, ["remove", current])
if not removal_result.startswith("SUCCESS"):
logger.warning("failed to remove raft member: %s", removal_result)
raise Exception("Failed to remove raft member")

# If removing multiple units partner list will drift
_, partners = self._parse_raft_partners(status)

for partner in partners:
removal_result = syncobj_util.executeCommand(partner, ["remove", current])
if not removal_result.startswith("SUCCESS"):
logger.warning("failed to remove raft member: %s", removal_result)
continue
return
raise Exception("Failed to remove raft member")

def _get_raft_status(self, syncobj_util: TcpUtility, host: str) -> Optional[Dict]:
"""Get raft status."""
Expand Down Expand Up @@ -1053,7 +1071,7 @@ def _on_stop(self, _) -> None:
raise Exception("Cannot stop unit: All other members are still connecting")

try:
self._remove_raft_node(syncobj_util, ready[0], status["self"].address)
self._remove_raft_node(syncobj_util, ready, status["self"].address)
except Exception:
self._patroni.start_patroni()
raise
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/new_relations/test_new_relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ async def test_invalid_extra_user_roles(ops_test: OpsTest):
for app in data_integrator_apps_names:
await ops_test.model.add_relation(f"{app}:postgresql", f"{DATABASE_APP_NAME}:database")
await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME])
ops_test.model.block_until(
await ops_test.model.block_until(
lambda: any(
unit.workload_status_message == INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE
for unit in ops_test.model.applications[DATABASE_APP_NAME].units
Expand Down

0 comments on commit a030215

Please sign in to comment.