diff --git a/poetry.lock b/poetry.lock index 86abecb433..68c9342c58 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1494,6 +1494,16 @@ files = [ [package.dependencies] pytz = "*" +[[package]] +name = "pysyncobj" +version = "0.3.12" +description = "A library for replicating your python class between multiple servers, based on raft protocol" +optional = false +python-versions = "*" +files = [ + {file = "pysyncobj-0.3.12.tar.gz", hash = "sha256:a4c82f080d3dcbf4a4fd627a6a0c466a60169297d3b7797acff9ff40cc44daf6"}, +] + [[package]] name = "pytest" version = "7.4.4" @@ -2146,4 +2156,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "f07208dda5aac45012b944eb647b3718e3d754a0b7b05da0ab2f3895800cec33" +content-hash = "56afcb6f081fe8ade64311fcc5cc0a20755332b5b0134b959a2f7e050efde9d8" diff --git a/pyproject.toml b/pyproject.toml index 5296f27186..71d2cdb103 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ cosl = "^0.0.8" pydantic = "^1.10.14" poetry-core = "^1.9.0" pyOpenSSL = "^24.0.0" +pysyncobj = "^0.3.12" [tool.poetry.group.charm-libs.dependencies] # data_platform_libs/v0/data_interfaces.py @@ -104,10 +105,12 @@ target-version = ["py38"] [tool.ruff] # preview and explicit preview are enabled for CPY001 preview = true -explicit-preview-rules = true target-version = "py38" src = ["src", "."] line-length = 99 + +[tool.ruff.lint] +explicit-preview-rules = true select = ["A", "E", "W", "F", "C", "N", "D", "I001", "CPY001"] extend-ignore = [ "D203", @@ -126,16 +129,16 @@ extend-ignore = [ # Ignore D107 Missing docstring in __init__ ignore = ["E501", "D107"] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "tests/*" = ["D100", "D101", "D102", "D103", "D104"] -[tool.ruff.flake8-copyright] +[tool.ruff.lint.flake8-copyright] # Check for properly formatted copyright header in each file author = "Canonical Ltd." notice-rgx = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+" -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] max-complexity = 10 -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "google" diff --git a/src/charm.py b/src/charm.py index 9c489e16a6..8d26d49abb 100755 --- a/src/charm.py +++ b/src/charm.py @@ -44,6 +44,7 @@ Unit, WaitingStatus, ) +from pysyncobj.utility import TcpUtility from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed from backups import PostgreSQLBackups @@ -147,6 +148,7 @@ def __init__(self, *args): self.framework.observe(self.on[PEER].relation_departed, self._on_peer_relation_departed) self.framework.observe(self.on.pgdata_storage_detaching, self._on_pgdata_storage_detaching) self.framework.observe(self.on.start, self._on_start) + self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on.get_password_action, self._on_get_password) self.framework.observe(self.on.set_password_action, self._on_set_password) self.framework.observe(self.on.update_status, self._on_update_status) @@ -992,6 +994,52 @@ def _on_start(self, event: StartEvent) -> None: # Bootstrap the cluster in the leader unit. self._start_primary(event) + def _remove_raft_node(self, syncobj_util: TcpUtility, partner: str, current: str) -> None: + """Try to remove a raft member calling a partner node.""" + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True): + if not self._patroni.stop_patroni(): + logger.warning("cannot remove raft member: failed to stop Patroni") + raise Exception("Failed to stop service") + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True): + status = syncobj_util.executeCommand(partner, ["status"]) + if f"partner_node_status_server_{current}" not in status: + logger.debug("cannot remove raft member: not part of the cluster") + return + removal_result = syncobj_util.executeCommand(partner, ["remove", current]) + if removal_result.startswith("FAIL"): + logger.warning("failed to remove raft member: %s", removal_result) + raise Exception("Failed to remove the unit") + + def _on_stop(self, _) -> None: + syncobj_util = TcpUtility(timeout=3) + raft_host = "localhost:2222" + # Try to call a different unit + for ip in self._units_ips: + if ip != self._unit_ip: + raft_host = f"{ip}:2222" + break + status = syncobj_util.executeCommand(raft_host, ["status"]) + partners = [] + ready = [] + for key in status.keys(): + if key.startswith("partner_node_status_server_") and status[key]: + partner = key.split("partner_node_status_server_")[-1] + partners.append(partner) + if status[key] == 2: + ready.append(partner) + if not ready and not partners: + logger.debug("Terminating last node") + self._patroni.stop_patroni() + return + if not ready: + raise Exception("Cannot stop unit: All other members are connecting") + + try: + self._remove_raft_node(syncobj_util, ready[0], status["self"].address) + except Exception: + self._patroni.start_patroni() + raise + def _setup_exporter(self) -> None: """Set up postgresql_exporter options.""" cache = snap.SnapCache()