diff --git a/src/charm.py b/src/charm.py index 535fc71403..fbd513eb1a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -463,6 +463,18 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: event.defer() return + # Restart the workload if it's stuck on the starting state after a timeline divergence + # due to a backup that was restored. + if not self.is_primary and ( + self._patroni.member_replication_lag == "unknown" + or int(self._patroni.member_replication_lag) > 1000 + ): + self._patroni.reinitialize_postgresql() + logger.debug("Deferring on_peer_relation_changed: reinitialising replica") + self.unit.status = WaitingStatus("reinitialising replica") + event.defer() + return + self.postgresql_client_relation.update_read_only_endpoint() self.backup.check_stanza() diff --git a/src/patroni.py b/src/patroni.py index e83ada6495..0187633b4f 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -229,6 +229,26 @@ def primary_endpoint_ready(self) -> bool: return True + @property + def member_replication_lag(self) -> str: + """Member replication lag.""" + try: + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + cluster_status = requests.get( + f"{self._patroni_url}/cluster", + verify=self._verify, + timeout=5, + ) + except RetryError: + return "unknown" + + for member in cluster_status.json()["members"]: + if member["name"] == self._charm.unit.name.replace("/", "-"): + return member.get("lag", "unknown") + + return "unknown" + @property def member_started(self) -> bool: """Has the member started Patroni and PostgreSQL. @@ -259,6 +279,11 @@ def is_database_running(self) -> bool: # Check whether the PostgreSQL process has a state equal to T (frozen). return any(process for process in postgresql_processes if process.split()[7] != "T") + @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) + def reinitialize_postgresql(self) -> None: + """Reinitialize PostgreSQL.""" + requests.post(f"{self._patroni_url}/reinitialize", verify=self._verify) + def _render_file(self, path: str, content: str, mode: int) -> None: """Write a content rendered from a template to a file.