From 2c3926f0b229c32de4a21f1bfba2299388565e63 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasenko Date: Tue, 2 Jul 2024 17:20:43 +0300 Subject: [PATCH] [DPE-2582] Point In Time Recovery (#391) * Add restore-to-time parameter. * Add PITR restore test. * Fix patroni failed PITR check. * Add Patroni service restart condition override with PITR. * Improve restore-to-time parameter processing. * Improve PITR comments. * Improve PITR comments. * Fix unit tests errors caused by PITR. * Improve unit tests with PITR. * Fix PITR. * Improve PITR test. * Fix PITR, apply format. * Add PITR unit test. * Fix PITR integration test. * Format. * Add format check for restore-to-time parameter. Improve PITR fail detection. * Typo fix. * Add ability to restore only with 'restore-to-time' parameter. * Add requiring to move to another bucket after restore. * Add last transaction time logging on PITR fail. * Fix unit tests due to PITR. * Fix unit tests due to PITR. * Fix unit tests due to PITR. * Fix unit tests due to PITR. * Fix PITR integration test. * Improve restore-to-time input format. * Lint. * PITR PR suggestions Co-authored-by: Marcelo Henrique Neppel * PITR PR suggestions * PITR, improve Patroni restart condition overriding. * Lint. * PITR, s3 stanza wal check. * Add restore-to-time "latest" option. * Fix unit tests. * Fix unit tests. * Fix tls vars naming in integration test_backups. * PITR improvements and fixes. Improve integration backup test. * Format. * Fix backups integration test. * Increase timeout for pitr test. * Fix library Signed-off-by: Marcelo Henrique Neppel * Minor PITR test improvement. * Lint. --------- Signed-off-by: Marcelo Henrique Neppel Co-authored-by: Marcelo Henrique Neppel Co-authored-by: Marcelo Henrique Neppel --- actions.yaml | 3 + lib/charms/postgresql_k8s/v0/postgresql.py | 12 +- src/backups.py | 117 +++++++-- src/charm.py | 145 ++++++++++- src/cluster.py | 85 +++++- src/constants.py | 2 + templates/patroni.yml.j2 | 12 +- tests/integration/helpers.py | 13 +- tests/integration/test_backups.py | 25 +- tests/integration/test_backups_pitr.py | 290 +++++++++++++++++++++ tests/unit/test_backups.py | 11 + tests/unit/test_charm.py | 107 ++++++++ 12 files changed, 782 insertions(+), 40 deletions(-) create mode 100644 tests/integration/test_backups_pitr.py diff --git a/actions.yaml b/actions.yaml index 6e3ced3d8b..96932bbba8 100644 --- a/actions.yaml +++ b/actions.yaml @@ -45,6 +45,9 @@ restore: backup-id: type: string description: A backup-id to identify the backup to restore (format = %Y-%m-%dT%H:%M:%SZ) + restore-to-time: + type: string + description: Point-in-time-recovery target in PSQL format. set-password: description: Change the system user's password, which is used by charm. It is for internal charm users and SHOULD NOT be used by applications. diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 08b559649c..f8f3ad2b23 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -36,7 +36,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 29 +LIBPATCH = 30 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -383,6 +383,16 @@ def _generate_database_privileges_statements( ) return statements + def get_last_archived_wal(self) -> str: + """Get the name of the last archived wal for the current PostgreSQL cluster.""" + try: + with self._connect_to_database() as connection, connection.cursor() as cursor: + cursor.execute("SELECT last_archived_wal FROM pg_stat_archiver;") + return cursor.fetchone()[0] + except psycopg2.Error as e: + logger.error(f"Failed to get PostgreSQL last archived WAL: {e}") + raise PostgreSQLGetPostgreSQLVersionError() + def get_postgresql_text_search_configs(self) -> Set[str]: """Returns the PostgreSQL available text search configs. diff --git a/src/backups.py b/src/backups.py index b2e0297669..217c17b8aa 100644 --- a/src/backups.py +++ b/src/backups.py @@ -47,11 +47,14 @@ "failed to access/create the bucket, check your S3 settings" ) FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings" +CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details" +MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket" S3_BLOCK_MESSAGES = [ ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE, + MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, ] @@ -198,9 +201,29 @@ def can_use_s3_repository(self) -> Tuple[bool, Optional[str]]: if self.charm._patroni.member_started: self.charm._patroni.reload_patroni_configuration() return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE + return self._is_s3_wal_compatible(stanza) return True, None + def _is_s3_wal_compatible(self, stanza) -> Tuple[bool, Optional[str]]: + """Returns whether the S3 stanza is compatible with current PostgreSQL cluster by WAL parity.""" + charm_last_archived_wal = self.charm.postgresql.get_last_archived_wal() + logger.debug(f"last archived wal: {charm_last_archived_wal}") + s3_archive = stanza.get("archive", []) + if len(s3_archive) > 0: + s3_last_archived_wal = s3_archive[0].get("max") + logger.debug(f"last s3 wal: {str(s3_last_archived_wal)}") + if ( + charm_last_archived_wal + and s3_last_archived_wal + and charm_last_archived_wal.split(".", 1)[0] != str(s3_last_archived_wal) + ): + if bool(self.charm.app_peer_data.get("require-change-bucket-after-restore", None)): + return False, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET + else: + return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE + return True, None + def _change_connectivity_to_database(self, connectivity: bool) -> None: """Enable or disable the connectivity to the database.""" self.charm.unit_peer_data.update({"connectivity": "on" if connectivity else "off"}) @@ -423,11 +446,7 @@ def _initialise_stanza(self) -> None: # Enable stanza initialisation if the backup settings were fixed after being invalid # or pointing to a repository where there are backups from another cluster. - if self.charm.is_blocked and self.charm.unit.status.message not in [ - ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, - FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE, - FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE, - ]: + if self.charm.is_blocked and self.charm.unit.status.message not in S3_BLOCK_MESSAGES: logger.warning("couldn't initialize stanza due to a blocked status") return @@ -554,6 +573,18 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): event.defer() return + # Prevents config change in bad state, so DB peer relations change event will not cause patroni related errors. + if self.charm.unit.status.message == CANNOT_RESTORE_PITR: + logger.info("Cannot change S3 configuration in bad PITR restore status") + event.defer() + return + + # Prevents S3 change in the middle of restoring backup and patroni / pgbackrest errors caused by that. + if "restoring-backup" in self.charm.app_peer_data: + logger.info("Cannot change S3 configuration during restore") + event.defer() + return + if not self._render_pgbackrest_conf_file(): logger.debug("Cannot set pgBackRest configurations, missing configurations.") return @@ -567,6 +598,8 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): if not self.charm.is_primary: return + self.charm.app_peer_data.pop("require-change-bucket-after-restore", None) + try: self._create_bucket_if_not_exists() except (ClientError, ValueError): @@ -582,7 +615,11 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): def _on_s3_credential_gone(self, _) -> None: if self.charm.unit.is_leader(): - self.charm.app_peer_data.update({"stanza": "", "init-pgbackrest": ""}) + self.charm.app_peer_data.update({ + "stanza": "", + "init-pgbackrest": "", + "require-change-bucket-after-restore": "", + }) self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES: self.charm.unit.status = ActiveStatus() @@ -753,20 +790,42 @@ def _on_restore_action(self, event): return backup_id = event.params.get("backup-id") - logger.info(f"A restore with backup-id {backup_id} has been requested on unit") + restore_to_time = event.params.get("restore-to-time") + logger.info( + f"A restore" + f"{' with backup-id ' + backup_id if backup_id else ''}" + f"{' to time point ' + restore_to_time if restore_to_time else ''}" + f" has been requested on the unit" + ) - # Validate the provided backup id. - logger.info("Validating provided backup-id") + # Validate the provided backup id and restore to time. + logger.info("Validating provided backup-id and restore-to-time") try: backups = self._list_backups(show_failed=False) - if backup_id not in backups.keys(): + if backup_id and backup_id not in backups.keys(): error_message = f"Invalid backup-id: {backup_id}" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return + if not backup_id and restore_to_time and not backups: + error_message = "Cannot restore PITR without any backups created" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + return except ListBackupsError as e: logger.exception(e) - error_message = "Failed to retrieve backup id" + error_message = "Failed to retrieve backups list" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + return + + # Quick check for timestamp format + if ( + restore_to_time + and restore_to_time != "latest" + and not re.match("^[0-9-]+ [0-9:.+]+$", restore_to_time) + ): + error_message = "Bad restore-to-time format" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return @@ -781,6 +840,17 @@ def _on_restore_action(self, event): event.fail(error_message) return + # Temporarily disabling patroni service auto-restart. This is required as point-in-time-recovery can fail + # on restore, therefore during cluster bootstrapping process. In this case, we need be able to check patroni + # service status and logs. Disabling auto-restart feature is essential to prevent wrong status indicated + # and logs reading race condition (as logs cleared / moved with service restarts). + if not self.charm.override_patroni_restart_condition("no", "restore-backup"): + error_message = "Failed to override Patroni restart condition" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + self._restart_database() + return + logger.info("Removing the contents of the data directory") if not self._empty_data_files(): error_message = "Failed to remove contents of the data directory" @@ -792,8 +862,12 @@ def _on_restore_action(self, event): # Mark the cluster as in a restoring backup state and update the Patroni configuration. logger.info("Configuring Patroni to restore the backup") self.charm.app_peer_data.update({ - "restoring-backup": self._fetch_backup_from_id(backup_id), - "restore-stanza": backups[backup_id], + "restoring-backup": self._fetch_backup_from_id(backup_id) if backup_id else "", + "restore-stanza": backups[backup_id] + if backup_id + else self.charm.app_peer_data.get("stanza", self.stanza_name), + "restore-to-time": restore_to_time or "", + "require-change-bucket-after-restore": "True", }) self.charm.update_config() @@ -865,17 +939,20 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool: event.fail(validation_message) return False - if not event.params.get("backup-id"): - error_message = "Missing backup-id to restore" + if not event.params.get("backup-id") and not event.params.get("restore-to-time"): + error_message = ( + "Missing backup-id or/and restore-to-time parameter to be able to do restore" + ) logger.error(f"Restore failed: {error_message}") event.fail(error_message) return False logger.info("Checking if cluster is in blocked state") - if ( - self.charm.is_blocked - and self.charm.unit.status.message != ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE - ): + if self.charm.is_blocked and self.charm.unit.status.message not in [ + ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, + CANNOT_RESTORE_PITR, + MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + ]: error_message = "Cluster or unit is in a blocking state" logger.error(f"Restore failed: {error_message}") event.fail(error_message) @@ -941,7 +1018,7 @@ def _render_pgbackrest_conf_file(self) -> bool: def _restart_database(self) -> None: """Removes the restoring backup flag and restart the database.""" - self.charm.app_peer_data.update({"restoring-backup": ""}) + self.charm.app_peer_data.update({"restoring-backup": "", "restore-to-time": ""}) self.charm.update_config() self.charm._patroni.start_patroni() diff --git a/src/charm.py b/src/charm.py index f7f9675d59..8a3e3ab41a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,11 +8,12 @@ import logging import os import platform +import re import subprocess import sys from datetime import datetime from pathlib import Path -from typing import Dict, List, Literal, Optional, Set, get_args +from typing import Dict, List, Literal, Optional, Set, Tuple, get_args import psycopg2 from charms.data_platform_libs.v0.data_interfaces import DataPeerData, DataPeerUnitData @@ -53,7 +54,7 @@ ) from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed -from backups import PostgreSQLBackups +from backups import CANNOT_RESTORE_PITR, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, PostgreSQLBackups from cluster import ( NotReadyError, Patroni, @@ -539,6 +540,13 @@ def _on_peer_relation_changed(self, event: HookEvent): logger.error("Invalid configuration: %s", str(e)) return + # If PITR restore failed, then wait it for resolve. + if ( + "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data + ) and isinstance(self.unit.status, BlockedStatus): + event.defer() + return + # Start can be called here multiple times as it's idempotent. # At this moment, it starts Patroni at the first time the data is received # in the relation. @@ -1047,7 +1055,9 @@ def _can_start(self, event: StartEvent) -> bool: # Doesn't try to bootstrap the cluster if it's in a blocked state # caused, for example, because a failed installation of packages. - if self.is_blocked: + if self.is_blocked and self.unit.status.message not in [ + MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET + ]: logger.debug("Early exit on_start: Unit blocked") return False @@ -1280,7 +1290,16 @@ def _on_update_status(self, _) -> None: if not self._can_run_on_update_status(): return - if "restoring-backup" in self.app_peer_data: + if "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data: + if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed()): + logger.error( + "Restore failed: database service failed to reach point-in-time-recovery target. " + "You can launch another restore with different parameters" + ) + self.log_pitr_last_transaction_time() + self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) + return + if "failed" in self._patroni.get_member_status(self._member_name): logger.error("Restore failed: database service failed to start") self.unit.status = BlockedStatus("Failed to restore backup") @@ -1291,8 +1310,13 @@ def _on_update_status(self, _) -> None: return # Remove the restoring backup flag and the restore stanza name. - self.app_peer_data.update({"restoring-backup": "", "restore-stanza": ""}) + self.app_peer_data.update({ + "restoring-backup": "", + "restore-stanza": "", + "restore-to-time": "", + }) self.update_config() + self.restore_patroni_restart_condition() logger.info("Restore succeeded") can_use_s3_repository, validation_message = self.backup.can_use_s3_repository() @@ -1385,6 +1409,9 @@ def _handle_workload_failures(self) -> bool: def _set_primary_status_message(self) -> None: """Display 'Primary' in the unit status message if the current unit is the primary.""" try: + if "require-change-bucket-after-restore" in self.app_peer_data: + self.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET) + return if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name: self.unit.status = ActiveStatus("Primary") elif self.is_standby_leader: @@ -1584,8 +1611,13 @@ def update_config(self, is_creating_backup: bool = False) -> bool: is_creating_backup=is_creating_backup, enable_tls=enable_tls, backup_id=self.app_peer_data.get("restoring-backup"), + pitr_target=self.app_peer_data.get("restore-to-time"), + restore_to_latest=self.app_peer_data.get("restore-to-time", None) == "latest", stanza=self.app_peer_data.get("stanza"), restore_stanza=self.app_peer_data.get("restore-stanza"), + disable_pgbackrest_archiving=bool( + self.app_peer_data.get("require-change-bucket-after-restore", None) + ), parameters=pg_parameters, ) if not self._is_workload_running: @@ -1700,6 +1732,109 @@ def client_relations(self) -> List[Relation]: relations.append(relation) return relations + def override_patroni_restart_condition( + self, new_condition: str, repeat_cause: str | None + ) -> bool: + """Temporary override Patroni systemd service restart condition. + + Executes only on current unit. + + Args: + new_condition: new Patroni systemd service restart condition. + repeat_cause: whether this field is equal to the last success override operation repeat cause, Patroni + restart condition will be overridden (keeping the original restart condition reference untouched) and + success code will be returned. But if this field is distinct from previous repeat cause or None, + repeated operation will cause failure code will be returned. + """ + current_condition = self._patroni.get_patroni_restart_condition() + if "overridden-patroni-restart-condition" in self.unit_peer_data: + original_condition = self.unit_peer_data["overridden-patroni-restart-condition"] + if repeat_cause is None: + logger.error( + f"failure trying to override patroni restart condition to {new_condition}" + f"as it already overridden from {original_condition} to {current_condition}" + ) + return False + previous_repeat_cause = self.unit_peer_data.get( + "overridden-patroni-restart-condition-repeat-cause", None + ) + if previous_repeat_cause != repeat_cause: + logger.error( + f"failure trying to override patroni restart condition to {new_condition}" + f"as it already overridden from {original_condition} to {current_condition}" + f"and repeat cause is not equal: {previous_repeat_cause} != {repeat_cause}" + ) + return False + # There repeat cause is equal + self._patroni.update_patroni_restart_condition(new_condition) + logger.debug( + f"Patroni restart condition re-overridden to {new_condition} within repeat cause {repeat_cause}" + f"(original restart condition reference is untouched and is {original_condition})" + ) + return True + self._patroni.update_patroni_restart_condition(new_condition) + self.unit_peer_data["overridden-patroni-restart-condition"] = current_condition + if repeat_cause is not None: + self.unit_peer_data["overridden-patroni-restart-condition-repeat-cause"] = repeat_cause + logger.debug( + f"Patroni restart condition overridden from {current_condition} to {new_condition}" + f"{' with repeat cause ' + repeat_cause if repeat_cause is not None else ''}" + ) + return True + + def restore_patroni_restart_condition(self) -> None: + """Restore Patroni systemd service restart condition that was before overriding. + + Will do nothing if not overridden. Executes only on current unit. + """ + if "overridden-patroni-restart-condition" in self.unit_peer_data: + original_condition = self.unit_peer_data["overridden-patroni-restart-condition"] + self._patroni.update_patroni_restart_condition(original_condition) + self.unit_peer_data.update({ + "overridden-patroni-restart-condition": "", + "overridden-patroni-restart-condition-repeat-cause": "", + }) + logger.debug(f"restored Patroni restart condition to {original_condition}") + else: + logger.warning("not restoring patroni restart condition as it's not overridden") + + def is_pitr_failed(self) -> Tuple[bool, bool]: + """Check if Patroni service failed to bootstrap cluster during point-in-time-recovery. + + Typically, this means that database service failed to reach point-in-time-recovery target or has been + supplied with bad PITR parameter. Also, remembers last state and can provide info is it new event, or + it belongs to previous action. Executes only on current unit. + + Returns: + Tuple[bool, bool]: + - Is patroni service failed to bootstrap cluster. + - Is it new fail, that wasn't observed previously. + """ + patroni_logs = self._patroni.patroni_logs() + patroni_exceptions = re.findall( + r"^([0-9-:TZ]+).*patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", + patroni_logs, + re.MULTILINE, + ) + if len(patroni_exceptions) > 0: + old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None) + self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1] + return True, patroni_exceptions[-1] != old_pitr_fail_id + return False, False + + def log_pitr_last_transaction_time(self) -> None: + """Log to user last completed transaction time acquired from postgresql logs.""" + postgresql_logs = self._patroni.last_postgresql_logs() + log_time = re.findall( + r"last completed transaction was at log time (.*)$", + postgresql_logs, + re.MULTILINE, + ) + if len(log_time) > 0: + logger.info(f"Last completed transaction was at {log_time[-1]}") + else: + logger.error("Can't tell last completed transaction time") + if __name__ == "__main__": main(PostgresqlOperatorCharm) diff --git a/src/cluster.py b/src/cluster.py index 14651400e9..57420632c1 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -4,9 +4,11 @@ """Helper class used to manage cluster lifecycle.""" +import glob import logging import os import pwd +import re import subprocess from typing import Any, Dict, List, Optional, Set @@ -30,6 +32,7 @@ PATRONI_CLUSTER_STATUS_ENDPOINT, PATRONI_CONF_PATH, PATRONI_LOGS_PATH, + PATRONI_SERVICE_DEFAULT_PATH, PGBACKREST_CONFIGURATION_FILE, POSTGRESQL_CONF_PATH, POSTGRESQL_DATA_PATH, @@ -518,7 +521,10 @@ def render_patroni_yml_file( enable_tls: bool = False, stanza: str = None, restore_stanza: Optional[str] = None, + disable_pgbackrest_archiving: bool = False, backup_id: Optional[str] = None, + pitr_target: Optional[str] = None, + restore_to_latest: bool = False, parameters: Optional[dict[str, str]] = None, ) -> None: """Render the Patroni configuration file. @@ -529,7 +535,10 @@ def render_patroni_yml_file( enable_tls: whether to enable TLS. stanza: name of the stanza created by pgBackRest. restore_stanza: name of the stanza used when restoring a backup. + disable_pgbackrest_archiving: whether to force disable pgBackRest WAL archiving. backup_id: id of the backup that is being restored. + pitr_target: point-in-time-recovery target for the backup. + restore_to_latest: restore all the WAL transaction logs from the stanza. parameters: PostgreSQL parameters to be added to the postgresql.conf file. """ # Open the template patroni.yml file. @@ -555,9 +564,12 @@ def render_patroni_yml_file( replication_password=self.replication_password, rewind_user=REWIND_USER, rewind_password=self.rewind_password, - enable_pgbackrest=stanza is not None, - restoring_backup=backup_id is not None, + enable_pgbackrest_archiving=stanza is not None + and disable_pgbackrest_archiving is False, + restoring_backup=backup_id is not None or pitr_target is not None, backup_id=backup_id, + pitr_target=pitr_target if not restore_to_latest else None, + restore_to_latest=restore_to_latest, stanza=stanza, restore_stanza=restore_stanza, version=self.get_postgresql_version().split(".")[0], @@ -584,6 +596,44 @@ def start_patroni(self) -> bool: logger.exception(error_message, exc_info=e) return False + def patroni_logs(self, num_lines: int | None = 10) -> str: + """Get Patroni snap service logs. Executes only on current unit. + + Args: + num_lines: number of log last lines being returned. + + Returns: + Multi-line logs string. + """ + try: + cache = snap.SnapCache() + selected_snap = cache["charmed-postgresql"] + return selected_snap.logs(services=["patroni"], num_lines=num_lines) + except snap.SnapError as e: + error_message = "Failed to get logs from patroni snap service" + logger.exception(error_message, exc_info=e) + return "" + + def last_postgresql_logs(self) -> str: + """Get last log file content of Postgresql service. + + If there is no available log files, empty line will be returned. + + Returns: + Content of last log file of Postgresql service. + """ + log_files = glob.glob(f"{POSTGRESQL_LOGS_PATH}/*.log") + if len(log_files) == 0: + return "" + log_files.sort(reverse=True) + try: + with open(log_files[0], "r") as last_log_file: + return last_log_file.read() + except OSError as e: + error_message = "Failed to read last postgresql log file" + logger.exception(error_message, exc_info=e) + return "" + def stop_patroni(self) -> bool: """Stop Patroni service using systemd. @@ -720,3 +770,34 @@ def update_synchronous_node_count(self, units: int = None) -> None: # Check whether the update was unsuccessful. if r.status_code != 200: raise UpdateSyncNodeCountError(f"received {r.status_code}") + + def get_patroni_restart_condition(self) -> str: + """Get current restart condition for Patroni systemd service. Executes only on current unit. + + Returns: + Patroni systemd service restart condition. + """ + with open(PATRONI_SERVICE_DEFAULT_PATH, "r") as patroni_service_file: + patroni_service = patroni_service_file.read() + found_restart = re.findall(r"Restart=(\w+)", patroni_service) + if len(found_restart) == 1: + return str(found_restart[0]) + raise RuntimeError("failed to find patroni service restart condition") + + def update_patroni_restart_condition(self, new_condition: str) -> None: + """Override restart condition for Patroni systemd service by rewriting service file and doing daemon-reload. + + Executes only on current unit. + + Args: + new_condition: new Patroni systemd service restart condition. + """ + logger.info(f"setting restart-condition to {new_condition} for patroni service") + with open(PATRONI_SERVICE_DEFAULT_PATH, "r") as patroni_service_file: + patroni_service = patroni_service_file.read() + logger.debug(f"patroni service file: {patroni_service}") + new_patroni_service = re.sub(r"Restart=\w+", f"Restart={new_condition}", patroni_service) + logger.debug(f"new patroni service file: {new_patroni_service}") + with open(PATRONI_SERVICE_DEFAULT_PATH, "w") as patroni_service_file: + patroni_service_file.write(new_patroni_service) + subprocess.run(["/bin/systemctl", "daemon-reload"]) diff --git a/src/constants.py b/src/constants.py index b48faa17fa..189ec73d54 100644 --- a/src/constants.py +++ b/src/constants.py @@ -27,6 +27,8 @@ MONITORING_USER = "monitoring" MONITORING_PASSWORD_KEY = "monitoring-password" MONITORING_SNAP_SERVICE = "prometheus-postgres-exporter" +PATRONI_SERVICE_NAME = "snap.charmed-postgresql.patroni.service" +PATRONI_SERVICE_DEFAULT_PATH = f"/etc/systemd/system/{PATRONI_SERVICE_NAME}" # List of system usernames needed for correct work of the charm/workload. SYSTEM_USERS = [BACKUP_USER, REPLICATION_USER, REWIND_USER, USER, MONITORING_USER] diff --git a/templates/patroni.yml.j2 b/templates/patroni.yml.j2 index 64c00381f7..d1be14e22c 100644 --- a/templates/patroni.yml.j2 +++ b/templates/patroni.yml.j2 @@ -60,7 +60,7 @@ bootstrap: remove_data_directory_on_diverged_timelines: true parameters: synchronous_standby_names: "*" - {%- if enable_pgbackrest %} + {%- if enable_pgbackrest_archiving %} archive_command: 'pgbackrest {{ pgbackrest_configuration_file }} --stanza={{ stanza }} archive-push %p' {% else %} archive_command: /bin/true @@ -101,7 +101,13 @@ bootstrap: {%- if restoring_backup %} method: pgbackrest pgbackrest: - command: pgbackrest {{ pgbackrest_configuration_file }} --stanza={{ restore_stanza }} --pg1-path={{ data_path }} --set={{ backup_id }} --type=immediate --target-action=promote restore + command: > + pgbackrest {{ pgbackrest_configuration_file }} --stanza={{ restore_stanza }} --pg1-path={{ data_path }} + {%- if backup_id %} --set={{ backup_id }} {%- endif %} + {%- if restore_to_latest %} --type=default {%- else %} + --target-action=promote {%- if pitr_target %} --target="{{ pitr_target }}" --type=time {%- else %} --type=immediate {%- endif %} + {%- endif %} + restore no_params: True keep_existing_recovery_conf: True {% elif primary_cluster_endpoint %} @@ -122,7 +128,7 @@ postgresql: bin_dir: /snap/charmed-postgresql/current/usr/lib/postgresql/{{ version }}/bin data_dir: {{ data_path }} parameters: - {%- if enable_pgbackrest %} + {%- if enable_pgbackrest_archiving %} archive_command: 'pgbackrest {{ pgbackrest_configuration_file }} --stanza={{ stanza }} archive-push %p' {% else %} archive_command: /bin/true diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 27b9367ede..d9b491e59a 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -37,6 +37,7 @@ DATABASE_APP_NAME = METADATA["name"] STORAGE_PATH = METADATA["storage"]["pgdata"]["location"] APPLICATION_NAME = "postgresql-test-app" +MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket" logger = logging.getLogger(__name__) @@ -1212,7 +1213,11 @@ async def backup_operations( # Wait for the restore to complete. async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) + await ops_test.model.block_until( + lambda: remaining_unit.workload_status_message + == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + timeout=1000, + ) # Check that the backup was correctly restored by having only the first created table. logger.info("checking that the backup was correctly restored") @@ -1257,7 +1262,11 @@ async def backup_operations( # Wait for the restore to complete. async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) + await ops_test.model.block_until( + lambda: remaining_unit.workload_status_message + == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + timeout=1000, + ) # Check that the backup was correctly restored by having only the first created table. primary = await get_primary(ops_test, remaining_unit.name) diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index c5d02138a7..b813001cbd 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -14,13 +14,13 @@ from .helpers import ( CHARM_SERIES, DATABASE_APP_NAME, + MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, backup_operations, construct_endpoint, db_connect, get_password, get_primary, get_unit_address, - scale_application, switchover, wait_for_idle_on_blocked, ) @@ -30,7 +30,6 @@ FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE = ( "failed to access/create the bucket, check your S3 settings" ) -FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings" S3_INTEGRATOR_APP_NAME = "s3-integrator" if juju_major_version < 3: tls_certificates_app_name = "tls-certificates-operator" @@ -54,7 +53,7 @@ @pytest.fixture(scope="module") -async def cloud_configs(ops_test: OpsTest, github_secrets) -> None: +async def cloud_configs(github_secrets) -> None: # Define some configurations and credentials. configs = { AWS: { @@ -122,14 +121,20 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], c await ops_test.model.applications[database_app_name].remove_relation( f"{database_app_name}:certificates", f"{tls_certificates_app_name}:certificates" ) - await ops_test.model.wait_for_idle(apps=[database_app_name], status="active", timeout=1000) - # Scale up to be able to test primary and leader being different. + new_unit_name = f"{database_app_name}/2" + async with ops_test.fast_forward(): - await scale_application(ops_test, database_app_name, 2) + # Scale up to be able to test primary and leader being different. + await ops_test.model.applications[database_app_name].add_units(1) + # Ensure that new unit become in blocked status, but is fully functional. + await ops_test.model.block_until( + lambda: ops_test.model.units.get(new_unit_name).workload_status_message + == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + timeout=1000, + ) # Ensure replication is working correctly. - new_unit_name = f"{database_app_name}/2" address = get_unit_address(ops_test, new_unit_name) password = await get_password(ops_test, new_unit_name) with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: @@ -167,6 +172,12 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], c await action.wait() backups = action.results.get("backups") assert backups, "backups not outputted" + + # Remove S3 relation to ensure "move to another cluster" blocked status is gone + await ops_test.model.applications[database_app_name].remove_relation( + f"{database_app_name}:s3-parameters", f"{S3_INTEGRATOR_APP_NAME}:s3-credentials" + ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) # Remove the database app. diff --git a/tests/integration/test_backups_pitr.py b/tests/integration/test_backups_pitr.py new file mode 100644 index 0000000000..2a080e5dab --- /dev/null +++ b/tests/integration/test_backups_pitr.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. +import logging +import uuid +from typing import Dict, Tuple + +import boto3 +import pytest as pytest +from pytest_operator.plugin import OpsTest +from tenacity import Retrying, stop_after_attempt, wait_exponential + +from . import architecture +from .helpers import ( + CHARM_SERIES, + DATABASE_APP_NAME, + MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + construct_endpoint, + db_connect, + get_password, + get_primary, + get_unit_address, +) +from .juju_ import juju_major_version + +CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details" +S3_INTEGRATOR_APP_NAME = "s3-integrator" +if juju_major_version < 3: + TLS_CERTIFICATES_APP_NAME = "tls-certificates-operator" + if architecture.architecture == "arm64": + TLS_CHANNEL = "legacy/edge" + else: + TLS_CHANNEL = "legacy/stable" + TLS_CONFIG = {"generate-self-signed-certificates": "true", "ca-common-name": "Test CA"} +else: + TLS_CERTIFICATES_APP_NAME = "self-signed-certificates" + if architecture.architecture == "arm64": + TLS_CHANNEL = "latest/edge" + else: + TLS_CHANNEL = "latest/stable" + TLS_CONFIG = {"ca-common-name": "Test CA"} + +logger = logging.getLogger(__name__) + +AWS = "AWS" +GCP = "GCP" + + +@pytest.fixture(scope="module") +async def cloud_configs(github_secrets) -> None: + # Define some configurations and credentials. + configs = { + AWS: { + "endpoint": "https://s3.amazonaws.com", + "bucket": "data-charms-testing", + "path": f"/postgresql-vm/{uuid.uuid1()}", + "region": "us-east-1", + }, + GCP: { + "endpoint": "https://storage.googleapis.com", + "bucket": "data-charms-testing", + "path": f"/postgresql-vm/{uuid.uuid1()}", + "region": "", + }, + } + credentials = { + AWS: { + "access-key": github_secrets["AWS_ACCESS_KEY"], + "secret-key": github_secrets["AWS_SECRET_KEY"], + }, + GCP: { + "access-key": github_secrets["GCP_ACCESS_KEY"], + "secret-key": github_secrets["GCP_SECRET_KEY"], + }, + } + yield configs, credentials + # Delete the previously created objects. + logger.info("deleting the previously created backups") + for cloud, config in configs.items(): + session = boto3.session.Session( + aws_access_key_id=credentials[cloud]["access-key"], + aws_secret_access_key=credentials[cloud]["secret-key"], + region_name=config["region"], + ) + s3 = session.resource( + "s3", endpoint_url=construct_endpoint(config["endpoint"], config["region"]) + ) + bucket = s3.Bucket(config["bucket"]) + # GCS doesn't support batch delete operation, so delete the objects one by one. + for bucket_object in bucket.objects.filter(Prefix=config["path"].lstrip("/")): + bucket_object.delete() + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_pitr_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], charm) -> None: + """Build, deploy two units of PostgreSQL and do backup. Then, write new data into DB, switch WAL file and test point-in-time-recovery restore action.""" + # Deploy S3 Integrator and TLS Certificates Operator. + await ops_test.model.deploy(S3_INTEGRATOR_APP_NAME) + await ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, config=TLS_CONFIG, channel=TLS_CHANNEL) + + for cloud, config in cloud_configs[0].items(): + # Deploy and relate PostgreSQL to S3 integrator (one database app for each cloud for now + # as archive_mode is disabled after restoring the backup) and to TLS Certificates Operator + # (to be able to create backups from replicas). + database_app_name = f"{DATABASE_APP_NAME}-{cloud.lower()}" + await ops_test.model.deploy( + charm, + application_name=database_app_name, + num_units=2, + series=CHARM_SERIES, + config={"profile": "testing"}, + ) + + await ops_test.model.relate(database_app_name, TLS_CERTIFICATES_APP_NAME) + async with ops_test.fast_forward(fast_interval="60s"): + await ops_test.model.wait_for_idle( + apps=[database_app_name], status="active", timeout=1000 + ) + await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) + + # Configure and set access and secret keys. + logger.info(f"configuring S3 integrator for {cloud}") + await ops_test.model.applications[S3_INTEGRATOR_APP_NAME].set_config(config) + action = await ops_test.model.units.get(f"{S3_INTEGRATOR_APP_NAME}/0").run_action( + "sync-s3-credentials", + **cloud_configs[1][cloud], + ) + await action.wait() + async with ops_test.fast_forward(fast_interval="60s"): + await ops_test.model.wait_for_idle( + apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1500 + ) + + primary = await get_primary(ops_test, f"{database_app_name}/0") + for unit in ops_test.model.applications[database_app_name].units: + if unit.name != primary: + replica = unit.name + break + + # Write some data. + password = await get_password(ops_test, primary) + address = get_unit_address(ops_test, primary) + logger.info("creating a table in the database") + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute( + "CREATE TABLE IF NOT EXISTS backup_table_1 (test_column INT);" + ) + connection.close() + + # Run the "create backup" action. + logger.info("creating a backup") + action = await ops_test.model.units.get(replica).run_action("create-backup") + await action.wait() + backup_status = action.results.get("backup-status") + assert backup_status, "backup hasn't succeeded" + await ops_test.model.wait_for_idle( + apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000 + ) + + # Run the "list backups" action. + logger.info("listing the available backups") + action = await ops_test.model.units.get(replica).run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + assert backups, "backups not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Write some data. + logger.info("creating after-backup data in the database") + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute( + "INSERT INTO backup_table_1 (test_column) VALUES (1), (2), (3), (4), (5);" + ) + connection.close() + with db_connect( + host=address, password=password + ) as connection, connection.cursor() as cursor: + cursor.execute("SELECT current_timestamp;") + after_backup_ts = str(cursor.fetchone()[0]) + connection.close() + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute( + "CREATE TABLE IF NOT EXISTS backup_table_2 (test_column INT);" + ) + connection.close() + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("SELECT pg_switch_wal();") + connection.close() + + # Scale down to be able to restore. + async with ops_test.fast_forward(): + await ops_test.model.destroy_unit(replica) + await ops_test.model.block_until( + lambda: len(ops_test.model.applications[database_app_name].units) == 1 + ) + + for unit in ops_test.model.applications[database_app_name].units: + remaining_unit = unit + break + + most_recent_backup = backups.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + # Wrong timestamp pointing to one year ahead + wrong_ts = after_backup_ts.replace( + after_backup_ts[:4], str(int(after_backup_ts[:4]) + 1), 1 + ) + + # Run the "restore backup" action with bad PITR parameter. + logger.info("restoring the backup with bad restore-to-time parameter") + action = await remaining_unit.run_action( + "restore", **{"backup-id": backup_id, "restore-to-time": "bad data"} + ) + await action.wait() + assert ( + action.status == "failed" + ), "action must fail with bad restore-to-time parameter, but it succeeded" + + # Run the "restore backup" action with unreachable PITR parameter. + logger.info("restoring the backup with unreachable restore-to-time parameter") + action = await remaining_unit.run_action( + "restore", **{"backup-id": backup_id, "restore-to-time": wrong_ts} + ) + await action.wait() + logger.info("waiting for the database charm to become blocked") + async with ops_test.fast_forward(): + await ops_test.model.block_until( + lambda: remaining_unit.workload_status_message == CANNOT_RESTORE_PITR, + timeout=1000, + ) + logger.info( + "database charm become in blocked state, as supposed to be with unreachable PITR parameter" + ) + + # Run the "restore backup" action. + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + action = await remaining_unit.run_action( + "restore", **{"backup-id": backup_id, "restore-to-time": after_backup_ts} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" + + # Wait for the restore to complete. + async with ops_test.fast_forward(): + await ops_test.model.block_until( + lambda: remaining_unit.workload_status_message + == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + timeout=1000, + ) + + # Check that the backup was correctly restored. + primary = await get_primary(ops_test, remaining_unit.name) + address = get_unit_address(ops_test, primary) + logger.info("checking that the backup was correctly restored") + with db_connect( + host=address, password=password + ) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" + ) + assert cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" + cursor.execute("SELECT COUNT(1) FROM backup_table_1;") + assert ( + int(cursor.fetchone()[0]) == 5 + ), "backup wasn't correctly restored: table 'backup_table_1' doesn't have 5 rows" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" + ) + assert not cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_2' exists" + connection.close() + + # Remove the database app. + await ops_test.model.remove_application(database_app_name, block_until_done=True) + # Remove the TLS operator. + await ops_test.model.remove_application(TLS_CERTIFICATES_APP_NAME, block_until_done=True) diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 868a1bc3fc..53018825e9 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -233,6 +233,10 @@ def test_can_use_s3_repository(harness): patch( "charm.Patroni.get_postgresql_version", return_value="14.10" ) as _get_postgresql_version, + patch("charm.PostgresqlOperatorCharm.postgresql") as _postgresql, + patch( + "charms.postgresql_k8s.v0.postgresql.PostgreSQL.get_last_archived_wal" + ) as _get_last_archived_wal, ): peer_rel_id = harness.model.get_relation(PEER).id # Define the stanza name inside the unit relation data. @@ -1367,6 +1371,12 @@ def test_on_restore_action(harness): patch("charm.PostgreSQLBackups._list_backups") as _list_backups, patch("charm.PostgreSQLBackups._fetch_backup_from_id") as _fetch_backup_from_id, patch("charm.PostgreSQLBackups._pre_restore_checks") as _pre_restore_checks, + patch( + "charm.PostgresqlOperatorCharm.override_patroni_restart_condition" + ) as _override_patroni_restart_condition, + patch( + "charm.PostgresqlOperatorCharm.restore_patroni_restart_condition" + ) as _restore_patroni_restart_condition, ): peer_rel_id = harness.model.get_relation(PEER).id # Test when pre restore checks fail. @@ -1443,6 +1453,7 @@ def test_on_restore_action(harness): { "restoring-backup": "20230101-090000F", "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", + "require-change-bucket-after-restore": "True", }, ) _execute_command.assert_called_once_with( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 1e0cc81d02..ee2a254fed 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -27,6 +27,7 @@ from psycopg2 import OperationalError from tenacity import RetryError, wait_fixed +from backups import CANNOT_RESTORE_PITR from charm import ( EXTENSIONS_DEPENDENCY_MESSAGE, PRIMARY_NOT_REACHABLE_MESSAGE, @@ -873,6 +874,14 @@ def test_on_update_status(harness): ) as _primary_endpoint, patch("charm.PostgreSQLProvider.oversee_users") as _oversee_users, patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), + patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs, + patch("charm.Patroni.patroni_logs") as _patroni_logs, + patch("charm.Patroni.get_member_status") as _get_member_status, + patch( + "charm.PostgreSQLBackups.can_use_s3_repository", return_value=(True, None) + ) as _can_use_s3_repository, + patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, + patch("charm.PostgresqlOperatorCharm.log_pitr_last_transaction_time"), ): rel_id = harness.model.get_relation(PEER).id # Test before the cluster is initialised. @@ -881,6 +890,7 @@ def test_on_update_status(harness): # Test after the cluster was initialised, but with the unit in a blocked state. with harness.hooks_disabled(): + harness.set_leader() harness.update_relation_data( rel_id, harness.charm.app.name, {"cluster_initialised": "True"} ) @@ -888,7 +898,30 @@ def test_on_update_status(harness): harness.charm.on.update_status.emit() _set_primary_status_message.assert_not_called() + # Test the point-in-time-recovery fail. + with harness.hooks_disabled(): + harness.update_relation_data( + rel_id, + harness.charm.app.name, + { + "cluster_initialised": "True", + "restoring-backup": "valid", + "restore-to-time": "valid", + }, + ) + harness.charm.unit.status = ActiveStatus() + _patroni_logs.return_value = "2022-02-24 02:00:00 UTC patroni.exceptions.PatroniFatalException: Failed to bootstrap cluster" + harness.charm.on.update_status.emit() + _set_primary_status_message.assert_not_called() + assert harness.charm.unit.status.message == CANNOT_RESTORE_PITR + # Test with the unit in a status different that blocked. + with harness.hooks_disabled(): + harness.update_relation_data( + rel_id, + harness.charm.app.name, + {"cluster_initialised": "True", "restoring-backup": "", "restore-to-time": ""}, + ) harness.charm.unit.status = ActiveStatus() harness.charm.on.update_status.emit() _set_primary_status_message.assert_called_once() @@ -1228,6 +1261,9 @@ def test_update_config(harness): backup_id=None, stanza=None, restore_stanza=None, + pitr_target=None, + restore_to_latest=False, + disable_pgbackrest_archiving=False, parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once_with(False) @@ -1248,6 +1284,9 @@ def test_update_config(harness): backup_id=None, stanza=None, restore_stanza=None, + pitr_target=None, + restore_to_latest=False, + disable_pgbackrest_archiving=False, parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once() @@ -2377,3 +2416,71 @@ def test_set_primary_status_message(harness): _get_primary.return_value = None harness.charm._set_primary_status_message() tc.assertIsInstance(harness.charm.unit.status, MaintenanceStatus) + + @patch("charm.Patroni.update_patroni_restart_condition") + @patch("charm.Patroni.get_patroni_restart_condition") + @patch("charm.PostgresqlOperatorCharm._unit_ip") + def test_override_patroni_restart_condition( + self, _unit_ip, get_restart_condition, update_restart_condition + ): + get_restart_condition.return_value = "always" + + # Do override without repeat_cause + assert self.charm.override_patroni_restart_condition("no") is True + get_restart_condition.assert_called_once() + update_restart_condition.assert_called_once_with("no") + get_restart_condition.reset_mock() + update_restart_condition.reset_mock() + + get_restart_condition.return_value = "no" + + # Must not be overridden twice without repeat_cause + assert self.charm.override_patroni_restart_condition("on-failure") is False + get_restart_condition.assert_called_once() + update_restart_condition.assert_not_called() + get_restart_condition.reset_mock() + update_restart_condition.reset_mock() + + # Reset override + self.charm.restore_patroni_restart_condition() + update_restart_condition.assert_called_once_with("always") + update_restart_condition.reset_mock() + + # Must not be reset twice + self.charm.restore_patroni_restart_condition() + update_restart_condition.assert_not_called() + update_restart_condition.reset_mock() + + get_restart_condition.return_value = "always" + + # Do override with repeat_cause + assert self.charm.override_patroni_restart_condition("no", "test_charm") is True + get_restart_condition.assert_called_once() + update_restart_condition.assert_called_once_with("no") + get_restart_condition.reset_mock() + update_restart_condition.reset_mock() + + get_restart_condition.return_value = "no" + + # Do re-override with repeat_cause + assert self.charm.override_patroni_restart_condition("on-success", "test_charm") is True + get_restart_condition.assert_called_once() + update_restart_condition.assert_called_once_with("on-success") + get_restart_condition.reset_mock() + update_restart_condition.reset_mock() + + get_restart_condition.return_value = "on-success" + + # Must not be re-overridden with different repeat_cause + assert ( + self.charm.override_patroni_restart_condition("on-failure", "test_not_charm") is False + ) + get_restart_condition.assert_called_once() + update_restart_condition.assert_not_called() + get_restart_condition.reset_mock() + update_restart_condition.reset_mock() + + # Reset override + self.charm.restore_patroni_restart_condition() + update_restart_condition.assert_called_once_with("always") + update_restart_condition.reset_mock()