diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 0a13a765de..e603c12ebf 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -36,7 +36,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 35 +LIBPATCH = 36 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -83,6 +83,10 @@ class PostgreSQLGetLastArchivedWALError(Exception): """Exception raised when retrieving last archived WAL fails.""" +class PostgreSQLGetCurrentTimelineError(Exception): + """Exception raised when retrieving current timeline id for the PostgreSQL unit fails.""" + + class PostgreSQLGetPostgreSQLVersionError(Exception): """Exception raised when retrieving PostgreSQL version fails.""" @@ -419,6 +423,16 @@ def get_last_archived_wal(self) -> str: logger.error(f"Failed to get PostgreSQL last archived WAL: {e}") raise PostgreSQLGetLastArchivedWALError() + def get_current_timeline(self) -> str: + """Get the timeline id for the current PostgreSQL unit.""" + try: + with self._connect_to_database() as connection, connection.cursor() as cursor: + cursor.execute("SELECT timeline_id FROM pg_control_checkpoint();") + return cursor.fetchone()[0] + except psycopg2.Error as e: + logger.error(f"Failed to get PostgreSQL current timeline id: {e}") + raise PostgreSQLGetCurrentTimelineError() + def get_postgresql_text_search_configs(self) -> Set[str]: """Returns the PostgreSQL available text search configs. diff --git a/src/backups.py b/src/backups.py index 1acddf5150..e6b14d0d86 100644 --- a/src/backups.py +++ b/src/backups.py @@ -13,7 +13,7 @@ from datetime import datetime, timezone from pathlib import Path from subprocess import PIPE, TimeoutExpired, run -from typing import Dict, List, Optional, OrderedDict, Tuple +from typing import Dict, List, Optional, Tuple import boto3 as boto3 import botocore @@ -48,13 +48,11 @@ ) FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings" CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details" -MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket" S3_BLOCK_MESSAGES = [ ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, ] @@ -201,29 +199,9 @@ def can_use_s3_repository(self) -> Tuple[bool, Optional[str]]: if self.charm._patroni.member_started: self.charm._patroni.reload_patroni_configuration() return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE - return self._is_s3_wal_compatible(stanza) return True, None - def _is_s3_wal_compatible(self, stanza) -> Tuple[bool, Optional[str]]: - """Returns whether the S3 stanza is compatible with current PostgreSQL cluster by WAL parity.""" - charm_last_archived_wal = self.charm.postgresql.get_last_archived_wal() - logger.debug(f"last archived wal: {charm_last_archived_wal}") - s3_archive = stanza.get("archive", []) - if len(s3_archive) > 0: - s3_last_archived_wal = s3_archive[0].get("max") - logger.debug(f"last s3 wal: {str(s3_last_archived_wal)}") - if ( - charm_last_archived_wal - and s3_last_archived_wal - and charm_last_archived_wal.split(".", 1)[0] != str(s3_last_archived_wal) - ): - if bool(self.charm.app_peer_data.get("require-change-bucket-after-restore", None)): - return False, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET - else: - return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE - return True, None - def _change_connectivity_to_database(self, connectivity: bool) -> None: """Enable or disable the connectivity to the database.""" self.charm.unit_peer_data.update({"connectivity": "on" if connectivity else "off"}) @@ -346,37 +324,40 @@ def _format_backup_list(self, backup_list) -> str: backups = [ "Storage bucket name: {:s}".format(s3_parameters["bucket"]), "Backups base path: {:s}/backup/\n".format(s3_parameters["path"]), - "{:<20s} | {:<12s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:s}".format( + "{:<20s} | {:<19s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:<8s} | {:s}".format( "backup-id", - "type", + "action", "status", "reference-backup-id", "LSN start/stop", "start-time", "finish-time", + "timeline", "backup-path", ), ] backups.append("-" * len(backups[2])) for ( backup_id, - backup_type, + backup_action, backup_status, reference, lsn_start_stop, start, stop, + backup_timeline, path, ) in backup_list: backups.append( - "{:<20s} | {:<12s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:s}".format( + "{:<20s} | {:<19s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:<8s} | {:s}".format( backup_id, - backup_type, + backup_action, backup_status, reference, lsn_start_stop, start, stop, + backup_timeline, path, ) ) @@ -400,6 +381,7 @@ def _generate_backup_list_output(self) -> str: backups = json.loads(output)[0]["backup"] for backup in backups: backup_id, backup_type = self._parse_backup_id(backup["label"]) + backup_action = f"{backup_type} backup" backup_reference = "None" if backup["reference"]: backup_reference, _ = self._parse_backup_id(backup["reference"][-1]) @@ -410,6 +392,11 @@ def _generate_backup_list_output(self) -> str: ) for stamp in backup["timestamp"].values() ) + backup_timeline = ( + backup["archive"]["start"][:8].lstrip("0") + if backup["archive"] and backup["archive"]["start"] + else "" + ) backup_path = f'/{self.stanza_name}/{backup["label"]}' error = backup["error"] backup_status = "finished" @@ -417,17 +404,34 @@ def _generate_backup_list_output(self) -> str: backup_status = f"failed: {error}" backup_list.append(( backup_id, - backup_type, + backup_action, backup_status, backup_reference, lsn_start_stop, time_start, time_stop, + backup_timeline, backup_path, )) + + for timeline, (timeline_stanza, timeline_id) in self._list_timelines().items(): + backup_list.append(( + timeline, + "restore", + "finished", + "None", + "n/a", + timeline, + "n/a", + timeline_id, + "n/a", + )) + + backup_list.sort(key=lambda x: x[0]) + return self._format_backup_list(backup_list) - def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: + def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, str]]: """Retrieve the list of backups. Args: @@ -435,8 +439,8 @@ def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: parse: whether to convert backup labels to their IDs or not. Returns: - a dict of previously created backups (id + stanza name) or an empty list - if there is no backups in the S3 bucket. + a dict of previously created backups: id => (stanza, timeline) or an empty dict if there is no backups in + the S3 bucket. """ return_code, output, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, @@ -451,21 +455,95 @@ def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: # If there are no backups, returns an empty dict. if repository_info is None: - return OrderedDict[str, str]() + return dict[str, tuple[str, str]]() backups = repository_info["backup"] stanza_name = repository_info["name"] - return OrderedDict[str, str]( - ( - self._parse_backup_id(backup["label"])[0] if parse else backup["label"], + return dict[str, tuple[str, str]]({ + self._parse_backup_id(backup["label"])[0] if parse else backup["label"]: ( stanza_name, + backup["archive"]["start"][:8].lstrip("0") + if backup["archive"] and backup["archive"]["start"] + else "", ) for backup in backups if show_failed or not backup["error"] - ) + }) + + def _list_timelines(self) -> dict[str, tuple[str, str]]: + """Lists the timelines from the pgBackRest stanza. + + Returns: + a dict of timelines: id => (stanza, timeline) or an empty dict if there is no timelines in the S3 bucket. + """ + return_code, output, stderr = self._execute_command([ + PGBACKREST_EXECUTABLE, + PGBACKREST_CONFIGURATION_FILE, + "repo-ls", + "--recurse", + "--output=json", + ]) + if return_code != 0: + raise ListBackupsError(f"Failed to list repository with error: {stderr}") + + repository = json.loads(output).items() + if repository is None: + return dict[str, tuple[str, str]]() + + return dict[str, tuple[str, str]]({ + datetime.strftime( + datetime.fromtimestamp(timeline_object["time"], timezone.utc), + BACKUP_ID_FORMAT, + ): ( + timeline.split("/")[1], + timeline.split("/")[-1].split(".")[0].lstrip("0"), + ) + for timeline, timeline_object in repository + if timeline.endswith(".history") and not timeline.endswith("backup.history") + }) + + def _get_nearest_timeline(self, timestamp: str) -> tuple[str, str] | None: + """Finds the nearest timeline or backup prior to the specified timeline. + + Returns: + (stanza, timeline) of the nearest timeline or backup. None, if there are no matches. + """ + timelines = self._list_backups(show_failed=False) | self._list_timelines() + filtered_timelines = [ + (timeline_key, timeline_object) + for timeline_key, timeline_object in timelines.items() + if datetime.strptime(timeline_key, BACKUP_ID_FORMAT) + <= self._parse_psql_timestamp(timestamp) + ] + return max(filtered_timelines)[1] if len(filtered_timelines) > 0 else None + + def _is_psql_timestamp(self, timestamp: str) -> bool: + if not re.match( + r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,6})?([-+](?:\d{2}|\d{4}|\d{2}:\d{2}))?$", + timestamp, + ): + return False + try: + self._parse_psql_timestamp(timestamp) + return True + except ValueError: + return False + + def _parse_psql_timestamp(self, timestamp: str) -> datetime: + """Intended to use with data only after _is_psql_timestamp check.""" + # With the python >= 3.11 only the datetime.fromisoformat will be sufficient without any regexes. Therefore, + # it will not be required for the _is_psql_timestamp check that ensures intended regex execution. + t = re.sub(r"([-+]\d{2})$", r"\1:00", timestamp) + t = re.sub(r"([-+]\d{2})(\d{2})$", r"\1:\2", t) + t = re.sub(r"\.(\d+)", lambda x: f".{x[1]:06}", t) + dt = datetime.fromisoformat(t) + # Convert to the timezone-naive + if dt.tzinfo is not None and dt.tzinfo is not timezone.utc: + dt = dt.astimezone(tz=timezone.utc) + return dt.replace(tzinfo=None) def _parse_backup_id(self, label) -> Tuple[str, str]: - """Parse backup ID as a timestamp.""" + """Parse backup ID as a timestamp and its type.""" if label[-1] == "F": timestamp = label backup_type = "full" @@ -646,9 +724,6 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): event.defer() return - if self.charm.unit.is_leader(): - self.charm.app_peer_data.pop("require-change-bucket-after-restore", None) - # Verify the s3 relation only on the primary. if not self.charm.is_primary: return @@ -671,7 +746,6 @@ def _on_s3_credential_gone(self, _) -> None: self.charm.app_peer_data.update({ "stanza": "", "init-pgbackrest": "", - "require-change-bucket-after-restore": "", }) self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES: @@ -848,7 +922,7 @@ def _on_list_backups_action(self, event) -> None: logger.exception(e) event.fail(f"Failed to list PostgreSQL backups with error: {str(e)}") - def _on_restore_action(self, event): + def _on_restore_action(self, event): # noqa: C901 """Request that pgBackRest restores a backup.""" if not self._pre_restore_checks(event): return @@ -856,9 +930,8 @@ def _on_restore_action(self, event): backup_id = event.params.get("backup-id") restore_to_time = event.params.get("restore-to-time") logger.info( - f"A restore" - f"{' with backup-id ' + backup_id if backup_id else ''}" - f"{' to time point ' + restore_to_time if restore_to_time else ''}" + f"A restore with backup-id {backup_id}" + f"{f' to time point {restore_to_time}' if restore_to_time else ''}" f" has been requested on the unit" ) @@ -866,16 +939,35 @@ def _on_restore_action(self, event): logger.info("Validating provided backup-id and restore-to-time") try: backups = self._list_backups(show_failed=False) - if backup_id and backup_id not in backups.keys(): + timelines = self._list_timelines() + is_backup_id_real = backup_id and backup_id in backups.keys() + is_backup_id_timeline = ( + backup_id and not is_backup_id_real and backup_id in timelines.keys() + ) + if backup_id and not is_backup_id_real and not is_backup_id_timeline: error_message = f"Invalid backup-id: {backup_id}" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return - if not backup_id and restore_to_time and not backups: - error_message = "Cannot restore PITR without any backups created" + if is_backup_id_timeline and not restore_to_time: + error_message = "Cannot restore to the timeline without restore-to-time parameter" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return + if is_backup_id_real: + restore_stanza_timeline = backups[backup_id] + elif is_backup_id_timeline: + restore_stanza_timeline = timelines[backup_id] + else: + restore_stanza_timeline = self._get_nearest_timeline(restore_to_time) + if not restore_stanza_timeline: + error_message = f"Can't find the nearest timeline before timestamp {restore_to_time} to restore" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + return + logger.info( + f"Chosen timeline {restore_stanza_timeline[1]} as nearest for the specified timestamp {restore_to_time}" + ) except ListBackupsError as e: logger.exception(e) error_message = "Failed to retrieve backups list" @@ -883,17 +975,6 @@ def _on_restore_action(self, event): event.fail(error_message) return - # Quick check for timestamp format - if ( - restore_to_time - and restore_to_time != "latest" - and not re.match("^[0-9-]+ [0-9:.+]+$", restore_to_time) - ): - error_message = "Bad restore-to-time format" - logger.error(f"Restore failed: {error_message}") - event.fail(error_message) - return - self.charm.unit.status = MaintenanceStatus("restoring backup") # Stop the database service before performing the restore. @@ -926,12 +1007,10 @@ def _on_restore_action(self, event): # Mark the cluster as in a restoring backup state and update the Patroni configuration. logger.info("Configuring Patroni to restore the backup") self.charm.app_peer_data.update({ - "restoring-backup": self._fetch_backup_from_id(backup_id) if backup_id else "", - "restore-stanza": backups[backup_id] - if backup_id - else self.charm.app_peer_data.get("stanza", self.stanza_name), + "restoring-backup": self._fetch_backup_from_id(backup_id) if is_backup_id_real else "", + "restore-stanza": restore_stanza_timeline[0], + "restore-timeline": restore_stanza_timeline[1] if restore_to_time else "", "restore-to-time": restore_to_time or "", - "require-change-bucket-after-restore": "True", }) self.charm.update_config() @@ -1003,10 +1082,23 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool: event.fail(validation_message) return False - if not event.params.get("backup-id") and not event.params.get("restore-to-time"): - error_message = ( - "Missing backup-id or/and restore-to-time parameter to be able to do restore" - ) + if not event.params.get("backup-id") and event.params.get("restore-to-time") in ( + None, + "latest", + ): + error_message = "Missing backup-id or non-latest restore-to-time parameter to be able to do restore" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + return False + + # Quick check for timestamp format + restore_to_time = event.params.get("restore-to-time") + if ( + restore_to_time + and restore_to_time != "latest" + and not self._is_psql_timestamp(restore_to_time) + ): + error_message = "Bad restore-to-time format" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return False @@ -1015,7 +1107,6 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool: if self.charm.is_blocked and self.charm.unit.status.message not in [ ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, CANNOT_RESTORE_PITR, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, ]: error_message = "Cluster or unit is in a blocking state" logger.error(f"Restore failed: {error_message}") diff --git a/src/charm.py b/src/charm.py index 64e3a9175f..ed29fc930e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -25,6 +25,7 @@ PostgreSQL, PostgreSQLCreateUserError, PostgreSQLEnableDisableExtensionError, + PostgreSQLGetCurrentTimelineError, PostgreSQLListUsersError, PostgreSQLUpdateUserPasswordError, ) @@ -52,7 +53,7 @@ ) from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed -from backups import CANNOT_RESTORE_PITR, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, PostgreSQLBackups +from backups import CANNOT_RESTORE_PITR, PostgreSQLBackups from cluster import ( NotReadyError, Patroni, @@ -1069,9 +1070,7 @@ def _can_start(self, event: StartEvent) -> bool: # Doesn't try to bootstrap the cluster if it's in a blocked state # caused, for example, because a failed installation of packages. - if self.is_blocked and self.unit.status.message not in [ - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET - ]: + if self.is_blocked: logger.debug("Early exit on_start: Unit blocked") return False @@ -1316,39 +1315,10 @@ def _on_update_status(self, _) -> None: if not self._can_run_on_update_status(): return - if "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data: - if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed()): - logger.error( - "Restore failed: database service failed to reach point-in-time-recovery target. " - "You can launch another restore with different parameters" - ) - self.log_pitr_last_transaction_time() - self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) - return - - if "failed" in self._patroni.get_member_status(self._member_name): - logger.error("Restore failed: database service failed to start") - self.unit.status = BlockedStatus("Failed to restore backup") - return - - if not self._patroni.member_started: - logger.debug("on_update_status early exit: Patroni has not started yet") - return - - # Remove the restoring backup flag and the restore stanza name. - self.app_peer_data.update({ - "restoring-backup": "", - "restore-stanza": "", - "restore-to-time": "", - }) - self.update_config() - self.restore_patroni_restart_condition() - logger.info("Restore succeeded") - - can_use_s3_repository, validation_message = self.backup.can_use_s3_repository() - if not can_use_s3_repository: - self.unit.status = BlockedStatus(validation_message) - return + if ( + "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data + ) and not self._was_restore_successful(): + return if self._handle_processes_failures(): return @@ -1368,6 +1338,59 @@ def _on_update_status(self, _) -> None: # Restart topology observer if it is gone self._observer.start_observer() + def _was_restore_successful(self) -> bool: + if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed()): + logger.error( + "Restore failed: database service failed to reach point-in-time-recovery target. " + "You can launch another restore with different parameters" + ) + self.log_pitr_last_transaction_time() + self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) + return False + + if "failed" in self._patroni.get_member_status(self._member_name): + logger.error("Restore failed: database service failed to start") + self.unit.status = BlockedStatus("Failed to restore backup") + return False + + if not self._patroni.member_started: + logger.debug("Restore check early exit: Patroni has not started yet") + return False + + restoring_backup = self.app_peer_data.get("restoring-backup") + restore_timeline = self.app_peer_data.get("restore-timeline") + restore_to_time = self.app_peer_data.get("restore-to-time") + try: + current_timeline = self.postgresql.get_current_timeline() + except PostgreSQLGetCurrentTimelineError: + logger.debug("Restore check early exit: can't get current wal timeline") + return False + + # Remove the restoring backup flag and the restore stanza name. + self.app_peer_data.update({ + "restoring-backup": "", + "restore-stanza": "", + "restore-to-time": "", + "restore-timeline": "", + }) + self.update_config() + self.restore_patroni_restart_condition() + + logger.info( + "Restored" + f"{f' to {restore_to_time}' if restore_to_time else ''}" + f"{f' from timeline {restore_timeline}' if restore_timeline and not restoring_backup else ''}" + f"{f' from backup {self.backup._parse_backup_id(restoring_backup)[0]}' if restoring_backup else ''}" + f". Currently tracking the newly created timeline {current_timeline}." + ) + + can_use_s3_repository, validation_message = self.backup.can_use_s3_repository() + if not can_use_s3_repository: + self.unit.status = BlockedStatus(validation_message) + return False + + return True + def _can_run_on_update_status(self) -> bool: if "cluster_initialised" not in self._peers.data[self.app]: return False @@ -1438,15 +1461,6 @@ def _handle_workload_failures(self) -> bool: def _set_primary_status_message(self) -> None: """Display 'Primary' in the unit status message if the current unit is the primary.""" try: - if "require-change-bucket-after-restore" in self.app_peer_data: - if self.unit.is_leader(): - self.app_peer_data.update({ - "restoring-backup": "", - "restore-stanza": "", - "restore-to-time": "", - }) - self.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET) - return if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name: self.unit.status = ActiveStatus("Primary") elif self.is_standby_leader: @@ -1638,12 +1652,10 @@ def update_config(self, is_creating_backup: bool = False) -> bool: enable_tls=enable_tls, backup_id=self.app_peer_data.get("restoring-backup"), pitr_target=self.app_peer_data.get("restore-to-time"), + restore_timeline=self.app_peer_data.get("restore-timeline"), restore_to_latest=self.app_peer_data.get("restore-to-time", None) == "latest", stanza=self.app_peer_data.get("stanza"), restore_stanza=self.app_peer_data.get("restore-stanza"), - disable_pgbackrest_archiving=bool( - self.app_peer_data.get("require-change-bucket-after-restore", None) - ), parameters=pg_parameters, ) if not self._is_workload_running: diff --git a/src/cluster.py b/src/cluster.py index a45e6f889b..a280d3d057 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -555,6 +555,7 @@ def render_patroni_yml_file( disable_pgbackrest_archiving: bool = False, backup_id: Optional[str] = None, pitr_target: Optional[str] = None, + restore_timeline: Optional[str] = None, restore_to_latest: bool = False, parameters: Optional[dict[str, str]] = None, ) -> None: @@ -568,7 +569,8 @@ def render_patroni_yml_file( restore_stanza: name of the stanza used when restoring a backup. disable_pgbackrest_archiving: whether to force disable pgBackRest WAL archiving. backup_id: id of the backup that is being restored. - pitr_target: point-in-time-recovery target for the backup. + pitr_target: point-in-time-recovery target for the restore. + restore_timeline: timeline to restore from. restore_to_latest: restore all the WAL transaction logs from the stanza. parameters: PostgreSQL parameters to be added to the postgresql.conf file. """ @@ -600,6 +602,7 @@ def render_patroni_yml_file( restoring_backup=backup_id is not None or pitr_target is not None, backup_id=backup_id, pitr_target=pitr_target if not restore_to_latest else None, + restore_timeline=restore_timeline, restore_to_latest=restore_to_latest, stanza=stanza, restore_stanza=restore_stanza, diff --git a/templates/patroni.yml.j2 b/templates/patroni.yml.j2 index aeb76f57d7..bd3f87154a 100644 --- a/templates/patroni.yml.j2 +++ b/templates/patroni.yml.j2 @@ -115,6 +115,7 @@ bootstrap: command: > pgbackrest {{ pgbackrest_configuration_file }} --stanza={{ restore_stanza }} --pg1-path={{ data_path }} {%- if backup_id %} --set={{ backup_id }} {%- endif %} + {%- if restore_timeline %} --target-timeline="0x{{ restore_timeline }}" {% endif %} {%- if restore_to_latest %} --type=default {%- else %} --target-action=promote {%- if pitr_target %} --target="{{ pitr_target }}" --type=time {%- else %} --type=immediate {%- endif %} {%- endif %} diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index bbc84b084d..21eea9a20e 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -36,7 +36,6 @@ DATABASE_APP_NAME = METADATA["name"] STORAGE_PATH = METADATA["storage"]["pgdata"]["location"] APPLICATION_NAME = "postgresql-test-app" -MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket" logger = logging.getLogger(__name__) @@ -1238,11 +1237,7 @@ async def backup_operations( # Wait for the restore to complete. async with ops_test.fast_forward(): - await ops_test.model.block_until( - lambda: remaining_unit.workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) # Check that the backup was correctly restored by having only the first created table. logger.info("checking that the backup was correctly restored") @@ -1287,11 +1282,7 @@ async def backup_operations( # Wait for the restore to complete. async with ops_test.fast_forward(): - await ops_test.model.block_until( - lambda: remaining_unit.workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) # Check that the backup was correctly restored by having only the first created table. primary = await get_primary(ops_test, remaining_unit.name) diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index d1961f3478..d3d24486bf 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -14,13 +14,13 @@ from .helpers import ( CHARM_BASE, DATABASE_APP_NAME, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, backup_operations, construct_endpoint, db_connect, get_password, get_primary, get_unit_address, + scale_application, switchover, wait_for_idle_on_blocked, ) @@ -124,15 +124,9 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], c new_unit_name = f"{database_app_name}/2" + # Scale up to be able to test primary and leader being different. async with ops_test.fast_forward(): - # Scale up to be able to test primary and leader being different. - await ops_test.model.applications[database_app_name].add_units(1) - # Ensure that new unit become in blocked status, but is fully functional. - await ops_test.model.block_until( - lambda: ops_test.model.units.get(new_unit_name).workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + await scale_application(ops_test, database_app_name, 2) # Ensure replication is working correctly. address = get_unit_address(ops_test, new_unit_name) @@ -174,11 +168,6 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], c backups = action.results.get("backups") assert backups, "backups not outputted" - # Remove S3 relation to ensure "move to another cluster" blocked status is gone - await ops_test.model.applications[database_app_name].remove_relation( - f"{database_app_name}:s3-parameters", f"{S3_INTEGRATOR_APP_NAME}:s3-credentials" - ) - await ops_test.model.wait_for_idle(status="active", timeout=1000) # Remove the database app. @@ -221,12 +210,16 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets, charm) previous_database_app_name = f"{DATABASE_APP_NAME}-gcp" database_app_name = f"new-{DATABASE_APP_NAME}" await ops_test.model.deploy( - charm, application_name=previous_database_app_name, base=CHARM_BASE + charm, + application_name=previous_database_app_name, + base=CHARM_BASE, + config={"profile": "testing"}, ) await ops_test.model.deploy( charm, application_name=database_app_name, base=CHARM_BASE, + config={"profile": "testing"}, ) await ops_test.model.relate(previous_database_app_name, S3_INTEGRATOR_APP_NAME) await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) @@ -277,8 +270,9 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets, charm) ): with attempt: logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] + # Last two entries are 'action: restore', that cannot be used without restore-to-time parameter + most_recent_real_backup = backups.split("\n")[-3] + backup_id = most_recent_real_backup.split()[0] action = await ops_test.model.units.get(unit_name).run_action( "restore", **{"backup-id": backup_id} ) @@ -290,7 +284,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets, charm) async with ops_test.fast_forward(): unit = ops_test.model.units.get(f"{database_app_name}/0") await ops_test.model.block_until( - lambda: unit.workload_status_message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET + lambda: unit.workload_status_message == ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE ) # Check that the backup was correctly restored by having only the first created table. diff --git a/tests/integration/test_backups_pitr.py b/tests/integration/test_backups_pitr.py index 94ef8ae5a8..442325e3d2 100644 --- a/tests/integration/test_backups_pitr.py +++ b/tests/integration/test_backups_pitr.py @@ -14,7 +14,6 @@ from .helpers import ( CHARM_BASE, DATABASE_APP_NAME, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, construct_endpoint, db_connect, get_password, @@ -102,14 +101,20 @@ async def pitr_backup_operations( config, charm, ) -> None: - """Basic set of operations for PITR backup testing.""" - # Deploy S3 Integrator and TLS Certificates Operator. + """Basic set of operations for PITR backup and timelines management testing. + + Below is presented algorithm in the next format: "(timeline): action_1 -> action_2". + 1: table -> backup_b1 -> test_data_td1 -> timestamp_ts1 -> test_data_td2 -> restore_ts1 => 2 + 2: check_td1 -> check_not_td2 -> test_data_td3 -> restore_b1_latest => 3 + 3: check_td1 -> check_td2 -> check_not_td3 -> test_data_td4 -> restore_t2_latest => 4 + 4: check_td1 -> check_not_td2 -> check_td3 -> check_not_td4 + """ + # Set-up environment + database_app_name = f"{DATABASE_APP_NAME}-{cloud.lower()}" + + logger.info("deploying the next charms: s3-integrator, self-signed-certificates, postgresql") await ops_test.model.deploy(s3_integrator_app_name) await ops_test.model.deploy(tls_certificates_app_name, config=tls_config, channel=tls_channel) - # Deploy and relate PostgreSQL to S3 integrator (one database app for each cloud for now - # as archive_mode is disabled after restoring the backup) and to TLS Certificates Operator - # (to be able to create backups from replicas). - database_app_name = f"{DATABASE_APP_NAME}-{cloud.lower()}" await ops_test.model.deploy( charm, application_name=database_app_name, @@ -118,166 +123,258 @@ async def pitr_backup_operations( config={"profile": "testing"}, ) + logger.info( + "integrating self-signed-certificates with postgresql and waiting them to stabilize" + ) await ops_test.model.relate(database_app_name, tls_certificates_app_name) async with ops_test.fast_forward(fast_interval="60s"): - await ops_test.model.wait_for_idle(apps=[database_app_name], status="active", timeout=1000) - await ops_test.model.relate(database_app_name, s3_integrator_app_name) + await ops_test.model.wait_for_idle( + apps=[database_app_name, tls_certificates_app_name], status="active", timeout=1000 + ) - # Configure and set access and secret keys. - logger.info(f"configuring S3 integrator for {cloud}") + logger.info(f"configuring s3-integrator for {cloud}") await ops_test.model.applications[s3_integrator_app_name].set_config(config) action = await ops_test.model.units.get(f"{s3_integrator_app_name}/0").run_action( "sync-s3-credentials", **credentials, ) await action.wait() + + logger.info("integrating s3-integrator with postgresql and waiting model to stabilize") + await ops_test.model.relate(database_app_name, s3_integrator_app_name) async with ops_test.fast_forward(fast_interval="60s"): - await ops_test.model.wait_for_idle( - apps=[database_app_name, s3_integrator_app_name], status="active", timeout=1500 - ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) primary = await get_primary(ops_test, f"{database_app_name}/0") for unit in ops_test.model.applications[database_app_name].units: if unit.name != primary: replica = unit.name break - - # Write some data. password = await get_password(ops_test, primary) address = get_unit_address(ops_test, primary) - logger.info("creating a table in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("CREATE TABLE IF NOT EXISTS backup_table_1 (test_column INT);") - connection.close() - # Run the "create backup" action. - logger.info("creating a backup") + logger.info("1: creating table") + _create_table(address, password) + + logger.info("1: creating backup b1") action = await ops_test.model.units.get(replica).run_action("create-backup") await action.wait() backup_status = action.results.get("backup-status") assert backup_status, "backup hasn't succeeded" - await ops_test.model.wait_for_idle( - apps=[database_app_name, s3_integrator_app_name], status="active", timeout=1000 - ) - - # Run the "list backups" action. - logger.info("listing the available backups") - action = await ops_test.model.units.get(replica).run_action("list-backups") - await action.wait() - backups = action.results.get("backups") - assert backups, "backups not outputted" await ops_test.model.wait_for_idle(status="active", timeout=1000) + backup_b1 = await _get_most_recent_backup(ops_test, ops_test.model.units.get(replica)) - # Write some data. - logger.info("creating after-backup data in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute( - "INSERT INTO backup_table_1 (test_column) VALUES (1), (2), (3), (4), (5);" - ) - connection.close() + logger.info("1: creating test data td1") + _insert_test_data("test_data_td1", address, password) + + logger.info("1: get timestamp ts1") with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: cursor.execute("SELECT current_timestamp;") - after_backup_ts = str(cursor.fetchone()[0]) - connection.close() - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("CREATE TABLE IF NOT EXISTS backup_table_2 (test_column INT);") - connection.close() - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("SELECT pg_switch_wal();") + timestamp_ts1 = str(cursor.fetchone()[0]) connection.close() + # Wrong timestamp pointing to one year ahead + unreachable_timestamp_ts1 = timestamp_ts1.replace( + timestamp_ts1[:4], str(int(timestamp_ts1[:4]) + 1), 1 + ) - # Scale down to be able to restore. + logger.info("1: creating test data td2") + _insert_test_data("test_data_td2", address, password) + + logger.info("1: switching wal") + _switch_wal(address, password) + + logger.info("1: scaling down to do restore") async with ops_test.fast_forward(): await ops_test.model.destroy_unit(replica) - await ops_test.model.block_until( - lambda: len(ops_test.model.applications[database_app_name].units) == 1 - ) - + await ops_test.model.wait_for_idle(status="active", timeout=1000) for unit in ops_test.model.applications[database_app_name].units: remaining_unit = unit break - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - # Wrong timestamp pointing to one year ahead - wrong_ts = after_backup_ts.replace(after_backup_ts[:4], str(int(after_backup_ts[:4]) + 1), 1) - - # Run the "restore backup" action with bad PITR parameter. - logger.info("restoring the backup with bad restore-to-time parameter") + logger.info("1: restoring the backup b1 with bad restore-to-time parameter") action = await remaining_unit.run_action( - "restore", **{"backup-id": backup_id, "restore-to-time": "bad data"} + "restore", **{"backup-id": backup_b1, "restore-to-time": "bad data"} ) await action.wait() assert ( action.status == "failed" - ), "action must fail with bad restore-to-time parameter, but it succeeded" + ), "1: restore must fail with bad restore-to-time parameter, but that action succeeded" - # Run the "restore backup" action with unreachable PITR parameter. - logger.info("restoring the backup with unreachable restore-to-time parameter") + logger.info("1: restoring the backup b1 with unreachable restore-to-time parameter") action = await remaining_unit.run_action( - "restore", **{"backup-id": backup_id, "restore-to-time": wrong_ts} + "restore", **{"backup-id": backup_b1, "restore-to-time": unreachable_timestamp_ts1} ) await action.wait() - logger.info("waiting for the database charm to become blocked") + logger.info("1: waiting for the database charm to become blocked after restore") async with ops_test.fast_forward(): await ops_test.model.block_until( lambda: remaining_unit.workload_status_message == CANNOT_RESTORE_PITR, timeout=1000, ) logger.info( - "database charm become in blocked state, as supposed to be with unreachable PITR parameter" + "1: database charm become in blocked state after restore, as supposed to be with unreachable PITR parameter" ) - # Run the "restore backup" action. for attempt in Retrying( stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) ): with attempt: - logger.info("restoring the backup") + logger.info("1: restoring to the timestamp ts1") action = await remaining_unit.run_action( - "restore", **{"backup-id": backup_id, "restore-to-time": after_backup_ts} + "restore", **{"restore-to-time": timestamp_ts1} ) await action.wait() restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" - - # Wait for the restore to complete. - async with ops_test.fast_forward(): - await ops_test.model.block_until( - lambda: remaining_unit.workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + assert restore_status, "1: restore to the timestamp ts1 hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) - # Check that the backup was correctly restored. + logger.info("2: successful restore") primary = await get_primary(ops_test, remaining_unit.name) address = get_unit_address(ops_test, primary) - logger.info("checking that the backup was correctly restored") + timeline_t2 = await _get_most_recent_backup(ops_test, remaining_unit) + assert backup_b1 != timeline_t2, "2: timeline 2 do not exist in list-backups action or bad" + + logger.info("2: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "2: test data td1 should exist" + + logger.info("2: checking not test data td2") + assert not _check_test_data( + "test_data_td2", address, password + ), "2: test data td2 shouldn't exist" + + logger.info("2: creating test data td3") + _insert_test_data("test_data_td3", address, password) + + logger.info("2: get timestamp ts2") with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" - ) - assert cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" - cursor.execute("SELECT COUNT(1) FROM backup_table_1;") - assert ( - int(cursor.fetchone()[0]) == 5 - ), "backup wasn't correctly restored: table 'backup_table_1' doesn't have 5 rows" - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" - ) - assert not cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_2' exists" + cursor.execute("SELECT current_timestamp;") + timestamp_ts2 = str(cursor.fetchone()[0]) connection.close() + logger.info("2: creating test data td4") + _insert_test_data("test_data_td4", address, password) + + logger.info("2: switching wal") + _switch_wal(address, password) + + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("2: restoring the backup b1 to the latest") + action = await remaining_unit.run_action( + "restore", **{"backup-id": backup_b1, "restore-to-time": "latest"} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "2: restore the backup b1 to the latest hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) + + logger.info("3: successful restore") + primary = await get_primary(ops_test, remaining_unit.name) + address = get_unit_address(ops_test, primary) + timeline_t3 = await _get_most_recent_backup(ops_test, remaining_unit) + assert ( + backup_b1 != timeline_t3 and timeline_t2 != timeline_t3 + ), "3: timeline 3 do not exist in list-backups action or bad" + + logger.info("3: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "3: test data td1 should exist" + + logger.info("3: checking test data td2") + assert _check_test_data("test_data_td2", address, password), "3: test data td2 should exist" + + logger.info("3: checking not test data td3") + assert not _check_test_data( + "test_data_td3", address, password + ), "3: test data td3 shouldn't exist" + + logger.info("3: checking not test data td4") + assert not _check_test_data( + "test_data_td4", address, password + ), "3: test data td4 shouldn't exist" + + logger.info("3: switching wal") + _switch_wal(address, password) + + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("3: restoring the timeline 2 to the latest") + action = await remaining_unit.run_action( + "restore", **{"backup-id": timeline_t2, "restore-to-time": "latest"} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "3: restore the timeline 2 to the latest hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) + + logger.info("4: successful restore") + primary = await get_primary(ops_test, remaining_unit.name) + address = get_unit_address(ops_test, primary) + timeline_t4 = await _get_most_recent_backup(ops_test, remaining_unit) + assert ( + backup_b1 != timeline_t4 and timeline_t2 != timeline_t4 and timeline_t3 != timeline_t4 + ), "4: timeline 4 do not exist in list-backups action or bad" + + logger.info("4: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "4: test data td1 should exist" + + logger.info("4: checking not test data td2") + assert not _check_test_data( + "test_data_td2", address, password + ), "4: test data td2 shouldn't exist" + + logger.info("4: checking test data td3") + assert _check_test_data("test_data_td3", address, password), "4: test data td3 should exist" + + logger.info("4: checking test data td4") + assert _check_test_data("test_data_td4", address, password), "4: test data td4 should exist" + + logger.info("4: switching wal") + _switch_wal(address, password) + + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("4: restoring to the timestamp ts2") + action = await remaining_unit.run_action( + "restore", **{"restore-to-time": timestamp_ts2} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "4: restore to the timestamp ts2 hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) + + logger.info("5: successful restore") + primary = await get_primary(ops_test, remaining_unit.name) + address = get_unit_address(ops_test, primary) + timeline_t5 = await _get_most_recent_backup(ops_test, remaining_unit) + assert ( + backup_b1 != timeline_t5 + and timeline_t2 != timeline_t5 + and timeline_t3 != timeline_t5 + and timeline_t4 != timeline_t5 + ), "5: timeline 5 do not exist in list-backups action or bad" + + logger.info("5: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "5: test data td1 should exist" + + logger.info("5: checking not test data td2") + assert not _check_test_data( + "test_data_td2", address, password + ), "5: test data td2 shouldn't exist" + + logger.info("5: checking test data td3") + assert _check_test_data("test_data_td3", address, password), "5: test data td3 should exist" + + logger.info("5: checking not test data td4") + assert not _check_test_data( + "test_data_td4", address, password + ), "5: test data td4 shouldn't exist" + # Remove the database app. await ops_test.model.remove_application(database_app_name, block_until_done=True) # Remove the TLS operator. @@ -322,3 +419,49 @@ async def test_pitr_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dic config, charm, ) + + +def _create_table(host: str, password: str): + with db_connect(host=host, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("CREATE TABLE IF NOT EXISTS backup_table (test_column TEXT);") + connection.close() + + +def _insert_test_data(td: str, host: str, password: str): + with db_connect(host=host, password=password) as connection: + connection.autocommit = True + connection.cursor().execute( + "INSERT INTO backup_table (test_column) VALUES (%s);", + (td,), + ) + connection.close() + + +def _check_test_data(td: str, host: str, password: str) -> bool: + with db_connect(host=host, password=password) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT 1 FROM backup_table WHERE test_column = %s);", + (td,), + ) + res = cursor.fetchone()[0] + connection.close() + return res + + +def _switch_wal(host: str, password: str): + with db_connect(host=host, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("SELECT pg_switch_wal();") + connection.close() + + +async def _get_most_recent_backup(ops_test: OpsTest, unit: any) -> str: + logger.info("listing the available backups") + action = await unit.run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + assert backups, "backups not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + most_recent_backup = backups.split("\n")[-1] + return most_recent_backup.split()[0] diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 4769e9251f..d674985f2d 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -2,7 +2,6 @@ # See LICENSE file for licensing details. from pathlib import PosixPath from subprocess import PIPE, CompletedProcess, TimeoutExpired -from typing import OrderedDict from unittest.mock import ANY, MagicMock, PropertyMock, call, mock_open, patch import botocore as botocore @@ -576,42 +575,56 @@ def test_format_backup_list(harness): == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------""" ) # Test when there are backups. backup_list = [ ( "2023-01-01T09:00:00Z", - "full", + "full backup", "failed: fake error", "None", "0/3000000 / 0/5000000", "2023-01-01T09:00:00Z", "2023-01-01T09:00:05Z", + "1", "a/b/c", ), ( "2023-01-01T10:00:00Z", - "full", + "full backup", "finished", "None", "0/5000000 / 0/7000000", "2023-01-01T10:00:00Z", - "2023-01-01T010:00:07Z", + "2023-01-01T10:00:07Z", + "A", "a/b/d", ), + ( + "2023-01-01T11:00:00Z", + "restore", + "finished", + "None", + "n/a", + "2023-01-01T11:00:00Z", + "n/a", + "B", + "n/a", + ), ] assert ( harness.charm.backup._format_backup_list(backup_list) == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------ -2023-01-01T09:00:00Z | full | failed: fake error | None | 0/3000000 / 0/5000000 | 2023-01-01T09:00:00Z | 2023-01-01T09:00:05Z | a/b/c -2023-01-01T10:00:00Z | full | finished | None | 0/5000000 / 0/7000000 | 2023-01-01T10:00:00Z | 2023-01-01T010:00:07Z | a/b/d""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +2023-01-01T09:00:00Z | full backup | failed: fake error | None | 0/3000000 / 0/5000000 | 2023-01-01T09:00:00Z | 2023-01-01T09:00:05Z | 1 | a/b/c +2023-01-01T10:00:00Z | full backup | finished | None | 0/5000000 / 0/7000000 | 2023-01-01T10:00:00Z | 2023-01-01T10:00:07Z | A | a/b/d +2023-01-01T11:00:00Z | restore | finished | None | n/a | 2023-01-01T11:00:00Z | n/a | B | n/a""" ) @@ -629,30 +642,38 @@ def test_generate_backup_list_output(harness): "path": " test-path/ ", } # Test when no backups are returned. - _execute_command.return_value = (0, '[{"backup":[]}]', "") + _execute_command.side_effect = [(0, '[{"backup":[]}]', ""), (0, "{}", "")] assert ( harness.charm.backup._generate_backup_list_output() == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------""" ) # Test when backups are returned. - _execute_command.return_value = ( - 0, - '[{"backup":[{"label":"20230101-090000F","error":"fake error","reference":null,"lsn":{"start":"0/3000000","stop":"0/5000000"},"timestamp":{"start":1719866711,"stop":1719866714}}]}]', - "", - ) + _execute_command.side_effect = [ + ( + 0, + '[{"backup":[{"archive":{"start":"00000001000000000000000B"},"label":"20230101-090000F","error":"fake error","reference":null,"lsn":{"start":"0/3000000","stop":"0/5000000"},"timestamp":{"start":1719866711,"stop":1719866714}}]}]', + "", + ), + ( + 0, + '{".":{"type":"path"},"archive/None.postgresql/14-1/00000002.history":{"type": "file","size": 32,"time": 1728937652}}', + "", + ), + ] assert ( harness.charm.backup._generate_backup_list_output() == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------ -2023-01-01T09:00:00Z | full | failed: fake error | None | 0/3000000 / 0/5000000 | 2024-07-01T20:45:11Z | 2024-07-01T20:45:14Z | /None.postgresql/20230101-090000F""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +2023-01-01T09:00:00Z | full backup | failed: fake error | None | 0/3000000 / 0/5000000 | 2024-07-01T20:45:11Z | 2024-07-01T20:45:14Z | 1 | /None.postgresql/20230101-090000F +2024-10-14T20:27:32Z | restore | finished | None | n/a | 2024-10-14T20:27:32Z | n/a | 2 | n/a""" ) @@ -666,22 +687,22 @@ def test_list_backups(harness): # Test when no backups are available. _execute_command.return_value = (0, "[]", "") - assert harness.charm.backup._list_backups(show_failed=True) == OrderedDict[str, str]() + assert harness.charm.backup._list_backups(show_failed=True) == dict[str, tuple[str, str]]() # Test when some backups are available. _execute_command.return_value = ( 0, - '[{"backup":[{"label":"20230101-090000F","error":"fake error"},{"label":"20230101-100000F","error":null}],"name":"test-stanza"}]', + '[{"backup":[{"archive":{"start":"00000001000000000000000B"},"label":"20230101-090000F","error":"fake error"},{"archive":{"start":"0000000A000000000000000B"},"label":"20230101-100000F","error":null}],"name":"test-stanza"}]', "", ) - assert harness.charm.backup._list_backups(show_failed=True) == OrderedDict[str, str]([ - ("2023-01-01T09:00:00Z", "test-stanza"), - ("2023-01-01T10:00:00Z", "test-stanza"), + assert harness.charm.backup._list_backups(show_failed=True) == dict[str, tuple[str, str]]([ + ("2023-01-01T09:00:00Z", ("test-stanza", "1")), + ("2023-01-01T10:00:00Z", ("test-stanza", "A")), ]) # Test when some backups are available, but it's not desired to list failed backups. - assert harness.charm.backup._list_backups(show_failed=False) == OrderedDict[str, str]([ - ("2023-01-01T10:00:00Z", "test-stanza") + assert harness.charm.backup._list_backups(show_failed=False) == dict[str, tuple[str, str]]([ + ("2023-01-01T10:00:00Z", ("test-stanza", "A")) ]) @@ -1033,80 +1054,12 @@ def test_on_s3_credential_changed(harness): _can_initialise_stanza.assert_called_once() _is_primary.assert_not_called() - # Test that followers will not initialise the bucket (and that only the leader will - # remove the "require-change-bucket-after-restore" flag from the application databag). + # Test when it's not possible to use the S3 repository due to backups from another cluster. with harness.hooks_disabled(): harness.set_leader() - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"require-change-bucket-after-restore": "True"}, - ) - harness.charm.unit.status = ActiveStatus() - _render_pgbackrest_conf_file.reset_mock() + _create_bucket_if_not_exists.reset_mock() _can_initialise_stanza.return_value = True - _is_primary.return_value = False - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"cluster_initialised": "True"}, - ) - harness.charm.backup.s3_client.on.credentials_changed.emit( - relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) - ) - _render_pgbackrest_conf_file.assert_called_once() - assert "require-change-bucket-after-restore" not in harness.get_relation_data( - peer_rel_id, harness.charm.app - ) - _is_primary.assert_called_once() - _create_bucket_if_not_exists.assert_not_called() - assert isinstance(harness.charm.unit.status, ActiveStatus) - _can_use_s3_repository.assert_not_called() - _initialise_stanza.assert_not_called() - - # Test when the charm render the pgBackRest configuration file, but fails to - # access or create the S3 bucket (and assert that a non-leader unit won't - # remove the "require-change-bucket-after-restore" flag from the application - # databag). _is_primary.return_value = True - for error in [ - ClientError( - error_response={"Error": {"Code": 1, "message": "fake error"}}, - operation_name="fake operation name", - ), - ValueError, - ]: - with harness.hooks_disabled(): - harness.set_leader(False) - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"require-change-bucket-after-restore": "True"}, - ) - _render_pgbackrest_conf_file.reset_mock() - _create_bucket_if_not_exists.reset_mock() - _create_bucket_if_not_exists.side_effect = error - harness.charm.backup.s3_client.on.credentials_changed.emit( - relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) - ) - _render_pgbackrest_conf_file.assert_called_once() - assert ( - harness.get_relation_data(peer_rel_id, harness.charm.app)[ - "require-change-bucket-after-restore" - ] - == "True" - ) - _create_bucket_if_not_exists.assert_called_once() - assert isinstance(harness.charm.unit.status, BlockedStatus) - assert ( - harness.charm.unit.status.message == FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE - ) - _can_use_s3_repository.assert_not_called() - _initialise_stanza.assert_not_called() - - # Test when it's not possible to use the S3 repository due to backups from another cluster. - _create_bucket_if_not_exists.reset_mock() _create_bucket_if_not_exists.side_effect = None _can_use_s3_repository.return_value = (False, "fake validation message") harness.charm.backup.s3_client.on.credentials_changed.emit( @@ -1376,6 +1329,70 @@ def test_on_list_backups_action(harness): mock_event.fail.assert_not_called() +def test_list_timelines(harness): + with patch("charm.PostgreSQLBackups._execute_command") as _execute_command: + _execute_command.return_value = (0, "{}", "") + assert harness.charm.backup._list_timelines() == dict[str, tuple[str, str]]() + + _execute_command.return_value = ( + 0, + '{".":{"type":"path"},"archive/test-stanza/14-1/00000002.history":{"type": "file","size": 32,"time": 1728937652}}', + "", + ) + assert harness.charm.backup._list_timelines() == dict[str, tuple[str, str]]([ + ("2024-10-14T20:27:32Z", ("test-stanza", "2")) + ]) + + +def test_get_nearest_timeline(harness): + with ( + patch("charm.PostgreSQLBackups._list_backups") as _list_backups, + patch("charm.PostgreSQLBackups._list_timelines") as _list_timelines, + ): + _list_backups.return_value = dict[str, tuple[str, str]]() + _list_timelines.return_value = dict[str, tuple[str, str]]() + assert harness.charm.backup._get_nearest_timeline("2022-02-24 05:00:00") is None + + _list_backups.return_value = dict[str, tuple[str, str]]({ + "2022-02-24T05:00:00Z": ("test-stanza", "1"), + "2024-02-24T05:00:00Z": ("test-stanza", "2"), + }) + _list_timelines.return_value = dict[str, tuple[str, str]]({ + "2023-02-24T05:00:00Z": ("test-stanza", "2") + }) + assert harness.charm.backup._get_nearest_timeline("2025-01-01 00:00:00") == tuple[ + str, str + ](("test-stanza", "2")) + assert harness.charm.backup._get_nearest_timeline("2024-01-01 00:00:00") == tuple[ + str, str + ](("test-stanza", "2")) + assert harness.charm.backup._get_nearest_timeline("2023-01-01 00:00:00") == tuple[ + str, str + ](("test-stanza", "1")) + assert harness.charm.backup._get_nearest_timeline("2022-01-01 00:00:00") is None + + +def test_is_psql_timestamp(harness): + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00+0000") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00+03") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00-01:00") is True + + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01+0000") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01+03") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01-01:00") is True + + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001+0000") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001+03") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001-01:00") is True + + assert harness.charm.backup._is_psql_timestamp("bad data") is False + assert harness.charm.backup._is_psql_timestamp("2022-02-24T05:00:00.5000001-01:00") is False + assert harness.charm.backup._is_psql_timestamp("2022-24-24 05:00:00") is False + + def test_on_restore_action(harness): with ( patch("charm.Patroni.start_patroni") as _start_patroni, @@ -1385,6 +1402,7 @@ def test_on_restore_action(harness): patch("charm.PostgreSQLBackups._execute_command") as _execute_command, patch("charm.Patroni.stop_patroni") as _stop_patroni, patch("charm.PostgreSQLBackups._list_backups") as _list_backups, + patch("charm.PostgreSQLBackups._list_timelines") as _list_timelines, patch("charm.PostgreSQLBackups._fetch_backup_from_id") as _fetch_backup_from_id, patch("charm.PostgreSQLBackups._pre_restore_checks") as _pre_restore_checks, patch( @@ -1414,10 +1432,32 @@ def test_on_restore_action(harness): # Test when the user provides an invalid backup id. mock_event.params = {"backup-id": "2023-01-01T10:00:00Z"} _pre_restore_checks.return_value = True - _list_backups.return_value = {"2023-01-01T09:00:00Z": harness.charm.backup.stanza_name} + _list_backups.return_value = { + "2023-01-01T09:00:00Z": (harness.charm.backup.stanza_name, "1") + } + _list_timelines.return_value = { + "2024-02-24T05:00:00Z": (harness.charm.backup.stanza_name, "2") + } harness.charm.unit.status = ActiveStatus() harness.charm.backup._on_restore_action(mock_event) _list_backups.assert_called_once_with(show_failed=False) + _list_timelines.assert_called_once() + _fetch_backup_from_id.assert_not_called() + mock_event.fail.assert_called_once() + _stop_patroni.assert_not_called() + _execute_command.assert_not_called() + _restart_database.assert_not_called() + _empty_data_files.assert_not_called() + _update_config.assert_not_called() + _start_patroni.assert_not_called() + mock_event.set_results.assert_not_called() + assert not isinstance(harness.charm.unit.status, MaintenanceStatus) + + # Test when the user provides an only the timeline backup id leading to the error. + mock_event.reset_mock() + mock_event.params = {"backup-id": "2024-02-24T05:00:00Z"} + harness.charm.unit.status = ActiveStatus() + harness.charm.backup._on_restore_action(mock_event) _fetch_backup_from_id.assert_not_called() mock_event.fail.assert_called_once() _stop_patroni.assert_not_called() @@ -1467,7 +1507,6 @@ def test_on_restore_action(harness): assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { "restoring-backup": "20230101-090000F", "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", - "require-change-bucket-after-restore": "True", } _execute_command.assert_called_once_with( [ @@ -1496,6 +1535,39 @@ def test_on_restore_action(harness): mock_event.fail.assert_not_called() mock_event.set_results.assert_called_once_with({"restore-status": "restore started"}) + # Test a successful PITR with the real backup id to the latest. + mock_event.reset_mock() + mock_event.params = {"backup-id": "2023-01-01T09:00:00Z", "restore-to-time": "latest"} + _restart_database.reset_mock() + _execute_command.return_value = (0, "fake stdout", "") + _fetch_backup_from_id.return_value = "20230101-090000F" + harness.charm.backup._on_restore_action(mock_event) + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { + "restoring-backup": "20230101-090000F", + "restore-timeline": "1", + "restore-to-time": "latest", + "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", + } + _restart_database.assert_not_called() + mock_event.fail.assert_not_called() + mock_event.set_results.assert_called_once_with({"restore-status": "restore started"}) + + # Test a successful PITR with only the timestamp. + mock_event.reset_mock() + mock_event.params = {"restore-to-time": "2025-02-24 05:00:00.001+00"} + _restart_database.reset_mock() + _execute_command.return_value = (0, "fake stdout", "") + _fetch_backup_from_id.return_value = "20230101-090000F" + harness.charm.backup._on_restore_action(mock_event) + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { + "restore-timeline": "2", + "restore-to-time": "2025-02-24 05:00:00.001+00", + "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", + } + _restart_database.assert_not_called() + mock_event.fail.assert_not_called() + mock_event.set_results.assert_called_once_with({"restore-status": "restore started"}) + def test_pre_restore_checks(harness): with ( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9f4dd50535..a377a792a6 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -737,6 +737,7 @@ def test_on_start_no_patroni_member(harness): patch( "charm.PostgresqlOperatorCharm._is_storage_attached", return_value=True ) as _is_storage_attached, + patch("charm.PostgresqlOperatorCharm.get_available_memory") as _get_available_memory, ): # Mock the passwords. patroni.return_value.member_started = False @@ -1001,11 +1002,15 @@ def test_on_update_status_after_restore_operation(harness): "charm.PostgresqlOperatorCharm._handle_processes_failures" ) as _handle_processes_failures, patch("charm.PostgreSQLBackups.can_use_s3_repository") as _can_use_s3_repository, + patch( + "charms.postgresql_k8s.v0.postgresql.PostgreSQL.get_current_timeline" + ) as _get_current_timeline, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, patch("charm.Patroni.get_member_status") as _get_member_status, patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), ): + _get_current_timeline.return_value = "2" rel_id = harness.model.get_relation(PEER).id # Test when the restore operation fails. with harness.hooks_disabled(): @@ -1013,7 +1018,7 @@ def test_on_update_status_after_restore_operation(harness): harness.update_relation_data( rel_id, harness.charm.app.name, - {"cluster_initialised": "True", "restoring-backup": "2023-01-01T09:00:00Z"}, + {"cluster_initialised": "True", "restoring-backup": "20230101-090000F"}, ) _get_member_status.return_value = "failed" harness.charm.on.update_status.emit() @@ -1041,7 +1046,7 @@ def test_on_update_status_after_restore_operation(harness): # Assert that the backup id is still in the application relation databag. assert harness.get_relation_data(rel_id, harness.charm.app) == { "cluster_initialised": "True", - "restoring-backup": "2023-01-01T09:00:00Z", + "restoring-backup": "20230101-090000F", } # Test when the restore operation finished successfully. @@ -1074,7 +1079,7 @@ def test_on_update_status_after_restore_operation(harness): harness.update_relation_data( rel_id, harness.charm.app.name, - {"restoring-backup": "2023-01-01T09:00:00Z"}, + {"restoring-backup": "20230101-090000F"}, ) _can_use_s3_repository.return_value = (False, "fake validation message") harness.charm.on.update_status.emit() @@ -1249,6 +1254,7 @@ def test_update_config(harness): "charm.PostgresqlOperatorCharm.is_tls_enabled", new_callable=PropertyMock ) as _is_tls_enabled, patch.object(PostgresqlOperatorCharm, "postgresql", Mock()) as postgresql_mock, + patch("charm.PostgresqlOperatorCharm.get_available_memory") as _get_available_memory, ): rel_id = harness.model.get_relation(PEER).id # Mock some properties. @@ -1269,9 +1275,9 @@ def test_update_config(harness): backup_id=None, stanza=None, restore_stanza=None, + restore_timeline=None, pitr_target=None, restore_to_latest=False, - disable_pgbackrest_archiving=False, parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once_with(False) @@ -1292,9 +1298,9 @@ def test_update_config(harness): backup_id=None, stanza=None, restore_stanza=None, + restore_timeline=None, pitr_target=None, restore_to_latest=False, - disable_pgbackrest_archiving=False, parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once() @@ -2376,35 +2382,6 @@ def test_set_primary_status_message(harness, is_leader): ) as _is_standby_leader, patch("charm.Patroni.get_primary") as _get_primary, ): - # Test scenario when it's needed to move to another S3 bucket after a restore. - databag_containing_restore_data = { - "require-change-bucket-after-restore": "True", - "restoring-backup": "2024-01-01T09:00:00Z", - "restore-stanza": "fake-stanza", - "restore-to-time": "", - } - with harness.hooks_disabled(): - harness.update_relation_data( - harness.model.get_relation(PEER).id, - harness.charm.app.name, - databag_containing_restore_data, - ) - harness.set_leader(is_leader) - harness.charm._set_primary_status_message() - harness.get_relation_data(harness.model.get_relation(PEER).id, harness.charm.app.name) == ( - {"require-change-bucket-after-restore": "True"} - if is_leader - else databag_containing_restore_data - ) - assert isinstance(harness.charm.unit.status, BlockedStatus) - - # Test other scenarios. - with harness.hooks_disabled(): - harness.update_relation_data( - harness.model.get_relation(PEER).id, - harness.charm.app.name, - {"require-change-bucket-after-restore": ""}, - ) for values in itertools.product( [ RetryError(last_attempt=1),