diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 0a13a765de..e603c12ebf 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -36,7 +36,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 35 +LIBPATCH = 36 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -83,6 +83,10 @@ class PostgreSQLGetLastArchivedWALError(Exception): """Exception raised when retrieving last archived WAL fails.""" +class PostgreSQLGetCurrentTimelineError(Exception): + """Exception raised when retrieving current timeline id for the PostgreSQL unit fails.""" + + class PostgreSQLGetPostgreSQLVersionError(Exception): """Exception raised when retrieving PostgreSQL version fails.""" @@ -419,6 +423,16 @@ def get_last_archived_wal(self) -> str: logger.error(f"Failed to get PostgreSQL last archived WAL: {e}") raise PostgreSQLGetLastArchivedWALError() + def get_current_timeline(self) -> str: + """Get the timeline id for the current PostgreSQL unit.""" + try: + with self._connect_to_database() as connection, connection.cursor() as cursor: + cursor.execute("SELECT timeline_id FROM pg_control_checkpoint();") + return cursor.fetchone()[0] + except psycopg2.Error as e: + logger.error(f"Failed to get PostgreSQL current timeline id: {e}") + raise PostgreSQLGetCurrentTimelineError() + def get_postgresql_text_search_configs(self) -> Set[str]: """Returns the PostgreSQL available text search configs. diff --git a/src/backups.py b/src/backups.py index 535c536be4..7c1700ab0d 100644 --- a/src/backups.py +++ b/src/backups.py @@ -9,7 +9,7 @@ import re import tempfile from datetime import datetime, timezone -from typing import Dict, List, Optional, OrderedDict, Tuple +from typing import Dict, List, Optional, Tuple import boto3 as boto3 import botocore @@ -36,13 +36,11 @@ ) FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings" CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details" -MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket" S3_BLOCK_MESSAGES = [ ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, ] @@ -171,29 +169,9 @@ def can_use_s3_repository(self) -> Tuple[bool, Optional[str]]: if self.charm._patroni.member_started: self.charm._patroni.reload_patroni_configuration() return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE - return self._is_s3_wal_compatible(stanza) return True, None - def _is_s3_wal_compatible(self, stanza) -> Tuple[bool, Optional[str]]: - """Returns whether the S3 stanza is compatible with current PostgreSQL cluster by WAL parity.""" - charm_last_archived_wal = self.charm.postgresql.get_last_archived_wal() - logger.debug(f"last archived wal: {charm_last_archived_wal}") - s3_archive = stanza.get("archive", []) - if len(s3_archive) > 0: - s3_last_archived_wal = s3_archive[0].get("max") - logger.debug(f"last s3 wal: {str(s3_last_archived_wal)}") - if ( - charm_last_archived_wal - and s3_last_archived_wal - and charm_last_archived_wal.split(".", 1)[0] != str(s3_last_archived_wal) - ): - if bool(self.charm.app_peer_data.get("require-change-bucket-after-restore", None)): - return False, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET - else: - return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE - return True, None - def _construct_endpoint(self, s3_parameters: Dict) -> str: """Construct the S3 service endpoint using the region. @@ -311,37 +289,40 @@ def _format_backup_list(self, backup_list) -> str: backups = [ "Storage bucket name: {:s}".format(s3_parameters["bucket"]), "Backups base path: {:s}/backup/\n".format(s3_parameters["path"]), - "{:<20s} | {:<12s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:s}".format( + "{:<20s} | {:<19s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:<8s} | {:s}".format( "backup-id", - "type", + "action", "status", "reference-backup-id", "LSN start/stop", "start-time", "finish-time", + "timeline", "backup-path", ), ] backups.append("-" * len(backups[2])) for ( backup_id, - backup_type, + backup_action, backup_status, reference, lsn_start_stop, start, stop, + backup_timeline, path, ) in backup_list: backups.append( - "{:<20s} | {:<12s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:s}".format( + "{:<20s} | {:<19s} | {:<8s} | {:<20s} | {:<23s} | {:<20s} | {:<20s} | {:<8s} | {:s}".format( backup_id, - backup_type, + backup_action, backup_status, reference, lsn_start_stop, start, stop, + backup_timeline, path, ) ) @@ -357,6 +338,7 @@ def _generate_backup_list_output(self) -> str: backups = json.loads(output)[0]["backup"] for backup in backups: backup_id, backup_type = self._parse_backup_id(backup["label"]) + backup_action = f"{backup_type} backup" backup_reference = "None" if backup["reference"]: backup_reference, _ = self._parse_backup_id(backup["reference"][-1]) @@ -367,6 +349,11 @@ def _generate_backup_list_output(self) -> str: ) for stamp in backup["timestamp"].values() ) + backup_timeline = ( + backup["archive"]["start"][:8].lstrip("0") + if backup["archive"] and backup["archive"]["start"] + else "" + ) backup_path = f'/{self.stanza_name}/{backup["label"]}' error = backup["error"] backup_status = "finished" @@ -374,17 +361,34 @@ def _generate_backup_list_output(self) -> str: backup_status = f"failed: {error}" backup_list.append(( backup_id, - backup_type, + backup_action, backup_status, backup_reference, lsn_start_stop, time_start, time_stop, + backup_timeline, backup_path, )) + + for timeline, (timeline_stanza, timeline_id) in self._list_timelines().items(): + backup_list.append(( + timeline, + "restore", + "finished", + "None", + "n/a", + timeline, + "n/a", + timeline_id, + "n/a", + )) + + backup_list.sort(key=lambda x: x[0]) + return self._format_backup_list(backup_list) - def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: + def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, str]]: """Retrieve the list of backups. Args: @@ -392,29 +396,100 @@ def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: parse: whether to convert backup labels to their IDs or not. Returns: - a dict of previously created backups (id + stanza name) or an empty list - if there is no backups in the S3 bucket. + a dict of previously created backups: id => (stanza, timeline) or an empty dict if there is no backups in + the S3 bucket. """ output, _ = self._execute_command(["pgbackrest", "info", "--output=json"]) repository_info = next(iter(json.loads(output)), None) # If there are no backups, returns an empty dict. if repository_info is None: - return OrderedDict[str, str]() + return dict[str, tuple[str, str]]() backups = repository_info["backup"] stanza_name = repository_info["name"] - return OrderedDict[str, str]( - ( - self._parse_backup_id(backup["label"])[0] if parse else backup["label"], + return dict[str, tuple[str, str]]({ + self._parse_backup_id(backup["label"])[0] if parse else backup["label"]: ( stanza_name, + backup["archive"]["start"][:8].lstrip("0") + if backup["archive"] and backup["archive"]["start"] + else "", ) for backup in backups if show_failed or not backup["error"] - ) + }) + + def _list_timelines(self) -> dict[str, tuple[str, str]]: + """Lists the timelines from the pgBackRest stanza. + + Returns: + a dict of timelines: id => (stanza, timeline) or an empty dict if there is no timelines in the S3 bucket. + """ + output, _ = self._execute_command([ + "pgbackrest", + "repo-ls", + "--recurse", + "--output=json", + ]) + + repository = json.loads(output).items() + if repository is None: + return dict[str, tuple[str, str]]() + + return dict[str, tuple[str, str]]({ + datetime.strftime( + datetime.fromtimestamp(timeline_object["time"], timezone.utc), + "%Y-%m-%dT%H:%M:%SZ", + ): ( + timeline.split("/")[1], + timeline.split("/")[-1].split(".")[0].lstrip("0"), + ) + for timeline, timeline_object in repository + if timeline.endswith(".history") and not timeline.endswith("backup.history") + }) + + def _get_nearest_timeline(self, timestamp: str) -> tuple[str, str] | None: + """Finds the nearest timeline or backup prior to the specified timeline. + + Returns: + (stanza, timeline) of the nearest timeline or backup. None, if there are no matches. + """ + timelines = self._list_backups(show_failed=False) | self._list_timelines() + filtered_timelines = [ + (timeline_key, timeline_object) + for timeline_key, timeline_object in timelines.items() + if datetime.strptime(timeline_key, "%Y-%m-%dT%H:%M:%SZ") + <= self._parse_psql_timestamp(timestamp) + ] + return max(filtered_timelines)[1] if len(filtered_timelines) > 0 else None + + def _is_psql_timestamp(self, timestamp: str) -> bool: + if not re.match( + r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,6})?([-+](?:\d{2}|\d{4}|\d{2}:\d{2}))?$", + timestamp, + ): + return False + try: + self._parse_psql_timestamp(timestamp) + return True + except ValueError: + return False + + def _parse_psql_timestamp(self, timestamp: str) -> datetime: + """Intended to use with data only after _is_psql_timestamp check.""" + # With the python >= 3.11 only the datetime.fromisoformat will be sufficient without any regexes. Therefore, + # it will not be required for the _is_psql_timestamp check that ensures intended regex execution. + t = re.sub(r"([-+]\d{2})$", r"\1:00", timestamp) + t = re.sub(r"([-+]\d{2})(\d{2})$", r"\1:\2", t) + t = re.sub(r"\.(\d+)", lambda x: f".{x[1]:06}", t) + dt = datetime.fromisoformat(t) + # Convert to the timezone-naive + if dt.tzinfo is not None and dt.tzinfo is not timezone.utc: + dt = dt.astimezone(tz=timezone.utc) + return dt.replace(tzinfo=None) def _parse_backup_id(self, label) -> Tuple[str, str]: - """Parse backup ID as a timestamp.""" + """Parse backup ID as a timestamp and its type.""" if label[-1] == "F": timestamp = label backup_type = "full" @@ -593,9 +668,6 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): event.defer() return - if self.charm.unit.is_leader(): - self.charm.app_peer_data.pop("require-change-bucket-after-restore", None) - # Verify the s3 relation only on the primary. if not self.charm.is_primary: return @@ -757,7 +829,6 @@ def _on_s3_credential_gone(self, _) -> None: self.charm.app_peer_data.update({ "stanza": "", "init-pgbackrest": "", - "require-change-bucket-after-restore": "", }) self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES: @@ -778,7 +849,7 @@ def _on_list_backups_action(self, event) -> None: logger.exception(e) event.fail(f"Failed to list PostgreSQL backups with error: {str(e)}") - def _on_restore_action(self, event): + def _on_restore_action(self, event): # noqa: C901 """Request that pgBackRest restores a backup.""" if not self._pre_restore_checks(event): return @@ -786,36 +857,43 @@ def _on_restore_action(self, event): backup_id = event.params.get("backup-id") restore_to_time = event.params.get("restore-to-time") logger.info( - f"A restore" - f"{' with backup-id ' + backup_id if backup_id else ''}" - f"{' to time point ' + restore_to_time if restore_to_time else ''}" + f"A restore with backup-id {backup_id}" + f"{f' to time point {restore_to_time}' if restore_to_time else ''}" f" has been requested on the unit" ) # Validate the provided backup id and restore to time. logger.info("Validating provided backup-id and restore-to-time") backups = self._list_backups(show_failed=False) - if backup_id and backup_id not in backups.keys(): + timelines = self._list_timelines() + is_backup_id_real = backup_id and backup_id in backups.keys() + is_backup_id_timeline = ( + backup_id and not is_backup_id_real and backup_id in timelines.keys() + ) + if backup_id and not is_backup_id_real and not is_backup_id_timeline: error_message = f"Invalid backup-id: {backup_id}" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return - if not backup_id and restore_to_time and not backups: - error_message = "Cannot restore PITR without any backups created" - logger.error(f"Restore failed: {error_message}") - event.fail(error_message) - return - - # Quick check for timestamp format - if ( - restore_to_time - and restore_to_time != "latest" - and not re.match("^[0-9-]+ [0-9:.+]+$", restore_to_time) - ): - error_message = "Bad restore-to-time format" + if is_backup_id_timeline and not restore_to_time: + error_message = "Cannot restore to the timeline without restore-to-time parameter" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return + if is_backup_id_real: + restore_stanza_timeline = backups[backup_id] + elif is_backup_id_timeline: + restore_stanza_timeline = timelines[backup_id] + else: + restore_stanza_timeline = self._get_nearest_timeline(restore_to_time) + if not restore_stanza_timeline: + error_message = f"Can't find the nearest timeline before timestamp {restore_to_time} to restore" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + return + logger.info( + f"Chosen timeline {restore_stanza_timeline[1]} as nearest for the specified timestamp {restore_to_time}" + ) self.charm.unit.status = MaintenanceStatus("restoring backup") @@ -883,12 +961,10 @@ def _on_restore_action(self, event): # Mark the cluster as in a restoring backup state and update the Patroni configuration. logger.info("Configuring Patroni to restore the backup") self.charm.app_peer_data.update({ - "restoring-backup": self._fetch_backup_from_id(backup_id) if backup_id else "", - "restore-stanza": backups[backup_id] - if backup_id - else self.charm.app_peer_data.get("stanza", self.stanza_name), + "restoring-backup": self._fetch_backup_from_id(backup_id) if is_backup_id_real else "", + "restore-stanza": restore_stanza_timeline[0], + "restore-timeline": restore_stanza_timeline[1] if restore_to_time else "", "restore-to-time": restore_to_time or "", - "require-change-bucket-after-restore": "True", }) self.charm.update_config() @@ -941,10 +1017,23 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool: event.fail(validation_message) return False - if not event.params.get("backup-id") and not event.params.get("restore-to-time"): - error_message = ( - "Missing backup-id or/and restore-to-time parameter to be able to do restore" - ) + if not event.params.get("backup-id") and event.params.get("restore-to-time") in ( + None, + "latest", + ): + error_message = "Missing backup-id or non-latest restore-to-time parameter to be able to do restore" + logger.error(f"Restore failed: {error_message}") + event.fail(error_message) + return False + + # Quick check for timestamp format + restore_to_time = event.params.get("restore-to-time") + if ( + restore_to_time + and restore_to_time != "latest" + and not self._is_psql_timestamp(restore_to_time) + ): + error_message = "Bad restore-to-time format" logger.error(f"Restore failed: {error_message}") event.fail(error_message) return False @@ -959,7 +1048,6 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool: if self.charm.is_blocked and self.charm.unit.status.message not in [ ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, CANNOT_RESTORE_PITR, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, ]: error_message = "Cluster or unit is in a blocking state" logger.error(f"Restore failed: {error_message}") diff --git a/src/charm.py b/src/charm.py index b3e3b6f5aa..9cf89caf76 100755 --- a/src/charm.py +++ b/src/charm.py @@ -39,6 +39,7 @@ REQUIRED_PLUGINS, PostgreSQL, PostgreSQLEnableDisableExtensionError, + PostgreSQLGetCurrentTimelineError, PostgreSQLUpdateUserPasswordError, ) from charms.postgresql_k8s.v0.postgresql_tls import PostgreSQLTLS @@ -82,7 +83,7 @@ from requests import ConnectionError from tenacity import RetryError, Retrying, stop_after_attempt, stop_after_delay, wait_fixed -from backups import CANNOT_RESTORE_PITR, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, PostgreSQLBackups +from backups import CANNOT_RESTORE_PITR, PostgreSQLBackups from config import CharmConfig from constants import ( APP_SCOPE, @@ -966,15 +967,6 @@ def _set_active_status(self): # The charm should not override this status outside of the function checking disk space. if self.unit.status.message == INSUFFICIENT_SIZE_WARNING: return - if "require-change-bucket-after-restore" in self.app_peer_data: - if self.unit.is_leader(): - self.app_peer_data.update({ - "restoring-backup": "", - "restore-stanza": "", - "restore-to-time": "", - }) - self.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET) - return try: if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name: self.unit.status = ActiveStatus("Primary") @@ -1364,12 +1356,6 @@ def _on_update_status_early_exit_checks(self, container) -> bool: self.enable_disable_extensions() return True - if ( - self.unit.status.message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET - and "require-change-bucket-after-restore" not in self.app_peer_data - ): - return True - logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status") return False return True @@ -1408,6 +1394,7 @@ def _on_update_status(self, _) -> None: if ( "restoring-backup" not in self.app_peer_data + and "restore-to-time" not in self.app_peer_data and "stopped" not in self.unit_peer_data and services[0].current != ServiceStatus.ACTIVE ): @@ -1459,15 +1446,32 @@ def _was_restore_successful(self, container: Container, service: ServiceInfo) -> logger.debug("Restore check early exit: Patroni has not started yet") return False + restoring_backup = self.app_peer_data.get("restoring-backup") + restore_timeline = self.app_peer_data.get("restore-timeline") + restore_to_time = self.app_peer_data.get("restore-to-time") + try: + current_timeline = self.postgresql.get_current_timeline() + except PostgreSQLGetCurrentTimelineError: + logger.debug("Restore check early exit: can't get current wal timeline") + return False + # Remove the restoring backup flag and the restore stanza name. self.app_peer_data.update({ "restoring-backup": "", "restore-stanza": "", "restore-to-time": "", + "restore-timeline": "", }) self.update_config() self.restore_patroni_on_failure_condition() - logger.info("Restore succeeded") + + logger.info( + "Restored" + f"{f' to {restore_to_time}' if restore_to_time else ''}" + f"{f' from timeline {restore_timeline}' if restore_timeline and not restoring_backup else ''}" + f"{f' from backup {self.backup._parse_backup_id(restoring_backup)[0]}' if restoring_backup else ''}" + f". Currently tracking the newly created timeline {current_timeline}." + ) can_use_s3_repository, validation_message = self.backup.can_use_s3_repository() if not can_use_s3_repository: @@ -1821,12 +1825,10 @@ def update_config(self, is_creating_backup: bool = False) -> bool: is_no_sync_member=self.upgrade.is_no_sync_member, backup_id=self.app_peer_data.get("restoring-backup"), pitr_target=self.app_peer_data.get("restore-to-time"), + restore_timeline=self.app_peer_data.get("restore-timeline"), restore_to_latest=self.app_peer_data.get("restore-to-time", None) == "latest", stanza=self.app_peer_data.get("stanza"), restore_stanza=self.app_peer_data.get("restore-stanza"), - disable_pgbackrest_archiving=bool( - self.app_peer_data.get("require-change-bucket-after-restore", None) - ), parameters=postgresql_parameters, ) diff --git a/src/patroni.py b/src/patroni.py index fd985b7e86..8cbb122a4d 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -442,6 +442,7 @@ def render_patroni_yml_file( disable_pgbackrest_archiving: bool = False, backup_id: Optional[str] = None, pitr_target: Optional[str] = None, + restore_timeline: Optional[str] = None, restore_to_latest: bool = False, parameters: Optional[dict[str, str]] = None, ) -> None: @@ -457,7 +458,8 @@ def render_patroni_yml_file( restore_stanza: name of the stanza used when restoring a backup. disable_pgbackrest_archiving: whether to force disable pgBackRest WAL archiving. backup_id: id of the backup that is being restored. - pitr_target: point-in-time-recovery target for the backup. + pitr_target: point-in-time-recovery target for the restore. + restore_timeline: timeline to restore from. restore_to_latest: restore all the WAL transaction logs from the stanza. parameters: PostgreSQL parameters to be added to the postgresql.conf file. """ @@ -483,6 +485,7 @@ def render_patroni_yml_file( restoring_backup=backup_id is not None or pitr_target is not None, backup_id=backup_id, pitr_target=pitr_target if not restore_to_latest else None, + restore_timeline=restore_timeline, restore_to_latest=restore_to_latest, stanza=stanza, restore_stanza=restore_stanza, diff --git a/templates/patroni.yml.j2 b/templates/patroni.yml.j2 index f84a9000cf..a74956a224 100644 --- a/templates/patroni.yml.j2 +++ b/templates/patroni.yml.j2 @@ -57,6 +57,7 @@ bootstrap: command: > pgbackrest --stanza={{ restore_stanza }} --pg1-path={{ storage_path }}/pgdata {%- if backup_id %} --set={{ backup_id }} {%- endif %} + {%- if restore_timeline %} --target-timeline="0x{{ restore_timeline }}" {% endif %} {%- if restore_to_latest %} --type=default {%- else %} --target-action=promote {%- if pitr_target %} --target="{{ pitr_target }}" --type=time {%- else %} --type=immediate {%- endif %} {%- endif %} diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 25f1a04130..778057d8d8 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -38,7 +38,6 @@ DATABASE_APP_NAME = METADATA["name"] APPLICATION_NAME = "postgresql-test-app" STORAGE_PATH = METADATA["storage"]["pgdata"]["location"] -MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket" try: check_call(["kubectl", "version", "--client=true"]) @@ -957,11 +956,7 @@ async def backup_operations( # Wait for the restore to complete. async with ops_test.fast_forward(): - await ops_test.model.block_until( - lambda: remaining_unit.workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) # Check that the backup was correctly restored by having only the first created table. logger.info("checking that the backup was correctly restored") @@ -1006,11 +1001,7 @@ async def backup_operations( # Wait for the restore to complete. async with ops_test.fast_forward(): - await ops_test.model.block_until( - lambda: remaining_unit.workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) # Check that the backup was correctly restored by having only the first created table. logger.info("checking that the backup was correctly restored") diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 47c6e4d426..b31c7120fa 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -15,7 +15,6 @@ from . import architecture from .helpers import ( DATABASE_APP_NAME, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, backup_operations, build_and_deploy, cat_file_from_unit, @@ -24,6 +23,7 @@ get_password, get_primary, get_unit_address, + scale_application, switchover, wait_for_idle_on_blocked, ) @@ -130,13 +130,8 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) - new_unit_name = f"{database_app_name}/1" # Scale up to be able to test primary and leader being different. - await ops_test.model.applications[database_app_name].scale(2) - await ops_test.model.block_until( - lambda: len(ops_test.model.applications[database_app_name].units) == 2 - and ops_test.model.units.get(new_unit_name).workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + async with ops_test.fast_forward(): + await scale_application(ops_test, database_app_name, 2) logger.info("ensuring that the replication is working correctly") address = await get_unit_address(ops_test, new_unit_name) @@ -186,11 +181,6 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) - backups = action.results.get("backups") assert backups, "backups not outputted" - # Remove S3 relation to ensure "move to another cluster" blocked status is gone - await ops_test.model.applications[database_app_name].remove_relation( - f"{database_app_name}:s3-parameters", f"{S3_INTEGRATOR_APP_NAME}:s3-credentials" - ) - await ops_test.model.wait_for_idle(status="active", timeout=1000) # Remove the database app. @@ -293,8 +283,9 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None ): with attempt: logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] + # Last two entries are 'action: restore', that cannot be used without restore-to-time parameter + most_recent_real_backup = backups.split("\n")[-3] + backup_id = most_recent_real_backup.split()[0] action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( "restore", **{"backup-id": backup_id} ) diff --git a/tests/integration/test_backups_pitr.py b/tests/integration/test_backups_pitr.py index 3129a79c4c..1408d7f8b7 100644 --- a/tests/integration/test_backups_pitr.py +++ b/tests/integration/test_backups_pitr.py @@ -13,7 +13,6 @@ from . import architecture from .helpers import ( DATABASE_APP_NAME, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, build_and_deploy, construct_endpoint, db_connect, @@ -102,22 +101,33 @@ async def pitr_backup_operations( cloud, config, ) -> None: - """Utility function containing PITR backup operations for both cloud tests.""" - # Deploy S3 Integrator and TLS Certificates Operator. + """Utility function containing PITR backup and timelines management operations for both cloud tests. + + Below is presented algorithm in the next format: "(timeline): action_1 -> action_2". + 1: table -> backup_b1 -> test_data_td1 -> timestamp_ts1 -> test_data_td2 -> restore_ts1 => 2 + 2: check_td1 -> check_not_td2 -> test_data_td3 -> restore_b1_latest => 3 + 3: check_td1 -> check_td2 -> check_not_td3 -> test_data_td4 -> restore_t2_latest => 4 + 4: check_td1 -> check_not_td2 -> check_td3 -> check_not_td4 + """ + # Set-up environment + database_app_name = f"{DATABASE_APP_NAME}-{cloud}" + + logger.info("deploying the next charms: s3-integrator, self-signed-certificates, postgresql") await ops_test.model.deploy(s3_integrator_app_name) await ops_test.model.deploy(tls_certificates_app_name, config=tls_config, channel=tls_channel) - # Deploy and relate PostgreSQL to S3 integrator (one database app for each cloud for now - # as archivo_mode is disabled after restoring the backup) and to TLS Certificates Operator - # (to be able to create backups from replicas). - database_app_name = f"{DATABASE_APP_NAME}-{cloud}" await build_and_deploy(ops_test, 2, database_app_name=database_app_name, wait_for_idle=False) + logger.info( + "integrating self-signed-certificates with postgresql and waiting them to stabilize" + ) await ops_test.model.relate(database_app_name, tls_certificates_app_name) async with ops_test.fast_forward(fast_interval="60s"): await ops_test.model.wait_for_idle( - apps=[database_app_name], status="active", timeout=1000, raise_on_error=False + apps=[database_app_name, tls_certificates_app_name], + status="active", + timeout=1000, + raise_on_error=False, ) - await ops_test.model.relate(database_app_name, s3_integrator_app_name) # Configure and set access and secret keys. logger.info(f"configuring S3 integrator for {cloud}") @@ -127,95 +137,71 @@ async def pitr_backup_operations( **credentials, ) await action.wait() + + logger.info("integrating s3-integrator with postgresql and waiting model to stabilize") + await ops_test.model.relate(database_app_name, s3_integrator_app_name) async with ops_test.fast_forward(fast_interval="60s"): - await ops_test.model.wait_for_idle( - apps=[database_app_name, s3_integrator_app_name], status="active", timeout=1000 - ) + await ops_test.model.wait_for_idle(status="active", timeout=1000) primary = await get_primary(ops_test, database_app_name) for unit in ops_test.model.applications[database_app_name].units: if unit.name != primary: replica = unit.name break - - # Write some data. password = await get_password(ops_test, database_app_name=database_app_name) address = await get_unit_address(ops_test, primary) - logger.info("creating a table in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute( - "CREATE TABLE IF NOT EXISTS backup_table_1 (test_column INT );" - ) - connection.close() - # With a stable cluster, Run the "create backup" action - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) - logger.info("creating a backup") + logger.info("1: creating table") + _create_table(address, password) + + logger.info("1: creating backup b1") action = await ops_test.model.units.get(replica).run_action("create-backup") await action.wait() backup_status = action.results.get("backup-status") assert backup_status, "backup hasn't succeeded" async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(status="active", timeout=1000) + backup_b1 = await _get_most_recent_backup(ops_test, ops_test.model.units.get(replica)) - # Run the "list backups" action. - logger.info("listing the available backups") - action = await ops_test.model.units.get(replica).run_action("list-backups") - await action.wait() - backups = action.results.get("backups") - # 5 lines for header output, 1 backup line ==> 6 total lines - assert len(backups.split("\n")) == 6, "full backup is not outputted" - await ops_test.model.wait_for_idle(status="active", timeout=1000) + logger.info("1: creating test data td1") + _insert_test_data("test_data_td1", address, password) - # Write some data. - logger.info("creating after-backup data in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute( - "INSERT INTO backup_table_1 (test_column) VALUES (1), (2), (3), (4), (5);" - ) - connection.close() + logger.info("1: get timestamp ts1") with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: cursor.execute("SELECT current_timestamp;") - after_backup_ts = str(cursor.fetchone()[0]) - connection.close() - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("CREATE TABLE IF NOT EXISTS backup_table_2 (test_column INT);") - connection.close() - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("SELECT pg_switch_wal();") + timestamp_ts1 = str(cursor.fetchone()[0]) connection.close() + # Wrong timestamp pointing to one year ahead + unreachable_timestamp_ts1 = timestamp_ts1.replace( + timestamp_ts1[:4], str(int(timestamp_ts1[:4]) + 1), 1 + ) + logger.info("1: creating test data td2") + _insert_test_data("test_data_td2", address, password) + + logger.info("1: switching wal") + _switch_wal(address, password) + + logger.info("1: scaling down to do restore") async with ops_test.fast_forward(fast_interval="60s"): await scale_application(ops_test, database_app_name, 1) remaining_unit = ops_test.model.units.get(f"{database_app_name}/0") - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - # Wrong timestamp pointing to one year ahead - wrong_ts = after_backup_ts.replace(after_backup_ts[:4], str(int(after_backup_ts[:4]) + 1), 1) - - # Run the "restore backup" action with bad PITR parameter. - logger.info("restoring the backup with bad restore-to-time parameter") + logger.info("1: restoring the backup b1 with bad restore-to-time parameter") action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id, "restore-to-time": "bad data"} + "restore", **{"backup-id": backup_b1, "restore-to-time": "bad data"} ) await action.wait() assert ( action.status == "failed" - ), "action must fail with bad restore-to-time parameter, but it succeeded" + ), "1: restore must fail with bad restore-to-time parameter, but that action succeeded" - # Run the "restore backup" action with unreachable PITR parameter. - logger.info("restoring the backup with unreachable restore-to-time parameter") + logger.info("1: restoring the backup b1 with unreachable restore-to-time parameter") action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id, "restore-to-time": wrong_ts} + "restore", **{"backup-id": backup_b1, "restore-to-time": unreachable_timestamp_ts1} ) await action.wait() - logger.info("waiting for the database charm to become blocked") + logger.info("1: waiting for the database charm to become blocked after restore") async with ops_test.fast_forward(): await ops_test.model.block_until( lambda: ops_test.model.units.get(f"{database_app_name}/0").workload_status_message @@ -223,63 +209,170 @@ async def pitr_backup_operations( timeout=1000, ) logger.info( - "database charm become in blocked state, as supposed to be with unreachable PITR parameter" + "1: database charm become in blocked state after restore, as supposed to be with unreachable PITR parameter" ) - # Run the "restore backup" action. for attempt in Retrying( stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) ): with attempt: - logger.info("restoring the backup") + logger.info("1: restoring to the timestamp ts1") action = await remaining_unit.run_action( - "restore", **{"backup-id": backup_id, "restore-to-time": after_backup_ts} + "restore", **{"restore-to-time": timestamp_ts1} ) await action.wait() restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + assert restore_status, "1: restore to the timestamp ts1 hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) - # Wait for the restore to complete. - async with ops_test.fast_forward(): - await ops_test.model.block_until( - lambda: remaining_unit.workload_status_message - == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, - timeout=1000, - ) + logger.info("2: successful restore") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + timeline_t2 = await _get_most_recent_backup(ops_test, remaining_unit) + assert backup_b1 != timeline_t2, "2: timeline 2 do not exist in list-backups action or bad" + + logger.info("2: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "2: test data td1 should exist" + + logger.info("2: checking not test data td2") + assert not _check_test_data( + "test_data_td2", address, password + ), "2: test data td2 shouldn't exist" - # Check that the backup was correctly restored. - primary = await get_primary(ops_test, database_app_name) - address = await get_unit_address(ops_test, primary) - logger.info("checking that the backup was correctly restored") - with db_connect( - host=address, password=password - ) as connection, connection.cursor() as cursor: - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" + logger.info("2: creating test data td3") + _insert_test_data("test_data_td3", address, password) + + logger.info("2: get timestamp ts2") + with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: + cursor.execute("SELECT current_timestamp;") + timestamp_ts2 = str(cursor.fetchone()[0]) + connection.close() + + logger.info("2: creating test data td4") + _insert_test_data("test_data_td4", address, password) + + logger.info("2: switching wal") + _switch_wal(address, password) + + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("2: restoring the backup b1 to the latest") + action = await remaining_unit.run_action( + "restore", **{"backup-id": backup_b1, "restore-to-time": "latest"} ) - assert cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" - cursor.execute("SELECT COUNT(1) FROM backup_table_1;") - assert ( - int(cursor.fetchone()[0]) == 5 - ), "backup wasn't correctly restored: table 'backup_table_1' doesn't have 5 rows" - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "2: restore the backup b1 to the latest hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) + + logger.info("3: successful restore") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + timeline_t3 = await _get_most_recent_backup(ops_test, remaining_unit) + assert ( + backup_b1 != timeline_t3 and timeline_t2 != timeline_t3 + ), "3: timeline 3 do not exist in list-backups action or bad" + + logger.info("3: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "3: test data td1 should exist" + + logger.info("3: checking test data td2") + assert _check_test_data("test_data_td2", address, password), "3: test data td2 should exist" + + logger.info("3: checking not test data td3") + assert not _check_test_data( + "test_data_td3", address, password + ), "3: test data td3 shouldn't exist" + + logger.info("3: checking not test data td4") + assert not _check_test_data( + "test_data_td4", address, password + ), "3: test data td4 shouldn't exist" + + logger.info("3: switching wal") + _switch_wal(address, password) + + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("3: restoring the timeline 2 to the latest") + action = await remaining_unit.run_action( + "restore", **{"backup-id": timeline_t2, "restore-to-time": "latest"} ) - assert not cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_2' exists" - connection.close() - - # Remove S3 relation to ensure "move to another cluster" blocked status is gone - await ops_test.model.applications[database_app_name].remove_relation( - f"{database_app_name}:s3-parameters", f"{s3_integrator_app_name}:s3-credentials" - ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "3: restore the timeline 2 to the latest hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) - await ops_test.model.wait_for_idle(status="active", timeout=1000) + logger.info("4: successful restore") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + timeline_t4 = await _get_most_recent_backup(ops_test, remaining_unit) + assert ( + backup_b1 != timeline_t4 and timeline_t2 != timeline_t4 and timeline_t3 != timeline_t4 + ), "4: timeline 4 do not exist in list-backups action or bad" + + logger.info("4: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "4: test data td1 should exist" + + logger.info("4: checking not test data td2") + assert not _check_test_data( + "test_data_td2", address, password + ), "4: test data td2 shouldn't exist" + + logger.info("4: checking test data td3") + assert _check_test_data("test_data_td3", address, password), "4: test data td3 should exist" + + logger.info("4: checking test data td4") + assert _check_test_data("test_data_td4", address, password), "4: test data td4 should exist" + + logger.info("4: switching wal") + _switch_wal(address, password) + + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("4: restoring to the timestamp ts2") + action = await remaining_unit.run_action( + "restore", **{"restore-to-time": timestamp_ts2} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "4: restore to the timestamp ts2 hasn't succeeded" + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) + + logger.info("5: successful restore") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + timeline_t5 = await _get_most_recent_backup(ops_test, remaining_unit) + assert ( + backup_b1 != timeline_t5 + and timeline_t2 != timeline_t5 + and timeline_t3 != timeline_t5 + and timeline_t4 != timeline_t5 + ), "5: timeline 5 do not exist in list-backups action or bad" + + logger.info("5: checking test data td1") + assert _check_test_data("test_data_td1", address, password), "5: test data td1 should exist" + + logger.info("5: checking not test data td2") + assert not _check_test_data( + "test_data_td2", address, password + ), "5: test data td2 shouldn't exist" + + logger.info("5: checking test data td3") + assert _check_test_data("test_data_td3", address, password), "5: test data td3 should exist" + + logger.info("5: checking not test data td4") + assert not _check_test_data( + "test_data_td4", address, password + ), "5: test data td4 shouldn't exist" + + await ops_test.model.wait_for_idle(status="active", timeout=1000) # Remove the database app. await ops_test.model.remove_application(database_app_name) @@ -331,3 +424,49 @@ async def test_pitr_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dic cloud, config, ) + + +def _create_table(host: str, password: str): + with db_connect(host=host, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("CREATE TABLE IF NOT EXISTS backup_table (test_column TEXT);") + connection.close() + + +def _insert_test_data(td: str, host: str, password: str): + with db_connect(host=host, password=password) as connection: + connection.autocommit = True + connection.cursor().execute( + "INSERT INTO backup_table (test_column) VALUES (%s);", + (td,), + ) + connection.close() + + +def _check_test_data(td: str, host: str, password: str) -> bool: + with db_connect(host=host, password=password) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT 1 FROM backup_table WHERE test_column = %s);", + (td,), + ) + res = cursor.fetchone()[0] + connection.close() + return res + + +def _switch_wal(host: str, password: str): + with db_connect(host=host, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("SELECT pg_switch_wal();") + connection.close() + + +async def _get_most_recent_backup(ops_test: OpsTest, unit: any) -> str: + logger.info("listing the available backups") + action = await unit.run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + assert backups, "backups not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + most_recent_backup = backups.split("\n")[-1] + return most_recent_backup.split()[0] diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index bd22e5e477..42b533073e 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -1,7 +1,6 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. import datetime -from typing import OrderedDict from unittest.mock import MagicMock, PropertyMock, call, mock_open, patch import pytest @@ -572,42 +571,56 @@ def test_format_backup_list(harness): == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------""" ) # Test when there are backups. backup_list = [ ( "2023-01-01T09:00:00Z", - "full", + "full backup", "failed: fake error", "None", "0/3000000 / 0/5000000", "2023-01-01T09:00:00Z", "2023-01-01T09:00:05Z", + "1", "a/b/c", ), ( "2023-01-01T10:00:00Z", - "full", + "full backup", "finished", "None", "0/5000000 / 0/7000000", "2023-01-01T10:00:00Z", - "2023-01-01T010:00:07Z", + "2023-01-01T10:00:07Z", + "A", "a/b/d", ), + ( + "2023-01-01T11:00:00Z", + "restore", + "finished", + "None", + "n/a", + "2023-01-01T11:00:00Z", + "n/a", + "B", + "n/a", + ), ] assert ( harness.charm.backup._format_backup_list(backup_list) == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------ -2023-01-01T09:00:00Z | full | failed: fake error | None | 0/3000000 / 0/5000000 | 2023-01-01T09:00:00Z | 2023-01-01T09:00:05Z | a/b/c -2023-01-01T10:00:00Z | full | finished | None | 0/5000000 / 0/7000000 | 2023-01-01T10:00:00Z | 2023-01-01T010:00:07Z | a/b/d""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +2023-01-01T09:00:00Z | full backup | failed: fake error | None | 0/3000000 / 0/5000000 | 2023-01-01T09:00:00Z | 2023-01-01T09:00:05Z | 1 | a/b/c +2023-01-01T10:00:00Z | full backup | finished | None | 0/5000000 / 0/7000000 | 2023-01-01T10:00:00Z | 2023-01-01T10:00:07Z | A | a/b/d +2023-01-01T11:00:00Z | restore | finished | None | n/a | 2023-01-01T11:00:00Z | n/a | B | n/a""" ) @@ -625,29 +638,36 @@ def test_generate_backup_list_output(harness): "path": " test-path/ ", } # Test when no backups are returned. - _execute_command.return_value = ('[{"backup":[]}]', None) + _execute_command.side_effect = [('[{"backup":[]}]', None), ("{}", None)] assert ( harness.charm.backup._generate_backup_list_output() == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------""" ) # Test when backups are returned. - _execute_command.return_value = ( - '[{"backup":[{"label":"20230101-090000F","error":"fake error","reference":null,"lsn":{"start":"0/3000000","stop":"0/5000000"},"timestamp":{"start":1719866711,"stop":1719866714}}]}]', - None, - ) + _execute_command.side_effect = [ + ( + '[{"backup":[{"archive":{"start":"00000001000000000000000B"},"label":"20230101-090000F","error":"fake error","reference":null,"lsn":{"start":"0/3000000","stop":"0/5000000"},"timestamp":{"start":1719866711,"stop":1719866714}}]}]', + None, + ), + ( + '{".":{"type":"path"},"archive/None.patroni-postgresql-k8s/14-1/00000002.history":{"type": "file","size": 32,"time": 1728937652}}', + None, + ), + ] assert ( harness.charm.backup._generate_backup_list_output() == """Storage bucket name: test-bucket Backups base path: /test-path/backup/ -backup-id | type | status | reference-backup-id | LSN start/stop | start-time | finish-time | backup-path ------------------------------------------------------------------------------------------------------------------------------------------------------------ -2023-01-01T09:00:00Z | full | failed: fake error | None | 0/3000000 / 0/5000000 | 2024-07-01T20:45:11Z | 2024-07-01T20:45:14Z | /None.patroni-postgresql-k8s/20230101-090000F""" +backup-id | action | status | reference-backup-id | LSN start/stop | start-time | finish-time | timeline | backup-path +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +2023-01-01T09:00:00Z | full backup | failed: fake error | None | 0/3000000 / 0/5000000 | 2024-07-01T20:45:11Z | 2024-07-01T20:45:14Z | 1 | /None.patroni-postgresql-k8s/20230101-090000F +2024-10-14T20:27:32Z | restore | finished | None | n/a | 2024-10-14T20:27:32Z | n/a | 2 | n/a""" ) @@ -655,24 +675,87 @@ def test_list_backups(harness): with patch("charm.PostgreSQLBackups._execute_command") as _execute_command: # Test when no backups are available. _execute_command.return_value = ("[]", None) - assert harness.charm.backup._list_backups(show_failed=True) == OrderedDict[str, str]() + assert harness.charm.backup._list_backups(show_failed=True) == dict[str, tuple[str, str]]() # Test when some backups are available. _execute_command.return_value = ( - '[{"backup":[{"label":"20230101-090000F","error":"fake error"},{"label":"20230101-100000F","error":null}],"name":"test-stanza"}]', + '[{"backup":[{"archive":{"start":"00000001000000000000000B"},"label":"20230101-090000F","error":"fake error"},{"archive":{"start":"0000000A000000000000000B"},"label":"20230101-100000F","error":null}],"name":"test-stanza"}]', None, ) - assert harness.charm.backup._list_backups(show_failed=True) == OrderedDict[str, str]([ - ("2023-01-01T09:00:00Z", "test-stanza"), - ("2023-01-01T10:00:00Z", "test-stanza"), + assert harness.charm.backup._list_backups(show_failed=True) == dict[str, tuple[str, str]]([ + ("2023-01-01T09:00:00Z", ("test-stanza", "1")), + ("2023-01-01T10:00:00Z", ("test-stanza", "A")), ]) # Test when some backups are available, but it's not desired to list failed backups. - assert harness.charm.backup._list_backups(show_failed=False) == OrderedDict[str, str]([ - ("2023-01-01T10:00:00Z", "test-stanza") + assert harness.charm.backup._list_backups(show_failed=False) == dict[str, tuple[str, str]]([ + ("2023-01-01T10:00:00Z", ("test-stanza", "A")) ]) +def test_list_timelines(harness): + with patch("charm.PostgreSQLBackups._execute_command") as _execute_command: + _execute_command.return_value = ("{}", None) + assert harness.charm.backup._list_timelines() == dict[str, tuple[str, str]]() + + _execute_command.return_value = ( + '{".":{"type":"path"},"archive/test-stanza/14-1/00000002.history":{"type": "file","size": 32,"time": 1728937652}}', + None, + ) + assert harness.charm.backup._list_timelines() == dict[str, tuple[str, str]]([ + ("2024-10-14T20:27:32Z", ("test-stanza", "2")) + ]) + + +def test_get_nearest_timeline(harness): + with ( + patch("charm.PostgreSQLBackups._list_backups") as _list_backups, + patch("charm.PostgreSQLBackups._list_timelines") as _list_timelines, + ): + _list_backups.return_value = dict[str, tuple[str, str]]() + _list_timelines.return_value = dict[str, tuple[str, str]]() + assert harness.charm.backup._get_nearest_timeline("2022-02-24 05:00:00") is None + + _list_backups.return_value = dict[str, tuple[str, str]]({ + "2022-02-24T05:00:00Z": ("test-stanza", "1"), + "2024-02-24T05:00:00Z": ("test-stanza", "2"), + }) + _list_timelines.return_value = dict[str, tuple[str, str]]({ + "2023-02-24T05:00:00Z": ("test-stanza", "2") + }) + assert harness.charm.backup._get_nearest_timeline("2025-01-01 00:00:00") == tuple[ + str, str + ](("test-stanza", "2")) + assert harness.charm.backup._get_nearest_timeline("2024-01-01 00:00:00") == tuple[ + str, str + ](("test-stanza", "2")) + assert harness.charm.backup._get_nearest_timeline("2023-01-01 00:00:00") == tuple[ + str, str + ](("test-stanza", "1")) + assert harness.charm.backup._get_nearest_timeline("2022-01-01 00:00:00") is None + + +def test_is_psql_timestamp(harness): + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00+0000") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00+03") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00-01:00") is True + + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01+0000") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01+03") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.01-01:00") is True + + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001+0000") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001+03") is True + assert harness.charm.backup._is_psql_timestamp("2022-02-24 05:00:00.500001-01:00") is True + + assert harness.charm.backup._is_psql_timestamp("bad data") is False + assert harness.charm.backup._is_psql_timestamp("2022-02-24T05:00:00.5000001-01:00") is False + assert harness.charm.backup._is_psql_timestamp("2022-24-24 05:00:00") is False + + def test_initialise_stanza(harness): with ( patch("charm.Patroni.reload_patroni_configuration") as _reload_patroni_configuration, @@ -1389,6 +1472,7 @@ def test_on_restore_action(harness): patch("lightkube.Client.delete") as _delete, patch("ops.model.Container.stop") as _stop, patch("charm.PostgreSQLBackups._list_backups") as _list_backups, + patch("charm.PostgreSQLBackups._list_timelines") as _list_timelines, patch("charm.PostgreSQLBackups._fetch_backup_from_id") as _fetch_backup_from_id, patch("charm.PostgreSQLBackups._pre_restore_checks") as _pre_restore_checks, patch( @@ -1419,10 +1503,33 @@ def test_on_restore_action(harness): # Test when the user provides an invalid backup id. mock_event.params = {"backup-id": "2023-01-01T10:00:00Z"} _pre_restore_checks.return_value = True - _list_backups.return_value = {"2023-01-01T09:00:00Z": harness.charm.backup.stanza_name} + _list_backups.return_value = { + "2023-01-01T09:00:00Z": (harness.charm.backup.stanza_name, "1") + } + _list_timelines.return_value = { + "2024-02-24T05:00:00Z": (harness.charm.backup.stanza_name, "2") + } harness.charm.unit.status = ActiveStatus() harness.charm.backup._on_restore_action(mock_event) _list_backups.assert_called_once_with(show_failed=False) + _list_timelines.assert_called_once() + _fetch_backup_from_id.assert_not_called() + mock_event.fail.assert_called_once() + _stop.assert_not_called() + _delete.assert_not_called() + _restart_database.assert_not_called() + _empty_data_files.assert_not_called() + _create_pgdata.assert_not_called() + _update_config.assert_not_called() + _start.assert_not_called() + mock_event.set_results.assert_not_called() + assert not isinstance(harness.charm.unit.status, MaintenanceStatus) + + # Test when the user provides an only the timeline backup id leading to the error. + mock_event.reset_mock() + mock_event.params = {"backup-id": "2024-02-24T05:00:00Z"} + harness.charm.unit.status = ActiveStatus() + harness.charm.backup._on_restore_action(mock_event) _fetch_backup_from_id.assert_not_called() mock_event.fail.assert_called_once() _stop.assert_not_called() @@ -1505,7 +1612,47 @@ def test_on_restore_action(harness): assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { "restoring-backup": "20230101-090000F", "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", - "require-change-bucket-after-restore": "True", + } + _create_pgdata.assert_called_once() + _update_config.assert_called_once() + _start.assert_called_once_with("postgresql") + mock_event.fail.assert_not_called() + mock_event.set_results.assert_called_once_with({"restore-status": "restore started"}) + + # Test a successful PITR with the real backup id to the latest. + mock_event.reset_mock() + _restart_database.reset_mock() + _create_pgdata.reset_mock() + _update_config.reset_mock() + _start.reset_mock() + mock_event.params = {"backup-id": "2023-01-01T09:00:00Z", "restore-to-time": "latest"} + harness.charm.backup._on_restore_action(mock_event) + _restart_database.assert_not_called() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { + "restoring-backup": "20230101-090000F", + "restore-timeline": "1", + "restore-to-time": "latest", + "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", + } + _create_pgdata.assert_called_once() + _update_config.assert_called_once() + _start.assert_called_once_with("postgresql") + mock_event.fail.assert_not_called() + mock_event.set_results.assert_called_once_with({"restore-status": "restore started"}) + + # Test a successful PITR with only the timestamp. + mock_event.reset_mock() + _restart_database.reset_mock() + _create_pgdata.reset_mock() + _update_config.reset_mock() + _start.reset_mock() + mock_event.params = {"restore-to-time": "2025-02-24 05:00:00.001+00"} + harness.charm.backup._on_restore_action(mock_event) + _restart_database.assert_not_called() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { + "restore-timeline": "2", + "restore-to-time": "2025-02-24 05:00:00.001+00", + "restore-stanza": f"{harness.charm.model.name}.{harness.charm.cluster_name}", } _create_pgdata.assert_called_once() _update_config.assert_called_once() @@ -1527,6 +1674,7 @@ def test_pre_restore_checks(harness): # Test when no backup id is provided. mock_event.reset_mock() + mock_event.params = {} _are_backup_settings_ok.return_value = (True, None) assert harness.charm.backup._pre_restore_checks(mock_event) is False mock_event.fail.assert_called_once() @@ -1566,6 +1714,30 @@ def test_pre_restore_checks(harness): assert harness.charm.backup._pre_restore_checks(mock_event) is True mock_event.fail.assert_not_called() + # Test with bad restore-to-time parameter + mock_event.reset_mock() + mock_event.params = {"restore-to-time": "bad"} + assert harness.charm.backup._pre_restore_checks(mock_event) is False + mock_event.fail.assert_called_once() + + # Test with good restore-to-time parameter + mock_event.reset_mock() + mock_event.params = {"restore-to-time": "2022-02-24 05:00:00"} + assert harness.charm.backup._pre_restore_checks(mock_event) is True + mock_event.fail.assert_not_called() + + # Test with single (bad) restore-to-time=latest parameter + mock_event.reset_mock() + mock_event.params = {"restore-to-time": "latest"} + assert harness.charm.backup._pre_restore_checks(mock_event) is False + mock_event.fail.assert_called_once() + + # Test with good restore-to-time=latest parameter + mock_event.reset_mock() + mock_event.params = {"backup-id": "2023-01-01T09:00:00Z", "restore-to-time": "latest"} + assert harness.charm.backup._pre_restore_checks(mock_event) is True + mock_event.fail.assert_not_called() + @pytest.mark.parametrize( "tls_ca_chain_filename", ["", "/var/lib/postgresql/data/pgbackrest-tls-ca-chain.crt"] diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e5b12813c7..2437958e31 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -24,7 +24,6 @@ from requests import ConnectionError from tenacity import RetryError, wait_fixed -from backups import MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm from constants import PEER, SECRET_INTERNAL_LABEL from patroni import NotReadyError @@ -485,15 +484,6 @@ def test_on_update_status(harness): _enable_disable_extensions.assert_called_once() _pebble.get_services.assert_called_once() - # Test unit in blocked status due to restored cluster. - _pebble.get_services.reset_mock() - _enable_disable_extensions.reset_mock() - harness.model.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET) - harness.charm.on.update_status.emit() - _enable_disable_extensions.assert_not_called() - _pebble.get_services.assert_called_once() - harness.model.unit.status = ActiveStatus() - # Test when a failure need to be handled. _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] _handle_processes_failures.return_value = True @@ -721,6 +711,9 @@ def test_on_update_status_after_restore_operation(harness): "charm.PostgresqlOperatorCharm._handle_processes_failures" ) as _handle_processes_failures, patch("charm.PostgreSQLBackups.can_use_s3_repository") as _can_use_s3_repository, + patch( + "charms.postgresql_k8s.v0.postgresql.PostgreSQL.get_current_timeline" + ) as _get_current_timeline, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, patch("ops.model.Container.pebble") as _pebble, @@ -729,6 +722,7 @@ def test_on_update_status_after_restore_operation(harness): rel_id = harness.model.get_relation(PEER).id # Mock the access to the list of Pebble services to test a failed restore. _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] + _get_current_timeline.return_value = "2" # Test when the restore operation fails. with harness.hooks_disabled(): @@ -736,7 +730,7 @@ def test_on_update_status_after_restore_operation(harness): harness.update_relation_data( rel_id, harness.charm.app.name, - {"restoring-backup": "2023-01-01T09:00:00Z"}, + {"restoring-backup": "20230101-090000F"}, ) harness.set_can_connect(POSTGRESQL_CONTAINER, True) harness.charm.on.update_status.emit() @@ -758,7 +752,7 @@ def test_on_update_status_after_restore_operation(harness): # Assert that the backup id is still in the application relation databag. tc.assertEqual( harness.get_relation_data(rel_id, harness.charm.app), - {"restoring-backup": "2023-01-01T09:00:00Z"}, + {"restoring-backup": "20230101-090000F"}, ) # Test when the restore operation finished successfully. @@ -782,7 +776,7 @@ def test_on_update_status_after_restore_operation(harness): harness.update_relation_data( rel_id, harness.charm.app.name, - {"restoring-backup": "2023-01-01T09:00:00Z"}, + {"restoring-backup": "20230101-090000F"}, ) _can_use_s3_repository.return_value = (False, "fake validation message") harness.charm.on.update_status.emit() @@ -1636,9 +1630,9 @@ def test_update_config(harness): backup_id=None, stanza=None, restore_stanza=None, + restore_timeline=None, pitr_target=None, restore_to_latest=False, - disable_pgbackrest_archiving=False, parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once() @@ -1660,9 +1654,9 @@ def test_update_config(harness): backup_id=None, stanza=None, restore_stanza=None, + restore_timeline=None, pitr_target=None, restore_to_latest=False, - disable_pgbackrest_archiving=False, parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once()