Skip to content

Commit

Permalink
[DPE-4839] Point In Time Recovery (#554)
Browse files Browse the repository at this point in the history
* Initial PITR work.

* Initial PITR work.

* PITR.

* Lint.

* S3 WAL stanza check.

* PITR: fix Patroni.last_postgresql_logs.

* Unit tests.

* Format.

* Fix s3 change event.

* Fix PITR s3 statuses.

* PITR integration tests.

* PITR integration tests.

* backup test works.

* PITR tests.

* Fix backup test.
  • Loading branch information
Zvirovyi authored Jul 26, 2024
1 parent d027e56 commit 8ab2074
Show file tree
Hide file tree
Showing 11 changed files with 693 additions and 80 deletions.
3 changes: 3 additions & 0 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ restore:
backup-id:
type: string
description: A backup-id to identify the backup to restore (format = %Y-%m-%dT%H:%M:%SZ)
restore-to-time:
type: string
description: Point-in-time-recovery target in PSQL format.
resume-upgrade:
description: Resume a rolling upgrade after asserting successful upgrade of a new revision.
set-password:
Expand Down
143 changes: 115 additions & 28 deletions src/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@
"failed to access/create the bucket, check your S3 settings"
)
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings"
CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details"
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket"

S3_BLOCK_MESSAGES = [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
]


Expand Down Expand Up @@ -168,9 +171,29 @@ def can_use_s3_repository(self) -> Tuple[bool, Optional[str]]:
if self.charm._patroni.member_started:
self.charm._patroni.reload_patroni_configuration()
return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
return self._is_s3_wal_compatible(stanza)

return True, None

def _is_s3_wal_compatible(self, stanza) -> Tuple[bool, Optional[str]]:
"""Returns whether the S3 stanza is compatible with current PostgreSQL cluster by WAL parity."""
charm_last_archived_wal = self.charm.postgresql.get_last_archived_wal()
logger.debug(f"last archived wal: {charm_last_archived_wal}")
s3_archive = stanza.get("archive", [])
if len(s3_archive) > 0:
s3_last_archived_wal = s3_archive[0].get("max")
logger.debug(f"last s3 wal: {str(s3_last_archived_wal)}")
if (
charm_last_archived_wal
and s3_last_archived_wal
and charm_last_archived_wal.split(".", 1)[0] != str(s3_last_archived_wal)
):
if bool(self.charm.app_peer_data.get("require-change-bucket-after-restore", None)):
return False, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
else:
return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
return True, None

def _construct_endpoint(self, s3_parameters: Dict) -> str:
"""Construct the S3 service endpoint using the region.
Expand Down Expand Up @@ -240,10 +263,12 @@ def _empty_data_files(self) -> None:
try:
self.container.exec("rm -r /var/lib/postgresql/data/pgdata".split()).wait_output()
except ExecError as e:
logger.exception(
"Failed to empty data directory in prep for backup restore", exc_info=e
)
raise
# If previous PITR restore was unsuccessful, there is no such directory.
if "No such file or directory" not in e.stderr:
logger.exception(
"Failed to empty data directory in prep for backup restore", exc_info=e
)
raise

def _change_connectivity_to_database(self, connectivity: bool) -> None:
"""Enable or disable the connectivity to the database."""
Expand Down Expand Up @@ -419,11 +444,7 @@ def _initialise_stanza(self) -> None:

# Enable stanza initialisation if the backup settings were fixed after being invalid
# or pointing to a repository where there are backups from another cluster.
if self.charm.is_blocked and self.charm.unit.status.message not in [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
]:
if self.charm.is_blocked and self.charm.unit.status.message not in S3_BLOCK_MESSAGES:
logger.warning("couldn't initialize stanza due to a blocked status")
return

Expand Down Expand Up @@ -545,6 +566,21 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
event.defer()
return

# Prevents config change in bad state, so DB peer relations change event will not cause patroni related errors.
if self.charm.unit.status.message == CANNOT_RESTORE_PITR:
logger.info("Cannot change S3 configuration in bad PITR restore status")
event.defer()
return

# Prevents S3 change in the middle of restoring backup and patroni / pgbackrest errors caused by that.
if (
"restoring-backup" in self.charm.app_peer_data
or "restore-to-time" in self.charm.app_peer_data
):
logger.info("Cannot change S3 configuration during restore")
event.defer()
return

if not self._render_pgbackrest_conf_file():
logger.debug("Cannot set pgBackRest configurations, missing configurations.")
return
Expand All @@ -554,6 +590,9 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
event.defer()
return

if self.charm.unit.is_leader():
self.charm.app_peer_data.pop("require-change-bucket-after-restore", None)

# Verify the s3 relation only on the primary.
if not self.charm.is_primary:
return
Expand Down Expand Up @@ -712,7 +751,11 @@ def _on_create_backup_action(self, event) -> None: # noqa: C901

def _on_s3_credential_gone(self, _) -> None:
if self.charm.unit.is_leader():
self.charm.app_peer_data.update({"stanza": "", "init-pgbackrest": ""})
self.charm.app_peer_data.update({
"stanza": "",
"init-pgbackrest": "",
"require-change-bucket-after-restore": "",
})
self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""})
if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES:
self.charm.unit.status = ActiveStatus()
Expand All @@ -738,19 +781,52 @@ def _on_restore_action(self, event):
return

backup_id = event.params.get("backup-id")
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")
restore_to_time = event.params.get("restore-to-time")
logger.info(
f"A restore"
f"{' with backup-id ' + backup_id if backup_id else ''}"
f"{' to time point ' + restore_to_time if restore_to_time else ''}"
f" has been requested on the unit"
)

# Validate the provided backup id.
logger.info("Validating provided backup-id")
# Validate the provided backup id and restore to time.
logger.info("Validating provided backup-id and restore-to-time")
backups = self._list_backups(show_failed=False)
if backup_id not in backups.keys():
if backup_id and backup_id not in backups.keys():
error_message = f"Invalid backup-id: {backup_id}"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
if not backup_id and restore_to_time and not backups:
error_message = "Cannot restore PITR without any backups created"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return

# Quick check for timestamp format
if (
restore_to_time
and restore_to_time != "latest"
and not re.match("^[0-9-]+ [0-9:.+]+$", restore_to_time)
):
error_message = "Bad restore-to-time format"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return

self.charm.unit.status = MaintenanceStatus("restoring backup")

# Temporarily disabling patroni (postgresql) pebble service auto-restart on failures. This is required
# as point-in-time-recovery can fail on restore, therefore during cluster bootstrapping process. In this
# case, we need be able to check patroni service status and logs. Disabling auto-restart feature is essential
# to prevent wrong status indicated and logs reading race condition (as logs cleared / moved with service
# restarts).
if not self.charm.override_patroni_on_failure_condition("ignore", "restore-backup"):
error_message = "Failed to override Patroni on-failure condition"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return

# Stop the database service before performing the restore.
logger.info("Stopping database service")
try:
Expand Down Expand Up @@ -778,11 +854,15 @@ def _on_restore_action(self, event):
namespace=self.charm._namespace,
)
except ApiError as e:
error_message = f"Failed to remove previous cluster information with error: {str(e)}"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
self._restart_database()
return
# If previous PITR restore was unsuccessful, there are no such endpoints.
if "restore-to-time" not in self.charm.app_peer_data:
error_message = (
f"Failed to remove previous cluster information with error: {str(e)}"
)
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
self._restart_database()
return

logger.info("Removing the contents of the data directory")
try:
Expand All @@ -800,8 +880,12 @@ def _on_restore_action(self, event):
# Mark the cluster as in a restoring backup state and update the Patroni configuration.
logger.info("Configuring Patroni to restore the backup")
self.charm.app_peer_data.update({
"restoring-backup": self._fetch_backup_from_id(backup_id),
"restore-stanza": backups[backup_id],
"restoring-backup": self._fetch_backup_from_id(backup_id) if backup_id else "",
"restore-stanza": backups[backup_id]
if backup_id
else self.charm.app_peer_data.get("stanza", self.stanza_name),
"restore-to-time": restore_to_time or "",
"require-change-bucket-after-restore": "True",
})
self.charm.update_config()

Expand Down Expand Up @@ -854,8 +938,10 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
event.fail(validation_message)
return False

if not event.params.get("backup-id"):
error_message = "Missing backup-id to restore"
if not event.params.get("backup-id") and not event.params.get("restore-to-time"):
error_message = (
"Missing backup-id or/and restore-to-time parameter to be able to do restore"
)
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return False
Expand All @@ -867,10 +953,11 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
return False

logger.info("Checking if cluster is in blocked state")
if (
self.charm.is_blocked
and self.charm.unit.status.message != ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
):
if self.charm.is_blocked and self.charm.unit.status.message not in [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
CANNOT_RESTORE_PITR,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
]:
error_message = "Cluster or unit is in a blocking state"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
Expand Down Expand Up @@ -956,7 +1043,7 @@ def _render_pgbackrest_conf_file(self) -> bool:

def _restart_database(self) -> None:
"""Removes the restoring backup flag and restart the database."""
self.charm.app_peer_data.update({"restoring-backup": ""})
self.charm.app_peer_data.update({"restoring-backup": "", "restore-to-time": ""})
self.charm.update_config()
self.container.start(self.charm._postgresql_service)

Expand Down
Loading

0 comments on commit 8ab2074

Please sign in to comment.