Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into dpe-4809-enforce-juju…
Browse files Browse the repository at this point in the history
…-versions

Signed-off-by: Marcelo Henrique Neppel <[email protected]>
  • Loading branch information
marceloneppel committed Jul 2, 2024
2 parents b1650f8 + 2c3926f commit 865732c
Show file tree
Hide file tree
Showing 12 changed files with 782 additions and 40 deletions.
3 changes: 3 additions & 0 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ restore:
backup-id:
type: string
description: A backup-id to identify the backup to restore (format = %Y-%m-%dT%H:%M:%SZ)
restore-to-time:
type: string
description: Point-in-time-recovery target in PSQL format.
set-password:
description: Change the system user's password, which is used by charm.
It is for internal charm users and SHOULD NOT be used by applications.
Expand Down
12 changes: 11 additions & 1 deletion lib/charms/postgresql_k8s/v0/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 29
LIBPATCH = 30

INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles"

Expand Down Expand Up @@ -383,6 +383,16 @@ def _generate_database_privileges_statements(
)
return statements

def get_last_archived_wal(self) -> str:
"""Get the name of the last archived wal for the current PostgreSQL cluster."""
try:
with self._connect_to_database() as connection, connection.cursor() as cursor:
cursor.execute("SELECT last_archived_wal FROM pg_stat_archiver;")
return cursor.fetchone()[0]
except psycopg2.Error as e:
logger.error(f"Failed to get PostgreSQL last archived WAL: {e}")
raise PostgreSQLGetPostgreSQLVersionError()

def get_postgresql_text_search_configs(self) -> Set[str]:
"""Returns the PostgreSQL available text search configs.
Expand Down
117 changes: 97 additions & 20 deletions src/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,14 @@
"failed to access/create the bucket, check your S3 settings"
)
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings"
CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details"
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket"

S3_BLOCK_MESSAGES = [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
]


Expand Down Expand Up @@ -198,9 +201,29 @@ def can_use_s3_repository(self) -> Tuple[bool, Optional[str]]:
if self.charm._patroni.member_started:
self.charm._patroni.reload_patroni_configuration()
return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
return self._is_s3_wal_compatible(stanza)

return True, None

def _is_s3_wal_compatible(self, stanza) -> Tuple[bool, Optional[str]]:
"""Returns whether the S3 stanza is compatible with current PostgreSQL cluster by WAL parity."""
charm_last_archived_wal = self.charm.postgresql.get_last_archived_wal()
logger.debug(f"last archived wal: {charm_last_archived_wal}")
s3_archive = stanza.get("archive", [])
if len(s3_archive) > 0:
s3_last_archived_wal = s3_archive[0].get("max")
logger.debug(f"last s3 wal: {str(s3_last_archived_wal)}")
if (
charm_last_archived_wal
and s3_last_archived_wal
and charm_last_archived_wal.split(".", 1)[0] != str(s3_last_archived_wal)
):
if bool(self.charm.app_peer_data.get("require-change-bucket-after-restore", None)):
return False, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
else:
return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
return True, None

def _change_connectivity_to_database(self, connectivity: bool) -> None:
"""Enable or disable the connectivity to the database."""
self.charm.unit_peer_data.update({"connectivity": "on" if connectivity else "off"})
Expand Down Expand Up @@ -423,11 +446,7 @@ def _initialise_stanza(self) -> None:

# Enable stanza initialisation if the backup settings were fixed after being invalid
# or pointing to a repository where there are backups from another cluster.
if self.charm.is_blocked and self.charm.unit.status.message not in [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
]:
if self.charm.is_blocked and self.charm.unit.status.message not in S3_BLOCK_MESSAGES:
logger.warning("couldn't initialize stanza due to a blocked status")
return

Expand Down Expand Up @@ -554,6 +573,18 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
event.defer()
return

# Prevents config change in bad state, so DB peer relations change event will not cause patroni related errors.
if self.charm.unit.status.message == CANNOT_RESTORE_PITR:
logger.info("Cannot change S3 configuration in bad PITR restore status")
event.defer()
return

# Prevents S3 change in the middle of restoring backup and patroni / pgbackrest errors caused by that.
if "restoring-backup" in self.charm.app_peer_data:
logger.info("Cannot change S3 configuration during restore")
event.defer()
return

if not self._render_pgbackrest_conf_file():
logger.debug("Cannot set pgBackRest configurations, missing configurations.")
return
Expand All @@ -567,6 +598,8 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
if not self.charm.is_primary:
return

self.charm.app_peer_data.pop("require-change-bucket-after-restore", None)

try:
self._create_bucket_if_not_exists()
except (ClientError, ValueError):
Expand All @@ -582,7 +615,11 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):

def _on_s3_credential_gone(self, _) -> None:
if self.charm.unit.is_leader():
self.charm.app_peer_data.update({"stanza": "", "init-pgbackrest": ""})
self.charm.app_peer_data.update({
"stanza": "",
"init-pgbackrest": "",
"require-change-bucket-after-restore": "",
})
self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""})
if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES:
self.charm.unit.status = ActiveStatus()
Expand Down Expand Up @@ -753,20 +790,42 @@ def _on_restore_action(self, event):
return

backup_id = event.params.get("backup-id")
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")
restore_to_time = event.params.get("restore-to-time")
logger.info(
f"A restore"
f"{' with backup-id ' + backup_id if backup_id else ''}"
f"{' to time point ' + restore_to_time if restore_to_time else ''}"
f" has been requested on the unit"
)

# Validate the provided backup id.
logger.info("Validating provided backup-id")
# Validate the provided backup id and restore to time.
logger.info("Validating provided backup-id and restore-to-time")
try:
backups = self._list_backups(show_failed=False)
if backup_id not in backups.keys():
if backup_id and backup_id not in backups.keys():
error_message = f"Invalid backup-id: {backup_id}"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
if not backup_id and restore_to_time and not backups:
error_message = "Cannot restore PITR without any backups created"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
except ListBackupsError as e:
logger.exception(e)
error_message = "Failed to retrieve backup id"
error_message = "Failed to retrieve backups list"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return

# Quick check for timestamp format
if (
restore_to_time
and restore_to_time != "latest"
and not re.match("^[0-9-]+ [0-9:.+]+$", restore_to_time)
):
error_message = "Bad restore-to-time format"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
Expand All @@ -781,6 +840,17 @@ def _on_restore_action(self, event):
event.fail(error_message)
return

# Temporarily disabling patroni service auto-restart. This is required as point-in-time-recovery can fail
# on restore, therefore during cluster bootstrapping process. In this case, we need be able to check patroni
# service status and logs. Disabling auto-restart feature is essential to prevent wrong status indicated
# and logs reading race condition (as logs cleared / moved with service restarts).
if not self.charm.override_patroni_restart_condition("no", "restore-backup"):
error_message = "Failed to override Patroni restart condition"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
self._restart_database()
return

logger.info("Removing the contents of the data directory")
if not self._empty_data_files():
error_message = "Failed to remove contents of the data directory"
Expand All @@ -792,8 +862,12 @@ def _on_restore_action(self, event):
# Mark the cluster as in a restoring backup state and update the Patroni configuration.
logger.info("Configuring Patroni to restore the backup")
self.charm.app_peer_data.update({
"restoring-backup": self._fetch_backup_from_id(backup_id),
"restore-stanza": backups[backup_id],
"restoring-backup": self._fetch_backup_from_id(backup_id) if backup_id else "",
"restore-stanza": backups[backup_id]
if backup_id
else self.charm.app_peer_data.get("stanza", self.stanza_name),
"restore-to-time": restore_to_time or "",
"require-change-bucket-after-restore": "True",
})
self.charm.update_config()

Expand Down Expand Up @@ -865,17 +939,20 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
event.fail(validation_message)
return False

if not event.params.get("backup-id"):
error_message = "Missing backup-id to restore"
if not event.params.get("backup-id") and not event.params.get("restore-to-time"):
error_message = (
"Missing backup-id or/and restore-to-time parameter to be able to do restore"
)
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return False

logger.info("Checking if cluster is in blocked state")
if (
self.charm.is_blocked
and self.charm.unit.status.message != ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
):
if self.charm.is_blocked and self.charm.unit.status.message not in [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
CANNOT_RESTORE_PITR,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
]:
error_message = "Cluster or unit is in a blocking state"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
Expand Down Expand Up @@ -941,7 +1018,7 @@ def _render_pgbackrest_conf_file(self) -> bool:

def _restart_database(self) -> None:
"""Removes the restoring backup flag and restart the database."""
self.charm.app_peer_data.update({"restoring-backup": ""})
self.charm.app_peer_data.update({"restoring-backup": "", "restore-to-time": ""})
self.charm.update_config()
self.charm._patroni.start_patroni()

Expand Down
Loading

0 comments on commit 865732c

Please sign in to comment.