Skip to content

Commit

Permalink
PITR improvements and fixes. Improve integration backup test.
Browse files Browse the repository at this point in the history
  • Loading branch information
Zvirovyi committed Jun 18, 2024
1 parent d8d8210 commit 3601f82
Showing 4 changed files with 46 additions and 21 deletions.
4 changes: 2 additions & 2 deletions lib/charms/postgresql_k8s/v0/postgresql.py
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 28
LIBPATCH = 29

INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles"

@@ -641,5 +641,5 @@ def get_last_archived_wal(self) -> str | None:
cursor.execute("SELECT last_archived_wal FROM pg_stat_archiver;")
return cursor.fetchone()[0]
except psycopg2.Error as e:
logger.error(f"Failed to get PostgreSQL version: {e}")
logger.error(f"Failed to get PostgreSQL last archived WAL: {e}")
raise PostgreSQLGetPostgreSQLVersionError()
25 changes: 19 additions & 6 deletions src/backups.py
Original file line number Diff line number Diff line change
@@ -54,7 +54,7 @@
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
CANNOT_RESTORE_PITR,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
]


@@ -446,11 +446,7 @@ def _initialise_stanza(self) -> None:

# Enable stanza initialisation if the backup settings were fixed after being invalid
# or pointing to a repository where there are backups from another cluster.
if self.charm.is_blocked and self.charm.unit.status.message not in [
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
]:
if self.charm.is_blocked and self.charm.unit.status.message not in S3_BLOCK_MESSAGES:
logger.warning("couldn't initialize stanza due to a blocked status")
return

@@ -577,6 +573,18 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
event.defer()
return

# Prevents config change in bad state, so DB peer relations change event will not cause patroni related errors.
if self.charm.unit.status.message == CANNOT_RESTORE_PITR:
logger.info("Cannot change S3 configuration in bad PITR restore status")
event.defer()
return

# Prevents S3 change in the middle of restoring backup and patroni / pgbackrest errors caused by that.
if "restoring-backup" in self.charm.app_peer_data:
logger.info("Cannot change S3 configuration during restore")
event.defer()
return

if not self._render_pgbackrest_conf_file():
logger.debug("Cannot set pgBackRest configurations, missing configurations.")
return
@@ -806,6 +814,11 @@ def _on_restore_action(self, event):
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
elif not self._list_backups(show_failed=False):
error_message = "Cannot restore PITR without any backups created"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return

# Quick check for timestamp format
if (
10 changes: 5 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
@@ -1260,11 +1260,6 @@ def _on_update_status(self, _) -> None:
return

if "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data:
if "failed" in self._patroni.get_member_status(self._member_name):
logger.error("Restore failed: database service failed to start")
self.unit.status = BlockedStatus("Failed to restore backup")
return

if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed()):
logger.error(
"Restore failed: database service failed to reach point-in-time-recovery target. "
@@ -1274,6 +1269,11 @@ def _on_update_status(self, _) -> None:
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
return

if "failed" in self._patroni.get_member_status(self._member_name):
logger.error("Restore failed: database service failed to start")
self.unit.status = BlockedStatus("Failed to restore backup")
return

if not self._patroni.member_started:
logger.debug("on_update_status early exit: Patroni has not started yet")
return
28 changes: 20 additions & 8 deletions tests/integration/test_backups.py
Original file line number Diff line number Diff line change
@@ -19,7 +19,6 @@
get_password,
get_primary,
get_unit_address,
scale_application,
switchover,
wait_for_idle_on_blocked,
)
@@ -277,7 +276,10 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], charm

# Wait for the restore to complete.
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(status="active", timeout=1000)
await ops_test.model.block_until(
lambda: remaining_unit.workload_status_message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
timeout=1000,
)

# Check that the backup was correctly restored by having only the first created table.
primary = await get_primary(ops_test, remaining_unit.name)
@@ -315,16 +317,20 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], charm
await ops_test.model.applications[database_app_name].remove_relation(
f"{database_app_name}:certificates", f"{TLS_CERTIFICATES_APP_NAME}:certificates"
)
await ops_test.model.wait_for_idle(
apps=[database_app_name], status="active", timeout=1000
)

# Scale up to be able to test primary and leader being different.
new_unit_name = f"{database_app_name}/2"

async with ops_test.fast_forward():
await scale_application(ops_test, database_app_name, 2)
# Scale up to be able to test primary and leader being different.
await ops_test.model.applications[database_app_name].add_units(1)
# Ensure that new unit become in blocked status, but is fully functional.
await ops_test.model.block_until(
lambda: ops_test.model.units.get(new_unit_name).workload_status_message
== MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
timeout=1000,
)

# Ensure replication is working correctly.
new_unit_name = f"{database_app_name}/2"
address = get_unit_address(ops_test, new_unit_name)
with db_connect(
host=address, password=password
@@ -362,6 +368,12 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict], charm
await action.wait()
backups = action.results.get("backups")
assert backups, "backups not outputted"

# Remove S3 relation to ensure "move to another cluster" blocked status is gone
await ops_test.model.applications[database_app_name].remove_relation(
f"{database_app_name}:s3-parameters", f"{S3_INTEGRATOR_APP_NAME}:s3-credentials"
)

await ops_test.model.wait_for_idle(status="active", timeout=1000)

# Remove the database app.

0 comments on commit 3601f82

Please sign in to comment.