diff --git a/src/backups.py b/src/backups.py index 95e4be6ac6..ca12443759 100644 --- a/src/backups.py +++ b/src/backups.py @@ -10,20 +10,25 @@ import re import shutil import tempfile +import time from datetime import datetime, timezone +from io import BytesIO from pathlib import Path from subprocess import TimeoutExpired, run import boto3 as boto3 import botocore from botocore.exceptions import ClientError -from charms.data_platform_libs.v0.s3 import CredentialsChangedEvent, S3Requirer +from charms.data_platform_libs.v0.s3 import ( + CredentialsChangedEvent, + S3Requirer, +) from charms.operator_libs_linux.v2 import snap from jinja2 import Template -from ops.charm import ActionEvent +from ops.charm import ActionEvent, HookEvent from ops.framework import Object from ops.jujuversion import JujuVersion -from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus +from ops.model import ActiveStatus, MaintenanceStatus from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from constants import ( @@ -74,6 +79,9 @@ def __init__(self, charm, relation_name: str): self.framework.observe( self.s3_client.on.credentials_changed, self._on_s3_credential_changed ) + # When the leader unit is being removed, s3_client.on.credentials_gone is performed on it (and only on it). + # After a new leader is elected, the S3 connection must be reinitialized. + self.framework.observe(self.charm.on.leader_elected, self._on_s3_credential_changed) self.framework.observe(self.s3_client.on.credentials_gone, self._on_s3_credential_gone) self.framework.observe(self.charm.on.create_backup_action, self._on_create_backup_action) self.framework.observe(self.charm.on.list_backups_action, self._on_list_backups_action) @@ -157,7 +165,18 @@ def _can_unit_perform_backup(self) -> tuple[bool, str | None]: def can_use_s3_repository(self) -> tuple[bool, str | None]: """Returns whether the charm was configured to use another cluster repository.""" - # Prevent creating backups and storing in another cluster repository. + # Check model uuid + s3_parameters, _ = self._retrieve_s3_parameters() + s3_model_uuid = self._read_content_from_s3( + "model-uuid.txt", + s3_parameters, + ) + if s3_model_uuid and s3_model_uuid.strip() != self.model.uuid: + logger.debug( + f"can_use_s3_repository: incompatible model-uuid s3={s3_model_uuid.strip()}, local={self.model.uuid}" + ) + return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE + try: return_code, stdout, stderr = self._execute_command( [PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, "info", "--output=json"], @@ -175,29 +194,30 @@ def can_use_s3_repository(self) -> tuple[bool, str | None]: logger.error(stderr) return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE - if self.charm.unit.is_leader(): - for stanza in json.loads(stdout): - return_code, system_identifier_from_instance, error = self._execute_command([ - f'/snap/charmed-postgresql/current/usr/lib/postgresql/{self.charm._patroni.get_postgresql_version().split(".")[0]}/bin/pg_controldata', - POSTGRESQL_DATA_PATH, - ]) - if return_code != 0: - raise Exception(error) - system_identifier_from_instance = next( - line - for line in system_identifier_from_instance.splitlines() - if "Database system identifier" in line - ).split(" ")[-1] - system_identifier_from_stanza = str(stanza.get("db")[0]["system-id"]) - if system_identifier_from_instance != system_identifier_from_stanza or stanza.get( - "name" - ) != self.charm.app_peer_data.get("stanza", self.stanza_name): - # Prevent archiving of WAL files. - self.charm.app_peer_data.update({"stanza": ""}) - self.charm.update_config() - if self.charm._patroni.member_started: - self.charm._patroni.reload_patroni_configuration() - return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE + for stanza in json.loads(stdout): + if stanza.get("name") != self.stanza_name: + logger.debug( + f"can_use_s3_repository: incompatible stanza name s3={stanza.get('name', '')}, local={self.stanza_name}" + ) + return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE + + return_code, system_identifier_from_instance, error = self._execute_command([ + f'/snap/charmed-postgresql/current/usr/lib/postgresql/{self.charm._patroni.get_postgresql_version().split(".")[0]}/bin/pg_controldata', + POSTGRESQL_DATA_PATH, + ]) + if return_code != 0: + raise Exception(error) + system_identifier_from_instance = next( + line + for line in system_identifier_from_instance.splitlines() + if "Database system identifier" in line + ).split(" ")[-1] + system_identifier_from_stanza = str(stanza.get("db")[0]["system-id"]) + if system_identifier_from_instance != system_identifier_from_stanza: + logger.debug( + f"can_use_s3_repository: incompatible system identifier s3={system_identifier_from_stanza}, local={system_identifier_from_instance}" + ) + return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE return True, None @@ -551,71 +571,90 @@ def _parse_backup_id(self, label) -> tuple[str, str]: backup_type, ) - def _initialise_stanza(self) -> None: + def _initialise_stanza(self, event: HookEvent) -> bool: """Initialize the stanza. - A stanza is the configuration for a PostgreSQL database cluster that defines where it is - located, how it will be backed up, archiving options, etc. (more info in + A stanza is the configuration for a PostgreSQL database cluster that defines where it is located, how it will + be backed up, archiving options, etc. (more info in https://pgbackrest.org/user-guide.html#quickstart/configure-stanza). - """ - if not self.charm.is_primary: - return + Returns: + whether stanza initialization were successful. + """ # Enable stanza initialisation if the backup settings were fixed after being invalid # or pointing to a repository where there are backups from another cluster. if self.charm.is_blocked and self.charm.unit.status.message not in S3_BLOCK_MESSAGES: - logger.warning("couldn't initialize stanza due to a blocked status") - return + logger.warning("couldn't initialize stanza due to a blocked status, deferring event") + event.defer() + return False self.charm.unit.status = MaintenanceStatus("initialising stanza") # Create the stanza. - return_code, _, stderr = self._execute_command([ - PGBACKREST_EXECUTABLE, - PGBACKREST_CONFIGURATION_FILE, - f"--stanza={self.stanza_name}", - "stanza-create", - ]) - if return_code == 49: - # Raise an error if the connection timeouts, so the user has the possibility to - # fix network issues and call juju resolve to re-trigger the hook that calls - # this method. - logger.error( - f"error: {stderr} - please fix the error and call juju resolve on this unit" - ) - raise TimeoutError - if return_code != 0: - logger.error(stderr) - self.charm.unit.status = BlockedStatus(FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE) - return + try: + # If the tls is enabled, it requires all the units in the cluster to run the pgBackRest service to + # successfully complete validation, and upon receiving the same parent event other units should start it. + # Therefore, the first retry may fail due to the delay of these other units to start this service. 60s given + # for that or else the s3 initialization sequence will fail. + for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True): + with attempt: + return_code, _, stderr = self._execute_command([ + PGBACKREST_EXECUTABLE, + PGBACKREST_CONFIGURATION_FILE, + f"--stanza={self.stanza_name}", + "stanza-create", + ]) + if return_code == 49: + # Raise an error if the connection timeouts, so the user has the possibility to + # fix network issues and call juju resolve to re-trigger the hook that calls + # this method. + logger.error( + f"error: {stderr} - please fix the error and call juju resolve on this unit" + ) + raise TimeoutError + if return_code != 0: + raise Exception(stderr) + except TimeoutError as e: + raise e + except Exception as e: + # If the check command doesn't succeed, remove the stanza name + # and rollback the configuration. + logger.exception(e) + self._s3_initialization_set_failure(FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE) + return False self.start_stop_pgbackrest_service() - # Store the stanza name to be used in configurations updates. + # Rest of the successful s3 initialization sequence such as s3-initialization-start and s3-initialization-done + # are left to the check_stanza func. if self.charm.unit.is_leader(): self.charm.app_peer_data.update({ "stanza": self.stanza_name, - "init-pgbackrest": "True", }) else: self.charm.unit_peer_data.update({ "stanza": self.stanza_name, - "init-pgbackrest": "True", }) - def check_stanza(self) -> None: - """Runs the pgbackrest stanza validation.""" - if not self.charm.is_primary or "init-pgbackrest" not in self.charm.app_peer_data: - return + return True + + def check_stanza(self) -> bool: + """Runs the pgbackrest stanza validation. + Returns: + whether stanza validation was successful. + """ # Update the configuration to use pgBackRest as the archiving mechanism. self.charm.update_config() self.charm.unit.status = MaintenanceStatus("checking stanza") try: - # Check that the stanza is correctly configured. - for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)): + # If the tls is enabled, it requires all the units in the cluster to run the pgBackRest service to + # successfully complete validation, and upon receiving the same parent event other units should start it. + # Therefore, the first retry may fail due to the delay of these other units to start this service. 60s given + # for that or else the s3 initialization sequence will fail. + for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True): with attempt: if self.charm._patroni.member_started: self.charm._patroni.reload_patroni_configuration() @@ -627,44 +666,50 @@ def check_stanza(self) -> None: ]) if return_code != 0: raise Exception(stderr) - self.charm.unit.status = ActiveStatus() - except RetryError as e: + self.charm._set_primary_status_message() + except Exception as e: # If the check command doesn't succeed, remove the stanza name # and rollback the configuration. - self.charm.app_peer_data.update({"stanza": ""}) - self.charm.app_peer_data.pop("init-pgbackrest", None) - self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) - self.charm.update_config() - logger.exception(e) - self.charm.unit.status = BlockedStatus(FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE) + self._s3_initialization_set_failure(FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE) + self.charm.update_config() + return False if self.charm.unit.is_leader(): - self.charm.app_peer_data.pop("init-pgbackrest", None) - self.charm.unit_peer_data.pop("init-pgbackrest", None) + self.charm.app_peer_data.update({ + "s3-initialization-start": "", + }) + else: + self.charm.unit_peer_data.update({"s3-initialization-done": "True"}) + + return True def coordinate_stanza_fields(self) -> None: """Coordinate the stanza name between the primary and the leader units.""" - for unit, unit_data in self.charm._peers.data.items(): - if "stanza" not in unit_data: + if ( + not self.charm.unit.is_leader() + or "s3-initialization-start" not in self.charm.app_peer_data + ): + return + + for _unit, unit_data in self.charm._peers.data.items(): + if "s3-initialization-done" not in unit_data: continue - # If the stanza name is not set in the application databag, then the primary is not - # the leader unit, and it's needed to set the stanza name in the application databag. - if "stanza" not in self.charm.app_peer_data and self.charm.unit.is_leader(): - self.charm.app_peer_data.update({ - "stanza": self.stanza_name, - "init-pgbackrest": "True", - }) - break - # If the stanza was already checked and its name is still in the unit databag, mark - # the stanza as already checked in the application databag and remove it from the - # unit databag. - if "init-pgbackrest" not in unit_data: - if self.charm.unit.is_leader(): - self.charm.app_peer_data.pop("init-pgbackrest", None) - if "init-pgbackrest" not in self.charm.app_peer_data and unit == self.charm.unit: - self.charm.unit_peer_data.update({"stanza": ""}) - break + + self.charm.app_peer_data.update({ + "stanza": unit_data.get("stanza", ""), + "s3-initialization-block-message": unit_data.get( + "s3-initialization-block-message", "" + ), + "s3-initialization-start": "", + "s3-initialization-done": "True", + }) + + self.charm.update_config() + if self.charm._patroni.member_started: + self.charm._patroni.reload_patroni_configuration() + + break @property def _is_primary_pgbackrest_service_running(self) -> bool: @@ -697,7 +742,10 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): return # Prevents S3 change in the middle of restoring backup and patroni / pgbackrest errors caused by that. - if "restoring-backup" in self.charm.app_peer_data: + if ( + "restoring-backup" in self.charm.app_peer_data + or "restore-to-time" in self.charm.app_peer_data + ): logger.info("Cannot change S3 configuration during restore") event.defer() return @@ -711,32 +759,75 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): event.defer() return - # Verify the s3 relation only on the primary. - if not self.charm.is_primary: - return + # Start the pgBackRest service for the check_stanza to be successful. It's required to run on all the units if the tls is enabled. + self.start_stop_pgbackrest_service() + + if self.charm.unit.is_leader(): + self.charm.app_peer_data.update({ + "s3-initialization-block-message": "", + "s3-initialization-start": time.asctime(time.gmtime()), + "stanza": "", + "s3-initialization-done": "", + }) + if not self.charm.is_primary: + self.charm._set_primary_status_message() + + if self.charm.is_primary and "s3-initialization-done" not in self.charm.unit_peer_data: + self._on_s3_credential_changed_primary(event) + + if self.charm.is_standby_leader: + logger.info( + "S3 credentials will not be connected on standby cluster until it becomes primary" + ) + + def _on_s3_credential_changed_primary(self, event: HookEvent) -> bool: + """Stanza must be cleared before calling this function.""" + self.charm.update_config() + if self.charm._patroni.member_started: + self.charm._patroni.reload_patroni_configuration() try: self._create_bucket_if_not_exists() except (ClientError, ValueError): - self.charm.unit.status = BlockedStatus(FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE) - return + self._s3_initialization_set_failure(FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE) + return False can_use_s3_repository, validation_message = self.can_use_s3_repository() if not can_use_s3_repository: - self.charm.unit.status = BlockedStatus(validation_message) - return + self._s3_initialization_set_failure(validation_message) + return False - self._initialise_stanza() + if not self._initialise_stanza(event): + return False + + if not self.check_stanza(): + return False + + s3_parameters, _ = self._retrieve_s3_parameters() + self._upload_content_to_s3( + self.model.uuid, + "model-uuid.txt", + s3_parameters, + ) + + return True def _on_s3_credential_gone(self, _) -> None: if self.charm.unit.is_leader(): self.charm.app_peer_data.update({ "stanza": "", - "init-pgbackrest": "", + "s3-initialization-start": "", + "s3-initialization-done": "", + "s3-initialization-block-message": "", }) - self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) + self.charm.unit_peer_data.update({ + "stanza": "", + "s3-initialization-start": "", + "s3-initialization-done": "", + "s3-initialization-block-message": "", + }) if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES: - self.charm.unit.status = ActiveStatus() + self.charm._set_primary_status_message() def _on_create_backup_action(self, event) -> None: """Request that pgBackRest creates a backup.""" @@ -779,10 +870,7 @@ def _on_create_backup_action(self, event) -> None: """ if not self._upload_content_to_s3( metadata, - os.path.join( - s3_parameters["path"], - f"backup/{self.stanza_name}/latest", - ), + f"backup/{self.stanza_name}/latest", s3_parameters, ): error_message = "Failed to upload metadata to provided S3" @@ -853,10 +941,7 @@ def _run_backup( """ self._upload_content_to_s3( logs, - os.path.join( - s3_parameters["path"], - f"backup/{self.stanza_name}/{backup_id}/backup.log", - ), + f"backup/{self.stanza_name}/{backup_id}/backup.log", s3_parameters, ) error_message = f"Failed to backup PostgreSQL with error: {stderr}" @@ -881,10 +966,7 @@ def _run_backup( """ if not self._upload_content_to_s3( logs, - os.path.join( - s3_parameters["path"], - f"backup/{self.stanza_name}/{backup_id}/backup.log", - ), + f"backup/{self.stanza_name}/{backup_id}/backup.log", s3_parameters, ): error_message = "Error uploading logs to S3" @@ -996,6 +1078,7 @@ def _on_restore_action(self, event): # noqa: C901 "restore-stanza": restore_stanza_timeline[0], "restore-timeline": restore_stanza_timeline[1] if restore_to_time else "", "restore-to-time": restore_to_time or "", + "s3-initialization-block-message": "", }) self.charm.update_config() @@ -1230,7 +1313,11 @@ def start_stop_pgbackrest_service(self) -> bool: return False # Stop the service if TLS is not enabled or there are no replicas. - if not self.charm.is_tls_enabled or len(self.charm._peer_members_ips) == 0: + if ( + not self.charm.is_tls_enabled + or len(self.charm._peer_members_ips) == 0 + or self.charm._patroni.get_standby_leader() + ): charmed_postgresql_snap.stop(services=["pgbackrest-service"]) return True @@ -1243,12 +1330,12 @@ def start_stop_pgbackrest_service(self) -> bool: return True def _upload_content_to_s3( - self: str, + self, content: str, s3_path: str, s3_parameters: dict, ) -> bool: - """Uploads the provided contents to the provided S3 bucket. + """Uploads the provided contents to the provided S3 bucket relative to the path from the S3 config. Args: content: The content to upload to S3 @@ -1261,10 +1348,9 @@ def _upload_content_to_s3( a boolean indicating success. """ bucket_name = s3_parameters["bucket"] - s3_path = os.path.join(s3_parameters["path"], s3_path).lstrip("/") - logger.info(f"Uploading content to bucket={s3_parameters['bucket']}, path={s3_path}") + processed_s3_path = os.path.join(s3_parameters["path"], s3_path).lstrip("/") try: - logger.info(f"Uploading content to bucket={bucket_name}, path={s3_path}") + logger.info(f"Uploading content to bucket={bucket_name}, path={processed_s3_path}") session = boto3.session.Session( aws_access_key_id=s3_parameters["access-key"], aws_secret_access_key=s3_parameters["secret-key"], @@ -1281,11 +1367,89 @@ def _upload_content_to_s3( with tempfile.NamedTemporaryFile() as temp_file: temp_file.write(content.encode("utf-8")) temp_file.flush() - bucket.upload_file(temp_file.name, s3_path) + bucket.upload_file(temp_file.name, processed_s3_path) except Exception as e: logger.exception( - f"Failed to upload content to S3 bucket={bucket_name}, path={s3_path}", exc_info=e + f"Failed to upload content to S3 bucket={bucket_name}, path={processed_s3_path}", + exc_info=e, ) return False return True + + def _read_content_from_s3(self, s3_path: str, s3_parameters: dict) -> str | None: + """Reads specified content from the provided S3 bucket relative to the path from the S3 config. + + Args: + s3_path: The S3 path from which download the content + s3_parameters: A dictionary containing the S3 parameters + The following are expected keys in the dictionary: bucket, region, + endpoint, access-key and secret-key + + Returns: + a string with the content if object is successfully downloaded and None if file is not existing or error + occurred during download. + """ + bucket_name = s3_parameters["bucket"] + processed_s3_path = os.path.join(s3_parameters["path"], s3_path).lstrip("/") + try: + logger.info(f"Reading content from bucket={bucket_name}, path={processed_s3_path}") + session = boto3.session.Session( + aws_access_key_id=s3_parameters["access-key"], + aws_secret_access_key=s3_parameters["secret-key"], + region_name=s3_parameters["region"], + ) + s3 = session.resource( + "s3", + endpoint_url=self._construct_endpoint(s3_parameters), + verify=(self._tls_ca_chain_filename or None), + ) + bucket = s3.Bucket(bucket_name) + with BytesIO() as buf: + bucket.download_fileobj(processed_s3_path, buf) + return buf.getvalue().decode("utf-8") + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + logger.info( + f"No such object to read from S3 bucket={bucket_name}, path={processed_s3_path}" + ) + else: + logger.exception( + f"Failed to read content from S3 bucket={bucket_name}, path={processed_s3_path}", + exc_info=e, + ) + except Exception as e: + logger.exception( + f"Failed to read content from S3 bucket={bucket_name}, path={processed_s3_path}", + exc_info=e, + ) + + return None + + def _s3_initialization_set_failure( + self, block_message: str, update_leader_status: bool = True + ): + """Sets failed s3 initialization status with corresponding block_message in the app_peer_data (leader) or unit_peer_data (primary). + + Args: + block_message: s3 initialization block message + update_leader_status: whether to update leader status (with s3-initialization-block-message set already) + immediately; defaults to True + """ + if self.charm.unit.is_leader(): + # If performed on the leader, then leader == primary and there is no need to sync s3 data between two + # different units. Therefore, there is no need for s3-initialization-done key, and it is sufficient to clear + # s3-initialization-start key only. + self.charm.app_peer_data.update({ + "s3-initialization-block-message": block_message, + "s3-initialization-start": "", + "stanza": "", + }) + if update_leader_status: + self.charm._set_primary_status_message() + else: + self.charm.unit_peer_data.update({ + "s3-initialization-block-message": block_message, + "s3-initialization-done": "True", + "stanza": "", + }) diff --git a/src/charm.py b/src/charm.py index 5a82e993c9..4ffcdd632b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -12,6 +12,7 @@ import re import subprocess import sys +import time from datetime import datetime from pathlib import Path from typing import Literal, get_args @@ -54,7 +55,7 @@ ) from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed -from backups import CANNOT_RESTORE_PITR, PostgreSQLBackups +from backups import CANNOT_RESTORE_PITR, S3_BLOCK_MESSAGES, PostgreSQLBackups from cluster import ( NotReadyError, Patroni, @@ -296,8 +297,7 @@ def get_secret(self, scope: Scopes, key: str) -> str | None: if scope not in get_args(Scopes): raise RuntimeError("Unknown secret scope.") - peers = self.model.get_relation(PEER) - if not peers: + if not (peers := self.model.get_relation(PEER)): return None secret_key = self._translate_field_to_secret_key(key) # Old translation in databag is to be taken @@ -314,7 +314,8 @@ def set_secret(self, scope: Scopes, key: str, value: str | None) -> str | None: if not value: return self.remove_secret(scope, key) - peers = self.model.get_relation(PEER) + if not (peers := self.model.get_relation(PEER)): + return None secret_key = self._translate_field_to_secret_key(key) # Old translation in databag is to be deleted self.peer_relation_data(scope).delete_relation_data(peers.id, [key]) @@ -325,7 +326,8 @@ def remove_secret(self, scope: Scopes, key: str) -> None: if scope not in get_args(Scopes): raise RuntimeError("Unknown secret scope.") - peers = self.model.get_relation(PEER) + if not (peers := self.model.get_relation(PEER)): + return None secret_key = self._translate_field_to_secret_key(key) if scope == APP_SCOPE: self.peer_relation_app.delete_relation_data(peers.id, [secret_key]) @@ -509,7 +511,7 @@ def _on_pgdata_storage_detaching(self, _) -> None: if self.primary_endpoint: self._update_relation_endpoints() - def _on_peer_relation_changed(self, event: HookEvent): + def _on_peer_relation_changed(self, event: HookEvent): # noqa: C901 """Reconfigure cluster members when something changes.""" # Prevents the cluster to be reconfigured before it's bootstrapped in the leader. if "cluster_initialised" not in self._peers.data[self.app]: @@ -542,11 +544,15 @@ def _on_peer_relation_changed(self, event: HookEvent): logger.error("Invalid configuration: %s", str(e)) return - # If PITR restore failed, then wait it for resolve. + # Should not override a blocked status + if isinstance(self.unit.status, BlockedStatus): + logger.debug("on_peer_relation_changed early exit: Unit in blocked status") + return + if ( "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data - ) and isinstance(self.unit.status, BlockedStatus): - event.defer() + ) and not self._was_restore_successful(): + logger.debug("on_peer_relation_changed early exit: Backup restore check failed") return # Start can be called here multiple times as it's idempotent. @@ -579,6 +585,24 @@ def _on_peer_relation_changed(self, event: HookEvent): self._start_stop_pgbackrest_service(event) + # This is intended to be executed only when leader is reinitializing S3 connection due to the leader change. + if ( + "s3-initialization-start" in self.app_peer_data + and "s3-initialization-done" not in self.unit_peer_data + and self.is_primary + and not self.backup._on_s3_credential_changed_primary(event) + ): + return + + # Clean-up unit initialization data after successful sync to the leader. + if "s3-initialization-done" in self.app_peer_data and not self.unit.is_leader(): + self.unit_peer_data.update({ + "stanza": "", + "s3-initialization-block-message": "", + "s3-initialization-done": "", + "s3-initialization-start": "", + }) + self._update_new_unit_status() # Split off into separate function, because of complexity _on_peer_relation_changed @@ -593,8 +617,6 @@ def _start_stop_pgbackrest_service(self, event: HookEvent) -> None: self.backup.coordinate_stanza_fields() - self.backup.check_stanza() - if "exporter-started" not in self.unit_peer_data: self._setup_exporter() @@ -1341,6 +1363,8 @@ def _on_update_status(self, _) -> None: # Update the sync-standby endpoint in the async replication data. self.async_replication.update_async_replication_data() + self.backup.coordinate_stanza_fields() + self._set_primary_status_message() # Restart topology observer if it is gone @@ -1394,8 +1418,12 @@ def _was_restore_successful(self) -> bool: can_use_s3_repository, validation_message = self.backup.can_use_s3_repository() if not can_use_s3_repository: - self.unit.status = BlockedStatus(validation_message) - return False + self.app_peer_data.update({ + "stanza": "", + "s3-initialization-start": "", + "s3-initialization-done": "", + "s3-initialization-block-message": validation_message, + }) return True @@ -1407,7 +1435,7 @@ def _can_run_on_update_status(self) -> bool: logger.debug("Early exit on_update_status: upgrade in progress") return False - if self.is_blocked: + if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES: # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: self.enable_disable_extensions() @@ -1469,6 +1497,11 @@ def _handle_workload_failures(self) -> bool: def _set_primary_status_message(self) -> None: """Display 'Primary' in the unit status message if the current unit is the primary.""" try: + if self.unit.is_leader() and "s3-initialization-block-message" in self.app_peer_data: + self.unit.status = BlockedStatus( + self.app_peer_data["s3-initialization-block-message"] + ) + return if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name: self.unit.status = ActiveStatus("Primary") elif self.is_standby_leader: @@ -1663,7 +1696,7 @@ def update_config(self, is_creating_backup: bool = False) -> bool: pitr_target=self.app_peer_data.get("restore-to-time"), restore_timeline=self.app_peer_data.get("restore-timeline"), restore_to_latest=self.app_peer_data.get("restore-to-time", None) == "latest", - stanza=self.app_peer_data.get("stanza"), + stanza=self.app_peer_data.get("stanza", self.unit_peer_data.get("stanza")), restore_stanza=self.app_peer_data.get("restore-stanza"), parameters=pg_parameters, ) @@ -1858,16 +1891,26 @@ def is_pitr_failed(self) -> tuple[bool, bool]: - Is patroni service failed to bootstrap cluster. - Is it new fail, that wasn't observed previously. """ - patroni_logs = self._patroni.patroni_logs() - patroni_exceptions = re.findall( - r"^([0-9-:TZ]+).*patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", - patroni_logs, - re.MULTILINE, - ) + patroni_exceptions = [] + count = 0 + while len(patroni_exceptions) == 0 and count < 10: + if count > 0: + time.sleep(3) + patroni_logs = self._patroni.patroni_logs(num_lines="all") + patroni_exceptions = re.findall( + r"^([0-9-:TZ]+).*patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", + patroni_logs, + re.MULTILINE, + ) + count += 1 + if len(patroni_exceptions) > 0: + logger.debug("Failures to bootstrap cluster detected on Patroni service logs") old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None) self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1] return True, patroni_exceptions[-1] != old_pitr_fail_id + + logger.debug("No failures detected on Patroni service logs") return False, False def log_pitr_last_transaction_time(self) -> None: diff --git a/src/cluster.py b/src/cluster.py index 0e0e1f5641..c77d801375 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -654,7 +654,7 @@ def start_patroni(self) -> bool: logger.exception(error_message, exc_info=e) return False - def patroni_logs(self, num_lines: int | None = 10) -> str: + def patroni_logs(self, num_lines: int | str | None = 10) -> str: """Get Patroni snap service logs. Executes only on current unit. Args: diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 6dc2702248..da24fe6e3c 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -206,24 +206,26 @@ def test_can_use_s3_repository(harness): ) as _get_postgresql_version, patch("charm.PostgresqlOperatorCharm.postgresql") as _postgresql, patch( - "charms.postgresql_k8s.v0.postgresql.PostgreSQL.get_last_archived_wal" - ) as _get_last_archived_wal, + "charm.PostgreSQLBackups._retrieve_s3_parameters", + return_value=({"path": "example"}, None), + ), + patch("charm.PostgreSQLBackups._read_content_from_s3") as _read_content_from_s3, ): - peer_rel_id = harness.model.get_relation(PEER).id - # Define the stanza name inside the unit relation data. - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": harness.charm.backup.stanza_name}, - ) + # Test with bad model-uuid. + _read_content_from_s3.return_value = "bad" + assert harness.charm.backup.can_use_s3_repository() == ( + False, + ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, + ) # Test when nothing is returned from the pgBackRest info command. + _read_content_from_s3.return_value = harness.model.uuid _execute_command.side_effect = TimeoutExpired(cmd="fake command", timeout=30) with pytest.raises(TimeoutError): harness.charm.backup.can_use_s3_repository() assert False + # Test with bad pgBackRest error code. _execute_command.side_effect = None _execute_command.return_value = (1, "", "") assert harness.charm.backup.can_use_s3_repository() == ( @@ -231,35 +233,11 @@ def test_can_use_s3_repository(harness): FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE, ) - # Test when the unit is a replica. pgbackrest_info_same_cluster_backup_output = ( 0, f'[{{"db": [{{"system-id": "12345"}}], "name": "{harness.charm.backup.stanza_name}"}}]', "", ) - _execute_command.return_value = pgbackrest_info_same_cluster_backup_output - assert harness.charm.backup.can_use_s3_repository() == (True, None) - - # Assert that the stanza name is still in the unit relation data. - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": harness.charm.backup.stanza_name - } - - # Test when the unit is the leader and the workload is running, - # but an exception happens when retrieving the cluster system id. - _member_started.return_value = True - _execute_command.side_effect = [ - pgbackrest_info_same_cluster_backup_output, - (1, "", "fake error"), - ] - with harness.hooks_disabled(): - harness.set_leader() - with pytest.raises(Exception, match="fake error"): - harness.charm.backup.can_use_s3_repository() - assert False - _update_config.assert_not_called() - _member_started.assert_not_called() - _reload_patroni_configuration.assert_not_called() # Test when the cluster system id can be retrieved, but it's different from the stanza system id. pgbackrest_info_other_cluster_system_id_backup_output = ( @@ -276,27 +254,12 @@ def test_can_use_s3_repository(harness): pgbackrest_info_other_cluster_system_id_backup_output, other_instance_system_identifier_output, ] - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": harness.charm.backup.stanza_name}, - ) assert harness.charm.backup.can_use_s3_repository() == ( False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, ) - _update_config.assert_called_once() - _member_started.assert_called_once() - _reload_patroni_configuration.assert_called_once() - - # Assert that the stanza name is not present in the unit relation data anymore. - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} # Test when the cluster system id can be retrieved, but it's different from the stanza system id. - _update_config.reset_mock() - _member_started.reset_mock() - _reload_patroni_configuration.reset_mock() pgbackrest_info_other_cluster_name_backup_output = ( 0, f'[{{"db": [{{"system-id": "12345"}}], "name": "another-model.{harness.charm.cluster_name}"}}]', @@ -311,34 +274,13 @@ def test_can_use_s3_repository(harness): pgbackrest_info_other_cluster_name_backup_output, same_instance_system_identifier_output, ] - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": harness.charm.backup.stanza_name}, - ) assert harness.charm.backup.can_use_s3_repository() == ( False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, ) - _update_config.assert_called_once() - _member_started.assert_called_once() - _reload_patroni_configuration.assert_called_once() - - # Assert that the stanza name is not present in the unit relation data anymore. - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} # Test when the workload is not running. - _update_config.reset_mock() - _member_started.reset_mock() - _reload_patroni_configuration.reset_mock() _member_started.return_value = False - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": harness.charm.backup.stanza_name}, - ) _execute_command.side_effect = [ pgbackrest_info_same_cluster_backup_output, other_instance_system_identifier_output, @@ -347,31 +289,14 @@ def test_can_use_s3_repository(harness): False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, ) - _update_config.assert_called_once() - _member_started.assert_called_once() - _reload_patroni_configuration.assert_not_called() - - # Assert that the stanza name is not present in the unit relation data anymore. - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} # Test when there is no backup from another cluster in the S3 repository. - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": harness.charm.backup.stanza_name}, - ) _execute_command.side_effect = [ pgbackrest_info_same_cluster_backup_output, same_instance_system_identifier_output, ] assert harness.charm.backup.can_use_s3_repository() == (True, None) - # Assert that the stanza name is still in the unit relation data. - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": harness.charm.backup.stanza_name - } - def test_construct_endpoint(harness): # Test with an AWS endpoint without region. @@ -714,23 +639,22 @@ def test_initialise_stanza(harness): patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch("charm.PostgreSQLBackups._execute_command") as _execute_command, patch( - "charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock - ) as _is_primary, + "charm.PostgreSQLBackups._s3_initialization_set_failure" + ) as _s3_initialization_set_failure, ): peer_rel_id = harness.model.get_relation(PEER).id - # Test when the unit is not the primary. - _is_primary.return_value = False - _execute_command.assert_not_called() - # Test when the unit is the primary, but it's in a blocked state - # other than the ones can be solved by new S3 settings. - _is_primary.return_value = True + mock_event = MagicMock() + + # Test when it's in a blocked state other than the ones can be solved by new S3 settings. harness.charm.unit.status = BlockedStatus("fake blocked state") - harness.charm.backup._initialise_stanza() + harness.charm.backup._initialise_stanza(mock_event) _execute_command.assert_not_called() + mock_event.defer.assert_called_once() # Test when the blocked state is any of the blocked stated that can be solved # by new S3 settings, but the stanza creation fails. + mock_event.reset_mock() stanza_creation_command = [ "charmed-postgresql.pgbackrest", "--config=/var/snap/charmed-postgresql/current/etc/pgbackrest/pgbackrest.conf", @@ -743,60 +667,67 @@ def test_initialise_stanza(harness): FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE, ]: + _s3_initialization_set_failure.reset_mock() _execute_command.reset_mock() harness.charm.unit.status = BlockedStatus(blocked_state) - harness.charm.backup._initialise_stanza() - _execute_command.assert_called_once_with(stanza_creation_command) - assert isinstance(harness.charm.unit.status, BlockedStatus) - assert harness.charm.unit.status.message == FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE - - # Assert there is no stanza name in the application relation databag. - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} + harness.charm.backup._initialise_stanza(mock_event) + _execute_command.assert_called_with(stanza_creation_command) + mock_event.defer.assert_not_called() + # Only the leader will display the blocked status. + assert isinstance(harness.charm.unit.status, MaintenanceStatus) + _s3_initialization_set_failure.assert_called_once_with( + FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE + ) # Test when the failure in the stanza creation is due to a timeout. _execute_command.reset_mock() _execute_command.return_value = (49, "", "fake stderr") with pytest.raises(TimeoutError): - harness.charm.backup._initialise_stanza() + harness.charm.backup._initialise_stanza(mock_event) assert False + mock_event.defer.assert_not_called() # Test when the archiving is working correctly (pgBackRest check command succeeds) # and the unit is not the leader. _execute_command.reset_mock() _execute_command.return_value = (0, "fake stdout", "") _member_started.return_value = True - harness.charm.backup._initialise_stanza() + harness.charm.backup._initialise_stanza(mock_event) assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == { "stanza": f"{harness.charm.model.name}.postgresql", - "init-pgbackrest": "True", } assert isinstance(harness.charm.unit.status, MaintenanceStatus) + mock_event.defer.assert_not_called() # Test when the unit is the leader. with harness.hooks_disabled(): harness.set_leader() - harness.update_relation_data( - peer_rel_id, harness.charm.unit.name, {"stanza": "", "init-pgbackrest": ""} - ) - harness.charm.backup._initialise_stanza() + harness.update_relation_data(peer_rel_id, harness.charm.unit.name, {"stanza": ""}) + harness.charm.backup._initialise_stanza(mock_event) _update_config.assert_not_called() assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": "None.postgresql", - "init-pgbackrest": "True", + "stanza": f"{harness.charm.model.name}.postgresql", } _member_started.assert_not_called() _reload_patroni_configuration.assert_not_called() + mock_event.defer.assert_not_called() assert isinstance(harness.charm.unit.status, MaintenanceStatus) def test_check_stanza(harness): with ( - patch("charm.Patroni.reload_patroni_configuration") as _reload_patroni_configuration, - patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch("charm.PostgresqlOperatorCharm.update_config"), patch("backups.wait_fixed", return_value=wait_fixed(0)), - patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch("charm.Patroni.reload_patroni_configuration") as _reload_patroni_configuration, patch("charm.PostgreSQLBackups._execute_command") as _execute_command, + patch( + "charm.PostgresqlOperatorCharm._set_primary_status_message" + ) as _set_primary_status_message, + patch( + "charm.PostgreSQLBackups._s3_initialization_set_failure" + ) as _s3_initialization_set_failure, patch( "charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock ) as _is_primary, @@ -807,163 +738,146 @@ def test_check_stanza(harness): harness.update_relation_data( peer_rel_id, harness.charm.app.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, - ) - harness.update_relation_data( - peer_rel_id, - harness.charm.unit.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, + {"s3-initialization-start": "test-stanza"}, ) - # Test when the unit is not the primary. - _is_primary.return_value = False - harness.charm.backup.check_stanza() - _execute_command.assert_not_called() - - # Set the unit as primary. - _is_primary.return_value = True - - # Test when the archiving is not working correctly (pgBackRest check command fails). + _member_started.return_value = False _execute_command.return_value = (49, "", "fake stderr") + assert not harness.charm.backup.check_stanza() + _member_started.assert_called() + _reload_patroni_configuration.assert_not_called() + _set_primary_status_message.assert_not_called() + _s3_initialization_set_failure.assert_called_once_with( + FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE + ) + + _execute_command.reset_mock() + _s3_initialization_set_failure.reset_mock() _member_started.return_value = True - harness.charm.backup.check_stanza() - assert _update_config.call_count == 2 + _execute_command.return_value = (0, "fake stdout", "") + assert harness.charm.backup.check_stanza() + _reload_patroni_configuration.assert_called_once() + _execute_command.assert_called_once() + _set_primary_status_message.assert_called_once() + _s3_initialization_set_failure.assert_not_called() + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == { + "s3-initialization-done": "True" + } + + with harness.hooks_disabled(): + harness.set_leader() + assert harness.charm.backup.check_stanza() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} + + +def test_coordinate_stanza_fields(harness): + with ( + patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch("charm.Patroni.reload_patroni_configuration") as _reload_patroni_configuration, + ): + peer_rel_id = harness.model.get_relation(PEER).id + stanza_name = f"{harness.charm.model.name}.{harness.charm.app.name}" + + peer_data_primary_error = { + "s3-initialization-done": "True", + "s3-initialization-block-message": ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, + } + peer_data_primary_ok = { + "s3-initialization-done": "True", + "stanza": stanza_name, + } + peer_data_leader_start = { + "s3-initialization-start": "Thu Feb 24 05:00:00 2022", + } + peer_data_leader_error = { + "s3-initialization-done": "True", + "s3-initialization-block-message": ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, + } + peer_data_leader_ok = {"s3-initialization-done": "True", "stanza": stanza_name} + peer_data_clean = { + "s3-initialization-start": "", + "s3-initialization-done": "", + "s3-initialization-block-message": "", + "stanza": "", + } + + # Add a new unit to the relation. + new_unit_name = "postgresql-k8s/1" + new_unit = Unit(new_unit_name, None, harness.charm.app._backend, {}) + harness.add_relation_unit(peer_rel_id, new_unit_name) + + # Test with clear values. + harness.charm.backup.coordinate_stanza_fields() + _member_started.assert_not_called() assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} - assert _member_started.call_count == 5 - assert _reload_patroni_configuration.call_count == 5 + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + assert harness.get_relation_data(peer_rel_id, new_unit) == {} + + # Test with primary failed prior leader s3 initialization sequence started. + with harness.hooks_disabled(): + harness.update_relation_data(peer_rel_id, new_unit_name, peer_data_primary_error) + harness.charm.backup.coordinate_stanza_fields() + _update_config.assert_not_called() + _member_started.assert_not_called() assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert isinstance(harness.charm.unit.status, BlockedStatus) - assert harness.charm.unit.status.message == FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE + assert harness.get_relation_data(peer_rel_id, new_unit) == peer_data_primary_error - # Test when the archiving is working correctly (pgBackRest check command succeeds) - # and the unit is not the leader. + # Test with non-leader unit. with harness.hooks_disabled(): harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, - ) - harness.update_relation_data( - peer_rel_id, - harness.charm.unit.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, + peer_rel_id, harness.charm.app.name, peer_data_leader_start ) - _execute_command.reset_mock() - _update_config.reset_mock() - _member_started.reset_mock() - _reload_patroni_configuration.reset_mock() - _execute_command.side_effect = None - _execute_command.return_value = (0, "fake stdout", "") - harness.charm.backup.check_stanza() + harness.charm.backup.coordinate_stanza_fields() + _update_config.assert_not_called() + _member_started.assert_not_called() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == peer_data_leader_start + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + assert harness.get_relation_data(peer_rel_id, new_unit) == peer_data_primary_error + + # Leader should sync fail result from the primary. + _member_started.return_value = False + with harness.hooks_disabled(): + harness.set_leader() + harness.charm.backup.coordinate_stanza_fields() _update_config.assert_called_once() _member_started.assert_called_once() - _reload_patroni_configuration.assert_called_once() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": "test-stanza", - "init-pgbackrest": "True", - } - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == { - "stanza": "test-stanza" - } - assert isinstance(harness.charm.unit.status, ActiveStatus) + _reload_patroni_configuration.assert_not_called() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == peer_data_leader_error + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + assert harness.get_relation_data(peer_rel_id, new_unit) == peer_data_primary_error - # Test when the unit is the leader. - harness.charm.unit.status = BlockedStatus("fake blocked state") + # Test with successful result from the primary. + _update_config.reset_mock() + _member_started.return_value = True with harness.hooks_disabled(): - harness.set_leader() - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"init-pgbackrest": "True"}, - ) + harness.update_relation_data(peer_rel_id, harness.charm.app.name, peer_data_clean) harness.update_relation_data( - peer_rel_id, - harness.charm.unit.name, - {"init-pgbackrest": "True"}, + peer_rel_id, harness.charm.app.name, peer_data_leader_start ) - _update_config.reset_mock() - _member_started.reset_mock() - _reload_patroni_configuration.reset_mock() - harness.charm.backup.check_stanza() + harness.update_relation_data(peer_rel_id, new_unit_name, peer_data_clean) + harness.update_relation_data(peer_rel_id, new_unit_name, peer_data_primary_ok) + harness.charm.backup.coordinate_stanza_fields() _update_config.assert_called_once() - _member_started.assert_called_once() _reload_patroni_configuration.assert_called_once() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": "test-stanza" - } - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == { - "stanza": "test-stanza" - } - assert isinstance(harness.charm.unit.status, ActiveStatus) - - -def test_coordinate_stanza_fields(harness): - peer_rel_id = harness.model.get_relation(PEER).id - # Add a new unit to the relation. - new_unit_name = "postgresql-k8s/1" - new_unit = Unit(new_unit_name, None, harness.charm.app._backend, {}) - harness.add_relation_unit(peer_rel_id, new_unit_name) - - # Test when the stanza name is neither in the application relation databag nor in the unit relation databag. - harness.charm.backup.coordinate_stanza_fields() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert harness.get_relation_data(peer_rel_id, new_unit) == {} - - # Test when the stanza name is in the unit relation databag but the unit is not the leader. - stanza_name = f"{harness.charm.model.name}.{harness.charm.app.name}" - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, new_unit_name, {"stanza": stanza_name, "init-pgbackrest": "True"} - ) - harness.charm.backup.coordinate_stanza_fields() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert harness.get_relation_data(peer_rel_id, new_unit) == { - "stanza": stanza_name, - "init-pgbackrest": "True", - } - - # Test when the unit is the leader. - with harness.hooks_disabled(): - harness.set_leader() - harness.charm.backup.coordinate_stanza_fields() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": stanza_name, - "init-pgbackrest": "True", - } - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert harness.get_relation_data(peer_rel_id, new_unit) == { - "stanza": stanza_name, - "init-pgbackrest": "True", - } - - # Test when the stanza was already checked in the primary non-leader unit. - with harness.hooks_disabled(): - harness.update_relation_data(peer_rel_id, new_unit_name, {"init-pgbackrest": ""}) - harness.charm.backup.coordinate_stanza_fields() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {"stanza": stanza_name} - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert harness.get_relation_data(peer_rel_id, new_unit) == {"stanza": stanza_name} - - # Test when the "init-pgbackrest" flag was removed from the application relation databag - # and this is the unit that has the stanza name in the unit relation databag. - with harness.hooks_disabled(): - harness.update_relation_data(peer_rel_id, harness.charm.unit.name, {"stanza": stanza_name}) - harness.charm.backup.coordinate_stanza_fields() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {"stanza": stanza_name} - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert harness.get_relation_data(peer_rel_id, new_unit) == {"stanza": stanza_name} + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == peer_data_leader_ok + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + assert harness.get_relation_data(peer_rel_id, new_unit) == peer_data_primary_ok - # Test when the unit is not the leader. - with harness.hooks_disabled(): - harness.set_leader(False) - harness.update_relation_data(peer_rel_id, harness.charm.unit.name, {"stanza": stanza_name}) - harness.charm.backup.coordinate_stanza_fields() - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {"stanza": stanza_name} - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - assert harness.get_relation_data(peer_rel_id, new_unit) == {"stanza": stanza_name} + # Test when leader is waiting for the primary result. + _update_config.reset_mock() + with harness.hooks_disabled(): + harness.update_relation_data(peer_rel_id, harness.charm.app.name, peer_data_clean) + harness.update_relation_data( + peer_rel_id, harness.charm.app.name, peer_data_leader_start + ) + harness.update_relation_data(peer_rel_id, new_unit_name, peer_data_clean) + harness.charm.backup.coordinate_stanza_fields() + _update_config.assert_not_called() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == peer_data_leader_start + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + assert harness.get_relation_data(peer_rel_id, new_unit) == {} def test_is_primary_pgbackrest_service_running(harness): @@ -996,21 +910,30 @@ def test_is_primary_pgbackrest_service_running(harness): def test_on_s3_credential_changed(harness): with ( - patch("charm.PostgreSQLBackups._initialise_stanza") as _initialise_stanza, - patch("charm.PostgreSQLBackups.can_use_s3_repository") as _can_use_s3_repository, - patch( - "charm.PostgreSQLBackups._create_bucket_if_not_exists" - ) as _create_bucket_if_not_exists, patch( - "charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock - ) as _is_primary, + "charm.PostgreSQLBackups._render_pgbackrest_conf_file" + ) as _render_pgbackrest_conf_file, patch( "charm.PostgreSQLBackups._can_initialise_stanza", new_callable=PropertyMock ) as _can_initialise_stanza, patch( - "charm.PostgreSQLBackups._render_pgbackrest_conf_file" - ) as _render_pgbackrest_conf_file, + "charm.PostgreSQLBackups.start_stop_pgbackrest_service" + ) as _start_stop_pgbackrest_service, + patch( + "charm.PostgresqlOperatorCharm._set_primary_status_message" + ) as _set_primary_status_message, + patch( + "charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock + ) as _is_primary, + patch( + "charm.PostgreSQLBackups._on_s3_credential_changed_primary" + ) as _on_s3_credential_changed_primary, patch("ops.framework.EventBase.defer") as _defer, + patch( + "charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock + ) as _is_standby_leader, + patch("time.gmtime"), + patch("time.asctime", return_value="Thu Feb 24 05:00:00 2022"), ): peer_rel_id = harness.model.get_relation(PEER).id # Test when the cluster was not initialised yet. @@ -1020,9 +943,6 @@ def test_on_s3_credential_changed(harness): ) _defer.assert_called_once() _render_pgbackrest_conf_file.assert_not_called() - _create_bucket_if_not_exists.assert_not_called() - _can_use_s3_repository.assert_not_called() - _initialise_stanza.assert_not_called() # Test when the cluster is already initialised, but the charm fails to render # the pgBackRest configuration file due to missing S3 parameters. @@ -1040,9 +960,6 @@ def test_on_s3_credential_changed(harness): _defer.assert_not_called() _render_pgbackrest_conf_file.assert_called_once() _can_initialise_stanza.assert_not_called() - _create_bucket_if_not_exists.assert_not_called() - _can_use_s3_repository.assert_not_called() - _initialise_stanza.assert_not_called() # Test when it's not possible to initialise the stanza in this unit. _render_pgbackrest_conf_file.return_value = True @@ -1050,79 +967,161 @@ def test_on_s3_credential_changed(harness): harness.charm.backup.s3_client.on.credentials_changed.emit( relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) ) - _defer.assert_called_once() _can_initialise_stanza.assert_called_once() - _is_primary.assert_not_called() + _defer.assert_called_once() + _start_stop_pgbackrest_service.assert_not_called() - # Test when it's not possible to use the S3 repository due to backups from another cluster. + # Test when unit is not a leader and can't do any peer data changes + _is_primary.return_value = False + _can_initialise_stanza.return_value = True + harness.charm.backup.s3_client.on.credentials_changed.emit( + relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) + ) + _is_standby_leader.assert_called_once() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { + "cluster_initialised": "True" + } + + # Test when unit is a leader but not primary + _is_standby_leader.reset_mock() with harness.hooks_disabled(): harness.set_leader() - _create_bucket_if_not_exists.reset_mock() - _can_initialise_stanza.return_value = True + harness.charm.backup.s3_client.on.credentials_changed.emit( + relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) + ) + _set_primary_status_message.assert_called_once() + _on_s3_credential_changed_primary.assert_not_called() + _is_standby_leader.assert_called_once() + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { + "cluster_initialised": "True", + "s3-initialization-start": "Thu Feb 24 05:00:00 2022", + } + + # Test when unit is a leader and primary _is_primary.return_value = True - _create_bucket_if_not_exists.side_effect = None - _can_use_s3_repository.return_value = (False, "fake validation message") + _is_standby_leader.reset_mock() + _set_primary_status_message.reset_mock() + with harness.hooks_disabled(): + harness.set_leader() harness.charm.backup.s3_client.on.credentials_changed.emit( relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) ) - assert isinstance(harness.charm.unit.status, BlockedStatus) - assert harness.charm.unit.status.message == "fake validation message" + _on_s3_credential_changed_primary.assert_called_once() + _set_primary_status_message.assert_not_called() + _is_standby_leader.assert_called_once() + + +def test_on_s3_credential_changed_primary(harness): + with ( + patch("charm.PostgresqlOperatorCharm.update_config"), + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch("charm.Patroni.reload_patroni_configuration") as _reload_patroni_configuration, + patch( + "charm.PostgreSQLBackups._create_bucket_if_not_exists" + ) as _create_bucket_if_not_exists, + patch( + "charm.PostgreSQLBackups._s3_initialization_set_failure" + ) as _s3_initialization_set_failure, + patch("charm.PostgreSQLBackups.can_use_s3_repository") as _can_use_s3_repository, + patch("charm.PostgreSQLBackups._initialise_stanza") as _initialise_stanza, + patch("charm.PostgreSQLBackups.check_stanza") as _check_stanza, + patch( + "charm.PostgreSQLBackups._retrieve_s3_parameters", + return_value=({"path": "example"}, None), + ), + patch("charm.PostgreSQLBackups._upload_content_to_s3") as _upload_content_to_s3, + ): + mock_event = MagicMock() + + _member_started.return_value = False + _create_bucket_if_not_exists.side_effect = ValueError() + assert not harness.charm.backup._on_s3_credential_changed_primary(mock_event) + _member_started.assert_called_once() + _reload_patroni_configuration.assert_not_called() _create_bucket_if_not_exists.assert_called_once() + _s3_initialization_set_failure.assert_called_once_with( + FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE + ) + _can_use_s3_repository.assert_not_called() + + _s3_initialization_set_failure.reset_mock() + _member_started.return_value = True + _create_bucket_if_not_exists.side_effect = None + _can_use_s3_repository.return_value = (False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE) + assert not harness.charm.backup._on_s3_credential_changed_primary(mock_event) + _reload_patroni_configuration.assert_called_once() _can_use_s3_repository.assert_called_once() + _s3_initialization_set_failure.assert_called_once_with( + ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE + ) _initialise_stanza.assert_not_called() - # Test when the stanza can be initialised and the pgBackRest service can start. - _can_use_s3_repository.reset_mock() _can_use_s3_repository.return_value = (True, None) - harness.charm.backup.s3_client.on.credentials_changed.emit( - relation=harness.model.get_relation(S3_PARAMETERS_RELATION, s3_rel_id) - ) - _can_use_s3_repository.assert_called_once() + _initialise_stanza.return_value = False + assert not harness.charm.backup._on_s3_credential_changed_primary(mock_event) _initialise_stanza.assert_called_once() + _check_stanza.assert_not_called() + + _initialise_stanza.return_value = True + _check_stanza.return_value = False + assert not harness.charm.backup._on_s3_credential_changed_primary(mock_event) + _check_stanza.assert_called_once() + _upload_content_to_s3.assert_not_called() + + _check_stanza.return_value = True + assert harness.charm.backup._on_s3_credential_changed_primary(mock_event) + _upload_content_to_s3.assert_called_once() def test_on_s3_credential_gone(harness): - peer_rel_id = harness.model.get_relation(PEER).id - # Test that unrelated blocks will remain - harness.charm.unit.status = BlockedStatus("test block") - harness.charm.backup._on_s3_credential_gone(None) - assert isinstance(harness.charm.unit.status, BlockedStatus) - - # Test that s3 related blocks will be cleared - harness.charm.unit.status = BlockedStatus(ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE) - harness.charm.backup._on_s3_credential_gone(None) - assert isinstance(harness.charm.unit.status, ActiveStatus) - - # Test removal of relation data when the unit is not the leader. - with harness.hooks_disabled(): - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, - ) - harness.update_relation_data( - peer_rel_id, - harness.charm.app.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, - ) - harness.charm.backup._on_s3_credential_gone(None) - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == { - "stanza": "test-stanza", - "init-pgbackrest": "True", - } - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} - - # Test removal of relation data when the unit is the leader. - with harness.hooks_disabled(): - harness.set_leader() - harness.update_relation_data( - peer_rel_id, - harness.charm.unit.name, - {"stanza": "test-stanza", "init-pgbackrest": "True"}, - ) - harness.charm.backup._on_s3_credential_gone(None) - assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} - assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + with patch( + "charm.PostgresqlOperatorCharm._set_primary_status_message" + ) as _set_primary_status_message: + full_peer_s3_parameters = { + "stanza": "test-stanza", + "s3-initialization-start": "Thu Feb 24 05:00:00 2022", + "s3-initialization-done": "True", + "s3-initialization-block-message": ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, + } + + peer_rel_id = harness.model.get_relation(PEER).id + # Test that unrelated blocks will remain + harness.charm.unit.status = BlockedStatus("test block") + harness.charm.backup._on_s3_credential_gone(None) + _set_primary_status_message.assert_not_called() + + # Test that s3 related blocks will be cleared + harness.charm.unit.status = BlockedStatus(ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE) + harness.charm.backup._on_s3_credential_gone(None) + _set_primary_status_message.assert_called_once() + + # Test removal of relation data when the unit is not the leader. + with harness.hooks_disabled(): + harness.update_relation_data( + peer_rel_id, + harness.charm.app.name, + full_peer_s3_parameters, + ) + harness.update_relation_data( + peer_rel_id, + harness.charm.unit.name, + full_peer_s3_parameters, + ) + harness.charm.backup._on_s3_credential_gone(None) + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == full_peer_s3_parameters + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} + + # Test removal of relation data when the unit is the leader. + with harness.hooks_disabled(): + harness.set_leader() + harness.update_relation_data( + peer_rel_id, + harness.charm.unit.name, + full_peer_s3_parameters, + ) + harness.charm.backup._on_s3_credential_gone(None) + assert harness.get_relation_data(peer_rel_id, harness.charm.app) == {} + assert harness.get_relation_data(peer_rel_id, harness.charm.unit) == {} def test_on_create_backup_action(harness): @@ -1184,7 +1183,7 @@ def test_on_create_backup_action(harness): harness.charm.backup._on_create_backup_action(mock_event) _upload_content_to_s3.assert_called_once_with( expected_metadata, - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", mock_s3_parameters, ) mock_event.fail.assert_called_once() @@ -1226,12 +1225,12 @@ def test_on_create_backup_action(harness): _upload_content_to_s3.assert_has_calls([ call( expected_metadata, - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", mock_s3_parameters, ), call( "Stdout:\nfake stdout\n\nStderr:\nfake stderr\n", - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/2023-01-01T09:00:00Z/backup.log", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/2023-01-01T09:00:00Z/backup.log", mock_s3_parameters, ), ]) @@ -1249,12 +1248,12 @@ def test_on_create_backup_action(harness): _upload_content_to_s3.assert_has_calls([ call( expected_metadata, - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", mock_s3_parameters, ), call( "Stdout:\nfake stdout\n\nStderr:\nfake stderr\n", - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/2023-01-01T09:00:00Z/backup.log", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/2023-01-01T09:00:00Z/backup.log", mock_s3_parameters, ), ]) @@ -1271,12 +1270,12 @@ def test_on_create_backup_action(harness): _upload_content_to_s3.assert_has_calls([ call( expected_metadata, - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/latest", mock_s3_parameters, ), call( "Stdout:\nfake stdout\n\nStderr:\nfake stderr\n", - f"test-path/backup/{harness.charm.model.name}.{harness.charm.cluster_name}/2023-01-01T09:00:00Z/backup.log", + f"backup/{harness.charm.model.name}.{harness.charm.cluster_name}/2023-01-01T09:00:00Z/backup.log", mock_s3_parameters, ), ]) @@ -1802,6 +1801,7 @@ def test_start_stop_pgbackrest_service(harness): patch( "charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock ) as _is_primary, + patch("charm.Patroni.get_standby_leader") as _get_standby_leader, patch("backups.snap.SnapCache") as _snap_cache, patch( "charm.PostgresqlOperatorCharm._peer_members_ips", new_callable=PropertyMock @@ -1845,9 +1845,17 @@ def test_start_stop_pgbackrest_service(harness): stop.assert_called_once() restart.assert_not_called() - # Test when the service hasn't started in the primary yet. + # Test when it's a standby. stop.reset_mock() _peer_members_ips.return_value = ["1.1.1.1"] + _get_standby_leader.return_value = "standby" + assert harness.charm.backup.start_stop_pgbackrest_service() + stop.assert_called_once() + restart.assert_not_called() + + # Test when the service hasn't started in the primary yet. + stop.reset_mock() + _get_standby_leader.return_value = None _is_primary.return_value = False _is_primary_pgbackrest_service_running.return_value = False assert not harness.charm.backup.start_stop_pgbackrest_service() diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 541d971623..650dbe689d 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1093,10 +1093,6 @@ def test_on_update_status_after_restore_operation(harness): # Test when it's not possible to use the configured S3 repository. _update_config.reset_mock() - _handle_processes_failures.reset_mock() - _oversee_users.reset_mock() - _update_relation_endpoints.reset_mock() - _handle_workload_failures.reset_mock() _set_primary_status_message.reset_mock() with harness.hooks_disabled(): harness.update_relation_data( @@ -1107,17 +1103,11 @@ def test_on_update_status_after_restore_operation(harness): _can_use_s3_repository.return_value = (False, "fake validation message") harness.charm.on.update_status.emit() _update_config.assert_called_once() - _handle_processes_failures.assert_not_called() - _oversee_users.assert_not_called() - _update_relation_endpoints.assert_not_called() - _handle_workload_failures.assert_not_called() - _set_primary_status_message.assert_not_called() - assert isinstance(harness.charm.unit.status, BlockedStatus) - assert harness.charm.unit.status.message == "fake validation message" - + _set_primary_status_message.assert_called_once() # Assert that the backup id is not in the application relation databag anymore. assert harness.get_relation_data(rel_id, harness.charm.app) == { - "cluster_initialised": "True" + "cluster_initialised": "True", + "s3-initialization-block-message": "fake validation message", } @@ -1482,7 +1472,6 @@ def test_on_peer_relation_changed(harness): patch( "charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock ) as _primary_endpoint, - patch("backups.PostgreSQLBackups.check_stanza") as _check_stanza, patch("backups.PostgreSQLBackups.coordinate_stanza_fields") as _coordinate_stanza_fields, patch( "backups.PostgreSQLBackups.start_stop_pgbackrest_service" @@ -1563,7 +1552,14 @@ def test_on_peer_relation_changed(harness): _update_new_unit_status.assert_not_called() assert isinstance(harness.model.unit.status, BlockedStatus) + # Test event is early exiting when in blocked status. + _update_config.side_effect = None + _member_started.return_value = False + harness.charm._on_peer_relation_changed(mock_event) + _start_patroni.assert_not_called() + # Test when Patroni hasn't started yet in the unit. + harness.model.unit.status = ActiveStatus() _update_config.side_effect = None _member_started.return_value = False harness.charm._on_peer_relation_changed(mock_event) @@ -1577,7 +1573,6 @@ def test_on_peer_relation_changed(harness): _member_started.return_value = True for values in itertools.product([True, False], ["0", "1000", "1001", "unknown"]): _defer.reset_mock() - _check_stanza.reset_mock() _start_stop_pgbackrest_service.reset_mock() _is_primary.return_value = values[0] _is_standby_leader.return_value = values[0] @@ -1586,12 +1581,10 @@ def test_on_peer_relation_changed(harness): harness.charm.on.database_peers_relation_changed.emit(relation) if _is_primary.return_value == values[0] or int(values[1]) <= 1000: _defer.assert_not_called() - _check_stanza.assert_called_once() _start_stop_pgbackrest_service.assert_called_once() assert isinstance(harness.charm.unit.status, ActiveStatus) else: _defer.assert_called_once() - _check_stanza.assert_not_called() _start_stop_pgbackrest_service.assert_not_called() assert isinstance(harness.charm.unit.status, MaintenanceStatus) @@ -1600,12 +1593,10 @@ def test_on_peer_relation_changed(harness): _member_started.return_value = True _defer.reset_mock() _coordinate_stanza_fields.reset_mock() - _check_stanza.reset_mock() _start_stop_pgbackrest_service.return_value = False harness.charm.on.database_peers_relation_changed.emit(relation) _defer.assert_called_once() _coordinate_stanza_fields.assert_not_called() - _check_stanza.assert_not_called() # Test the last calls been made when it was possible to start the # pgBackRest service. @@ -1614,7 +1605,6 @@ def test_on_peer_relation_changed(harness): harness.charm.on.database_peers_relation_changed.emit(relation) _defer.assert_not_called() _coordinate_stanza_fields.assert_called_once() - _check_stanza.assert_called_once() def test_reconfigure_cluster(harness):