From a87c52b03c91f824a439a9d11e276c667e4596eb Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Fri, 9 Feb 2024 08:46:56 -0300 Subject: [PATCH] [DPE-3380] Handle S3 relation in primary non-leader unit (#340) * Handle S3 relation in primary non-leader unit Signed-off-by: Marcelo Henrique Neppel * Fix replication after restore Signed-off-by: Marcelo Henrique Neppel * Fix unit name Signed-off-by: Marcelo Henrique Neppel * Speed up events Signed-off-by: Marcelo Henrique Neppel * Change fast_interval parameter Signed-off-by: Marcelo Henrique Neppel --------- Signed-off-by: Marcelo Henrique Neppel --- poetry.lock | 1 - src/backups.py | 57 +++++-- src/charm.py | 14 ++ tests/integration/test_backups.py | 65 +++++++- tests/unit/test_backups.py | 257 +++++++++++++++++++++++++++--- tests/unit/test_charm.py | 67 +++++++- 6 files changed, 422 insertions(+), 39 deletions(-) diff --git a/poetry.lock b/poetry.lock index a16d15edb9..32d8e61f7d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1633,7 +1633,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/src/backups.py b/src/backups.py index bfd9ecd90a..09c3627798 100644 --- a/src/backups.py +++ b/src/backups.py @@ -102,10 +102,14 @@ def _can_unit_perform_backup(self) -> Tuple[bool, Optional[str]]: tls_enabled = "tls" in self.charm.unit_peer_data + # Check if this unit is the primary (if it was not possible to retrieve that information, + # then show that the unit cannot perform a backup, because possibly the database is offline). + try: + is_primary = self.charm.is_primary + except RetryError: + return False, "Unit cannot perform backups as the database seems to be offline" + # Only enable backups on primary if there are replicas but TLS is not enabled. - is_primary = self.charm.unit.name == self.charm._patroni.get_primary( - unit_name_pattern=True - ) if is_primary and self.charm.app.planned_units() > 1 and tls_enabled: return False, "Unit cannot perform backups as it is the cluster primary" @@ -363,7 +367,7 @@ def _initialise_stanza(self) -> None: located, how it will be backed up, archiving options, etc. (more info in https://pgbackrest.org/user-guide.html#quickstart/configure-stanza). """ - if not self.charm.unit.is_leader(): + if not self.charm.is_primary: return # Enable stanza initialisation if the backup settings were fixed after being invalid @@ -403,11 +407,18 @@ def _initialise_stanza(self) -> None: self.start_stop_pgbackrest_service() # Store the stanza name to be used in configurations updates. - self.charm.app_peer_data.update({"stanza": self.stanza_name, "init-pgbackrest": "True"}) + if self.charm.unit.is_leader(): + self.charm.app_peer_data.update( + {"stanza": self.stanza_name, "init-pgbackrest": "True"} + ) + else: + self.charm.unit_peer_data.update( + {"stanza": self.stanza_name, "init-pgbackrest": "True"} + ) def check_stanza(self) -> None: """Runs the pgbackrest stanza validation.""" - if not self.charm.unit.is_leader() or "init-pgbackrest" not in self.charm.app_peer_data: + if not self.charm.is_primary or "init-pgbackrest" not in self.charm.app_peer_data: return # Update the configuration to use pgBackRest as the archiving mechanism. @@ -437,12 +448,37 @@ def check_stanza(self) -> None: # and rollback the configuration. self.charm.app_peer_data.update({"stanza": ""}) self.charm.app_peer_data.pop("init-pgbackrest", None) + self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) self.charm.update_config() logger.exception(e) self.charm.unit.status = BlockedStatus(FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE) - self.charm.app_peer_data.pop("init-pgbackrest", None) + if self.charm.unit.is_leader(): + self.charm.app_peer_data.pop("init-pgbackrest", None) + self.charm.unit_peer_data.pop("init-pgbackrest", None) + + def coordinate_stanza_fields(self) -> None: + """Coordinate the stanza name between the primary and the leader units.""" + for unit, unit_data in self.charm._peers.data.items(): + if "stanza" not in unit_data: + continue + # If the stanza name is not set in the application databag, then the primary is not + # the leader unit, and it's needed to set the stanza name in the application databag. + if "stanza" not in self.charm.app_peer_data and self.charm.unit.is_leader(): + self.charm.app_peer_data.update( + {"stanza": self.stanza_name, "init-pgbackrest": "True"} + ) + break + # If the stanza was already checked and its name is still in the unit databag, mark + # the stanza as already checked in the application databag and remove it from the + # unit databag. + if "init-pgbackrest" not in unit_data: + if self.charm.unit.is_leader(): + self.charm.app_peer_data.pop("init-pgbackrest", None) + if "init-pgbackrest" not in self.charm.app_peer_data and unit == self.charm.unit: + self.charm.unit_peer_data.update({"stanza": ""}) + break @property def _is_primary_pgbackrest_service_running(self) -> bool: @@ -469,8 +505,8 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): logger.debug("Cannot set pgBackRest configurations, missing configurations.") return - # Verify the s3 relation only on the leader - if not self.charm.unit.is_leader(): + # Verify the s3 relation only on the primary. + if not self.charm.is_primary: return try: @@ -487,6 +523,9 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): self._initialise_stanza() def _on_s3_credential_gone(self, _) -> None: + if self.charm.unit.is_leader(): + self.charm.app_peer_data.update({"stanza": "", "init-pgbackrest": ""}) + self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""}) if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES: self.charm.unit.status = ActiveStatus() diff --git a/src/charm.py b/src/charm.py index 6d95f131e8..a80290f900 100755 --- a/src/charm.py +++ b/src/charm.py @@ -477,6 +477,18 @@ def _on_peer_relation_changed(self, event: HookEvent): event.defer() return + # Restart the workload if it's stuck on the starting state after a timeline divergence + # due to a backup that was restored. + if not self.is_primary and ( + self._patroni.member_replication_lag == "unknown" + or int(self._patroni.member_replication_lag) > 1000 + ): + self._patroni.reinitialize_postgresql() + logger.debug("Deferring on_peer_relation_changed: reinitialising replica") + self.unit.status = MaintenanceStatus("reinitialising replica") + event.defer() + return + # Start or stop the pgBackRest TLS server service when TLS certificate change. if not self.backup.start_stop_pgbackrest_service(): logger.debug( @@ -485,6 +497,8 @@ def _on_peer_relation_changed(self, event: HookEvent): event.defer() return + self.backup.coordinate_stanza_fields() + self.backup.check_stanza() if "exporter-started" not in self.unit_peer_data: diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 88fb07a682..334743727e 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -18,6 +18,8 @@ get_password, get_primary, get_unit_address, + scale_application, + switchover, wait_for_idle_on_blocked, ) from .juju_ import juju_major_version @@ -72,6 +74,7 @@ async def cloud_configs(ops_test: OpsTest, github_secrets) -> None: } yield configs, credentials # Delete the previously created objects. + logger.info("deleting the previously created backups") for cloud, config in configs.items(): session = boto3.session.Session( aws_access_key_id=credentials[cloud]["access-key"], @@ -128,9 +131,10 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> No **cloud_configs[1][cloud], ) await action.wait() - await ops_test.model.wait_for_idle( - apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000 - ) + async with ops_test.fast_forward(fast_interval="60s"): + await ops_test.model.wait_for_idle( + apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000 + ) primary = await get_primary(ops_test, f"{database_app_name}/0") for unit in ops_test.model.applications[database_app_name].units: @@ -225,6 +229,61 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> No ], "backup wasn't correctly restored: table 'backup_table_2' exists" connection.close() + # Run the following steps only in one cloud (it's enough for those checks). + if cloud == list(cloud_configs[0].keys())[0]: + # Remove the relation to the TLS certificates operator. + await ops_test.model.applications[database_app_name].remove_relation( + f"{database_app_name}:certificates", f"{TLS_CERTIFICATES_APP_NAME}:certificates" + ) + await ops_test.model.wait_for_idle( + apps=[database_app_name], status="active", timeout=1000 + ) + + # Scale up to be able to test primary and leader being different. + async with ops_test.fast_forward(): + await scale_application(ops_test, database_app_name, 2) + + # Ensure replication is working correctly. + new_unit_name = f"{database_app_name}/2" + address = get_unit_address(ops_test, new_unit_name) + with db_connect( + host=address, password=password + ) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" + ) + assert cursor.fetchone()[ + 0 + ], f"replication isn't working correctly: table 'backup_table_1' doesn't exist in {new_unit_name}" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" + ) + assert not cursor.fetchone()[ + 0 + ], f"replication isn't working correctly: table 'backup_table_2' exists in {new_unit_name}" + connection.close() + + switchover(ops_test, primary, new_unit_name) + + # Get the new primary unit. + primary = await get_primary(ops_test, new_unit_name) + # Check that the primary changed. + for attempt in Retrying( + stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + assert primary == new_unit_name + + # Ensure stanza is working correctly. + logger.info("listing the available backups") + action = await ops_test.model.units.get(new_unit_name).run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + assert backups, "backups not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + # Remove the database app. await ops_test.model.remove_application(database_app_name, block_until_done=True) # Remove the TLS operator. diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 786c521b46..742ddebecb 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -10,9 +10,9 @@ from boto3.exceptions import S3UploadFailedError from botocore.exceptions import ClientError from jinja2 import Template -from ops import ActiveStatus, BlockedStatus, MaintenanceStatus +from ops import ActiveStatus, BlockedStatus, MaintenanceStatus, Unit from ops.testing import Harness -from tenacity import wait_fixed +from tenacity import RetryError, wait_fixed from backups import ListBackupsError from charm import PostgresqlOperatorCharm @@ -76,11 +76,20 @@ def test_are_backup_settings_ok(self): @patch("charm.PostgreSQLBackups._are_backup_settings_ok") @patch("charm.Patroni.member_started", new_callable=PropertyMock) @patch("ops.model.Application.planned_units") - @patch("charm.Patroni.get_primary") + @patch("charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock) def test_can_unit_perform_backup( - self, _get_primary, _planned_units, _member_started, _are_backup_settings_ok + self, _is_primary, _planned_units, _member_started, _are_backup_settings_ok ): + # Test when the charm fails to retrieve the primary. + _is_primary.side_effect = RetryError(last_attempt=1) + self.assertEqual( + self.charm.backup._can_unit_perform_backup(), + (False, "Unit cannot perform backups as the database seems to be offline"), + ) + # Test when the unit is in a blocked state. + _is_primary.side_effect = None + _is_primary.return_value = True self.charm.unit.status = BlockedStatus("fake blocked state") self.assertEqual( self.charm.backup._can_unit_perform_backup(), @@ -89,7 +98,6 @@ def test_can_unit_perform_backup( # Test when running the check in the primary, there are replicas and TLS is enabled. self.charm.unit.status = ActiveStatus() - _get_primary.return_value = self.charm.unit.name _planned_units.return_value = 2 with self.harness.hooks_disabled(): self.harness.update_relation_data( @@ -103,7 +111,7 @@ def test_can_unit_perform_backup( ) # Test when running the check in a replica and TLS is disabled. - _get_primary.return_value = "another-unit" + _is_primary.return_value = False with self.harness.hooks_disabled(): self.harness.update_relation_data( self.peer_rel_id, @@ -116,7 +124,7 @@ def test_can_unit_perform_backup( ) # Test when Patroni or PostgreSQL hasn't started yet. - _get_primary.return_value = self.charm.unit.name + _is_primary.return_value = True _member_started.return_value = False self.assertEqual( self.charm.backup._can_unit_perform_backup(), @@ -586,17 +594,23 @@ def test_list_backups(self, _execute_command): @patch("backups.wait_fixed", return_value=wait_fixed(0)) @patch("charm.PostgresqlOperatorCharm.update_config") @patch("charm.PostgreSQLBackups._execute_command") + @patch("charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock) def test_initialise_stanza( - self, _execute_command, _update_config, _, _member_started, _reload_patroni_configuration + self, + _is_primary, + _execute_command, + _update_config, + _, + _member_started, + _reload_patroni_configuration, ): - # Test when the unit is not the leader. - self.charm.backup._initialise_stanza() + # Test when the unit is not the primary. + _is_primary.return_value = False _execute_command.assert_not_called() - # Test when the unit is the leader, but it's in a blocked state + # Test when the unit is the primary, but it's in a blocked state # other than the ones can be solved by new S3 settings. - with self.harness.hooks_disabled(): - self.harness.set_leader() + _is_primary.return_value = True self.charm.unit.status = BlockedStatus("fake blocked state") self.charm.backup._initialise_stanza() _execute_command.assert_not_called() @@ -633,11 +647,29 @@ def test_initialise_stanza( with self.assertRaises(TimeoutError): self.charm.backup._initialise_stanza() - # Test when the archiving is working correctly (pgBackRest check command succeeds). + # Test when the archiving is working correctly (pgBackRest check command succeeds) + # and the unit is not the leader. _execute_command.reset_mock() _execute_command.return_value = (0, "fake stdout", "") _member_started.return_value = True self.charm.backup._initialise_stanza() + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.app), {}) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), + { + "stanza": f"{self.charm.model.name}.postgresql", + "init-pgbackrest": "True", + }, + ) + self.assertIsInstance(self.charm.unit.status, MaintenanceStatus) + + # Test when the unit is the leader. + with self.harness.hooks_disabled(): + self.harness.set_leader() + self.harness.update_relation_data( + self.peer_rel_id, self.charm.unit.name, {"stanza": "", "init-pgbackrest": ""} + ) + self.charm.backup._initialise_stanza() _update_config.assert_not_called() self.assertEqual( self.harness.get_relation_data(self.peer_rel_id, self.charm.app), @@ -653,23 +685,36 @@ def test_initialise_stanza( @patch("backups.wait_fixed", return_value=wait_fixed(0)) @patch("charm.PostgresqlOperatorCharm.update_config") @patch("charm.PostgreSQLBackups._execute_command") + @patch("charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock) def test_check_stanza( - self, _execute_command, _update_config, _, _member_started, _reload_patroni_configuration + self, + _is_primary, + _execute_command, + _update_config, + _, + _member_started, + _reload_patroni_configuration, ): # Set peer data flag with self.harness.hooks_disabled(): self.harness.update_relation_data( self.peer_rel_id, self.charm.app.name, - {"init-pgbackrest": "True"}, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, + ) + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.unit.name, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, ) + # Test when the unit is not the primary. + _is_primary.return_value = False self.charm.backup.check_stanza() _execute_command.assert_not_called() - # Set the unit as leader - with self.harness.hooks_disabled(): - self.harness.set_leader() + # Set the unit as primary. + _is_primary.return_value = True # Test when the archiving is not working correctly (pgBackRest check command fails). _execute_command.return_value = (49, "", "fake stderr") @@ -679,15 +724,23 @@ def test_check_stanza( self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.app), {}) self.assertEqual(_member_started.call_count, 5) self.assertEqual(_reload_patroni_configuration.call_count, 5) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.app), {}) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) self.assertIsInstance(self.charm.unit.status, BlockedStatus) self.assertEqual(self.charm.unit.status.message, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE) - # Test when the archiving is working correctly (pgBackRest check command succeeds). + # Test when the archiving is working correctly (pgBackRest check command succeeds) + # and the unit is not the leader. with self.harness.hooks_disabled(): self.harness.update_relation_data( self.peer_rel_id, self.charm.app.name, - {"init-pgbackrest": "True"}, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, + ) + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.unit.name, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, ) _execute_command.reset_mock() _update_config.reset_mock() @@ -696,15 +749,137 @@ def test_check_stanza( _execute_command.side_effect = None _execute_command.return_value = (0, "fake stdout", "") self.charm.backup.check_stanza() + _update_config.assert_called_once() + _member_started.assert_called_once() + _reload_patroni_configuration.assert_called_once() self.assertEqual( self.harness.get_relation_data(self.peer_rel_id, self.charm.app), - {}, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, ) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), + {"stanza": "test-stanza"}, + ) + self.assertIsInstance(self.charm.unit.status, ActiveStatus) + + # Test when the unit is the leader. + self.charm.unit.status = BlockedStatus("fake blocked state") + with self.harness.hooks_disabled(): + self.harness.set_leader() + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.app.name, + {"init-pgbackrest": "True"}, + ) + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.unit.name, + {"init-pgbackrest": "True"}, + ) + _update_config.reset_mock() + _member_started.reset_mock() + _reload_patroni_configuration.reset_mock() + self.charm.backup.check_stanza() _update_config.assert_called_once() _member_started.assert_called_once() _reload_patroni_configuration.assert_called_once() + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.app), + {"stanza": "test-stanza"}, + ) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), + {"stanza": "test-stanza"}, + ) self.assertIsInstance(self.charm.unit.status, ActiveStatus) + def test_coordinate_stanza_fields(self): + # Add a new unit to the relation. + new_unit_name = "postgresql-k8s/1" + new_unit = Unit(new_unit_name, None, self.harness.charm.app._backend, {}) + self.harness.add_relation_unit(self.peer_rel_id, new_unit_name) + + # Test when the stanza name is neither in the application relation databag nor in the unit relation databag. + self.charm.backup.coordinate_stanza_fields() + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.app), {}) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, new_unit), {}) + + # Test when the stanza name is in the unit relation databag but the unit is not the leader. + stanza_name = f"{self.charm.model.name}.{self.charm.app.name}" + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, new_unit_name, {"stanza": stanza_name, "init-pgbackrest": "True"} + ) + self.charm.backup.coordinate_stanza_fields() + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.app), {}) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, new_unit), + {"stanza": stanza_name, "init-pgbackrest": "True"}, + ) + + # Test when the unit is the leader. + with self.harness.hooks_disabled(): + self.harness.set_leader() + self.charm.backup.coordinate_stanza_fields() + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.app), + {"stanza": stanza_name, "init-pgbackrest": "True"}, + ) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, new_unit), + {"stanza": stanza_name, "init-pgbackrest": "True"}, + ) + + # Test when the stanza was already checked in the primary non-leader unit. + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, new_unit_name, {"init-pgbackrest": ""} + ) + self.charm.backup.coordinate_stanza_fields() + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.app), + {"stanza": stanza_name}, + ) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, new_unit), {"stanza": stanza_name} + ) + + # Test when the "init-pgbackrest" flag was removed from the application relation databag + # and this is the unit that has the stanza name in the unit relation databag. + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, self.charm.unit.name, {"stanza": stanza_name} + ) + self.charm.backup.coordinate_stanza_fields() + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.app), + {"stanza": stanza_name}, + ) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, new_unit), {"stanza": stanza_name} + ) + + # Test when the unit is not the leader. + with self.harness.hooks_disabled(): + self.harness.set_leader(False) + self.harness.update_relation_data( + self.peer_rel_id, self.charm.unit.name, {"stanza": stanza_name} + ) + self.charm.backup.coordinate_stanza_fields() + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.app), + {"stanza": stanza_name}, + ) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, new_unit), {"stanza": stanza_name} + ) + @patch("charm.PostgreSQLBackups._execute_command") @patch("charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock) @patch("charm.Patroni.get_primary") @@ -733,12 +908,14 @@ def test_is_primary_pgbackrest_service_running( @patch("charm.PostgreSQLBackups._initialise_stanza") @patch("charm.PostgreSQLBackups.can_use_s3_repository") @patch("charm.PostgreSQLBackups._create_bucket_if_not_exists") + @patch("charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock) @patch("charm.PostgreSQLBackups._render_pgbackrest_conf_file") @patch("ops.framework.EventBase.defer") def test_on_s3_credential_changed( self, _defer, _render_pgbackrest_conf_file, + _is_primary, _create_bucket_if_not_exists, _can_use_s3_repository, _initialise_stanza, @@ -764,6 +941,7 @@ def test_on_s3_credential_changed( {"cluster_initialised": "True"}, ) _render_pgbackrest_conf_file.return_value = False + _is_primary.return_value = False self.charm.backup.s3_client.on.credentials_changed.emit( relation=self.harness.model.get_relation(S3_PARAMETERS_RELATION, self.s3_rel_id) ) @@ -793,11 +971,9 @@ def test_on_s3_credential_changed( _can_use_s3_repository.assert_not_called() _initialise_stanza.assert_not_called() - with self.harness.hooks_disabled(): - self.harness.set_leader() - # Test when the charm render the pgBackRest configuration file, but fails to # access or create the S3 bucket. + _is_primary.return_value = True for error in [ ClientError( error_response={"Error": {"Code": 1, "message": "fake error"}}, @@ -853,6 +1029,37 @@ def test_on_s3_credential_gone(self): self.charm.backup._on_s3_credential_gone(None) self.assertIsInstance(self.charm.unit.status, ActiveStatus) + # Test removal of relation data when the unit is not the leader. + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.app.name, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, + ) + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.app.name, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, + ) + self.charm.backup._on_s3_credential_gone(None) + self.assertEqual( + self.harness.get_relation_data(self.peer_rel_id, self.charm.app), + {"stanza": "test-stanza", "init-pgbackrest": "True"}, + ) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + + # Test removal of relation data when the unit is the leader. + with self.harness.hooks_disabled(): + self.harness.set_leader() + self.harness.update_relation_data( + self.peer_rel_id, + self.charm.unit.name, + {"stanza": "test-stanza", "init-pgbackrest": "True"}, + ) + self.charm.backup._on_s3_credential_gone(None) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.app), {}) + self.assertEqual(self.harness.get_relation_data(self.peer_rel_id, self.charm.unit), {}) + @patch("charm.PostgresqlOperatorCharm.update_config") @patch("charm.PostgreSQLBackups._change_connectivity_to_database") @patch("charm.PostgreSQLBackups._list_backups") diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 726ffc110f..b48ee31ad2 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1,5 +1,6 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. +import itertools import logging import platform import subprocess @@ -14,7 +15,13 @@ PostgreSQLUpdateUserPasswordError, ) from ops.framework import EventBase -from ops.model import ActiveStatus, BlockedStatus, RelationDataTypeError, WaitingStatus +from ops.model import ( + ActiveStatus, + BlockedStatus, + MaintenanceStatus, + RelationDataTypeError, + WaitingStatus, +) from ops.testing import Harness from parameterized import parameterized from tenacity import RetryError @@ -1295,18 +1302,32 @@ def test_on_cluster_topology_change_clear_blocked( @patch("charm.snap.SnapCache") @patch("charm.PostgresqlOperatorCharm._update_relation_endpoints") @patch("charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock) + @patch("backups.PostgreSQLBackups.check_stanza") + @patch("backups.PostgreSQLBackups.coordinate_stanza_fields") + @patch("backups.PostgreSQLBackups.start_stop_pgbackrest_service") + @patch("charm.Patroni.reinitialize_postgresql") + @patch("charm.Patroni.member_replication_lag", new_callable=PropertyMock) + @patch("charm.PostgresqlOperatorCharm.is_primary") @patch("charm.Patroni.member_started", new_callable=PropertyMock) @patch("charm.Patroni.start_patroni") @patch("charm.PostgresqlOperatorCharm.update_config") @patch("charm.PostgresqlOperatorCharm._update_member_ip") @patch("charm.PostgresqlOperatorCharm._reconfigure_cluster") + @patch("ops.framework.EventBase.defer") def test_on_peer_relation_changed( self, + _defer, _reconfigure_cluster, _update_member_ip, _update_config, _start_patroni, _member_started, + _is_primary, + _member_replication_lag, + _reinitialize_postgresql, + _start_stop_pgbackrest_service, + _coordinate_stanza_fields, + _check_stanza, _primary_endpoint, _update_relation_endpoints, _, @@ -1382,6 +1403,50 @@ def test_on_peer_relation_changed( _update_relation_endpoints.assert_not_called() self.assertIsInstance(self.harness.model.unit.status, WaitingStatus) + # Test when Patroni has already started but this is a replica with a + # huge or unknown lag. + self.relation = self.harness.model.get_relation(self._peer_relation, self.rel_id) + _member_started.return_value = True + for values in itertools.product([True, False], ["0", "1000", "1001", "unknown"]): + _defer.reset_mock() + _check_stanza.reset_mock() + _start_stop_pgbackrest_service.reset_mock() + _is_primary.return_value = values[0] + _member_replication_lag.return_value = values[1] + self.charm.unit.status = ActiveStatus() + self.charm.on.database_peers_relation_changed.emit(self.relation) + if _is_primary.return_value == values[0] or int(values[1]) <= 1000: + _defer.assert_not_called() + _check_stanza.assert_called_once() + _start_stop_pgbackrest_service.assert_called_once() + self.assertIsInstance(self.charm.unit.status, ActiveStatus) + else: + _defer.assert_called_once() + _check_stanza.assert_not_called() + _start_stop_pgbackrest_service.assert_not_called() + self.assertIsInstance(self.charm.unit.status, MaintenanceStatus) + + # Test when it was not possible to start the pgBackRest service yet. + self.relation = self.harness.model.get_relation(self._peer_relation, self.rel_id) + _member_started.return_value = True + _defer.reset_mock() + _coordinate_stanza_fields.reset_mock() + _check_stanza.reset_mock() + _start_stop_pgbackrest_service.return_value = False + self.charm.on.database_peers_relation_changed.emit(self.relation) + _defer.assert_called_once() + _coordinate_stanza_fields.assert_not_called() + _check_stanza.assert_not_called() + + # Test the last calls been made when it was possible to start the + # pgBackRest service. + _defer.reset_mock() + _start_stop_pgbackrest_service.return_value = True + self.charm.on.database_peers_relation_changed.emit(self.relation) + _defer.assert_not_called() + _coordinate_stanza_fields.assert_called_once() + _check_stanza.assert_called_once() + @patch_network_get(private_address="1.1.1.1") @patch("charm.PostgresqlOperatorCharm._add_members") @patch("charm.PostgresqlOperatorCharm._remove_from_members_ips")