From f3998144a58ce4998b21523333933d241fd3aa34 Mon Sep 17 00:00:00 2001 From: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> Date: Thu, 30 Nov 2023 16:44:08 +0200 Subject: [PATCH 1/2] Move dashboard legends to the bottom of the graph (#337) --- .../postgresql-metrics.json | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/grafana_dashboards/postgresql-metrics.json b/src/grafana_dashboards/postgresql-metrics.json index 04990f0d78..400fb92d0d 100644 --- a/src/grafana_dashboards/postgresql-metrics.json +++ b/src/grafana_dashboards/postgresql-metrics.json @@ -1673,7 +1673,7 @@ "current": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, @@ -1772,7 +1772,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": true, "values": true @@ -1873,7 +1873,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, "sort": "current", @@ -1971,7 +1971,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, @@ -2068,7 +2068,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, @@ -2167,7 +2167,7 @@ "hideEmpty": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, @@ -2265,7 +2265,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "total", "sortDesc": true, @@ -2362,7 +2362,7 @@ "current": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "current", "sortDesc": false, @@ -2459,7 +2459,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, @@ -2556,7 +2556,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": true @@ -2647,7 +2647,7 @@ "current": true, "max": true, "min": true, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": true @@ -2766,7 +2766,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": true, "values": true @@ -2864,7 +2864,7 @@ "current": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": true, "values": true From 31ca568f1332660a1c6e04aabb86a1376c8186c9 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 30 Nov 2023 15:26:44 -0300 Subject: [PATCH 2/2] [DPE-2728] Handle scaling to zero units (#331) * Handle scaling to zero units Signed-off-by: Marcelo Henrique Neppel * Update units tests Signed-off-by: Marcelo Henrique Neppel * Remove unused constants Signed-off-by: Marcelo Henrique Neppel * Don't set unknown status Signed-off-by: Marcelo Henrique Neppel --------- Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 29 ++++- src/relations/db.py | 11 ++ src/relations/postgresql_provider.py | 17 ++- .../integration/ha_tests/test_self_healing.py | 41 +++++++ tests/integration/helpers.py | 18 ++- tests/unit/test_charm.py | 107 ++++++++++++++---- tests/unit/test_db.py | 49 ++++++++ tests/unit/test_postgresql_provider.py | 49 ++++++++ 8 files changed, 290 insertions(+), 31 deletions(-) diff --git a/src/charm.py b/src/charm.py index 2817830748..a9d1b3fbf6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -42,6 +42,7 @@ MaintenanceStatus, Relation, Unit, + UnknownStatus, WaitingStatus, ) from ops.pebble import ChangeError, Layer, PathError, ProtocolError, ServiceStatus @@ -498,12 +499,14 @@ def enable_disable_extensions(self, database: str = None) -> None: continue extension = plugins_exception.get(extension, extension) extensions[extension] = enable - self.unit.status = WaitingStatus("Updating extensions") + if not isinstance(original_status, UnknownStatus): + self.unit.status = WaitingStatus("Updating extensions") try: self.postgresql.enable_disable_extensions(extensions, database) except PostgreSQLEnableDisableExtensionError as e: logger.exception("failed to change plugins: %s", str(e)) - self.unit.status = original_status + if not isinstance(original_status, UnknownStatus): + self.unit.status = original_status def _add_members(self, event) -> None: """Add new cluster members. @@ -603,6 +606,23 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: self.set_secret(APP_SCOPE, MONITORING_PASSWORD_KEY, new_password()) self._cleanup_old_cluster_resources() + client = Client() + try: + endpoint = client.get(Endpoints, name=self.cluster_name, namespace=self._namespace) + if "leader" not in endpoint.metadata.annotations: + patch = { + "metadata": { + "annotations": {"leader": self._unit_name_to_pod_name(self._unit)} + } + } + client.patch( + Endpoints, name=self.cluster_name, namespace=self._namespace, obj=patch + ) + self.app_peer_data.pop("cluster_initialised", None) + except ApiError as e: + # Ignore the error only when the resource doesn't exist. + if e.status.code != 404: + raise e # Create resources and add labels needed for replication. try: @@ -931,6 +951,11 @@ def _on_get_primary(self, event: ActionEvent) -> None: logger.error(f"failed to get primary with error {e}") def _on_stop(self, _): + # Remove data from the drive when scaling down to zero to prevent + # the cluster from getting stuck when scaling back up. + if self.app.planned_units() == 0: + self.unit_peer_data.clear() + # Patch the services to remove them when the StatefulSet is deleted # (i.e. application is removed). try: diff --git a/src/relations/db.py b/src/relations/db.py index 822dc4d1bd..d9d6a6b9ab 100644 --- a/src/relations/db.py +++ b/src/relations/db.py @@ -244,6 +244,13 @@ def _on_relation_departed(self, event: RelationDepartedEvent) -> None: event.defer() return + # Set a flag to avoid deleting database users when this unit + # is removed and receives relation broken events from related applications. + # This is needed because of https://bugs.launchpad.net/juju/+bug/1979811. + if event.departing_unit == self.charm.unit: + self.charm._peers.data[self.charm.unit].update({"departing": "True"}) + return + if not self.charm.unit.is_leader(): return @@ -275,6 +282,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None: event.defer() return + if "departing" in self.charm._peers.data[self.charm.unit]: + logger.debug("Early exit on_relation_broken: Skipping departing unit") + return + if not self.charm.unit.is_leader(): return diff --git a/src/relations/postgresql_provider.py b/src/relations/postgresql_provider.py index b97a7b0a47..e3b654b711 100644 --- a/src/relations/postgresql_provider.py +++ b/src/relations/postgresql_provider.py @@ -17,7 +17,7 @@ PostgreSQLDeleteUserError, PostgreSQLGetPostgreSQLVersionError, ) -from ops.charm import CharmBase, RelationBrokenEvent +from ops.charm import CharmBase, RelationBrokenEvent, RelationDepartedEvent from ops.framework import Object from ops.model import ActiveStatus, BlockedStatus, Relation @@ -45,6 +45,9 @@ def __init__(self, charm: CharmBase, relation_name: str = "database") -> None: self.relation_name = relation_name super().__init__(charm, self.relation_name) + self.framework.observe( + charm.on[self.relation_name].relation_departed, self._on_relation_departed + ) self.framework.observe( charm.on[self.relation_name].relation_broken, self._on_relation_broken ) @@ -128,6 +131,14 @@ def _on_database_requested(self, event: DatabaseRequestedEvent) -> None: else f"Failed to initialize {self.relation_name} relation" ) + def _on_relation_departed(self, event: RelationDepartedEvent) -> None: + """Set a flag to avoid deleting database users when not wanted.""" + # Set a flag to avoid deleting database users when this unit + # is removed and receives relation broken events from related applications. + # This is needed because of https://bugs.launchpad.net/juju/+bug/1979811. + if event.departing_unit == self.charm.unit: + self.charm._peers.data[self.charm.unit].update({"departing": "True"}) + def _on_relation_broken(self, event: RelationBrokenEvent) -> None: """Remove the user created for this relation.""" # Check for some conditions before trying to access the PostgreSQL instance. @@ -144,6 +155,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None: self._update_unit_status(event.relation) + if "departing" in self.charm._peers.data[self.charm.unit]: + logger.debug("Early exit on_relation_broken: Skipping departing unit") + return + if not self.charm.unit.is_leader(): return diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 0ac5fa74c5..50f113cd25 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -39,6 +39,7 @@ get_password, get_unit_address, run_command_on_unit, + scale_application, ) logger = logging.getLogger(__name__) @@ -387,3 +388,43 @@ async def test_network_cut( ), "Connection is not possible after network restore" await is_cluster_updated(ops_test, primary_name) + + +async def test_scaling_to_zero(ops_test: OpsTest, continuous_writes) -> None: + """Scale the database to zero units and scale up again.""" + # Locate primary unit. + app = await app_name(ops_test) + + # Start an application that continuously writes data to the database. + await start_continuous_writes(ops_test, app) + + # Scale the database to zero units. + logger.info("scaling database to zero units") + await scale_application(ops_test, app, 0) + + # Scale the database to three units. + logger.info("scaling database to three units") + await scale_application(ops_test, app, 3) + + # Verify all units are up and running. + logger.info("waiting for the database service to start in all units") + for unit in ops_test.model.applications[app].units: + assert await is_postgresql_ready( + ops_test, unit.name + ), f"unit {unit.name} not restarted after cluster restart." + + logger.info("checking whether writes are increasing") + await are_writes_increasing(ops_test) + + # Verify that all units are part of the same cluster. + logger.info("checking whether all units are part of the same cluster") + member_ips = await fetch_cluster_members(ops_test) + ip_addresses = [ + await get_unit_address(ops_test, unit.name) + for unit in ops_test.model.applications[app].units + ] + assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." + + # Verify that no writes to the database were missed after stopping the writes. + logger.info("checking whether no writes to the database were missed after stopping the writes") + await check_writes(ops_test) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 4b535d0e2d..02314e4c27 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -637,12 +637,18 @@ async def scale_application(ops_test: OpsTest, application_name: str, scale: int scale: The number of units to scale to """ await ops_test.model.applications[application_name].scale(scale) - await ops_test.model.wait_for_idle( - apps=[application_name], - status="active", - timeout=1000, - wait_for_exact_units=scale, - ) + if scale == 0: + await ops_test.model.block_until( + lambda: len(ops_test.model.applications[DATABASE_APP_NAME].units) == scale, + timeout=1000, + ) + else: + await ops_test.model.wait_for_idle( + apps=[application_name], + status="active", + timeout=1000, + wait_for_exact_units=scale, + ) async def set_password( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e278b6b0bf..64f422b2ca 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -40,6 +40,7 @@ def setUp(self): self.addCleanup(self.harness.cleanup) self.harness.begin() self.charm = self.harness.charm + self._cluster_name = f"patroni-{self.charm.app.name}" self._context = { "namespace": self.harness.model.name, "app_name": self.harness.model.app.name, @@ -51,27 +52,75 @@ def setUp(self): def use_caplog(self, caplog): self._caplog = caplog + @patch("charm.PostgresqlOperatorCharm._add_members") + @patch("charm.Client") @patch("charm.new_password", return_value="sekr1t") @patch("charm.PostgresqlOperatorCharm.get_secret", return_value=None) @patch("charm.PostgresqlOperatorCharm.set_secret") @patch("charm.Patroni.reload_patroni_configuration") @patch("charm.PostgresqlOperatorCharm._patch_pod_labels") @patch("charm.PostgresqlOperatorCharm._create_services") - def test_on_leader_elected(self, _, __, ___, _set_secret, _get_secret, _____): - # Check that a new password was generated on leader election. + def test_on_leader_elected(self, _, __, ___, _set_secret, _get_secret, _____, _client, ______): + # Check that a new password was generated on leader election and nothing is done + # because the "leader" key is present in the endpoint annotations due to a scale + # down to zero units. + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.rel_id, self.charm.app.name, {"cluster_initialised": "True"} + ) + _client.return_value.get.return_value = MagicMock( + metadata=MagicMock(annotations=["leader"]) + ) + _client.return_value.list.side_effect = [ + [MagicMock(metadata=MagicMock(name="fakeName1", namespace="fakeNamespace"))], + [MagicMock(metadata=MagicMock(name="fakeName2", namespace="fakeNamespace"))], + ] self.harness.set_leader() assert _set_secret.call_count == 4 _set_secret.assert_any_call("app", "operator-password", "sekr1t") _set_secret.assert_any_call("app", "replication-password", "sekr1t") _set_secret.assert_any_call("app", "rewind-password", "sekr1t") _set_secret.assert_any_call("app", "monitoring-password", "sekr1t") + _client.return_value.get.assert_called_once_with( + Endpoints, name=self._cluster_name, namespace=self.charm.model.name + ) + _client.return_value.patch.assert_not_called() + self.assertIn( + "cluster_initialised", self.harness.get_relation_data(self.rel_id, self.charm.app) + ) - # Trigger a new leader election and check that the password is still the same. + # Trigger a new leader election and check that the password is still the same, and that the charm + # fixes the missing "leader" key in the endpoint annotations. + _client.reset_mock() + _client.return_value.get.return_value = MagicMock(metadata=MagicMock(annotations=[])) _set_secret.reset_mock() _get_secret.return_value = "test" self.harness.set_leader(False) self.harness.set_leader() assert _set_secret.call_count == 0 + _client.return_value.get.assert_called_once_with( + Endpoints, name=self._cluster_name, namespace=self.charm.model.name + ) + _client.return_value.patch.assert_called_once_with( + Endpoints, + name=self._cluster_name, + namespace=self.charm.model.name, + obj={"metadata": {"annotations": {"leader": "postgresql-k8s-0"}}}, + ) + self.assertNotIn( + "cluster_initialised", self.harness.get_relation_data(self.rel_id, self.charm.app) + ) + + # Test a failure in fixing the "leader" key in the endpoint annotations. + _client.return_value.patch.side_effect = _FakeApiError + with self.assertRaises(_FakeApiError): + self.harness.set_leader(False) + self.harness.set_leader() + + # Test no failure if the resource doesn't exist. + _client.return_value.patch.side_effect = _FakeApiError(404) + self.harness.set_leader(False) + self.harness.set_leader() @patch("charm.Patroni.rock_postgresql_version", new_callable=PropertyMock) @patch("charm.Patroni.primary_endpoint_ready", new_callable=PropertyMock) @@ -653,28 +702,42 @@ def test_set_secret(self, _, __): @patch("charm.Client") def test_on_stop(self, _client): # Test a successful run of the hook. - with self.assertNoLogs("charm", "ERROR"): - _client.return_value.get.return_value = MagicMock( - metadata=MagicMock(ownerReferences="fakeOwnerReferences") - ) - _client.return_value.list.side_effect = [ - [MagicMock(metadata=MagicMock(name="fakeName1", namespace="fakeNamespace"))], - [MagicMock(metadata=MagicMock(name="fakeName2", namespace="fakeNamespace"))], - ] - self.charm.on.stop.emit() - _client.return_value.get.assert_called_once_with( - res=Pod, name="postgresql-k8s-0", namespace=self.charm.model.name - ) - for kind in [Endpoints, Service]: - _client.return_value.list.assert_any_call( - kind, - namespace=self.charm.model.name, - labels={"app.juju.is/created-by": self.charm.app.name}, + for planned_units, relation_data in { + 0: {}, + 1: {"some-relation-data": "some-value"}, + }.items(): + self.harness.set_planned_units(planned_units) + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.rel_id, + self.charm.unit.name, + {"some-relation-data": "some-value"}, ) - self.assertEqual(_client.return_value.apply.call_count, 2) + with self.assertNoLogs("charm", "ERROR"): + _client.return_value.get.return_value = MagicMock( + metadata=MagicMock(ownerReferences="fakeOwnerReferences") + ) + _client.return_value.list.side_effect = [ + [MagicMock(metadata=MagicMock(name="fakeName1", namespace="fakeNamespace"))], + [MagicMock(metadata=MagicMock(name="fakeName2", namespace="fakeNamespace"))], + ] + self.charm.on.stop.emit() + _client.return_value.get.assert_called_once_with( + res=Pod, name="postgresql-k8s-0", namespace=self.charm.model.name + ) + for kind in [Endpoints, Service]: + _client.return_value.list.assert_any_call( + kind, + namespace=self.charm.model.name, + labels={"app.juju.is/created-by": self.charm.app.name}, + ) + self.assertEqual(_client.return_value.apply.call_count, 2) + self.assertEqual( + self.harness.get_relation_data(self.rel_id, self.charm.unit), relation_data + ) + _client.reset_mock() # Test when the charm fails to get first pod info. - _client.reset_mock() _client.return_value.get.side_effect = _FakeApiError with self.assertLogs("charm", "ERROR") as logs: self.charm.on.stop.emit() diff --git a/tests/unit/test_db.py b/tests/unit/test_db.py index a3c217c83b..502c1a759c 100644 --- a/tests/unit/test_db.py +++ b/tests/unit/test_db.py @@ -9,6 +9,7 @@ PostgreSQLCreateUserError, PostgreSQLGetPostgreSQLVersionError, ) +from ops import Unit from ops.framework import EventBase from ops.model import ActiveStatus, BlockedStatus from ops.testing import Harness @@ -379,6 +380,54 @@ def test_update_unit_status(self, _has_blocked_status, _check_for_blocking_relat _check_for_blocking_relations.assert_called_once_with(relation.id) self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) + @patch("charm.Patroni.member_started", new_callable=PropertyMock(return_value=True)) + def test_on_relation_departed(self, _): + # Test when this unit is departing the relation (due to a scale down event). + self.assertNotIn( + "departing", self.harness.get_relation_data(self.peer_rel_id, self.harness.charm.unit) + ) + event = Mock() + event.relation.data = {self.harness.charm.app: {}, self.harness.charm.unit: {}} + event.departing_unit = self.harness.charm.unit + self.harness.charm.legacy_db_relation._on_relation_departed(event) + self.assertIn( + "departing", self.harness.get_relation_data(self.peer_rel_id, self.harness.charm.unit) + ) + + # Test when this unit is departing the relation (due to the relation being broken between the apps). + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, self.harness.charm.unit.name, {"departing": ""} + ) + event.relation.data = {self.harness.charm.app: {}, self.harness.charm.unit: {}} + event.departing_unit = Unit( + f"{self.harness.charm.app}/1", None, self.harness.charm.app._backend, {} + ) + self.harness.charm.legacy_db_relation._on_relation_departed(event) + relation_data = self.harness.get_relation_data(self.peer_rel_id, self.harness.charm.unit) + self.assertNotIn("departing", relation_data) + + @patch("charm.Patroni.member_started", new_callable=PropertyMock(return_value=True)) + def test_on_relation_broken(self, _member_started): + with self.harness.hooks_disabled(): + self.harness.set_leader() + with patch.object(PostgresqlOperatorCharm, "postgresql", Mock()) as postgresql_mock: + # Test when this unit is departing the relation (due to the relation being broken between the apps). + event = Mock() + event.relation.id = self.rel_id + self.harness.charm.legacy_db_relation._on_relation_broken(event) + user = f"relation_id_{self.rel_id}" + postgresql_mock.delete_user.assert_called_once_with(user) + + # Test when this unit is departing the relation (due to a scale down event). + postgresql_mock.reset_mock() + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, self.harness.charm.unit.name, {"departing": "True"} + ) + self.harness.charm.legacy_db_relation._on_relation_broken(event) + postgresql_mock.delete_user.assert_not_called() + @patch( "charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock, diff --git a/tests/unit/test_postgresql_provider.py b/tests/unit/test_postgresql_provider.py index 6638029455..bf8e5351b3 100644 --- a/tests/unit/test_postgresql_provider.py +++ b/tests/unit/test_postgresql_provider.py @@ -9,6 +9,7 @@ PostgreSQLCreateUserError, PostgreSQLGetPostgreSQLVersionError, ) +from ops import Unit from ops.framework import EventBase from ops.model import ActiveStatus, BlockedStatus from ops.testing import Harness @@ -158,3 +159,51 @@ def test_on_database_requested(self, _member_started, _defer, _new_password): # BlockedStatus due to a PostgreSQLGetPostgreSQLVersionError. self.request_database() self.assertTrue(isinstance(self.harness.model.unit.status, BlockedStatus)) + + @patch("charm.Patroni.member_started", new_callable=PropertyMock(return_value=True)) + def test_on_relation_departed(self, _): + # Test when this unit is departing the relation (due to a scale down event). + self.assertNotIn( + "departing", self.harness.get_relation_data(self.peer_rel_id, self.harness.charm.unit) + ) + event = Mock() + event.relation.data = {self.harness.charm.app: {}, self.harness.charm.unit: {}} + event.departing_unit = self.harness.charm.unit + self.harness.charm.postgresql_client_relation._on_relation_departed(event) + self.assertIn( + "departing", self.harness.get_relation_data(self.peer_rel_id, self.harness.charm.unit) + ) + + # Test when this unit is departing the relation (due to the relation being broken between the apps). + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, self.harness.charm.unit.name, {"departing": ""} + ) + event.relation.data = {self.harness.charm.app: {}, self.harness.charm.unit: {}} + event.departing_unit = Unit( + f"{self.harness.charm.app}/1", None, self.harness.charm.app._backend, {} + ) + self.harness.charm.postgresql_client_relation._on_relation_departed(event) + relation_data = self.harness.get_relation_data(self.peer_rel_id, self.harness.charm.unit) + self.assertNotIn("departing", relation_data) + + @patch("charm.Patroni.member_started", new_callable=PropertyMock(return_value=True)) + def test_on_relation_broken(self, _member_started): + with self.harness.hooks_disabled(): + self.harness.set_leader() + with patch.object(PostgresqlOperatorCharm, "postgresql", Mock()) as postgresql_mock: + # Test when this unit is departing the relation (due to the relation being broken between the apps). + event = Mock() + event.relation.id = self.rel_id + self.harness.charm.postgresql_client_relation._on_relation_broken(event) + user = f"relation_id_{self.rel_id}" + postgresql_mock.delete_user.assert_called_once_with(user) + + # Test when this unit is departing the relation (due to a scale down event). + postgresql_mock.reset_mock() + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.peer_rel_id, self.harness.charm.unit.name, {"departing": "True"} + ) + self.harness.charm.postgresql_client_relation._on_relation_broken(event) + postgresql_mock.delete_user.assert_not_called()