From e3e1533f441894056286499333fd78c6ee8c4b8d Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 4 Jul 2024 22:13:34 -0300 Subject: [PATCH] [DPE-4736] Improve async replication stability (#526) * Stabilise async replication tests Signed-off-by: Marcelo Henrique Neppel * Add unit test for _on_async_relation_broken Signed-off-by: Marcelo Henrique Neppel * Add unit test for _on_async_relation_departed Signed-off-by: Marcelo Henrique Neppel * Add unit test for _on_async_relation_created Signed-off-by: Marcelo Henrique Neppel --------- Signed-off-by: Marcelo Henrique Neppel --- src/relations/async_replication.py | 32 ++++-- tests/unit/test_async_replication.py | 164 +++++++++++++++++++++++++++ 2 files changed, 184 insertions(+), 12 deletions(-) create mode 100644 tests/unit/test_async_replication.py diff --git a/src/relations/async_replication.py b/src/relations/async_replication.py index 5b917cf646..a5cf8153a4 100644 --- a/src/relations/async_replication.py +++ b/src/relations/async_replication.py @@ -66,12 +66,12 @@ def __init__(self, charm): super().__init__(charm, "postgresql") self.charm = charm self.framework.observe( - self.charm.on[REPLICATION_OFFER_RELATION].relation_joined, - self._on_async_relation_joined, + self.charm.on[REPLICATION_OFFER_RELATION].relation_created, + self._on_async_relation_created, ) self.framework.observe( - self.charm.on[REPLICATION_CONSUMER_RELATION].relation_joined, - self._on_async_relation_joined, + self.charm.on[REPLICATION_CONSUMER_RELATION].relation_created, + self._on_async_relation_created, ) self.framework.observe( self.charm.on[REPLICATION_OFFER_RELATION].relation_changed, @@ -468,7 +468,7 @@ def is_primary_cluster(self) -> bool: return self.charm.app == self._get_primary_cluster() def _on_async_relation_broken(self, _) -> None: - if "departing" in self.charm._peers.data[self.charm.unit]: + if self.charm._peers is None or "departing" in self.charm._peers.data[self.charm.unit]: logger.debug("Early exit on_async_relation_broken: Skipping departing unit.") return @@ -526,19 +526,21 @@ def _on_async_relation_changed(self, event: RelationChangedEvent) -> None: if self._wait_for_standby_leader(event): return + if ( + not self.container.can_connect() + or len(self.container.pebble.get_services(names=[self.charm._postgresql_service])) == 0 + ): + logger.debug("Early exit on_async_relation_changed: container hasn't started yet.") + event.defer() + return + # Update the asynchronous replication configuration and start the database. self.charm.update_config() self.container.start(self.charm._postgresql_service) self._handle_database_start(event) - def _on_async_relation_departed(self, event: RelationDepartedEvent) -> None: - """Set a flag to avoid setting a wrong status message on relation broken event handler.""" - # This is needed because of https://bugs.launchpad.net/juju/+bug/1979811. - if event.departing_unit == self.charm.unit: - self.charm._peers.data[self.charm.unit].update({"departing": "True"}) - - def _on_async_relation_joined(self, _) -> None: + def _on_async_relation_created(self, _) -> None: """Publish this unit address in the relation data.""" self._relation.data[self.charm.unit].update({"unit-address": self._get_unit_ip()}) @@ -549,6 +551,12 @@ def _on_async_relation_joined(self, _) -> None: "unit-promoted-cluster-counter": highest_promoted_cluster_counter }) + def _on_async_relation_departed(self, event: RelationDepartedEvent) -> None: + """Set a flag to avoid setting a wrong status message on relation broken event handler.""" + # This is needed because of https://bugs.launchpad.net/juju/+bug/1979811. + if event.departing_unit == self.charm.unit: + self.charm._peers.data[self.charm.unit].update({"departing": "True"}) + def _on_create_replication(self, event: ActionEvent) -> None: """Set up asynchronous replication between two clusters.""" if self._get_primary_cluster() is not None: diff --git a/tests/unit/test_async_replication.py b/tests/unit/test_async_replication.py new file mode 100644 index 0000000000..75574cfa36 --- /dev/null +++ b/tests/unit/test_async_replication.py @@ -0,0 +1,164 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +from unittest.mock import patch + +import pytest +from ops.testing import Harness + +from charm import PostgresqlOperatorCharm +from constants import PEER + +RELATION_NAMES = ["replication-offer", "replication"] + + +@pytest.fixture(autouse=True) +def harness(): + with patch("charm.KubernetesServicePatch", lambda x, y: None): + harness = Harness(PostgresqlOperatorCharm) + + # Set up the initial relation and hooks. + harness.set_leader(True) + harness.begin() + + yield harness + harness.cleanup() + + +@pytest.mark.parametrize("relation_name", RELATION_NAMES) +@pytest.mark.parametrize("is_leader", [True, False]) +def test_on_async_relation_broken(harness, is_leader, relation_name): + with ( + patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, + patch( + "relations.async_replication.PostgreSQLAsyncReplication._set_app_status" + ) as _set_app_status, + patch("charm.Patroni.get_standby_leader") as _get_standby_leader, + patch( + "relations.async_replication.PostgreSQLAsyncReplication._on_async_relation_departed" + ) as _on_async_relation_departed, + ): + # Test before the peer relation is available. + with harness.hooks_disabled(): + harness.set_leader(is_leader) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + harness.add_relation_unit(rel_id, harness.charm.unit.name) + harness.remove_relation(rel_id) + _get_standby_leader.assert_not_called() + _set_app_status.assert_not_called() + + # Test the departing unit. + with harness.hooks_disabled(): + peer_rel_id = harness.add_relation(PEER, harness.charm.app.name) + harness.update_relation_data( + peer_rel_id, + harness.charm.app.name, + {"promoted-cluster-counter": "1"}, + ) + harness.update_relation_data( + peer_rel_id, + harness.charm.unit.name, + {"departing": "True", "stopped": "True", "unit-promoted-cluster-counter": "1"}, + ) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + harness.add_relation_unit(rel_id, harness.charm.unit.name) + harness.remove_relation(rel_id) + # assert harness.get_relation_data(peer_rel_id, harness.charm.app.name) == { + # "promoted-cluster-counter": ("0" if is_leader else "0")} + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == { + "departing": "True", + "stopped": "True", + "unit-promoted-cluster-counter": "1", + } + _get_standby_leader.assert_not_called() + _set_app_status.assert_not_called() + + # Test in a primary cluster. + with harness.hooks_disabled(): + _get_standby_leader.return_value = None + harness.update_relation_data( + peer_rel_id, + harness.charm.unit.name, + {"departing": "", "stopped": "True", "unit-promoted-cluster-counter": "1"}, + ) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + harness.add_relation_unit(rel_id, harness.charm.unit.name) + harness.remove_relation(rel_id) + assert harness.get_relation_data(peer_rel_id, harness.charm.app.name) == ( + {} if is_leader else {"promoted-cluster-counter": "1"} + ) + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == {} + _get_standby_leader.assert_called_once() + _update_config.assert_called_once() + + # Test in a standby cluster. + _update_config.reset_mock() + with harness.hooks_disabled(): + _get_standby_leader.return_value = harness.charm.unit.name + harness.update_relation_data( + peer_rel_id, + harness.charm.unit.name, + {"stopped": "True", "unit-promoted-cluster-counter": "1"}, + ) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + harness.add_relation_unit(rel_id, harness.charm.unit.name) + harness.remove_relation(rel_id) + assert harness.get_relation_data(peer_rel_id, harness.charm.app.name) == { + "promoted-cluster-counter": ("0" if is_leader else "1") + } + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == {} + assert _set_app_status.call_count == (1 if is_leader else 0) + _update_config.assert_not_called() + + +@pytest.mark.parametrize("relation_name", RELATION_NAMES) +def test_on_async_relation_created(harness, relation_name): + with ( + patch( + "relations.async_replication.PostgreSQLAsyncReplication._get_highest_promoted_cluster_counter_value", + side_effect=["0", "1"], + ) as _get_highest_promoted_cluster_counter_value, + patch( + "relations.async_replication.PostgreSQLAsyncReplication._get_unit_ip", + return_value="1.1.1.1", + ) as _get_unit_ip, + ): + # Test in a standby cluster. + with harness.hooks_disabled(): + peer_rel_id = harness.add_relation(PEER, harness.charm.app.name) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + assert harness.get_relation_data(rel_id, harness.charm.unit.name) == { + "unit-address": "1.1.1.1" + } + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == {} + + # Test in a primary cluster. + with harness.hooks_disabled(): + harness.update_relation_data(rel_id, harness.charm.unit.name, {"unit-address": ""}) + harness.remove_relation(rel_id) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + assert harness.get_relation_data(rel_id, harness.charm.unit.name) == { + "unit-address": "1.1.1.1" + } + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == { + "unit-promoted-cluster-counter": "1" + } + + +@pytest.mark.parametrize("relation_name", RELATION_NAMES) +def test_on_async_relation_departed(harness, relation_name): + # Test the departing unit. + with harness.hooks_disabled(): + peer_rel_id = harness.add_relation(PEER, harness.charm.app.name) + rel_id = harness.add_relation(relation_name, harness.charm.app.name) + harness.add_relation_unit(rel_id, harness.charm.unit.name) + harness.remove_relation_unit(rel_id, harness.charm.unit.name) + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == {"departing": "True"} + + # Test the non-departing unit. + other_unit = f"{harness.charm.app.name}/1" + with harness.hooks_disabled(): + harness.update_relation_data(peer_rel_id, harness.charm.unit.name, {"departing": ""}) + harness.add_relation_unit(rel_id, other_unit) + harness.remove_relation_unit(rel_id, other_unit) + assert harness.get_relation_data(peer_rel_id, harness.charm.unit.name) == {}