diff --git a/src/charm.py b/src/charm.py index afc46729d9..c67a6894c1 100755 --- a/src/charm.py +++ b/src/charm.py @@ -405,7 +405,7 @@ def _on_get_primary(self, event: ActionEvent) -> None: except RetryError as e: logger.error(f"failed to get primary with error {e}") - def _updated_synchronous_node_count(self, num_units: int | None = None) -> bool: + def updated_synchronous_node_count(self, num_units: int | None = None) -> bool: """Tries to update synchronous_node_count configuration and reports the result.""" try: self._patroni.update_synchronous_node_count(num_units) @@ -439,7 +439,7 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: if "cluster_initialised" not in self._peers.data[ self.app - ] or not self._updated_synchronous_node_count(len(self._units_ips)): + ] or not self.updated_synchronous_node_count(len(self._units_ips)): logger.debug("Deferring on_peer_relation_departed: cluster not initialized") event.defer() return diff --git a/src/cluster.py b/src/cluster.py index 374964a5e5..2b27e5e159 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -647,7 +647,8 @@ def render_patroni_yml_file( stanza=stanza, restore_stanza=restore_stanza, version=self.get_postgresql_version().split(".")[0], - minority_count=self.planned_units // 2, + # -1 for leader + synchronous_node_count=self.planned_units - 1, pg_parameters=parameters, primary_cluster_endpoint=self.charm.async_replication.get_primary_cluster_endpoint(), extra_replication_endpoints=self.charm.async_replication.get_standby_endpoints(), @@ -789,6 +790,7 @@ def remove_raft_member(self, member_ip: str) -> None: raise RemoveRaftMemberFailedError() from None if not result.startswith("SUCCESS"): + logger.debug("Remove raft member: Remove call not successful") raise RemoveRaftMemberFailedError() @retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=10)) @@ -860,7 +862,7 @@ def update_synchronous_node_count(self, units: int | None = None) -> None: with attempt: r = requests.patch( f"{self._patroni_url}/config", - json={"synchronous_node_count": units // 2}, + json={"synchronous_node_count": units - 1}, verify=self.verify, auth=self._patroni_auth, timeout=PATRONI_TIMEOUT, diff --git a/src/upgrade.py b/src/upgrade.py index 629ba06fa8..2a20787981 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -146,6 +146,7 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: # Update the configuration. self.charm.unit.status = MaintenanceStatus("updating configuration") self.charm.update_config() + self.charm.updated_synchronous_node_count(len(self.charm._units_ips)) self.charm.unit.status = MaintenanceStatus("refreshing the snap") self.charm._install_snap_packages(packages=SNAP_PACKAGES, refresh=True) diff --git a/templates/patroni.yml.j2 b/templates/patroni.yml.j2 index bd3f87154a..1626926b84 100644 --- a/templates/patroni.yml.j2 +++ b/templates/patroni.yml.j2 @@ -59,7 +59,7 @@ bootstrap: retry_timeout: 10 maximum_lag_on_failover: 1048576 synchronous_mode: true - synchronous_node_count: {{ minority_count }} + synchronous_node_count: {{ synchronous_node_count }} postgresql: use_pg_rewind: true remove_data_directory_on_rewind_failure: true diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 57ddac6dd9..951659ee45 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -106,7 +106,7 @@ async def are_writes_increasing( with attempt: more_writes, _ = await count_writes( ops_test, - down_unit=down_unit, + down_unit=down_units[0], use_ip_from_inside=use_ip_from_inside, extra_model=extra_model, ) diff --git a/tests/integration/ha_tests/test_replication.py b/tests/integration/ha_tests/test_replication.py index dd924948da..3295bf71ce 100644 --- a/tests/integration/ha_tests/test_replication.py +++ b/tests/integration/ha_tests/test_replication.py @@ -63,13 +63,13 @@ async def test_reelection(ops_test: OpsTest, continuous_writes, primary_start_ti # Remove the primary unit. primary_name = await get_primary(ops_test, app) - await ops_test.model.destroy_units( - primary_name, - ) + await ops_test.model.destroy_units(primary_name) # Wait and get the primary again (which can be any unit, including the previous primary). - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(apps=[app], status="active") + async with ops_test.fast_forward("60s"): + await ops_test.model.wait_for_idle( + apps=[app, APPLICATION_NAME], status="active", idle_period=30 + ) await are_writes_increasing(ops_test, primary_name) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 2d587b38d5..49b88ee3a9 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1043,7 +1043,6 @@ def switchover( ) assert response.status_code == 200 app_name = current_primary.split("/")[0] - minority_count = len(ops_test.model.applications[app_name].units) // 2 for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(2), reraise=True): with attempt: response = requests.get(f"http://{primary_ip}:8008/cluster") @@ -1051,7 +1050,7 @@ def switchover( standbys = len([ member for member in response.json()["members"] if member["role"] == "sync_standby" ]) - assert standbys >= minority_count + assert standbys == len(ops_test.model.applications[app_name].units) - 1 async def wait_for_idle_on_blocked( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 650dbe689d..4bbe46d23c 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -2189,7 +2189,7 @@ def test_on_peer_relation_departed(harness): patch("charm.Patroni.are_all_members_ready") as _are_all_members_ready, patch("charm.PostgresqlOperatorCharm._get_ips_to_remove") as _get_ips_to_remove, patch( - "charm.PostgresqlOperatorCharm._updated_synchronous_node_count" + "charm.PostgresqlOperatorCharm.updated_synchronous_node_count" ) as _updated_synchronous_node_count, patch("charm.Patroni.remove_raft_member") as _remove_raft_member, patch("charm.PostgresqlOperatorCharm._unit_ip") as _unit_ip, diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 9cb737f0bf..0d920bbbc9 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -345,7 +345,7 @@ def test_render_patroni_yml_file(peers_ips, patroni): rewind_user=REWIND_USER, rewind_password=rewind_password, version=postgresql_version, - minority_count=patroni.planned_units // 2, + synchronous_node_count=patroni.planned_units - 1, raft_password=raft_password, patroni_password=patroni_password, ) diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py index 7dfbfc521b..97c4b5e560 100644 --- a/tests/unit/test_upgrade.py +++ b/tests/unit/test_upgrade.py @@ -106,6 +106,9 @@ def test_on_upgrade_granted(harness): patch("charm.Patroni.start_patroni") as _start_patroni, patch("charm.PostgresqlOperatorCharm._install_snap_packages") as _install_snap_packages, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, + patch( + "charm.PostgresqlOperatorCharm.updated_synchronous_node_count" + ) as _updated_synchronous_node_count, ): # Test when the charm fails to start Patroni. mock_event = MagicMock() @@ -174,6 +177,7 @@ def test_on_upgrade_granted(harness): _member_started.reset_mock() _cluster_members.reset_mock() mock_event.defer.reset_mock() + _updated_synchronous_node_count.reset_mock() _is_replication_healthy.return_value = True with harness.hooks_disabled(): harness.set_leader(True) @@ -184,6 +188,7 @@ def test_on_upgrade_granted(harness): _set_unit_completed.assert_called_once() _set_unit_failed.assert_not_called() _on_upgrade_changed.assert_called_once() + _updated_synchronous_node_count.assert_called_once_with(2) def test_pre_upgrade_check(harness):