diff --git a/src/cluster.py b/src/cluster.py index 95b6417312..a4f891ba79 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -541,6 +541,23 @@ def is_member_isolated(self) -> bool: return len(cluster_status.json()["members"]) == 0 + def are_replicas_up(self) -> dict[str, bool] | None: + """Check if cluster members are running or streaming.""" + try: + response = requests.get( + f"{self._patroni_url}/cluster", + verify=self.verify, + auth=self._patroni_auth, + timeout=PATRONI_TIMEOUT, + ) + return { + member["host"]: member["state"] in ["running", "streaming"] + for member in response.json()["members"] + } + except Exception: + logger.exception("Unable to get the state of the cluster") + return + def promote_standby_cluster(self) -> None: """Promote a standby cluster to be a regular cluster.""" config_response = requests.get( diff --git a/src/relations/postgresql_provider.py b/src/relations/postgresql_provider.py index 6b462124ba..0fa0c75d2f 100644 --- a/src/relations/postgresql_provider.py +++ b/src/relations/postgresql_provider.py @@ -184,6 +184,11 @@ def update_endpoints(self, event: DatabaseRequestedEvent = None) -> None: # If there are no replicas, remove the read-only endpoint. replicas_endpoint = list(self.charm.members_ips - {self.charm.primary_endpoint}) replicas_endpoint.sort() + cluster_state = self.charm._patroni.are_replicas_up() + if cluster_state: + replicas_endpoint = [ + replica for replica in replicas_endpoint if cluster_state.get(replica, False) + ] read_only_endpoints = ( ",".join(f"{x}:{DATABASE_PORT}" for x in replicas_endpoint) if len(replicas_endpoint) > 0 diff --git a/tests/integration/new_relations/test_new_relations.py b/tests/integration/new_relations/test_new_relations.py index 24307164eb..09cf292805 100644 --- a/tests/integration/new_relations/test_new_relations.py +++ b/tests/integration/new_relations/test_new_relations.py @@ -11,9 +11,19 @@ import pytest import yaml from pytest_operator.plugin import OpsTest +from tenacity import Retrying, stop_after_attempt, wait_fixed from .. import markers -from ..helpers import CHARM_BASE, assert_sync_standbys, get_leader_unit, scale_application +from ..helpers import ( + CHARM_BASE, + assert_sync_standbys, + get_leader_unit, + get_machine_from_unit, + get_primary, + scale_application, + start_machine, + stop_machine, +) from ..juju_ import juju_major_version from .helpers import ( build_connection_string, @@ -184,6 +194,35 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest): cursor.execute("DROP TABLE test;") +@pytest.mark.group("new_relations_tests") +@pytest.mark.abort_on_fail +async def test_filter_out_degraded_replicas(ops_test: OpsTest): + primary = await get_primary(ops_test, f"{DATABASE_APP_NAME}/0") + replica = next( + unit.name + for unit in ops_test.model.applications[DATABASE_APP_NAME].units + if unit.name != primary + ) + machine = await get_machine_from_unit(ops_test, replica) + await stop_machine(ops_test, machine) + + # Topology observer runs every half a minute + await asyncio.sleep(60) + + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(3), reraise=True): + with attempt: + data = await get_application_relation_data( + ops_test, + APPLICATION_APP_NAME, + FIRST_DATABASE_RELATION_NAME, + "read-only-endpoints", + ) + assert data is None + + await start_machine(ops_test, machine) + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=200) + + @pytest.mark.group("new_relations_tests") async def test_user_with_extra_roles(ops_test: OpsTest): """Test superuser actions and the request for more permissions.""" diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 6951238597..3eb42242fc 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -831,3 +831,21 @@ def test_reinitialise_raft_data(patroni): _restart_patroni.assert_called_once_with() assert _psutil.process_iter.call_count == 2 _psutil.process_iter.assert_any_call(["name"]) + + +def test_are_replicas_up(patroni): + with ( + patch("requests.get") as _get, + ): + _get.return_value.json.return_value = { + "members": [ + {"host": "1.1.1.1", "state": "running"}, + {"host": "2.2.2.2", "state": "streaming"}, + {"host": "3.3.3.3", "state": "other state"}, + ] + } + assert patroni.are_replicas_up() == {"1.1.1.1": True, "2.2.2.2": True, "3.3.3.3": False} + + # Return None on error + _get.side_effect = Exception + assert patroni.are_replicas_up() is None diff --git a/tests/unit/test_postgresql_provider.py b/tests/unit/test_postgresql_provider.py index ab8cf5d00c..2aaed01083 100644 --- a/tests/unit/test_postgresql_provider.py +++ b/tests/unit/test_postgresql_provider.py @@ -216,8 +216,12 @@ def test_update_endpoints_with_event(harness): patch( "charm.PostgresqlOperatorCharm.members_ips", new_callable=PropertyMock, + return_value={"1.1.1.1", "2.2.2.2"}, ) as _members_ips, patch("charm.Patroni.get_primary", new_callable=PropertyMock) as _get_primary, + patch( + "charm.Patroni.are_replicas_up", return_value={"1.1.1.1": True, "2.2.2.2": True} + ) as _are_replicas_up, patch( "relations.postgresql_provider.DatabaseProvides.fetch_my_relation_data" ) as _fetch_my_relation_data, @@ -227,7 +231,6 @@ def test_update_endpoints_with_event(harness): # Mock the members_ips list to simulate different scenarios # (with and without a replica). rel_id = harness.model.get_relation(RELATION_NAME).id - _members_ips.side_effect = [{"1.1.1.1", "2.2.2.2"}, {"1.1.1.1"}] # Add two different relations. rel_id = harness.add_relation(RELATION_NAME, "application") @@ -259,6 +262,7 @@ def test_update_endpoints_with_event(harness): _fetch_my_relation_data.assert_called_once_with([2], ["password"]) # Also test with only a primary instance. + _members_ips.return_value = {"1.1.1.1"} harness.charm.postgresql_client_relation.update_endpoints(mock_event) assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "endpoints": "1.1.1.1:5432", @@ -277,15 +281,18 @@ def test_update_endpoints_without_event(harness): patch( "charm.PostgresqlOperatorCharm.members_ips", new_callable=PropertyMock, + return_value={"1.1.1.1", "2.2.2.2"}, ) as _members_ips, patch("charm.Patroni.get_primary", new_callable=PropertyMock) as _get_primary, + patch( + "charm.Patroni.are_replicas_up", return_value={"1.1.1.1": True, "2.2.2.2": True} + ) as _are_replicas_up, patch( "relations.postgresql_provider.DatabaseProvides.fetch_my_relation_data" ) as _fetch_my_relation_data, ): # Mock the members_ips list to simulate different scenarios # (with and without a replica). - _members_ips.side_effect = [{"1.1.1.1", "2.2.2.2"}, {"1.1.1.1", "2.2.2.2"}, {"1.1.1.1"}] rel_id = harness.model.get_relation(RELATION_NAME).id # Don't set data if no password @@ -340,7 +347,41 @@ def test_update_endpoints_without_event(harness): } _fetch_my_relation_data.assert_called_once_with(None, ["password"]) + # Filter out missing replica + _members_ips.return_value = {"1.1.1.1", "2.2.2.2", "3.3.3.3"} + harness.charm.postgresql_client_relation.update_endpoints() + assert harness.get_relation_data(rel_id, harness.charm.app.name) == { + "endpoints": "1.1.1.1:5432", + "read-only-endpoints": "2.2.2.2:5432", + "uris": "postgresql://relation-2:test_password@1.1.1.1:5432/test_db", + "tls": "False", + } + assert harness.get_relation_data(another_rel_id, harness.charm.app.name) == { + "endpoints": "1.1.1.1:5432", + "read-only-endpoints": "2.2.2.2:5432", + "uris": "postgresql://relation-3:test_password@1.1.1.1:5432/test_db2", + "tls": "False", + } + + # Don't filter if unable to get cluster status + _are_replicas_up.return_value = None + harness.charm.postgresql_client_relation.update_endpoints() + assert harness.get_relation_data(rel_id, harness.charm.app.name) == { + "endpoints": "1.1.1.1:5432", + "read-only-endpoints": "2.2.2.2:5432,3.3.3.3:5432", + "uris": "postgresql://relation-2:test_password@1.1.1.1:5432/test_db", + "tls": "False", + } + assert harness.get_relation_data(another_rel_id, harness.charm.app.name) == { + "endpoints": "1.1.1.1:5432", + "read-only-endpoints": "2.2.2.2:5432,3.3.3.3:5432", + "uris": "postgresql://relation-3:test_password@1.1.1.1:5432/test_db2", + "tls": "False", + } + # Also test with only a primary instance. + _members_ips.return_value = {"1.1.1.1"} + _are_replicas_up.return_value = {"1.1.1.1": True} harness.charm.postgresql_client_relation.update_endpoints() assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "endpoints": "1.1.1.1:5432", diff --git a/tox.ini b/tox.ini index 39651f44eb..3f468e5999 100644 --- a/tox.ini +++ b/tox.ini @@ -6,13 +6,13 @@ no_package = True env_list = lint, unit [vars] -src_path = {tox_root}/src -tests_path = {tox_root}/tests +src_path = "{tox_root}/src" +tests_path = "{tox_root}/tests" all_path = {[vars]src_path} {[vars]tests_path} [testenv] set_env = - PYTHONPATH = {tox_root}/lib:{[vars]src_path} + PYTHONPATH = {tox_root}/lib:{tox_root}/src PY_COLORS = 1 allowlist_externals = poetry