From bd4b7dede95d0a487cb5dff58733f55a4217a178 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sun, 30 Jun 2024 21:44:03 +0300 Subject: [PATCH 01/10] Increase timeou and log unit that is still up --- tests/integration/ha_tests/helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 98e0589b18..daa755f775 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -68,7 +68,7 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool: pgrep_cmd = ("pgrep", "-x", process) try: - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(400), wait=wait_fixed(3)): with attempt: for unit in ops_test.model.applications[app].units: _, processes, _ = await ops_test.juju("ssh", unit.name, *pgrep_cmd) @@ -79,6 +79,7 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool: # If something was returned, there is a running process. if len(processes) > 0: + logger.info("Unit %s not yey down" % unit.name) raise ProcessRunningError except RetryError: return False From 32f07a461686596ebdc23516e9edf6b9af4874c2 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 1 Jul 2024 00:27:31 +0300 Subject: [PATCH 02/10] Early fail --- tests/integration/ha_tests/helpers.py | 2 +- tests/integration/ha_tests/test_self_healing.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index daa755f775..3cd3297001 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -79,7 +79,7 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool: # If something was returned, there is a running process. if len(processes) > 0: - logger.info("Unit %s not yey down" % unit.name) + logger.info("Unit %s not yet down" % unit.name) raise ProcessRunningError except RetryError: return False diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 63d5b5abaa..e958de09eb 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -95,6 +95,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_storage_re_use(ops_test, continuous_writes): """Verifies that database units with attached storage correctly repurpose storage. @@ -142,6 +143,7 @@ async def test_storage_re_use(ops_test, continuous_writes): @pytest.mark.group(1) +@pytest.mark.abort_on_fail @pytest.mark.parametrize("process", DB_PROCESSES) async def test_kill_db_process( ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout @@ -170,6 +172,7 @@ async def test_kill_db_process( @pytest.mark.group(1) +@pytest.mark.abort_on_fail @pytest.mark.parametrize("process", DB_PROCESSES) async def test_freeze_db_process( ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout @@ -208,6 +211,7 @@ async def test_freeze_db_process( @pytest.mark.group(1) +@pytest.mark.abort_on_fail @pytest.mark.parametrize("process", DB_PROCESSES) async def test_restart_db_process( ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout @@ -236,6 +240,7 @@ async def test_restart_db_process( @pytest.mark.group(1) +@pytest.mark.abort_on_fail @pytest.mark.parametrize("process", DB_PROCESSES) @pytest.mark.parametrize("signal", ["SIGTERM", "SIGKILL"]) async def test_full_cluster_restart( @@ -304,6 +309,7 @@ async def test_full_cluster_restart( @pytest.mark.group(1) +@pytest.mark.abort_on_fail @pytest.mark.unstable async def test_forceful_restart_without_data_and_transaction_logs( ops_test: OpsTest, @@ -380,6 +386,7 @@ async def test_forceful_restart_without_data_and_transaction_logs( @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_timeout): """Completely cut and restore network.""" # Locate primary unit. @@ -468,6 +475,7 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_network_cut_without_ip_change( ops_test: OpsTest, continuous_writes, primary_start_timeout ): From 87b2415942fef9283913ede15b31c148b9f74f5f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 8 Jul 2024 19:01:57 +0300 Subject: [PATCH 03/10] Bump coverage --- pyproject.toml | 6 ------ tests/unit/test_cluster.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3f937de95c..e1c21292f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,12 +86,6 @@ allure-pytest-collection-report = {git = "https://github.com/canonical/data-plat [tool.coverage.run] branch = true -[tool.coverage.report] -show_missing = true -exclude_lines = [ - "logger\\.debug" -] - [tool.pytest.ini_options] minversion = "6.0" log_cli_level = "INFO" diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index dbe2cb3901..9edbc7022c 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -542,3 +542,16 @@ def test_member_inactive_error(peers_ips, patroni): assert patroni.member_inactive _get.assert_called_once_with("http://1.1.1.1:8008/health", verify=True, timeout=5) + + +def test_patroni_logs(harness, patroni): + with ( + patch("charm.snap.SnapCache") as _snap_cache, + ): + _cache = _snap_cache.return_value + _selected_snap = _cache.__getitem__.return_value + _selected_snap.logs.return_value = sentinel.logs + + assert patroni.patroni_logs() == sentinel.logs + + _selected_snap.logs.assert_called_once_with(services=["patroni"], num_lines=10) From 83419027a6d4e293f9c4d8842c8ed4f762a4e3e6 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 8 Jul 2024 19:12:04 +0300 Subject: [PATCH 04/10] Restore pyproj --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e1c21292f8..3f937de95c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,12 @@ allure-pytest-collection-report = {git = "https://github.com/canonical/data-plat [tool.coverage.run] branch = true +[tool.coverage.report] +show_missing = true +exclude_lines = [ + "logger\\.debug" +] + [tool.pytest.ini_options] minversion = "6.0" log_cli_level = "INFO" From c1d41f8653acc432184e5b57ca6aa838b5cccae9 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 8 Jul 2024 19:44:33 +0300 Subject: [PATCH 05/10] Bump coverage --- tests/unit/test_cluster.py | 93 ++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 25 deletions(-) diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 9edbc7022c..a690d9a587 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -1,7 +1,6 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. -from unittest import TestCase from unittest.mock import MagicMock, Mock, PropertyMock, mock_open, patch, sentinel import pytest @@ -25,9 +24,6 @@ PATRONI_SERVICE = "patroni" CREATE_CLUSTER_CONF_PATH = "/var/snap/charmed-postgresql/current/etc/postgresql/postgresql.conf" -# used for assert functions -tc = TestCase() - # This method will be used by the mock to replace requests.get def mocked_requests_get(*args, **kwargs): @@ -90,7 +86,7 @@ def test_get_alternative_patroni_url(peers_ips, patroni): # Test the first URL that is returned (it should have the current unit IP). url = patroni._get_alternative_patroni_url(attempt) - tc.assertEqual(url, f"http://{patroni.unit_ip}:8008") + assert url == f"http://{patroni.unit_ip}:8008" # Test returning the other servers URLs. for attempt_number in range(attempt.retry_state.attempt_number + 1, len(peers_ips) + 2): @@ -106,8 +102,11 @@ def test_get_member_ip(peers_ips, patroni): ): # Test error on trying to get the member IP. _get_alternative_patroni_url.side_effect = "http://server2" - with tc.assertRaises(tenacity.RetryError): + try: patroni.get_member_ip(patroni.member_name) + assert False + except tenacity.RetryError: + pass # Test using an alternative Patroni URL. _get_alternative_patroni_url.side_effect = [ @@ -116,17 +115,17 @@ def test_get_member_ip(peers_ips, patroni): "http://server1", ] ip = patroni.get_member_ip(patroni.member_name) - tc.assertEqual(ip, "1.1.1.1") + assert ip == "1.1.1.1" # Test using the current Patroni URL. _get_alternative_patroni_url.side_effect = ["http://server1"] ip = patroni.get_member_ip(patroni.member_name) - tc.assertEqual(ip, "1.1.1.1") + assert ip == "1.1.1.1" # Test when not having that specific member in the cluster. _get_alternative_patroni_url.side_effect = ["http://server1"] ip = patroni.get_member_ip("other-member-name") - tc.assertIsNone(ip) + assert ip is None def test_get_patroni_health(peers_ips, patroni): @@ -143,12 +142,15 @@ def test_get_patroni_health(peers_ips, patroni): _stop_after_delay.assert_called_once_with(60) _wait_fixed.assert_called_once_with(7) - tc.assertEqual(health, {"state": "running"}) + assert health == {"state": "running"} # Test when the Patroni API is not reachable. _patroni_url.return_value = "http://server2" - with tc.assertRaises(tenacity.RetryError): + try: patroni.get_patroni_health() + assert False + except tenacity.RetryError: + pass def test_get_postgresql_version(peers_ips, patroni): @@ -161,7 +163,7 @@ def test_get_postgresql_version(peers_ips, patroni): ] version = patroni.get_postgresql_version() - tc.assertEqual(version, "14.0") + assert version == "14.0" _snap_client.assert_called_once_with() _get_installed_snaps.assert_called_once_with() @@ -173,8 +175,11 @@ def test_get_primary(peers_ips, patroni): ): # Test error on trying to get the member IP. _get_alternative_patroni_url.side_effect = "http://server2" - with tc.assertRaises(tenacity.RetryError): + try: patroni.get_primary(patroni.member_name) + assert False + except tenacity.RetryError: + pass # Test using an alternative Patroni URL. _get_alternative_patroni_url.side_effect = [ @@ -183,17 +188,17 @@ def test_get_primary(peers_ips, patroni): "http://server1", ] primary = patroni.get_primary() - tc.assertEqual(primary, "postgresql-0") + assert primary == "postgresql-0" # Test using the current Patroni URL. _get_alternative_patroni_url.side_effect = ["http://server1"] primary = patroni.get_primary() - tc.assertEqual(primary, "postgresql-0") + assert primary == "postgresql-0" # Test requesting the primary in the unit name pattern. _get_alternative_patroni_url.side_effect = ["http://server1"] primary = patroni.get_primary(unit_name_pattern=True) - tc.assertEqual(primary, "postgresql/0") + assert primary == "postgresql/0" def test_is_creating_backup(peers_ips, patroni): @@ -206,13 +211,13 @@ def test_is_creating_backup(peers_ips, patroni): {"name": "postgresql-1", "tags": {"is_creating_backup": True}}, ] } - tc.assertTrue(patroni.is_creating_backup) + assert patroni.is_creating_backup # Test when no member is creating a backup. response.json.return_value = { "members": [{"name": "postgresql-0"}, {"name": "postgresql-1"}] } - tc.assertFalse(patroni.is_creating_backup) + assert not patroni.is_creating_backup def test_is_replication_healthy(peers_ips, patroni): @@ -223,7 +228,7 @@ def test_is_replication_healthy(peers_ips, patroni): ): # Test when replication is healthy. _get.return_value.status_code = 200 - tc.assertTrue(patroni.is_replication_healthy) + assert patroni.is_replication_healthy # Test when replication is not healthy. _get.side_effect = [ @@ -231,7 +236,7 @@ def test_is_replication_healthy(peers_ips, patroni): MagicMock(status_code=200), MagicMock(status_code=503), ] - tc.assertFalse(patroni.is_replication_healthy) + assert not patroni.is_replication_healthy def test_is_member_isolated(peers_ips, patroni): @@ -243,15 +248,15 @@ def test_is_member_isolated(peers_ips, patroni): ): # Test when it wasn't possible to connect to the Patroni API. _patroni_url.return_value = "http://server3" - tc.assertFalse(patroni.is_member_isolated) + assert not patroni.is_member_isolated # Test when the member isn't isolated from the cluster. _patroni_url.return_value = "http://server1" - tc.assertFalse(patroni.is_member_isolated) + assert not patroni.is_member_isolated # Test when the member is isolated from the cluster. _patroni_url.return_value = "http://server4" - tc.assertTrue(patroni.is_member_isolated) + assert patroni.is_member_isolated def test_render_file(peers_ips, patroni): @@ -275,7 +280,7 @@ def test_render_file(peers_ips, patroni): patroni.render_file(filename, "rendered-content", 0o640) # Check the rendered file is opened with "w+" mode. - tc.assertEqual(mock.call_args_list[0][0], (filename, "w+")) + assert mock.call_args_list[0][0] == (filename, "w+") # Ensure that the correct user is lookup up. _pwnam.assert_called_with("snap_daemon") # Ensure the file is chmod'd correctly. @@ -336,7 +341,7 @@ def test_render_patroni_yml_file(peers_ips, patroni): patroni.render_patroni_yml_file() # Check the template is opened read-only in the call to open. - tc.assertEqual(mock.call_args_list[0][0], ("templates/patroni.yml.j2", "r")) + assert mock.call_args_list[0][0] == ("templates/patroni.yml.j2", "r") # Ensure the correct rendered template is sent to _render_file method. _render_file.assert_called_once_with( "/var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml", @@ -555,3 +560,41 @@ def test_patroni_logs(harness, patroni): assert patroni.patroni_logs() == sentinel.logs _selected_snap.logs.assert_called_once_with(services=["patroni"], num_lines=10) + + # Snap error + _selected_snap.logs.side_effect = snap.SnapError + + assert patroni.patroni_logs() == "" + + +def test_last_postgresql_logs(harness, patroni): + with ( + patch("cluster.glob") as _glob, + patch("builtins.open") as _open, + ): + # Empty string if no globs + _glob.glob.return_value = [] + _open.return_value.__enter__().read.return_value = sentinel.log_contents + + assert patroni.last_postgresql_logs() == "" + + _glob.glob.assert_called_once_with(f"{POSTGRESQL_LOGS_PATH}/*.log") + _glob.glob.reset_mock() + + _glob.glob.return_value = ["2.log", "3.log", "1.log"] + assert patroni.last_postgresql_logs() == sentinel.log_contents + _open.assert_called_once_with("3.log", "r") + + # Empty on OSError + _open.side_effect = OSError + + assert patroni.last_postgresql_logs() == "" + + +def test_primary_changed(harness, patroni): + with ( + patch("cluster.Patroni.get_primary", return_value="new_primary") as _get_primary, + ): + assert patroni.primary_changed("old_primary") + + _get_primary.assert_called_once_with() From cc8b152b85ed09ea7d7a7ae7dd8b0e70d57e95c0 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 8 Jul 2024 19:49:39 +0300 Subject: [PATCH 06/10] Bump libs --- .../data_platform_libs/v0/data_interfaces.py | 25 ++++++++++++++++++- lib/charms/data_platform_libs/v0/upgrade.py | 4 +-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lib/charms/data_platform_libs/v0/data_interfaces.py b/lib/charms/data_platform_libs/v0/data_interfaces.py index 59a97226a4..a2162aa0ba 100644 --- a/lib/charms/data_platform_libs/v0/data_interfaces.py +++ b/lib/charms/data_platform_libs/v0/data_interfaces.py @@ -331,7 +331,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 37 +LIBPATCH = 38 PYDEPS = ["ops>=2.0.0"] @@ -2606,6 +2606,14 @@ def set_version(self, relation_id: int, version: str) -> None: """ self.update_relation_data(relation_id, {"version": version}) + def set_subordinated(self, relation_id: int) -> None: + """Raises the subordinated flag in the application relation databag. + + Args: + relation_id: the identifier for a particular relation. + """ + self.update_relation_data(relation_id, {"subordinated": "true"}) + class DatabaseProviderEventHandlers(EventHandlers): """Provider-side of the database relation handlers.""" @@ -2842,6 +2850,21 @@ def _on_relation_created_event(self, event: RelationCreatedEvent) -> None: def _on_relation_changed_event(self, event: RelationChangedEvent) -> None: """Event emitted when the database relation has changed.""" + is_subordinate = False + remote_unit_data = None + for key in event.relation.data.keys(): + if isinstance(key, Unit) and not key.name.startswith(self.charm.app.name): + remote_unit_data = event.relation.data[key] + elif isinstance(key, Application) and key.name != self.charm.app.name: + is_subordinate = event.relation.data[key].get("subordinated") == "true" + + if is_subordinate: + if not remote_unit_data: + return + + if remote_unit_data.get("state") != "ready": + return + # Check which data has changed to emit customs events. diff = self._diff(event) diff --git a/lib/charms/data_platform_libs/v0/upgrade.py b/lib/charms/data_platform_libs/v0/upgrade.py index 18a58ff09e..4d909d644d 100644 --- a/lib/charms/data_platform_libs/v0/upgrade.py +++ b/lib/charms/data_platform_libs/v0/upgrade.py @@ -285,7 +285,7 @@ def restart(self, event) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 17 +LIBPATCH = 18 PYDEPS = ["pydantic>=1.10,<2", "poetry-core"] @@ -921,7 +921,7 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None: self.charm.unit.status = WaitingStatus("other units upgrading first...") self.peer_relation.data[self.charm.unit].update({"state": "ready"}) - if self.charm.app.planned_units() == 1: + if len(self.app_units) == 1: # single unit upgrade, emit upgrade_granted event right away getattr(self.on, "upgrade_granted").emit() From 132146bbec52fa03904416d46181b3e1dbb671a9 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 8 Jul 2024 20:19:01 +0300 Subject: [PATCH 07/10] Bump coverage --- tests/unit/test_cluster.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index a690d9a587..ddc1c6e4f9 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -598,3 +598,23 @@ def test_primary_changed(harness, patroni): assert patroni.primary_changed("old_primary") _get_primary.assert_called_once_with() + + +def test_restart_patroni(harness, patroni): + with ( + patch("charm.snap.SnapCache") as _snap_cache, + ): + _cache = _snap_cache.return_value + _selected_snap = _cache.__getitem__.return_value + _selected_snap.services = { + "patroni": {"active": sentinel.restart}, + } + + assert patroni.restart_patroni() == sentinel.restart + + _selected_snap.restart.assert_called_once_with(services=["patroni"]) + + # Snap error + _selected_snap.restart.side_effect = snap.SnapError + + assert not patroni.restart_patroni() From c7c085df1a06c28068edafe3ae072396df1f560a Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 9 Jul 2024 13:06:38 +0300 Subject: [PATCH 08/10] Revert cluster test --- tests/unit/test_cluster.py | 126 ++++++++----------------------------- 1 file changed, 25 insertions(+), 101 deletions(-) diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index ddc1c6e4f9..dbe2cb3901 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -1,6 +1,7 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. +from unittest import TestCase from unittest.mock import MagicMock, Mock, PropertyMock, mock_open, patch, sentinel import pytest @@ -24,6 +25,9 @@ PATRONI_SERVICE = "patroni" CREATE_CLUSTER_CONF_PATH = "/var/snap/charmed-postgresql/current/etc/postgresql/postgresql.conf" +# used for assert functions +tc = TestCase() + # This method will be used by the mock to replace requests.get def mocked_requests_get(*args, **kwargs): @@ -86,7 +90,7 @@ def test_get_alternative_patroni_url(peers_ips, patroni): # Test the first URL that is returned (it should have the current unit IP). url = patroni._get_alternative_patroni_url(attempt) - assert url == f"http://{patroni.unit_ip}:8008" + tc.assertEqual(url, f"http://{patroni.unit_ip}:8008") # Test returning the other servers URLs. for attempt_number in range(attempt.retry_state.attempt_number + 1, len(peers_ips) + 2): @@ -102,11 +106,8 @@ def test_get_member_ip(peers_ips, patroni): ): # Test error on trying to get the member IP. _get_alternative_patroni_url.side_effect = "http://server2" - try: + with tc.assertRaises(tenacity.RetryError): patroni.get_member_ip(patroni.member_name) - assert False - except tenacity.RetryError: - pass # Test using an alternative Patroni URL. _get_alternative_patroni_url.side_effect = [ @@ -115,17 +116,17 @@ def test_get_member_ip(peers_ips, patroni): "http://server1", ] ip = patroni.get_member_ip(patroni.member_name) - assert ip == "1.1.1.1" + tc.assertEqual(ip, "1.1.1.1") # Test using the current Patroni URL. _get_alternative_patroni_url.side_effect = ["http://server1"] ip = patroni.get_member_ip(patroni.member_name) - assert ip == "1.1.1.1" + tc.assertEqual(ip, "1.1.1.1") # Test when not having that specific member in the cluster. _get_alternative_patroni_url.side_effect = ["http://server1"] ip = patroni.get_member_ip("other-member-name") - assert ip is None + tc.assertIsNone(ip) def test_get_patroni_health(peers_ips, patroni): @@ -142,15 +143,12 @@ def test_get_patroni_health(peers_ips, patroni): _stop_after_delay.assert_called_once_with(60) _wait_fixed.assert_called_once_with(7) - assert health == {"state": "running"} + tc.assertEqual(health, {"state": "running"}) # Test when the Patroni API is not reachable. _patroni_url.return_value = "http://server2" - try: + with tc.assertRaises(tenacity.RetryError): patroni.get_patroni_health() - assert False - except tenacity.RetryError: - pass def test_get_postgresql_version(peers_ips, patroni): @@ -163,7 +161,7 @@ def test_get_postgresql_version(peers_ips, patroni): ] version = patroni.get_postgresql_version() - assert version == "14.0" + tc.assertEqual(version, "14.0") _snap_client.assert_called_once_with() _get_installed_snaps.assert_called_once_with() @@ -175,11 +173,8 @@ def test_get_primary(peers_ips, patroni): ): # Test error on trying to get the member IP. _get_alternative_patroni_url.side_effect = "http://server2" - try: + with tc.assertRaises(tenacity.RetryError): patroni.get_primary(patroni.member_name) - assert False - except tenacity.RetryError: - pass # Test using an alternative Patroni URL. _get_alternative_patroni_url.side_effect = [ @@ -188,17 +183,17 @@ def test_get_primary(peers_ips, patroni): "http://server1", ] primary = patroni.get_primary() - assert primary == "postgresql-0" + tc.assertEqual(primary, "postgresql-0") # Test using the current Patroni URL. _get_alternative_patroni_url.side_effect = ["http://server1"] primary = patroni.get_primary() - assert primary == "postgresql-0" + tc.assertEqual(primary, "postgresql-0") # Test requesting the primary in the unit name pattern. _get_alternative_patroni_url.side_effect = ["http://server1"] primary = patroni.get_primary(unit_name_pattern=True) - assert primary == "postgresql/0" + tc.assertEqual(primary, "postgresql/0") def test_is_creating_backup(peers_ips, patroni): @@ -211,13 +206,13 @@ def test_is_creating_backup(peers_ips, patroni): {"name": "postgresql-1", "tags": {"is_creating_backup": True}}, ] } - assert patroni.is_creating_backup + tc.assertTrue(patroni.is_creating_backup) # Test when no member is creating a backup. response.json.return_value = { "members": [{"name": "postgresql-0"}, {"name": "postgresql-1"}] } - assert not patroni.is_creating_backup + tc.assertFalse(patroni.is_creating_backup) def test_is_replication_healthy(peers_ips, patroni): @@ -228,7 +223,7 @@ def test_is_replication_healthy(peers_ips, patroni): ): # Test when replication is healthy. _get.return_value.status_code = 200 - assert patroni.is_replication_healthy + tc.assertTrue(patroni.is_replication_healthy) # Test when replication is not healthy. _get.side_effect = [ @@ -236,7 +231,7 @@ def test_is_replication_healthy(peers_ips, patroni): MagicMock(status_code=200), MagicMock(status_code=503), ] - assert not patroni.is_replication_healthy + tc.assertFalse(patroni.is_replication_healthy) def test_is_member_isolated(peers_ips, patroni): @@ -248,15 +243,15 @@ def test_is_member_isolated(peers_ips, patroni): ): # Test when it wasn't possible to connect to the Patroni API. _patroni_url.return_value = "http://server3" - assert not patroni.is_member_isolated + tc.assertFalse(patroni.is_member_isolated) # Test when the member isn't isolated from the cluster. _patroni_url.return_value = "http://server1" - assert not patroni.is_member_isolated + tc.assertFalse(patroni.is_member_isolated) # Test when the member is isolated from the cluster. _patroni_url.return_value = "http://server4" - assert patroni.is_member_isolated + tc.assertTrue(patroni.is_member_isolated) def test_render_file(peers_ips, patroni): @@ -280,7 +275,7 @@ def test_render_file(peers_ips, patroni): patroni.render_file(filename, "rendered-content", 0o640) # Check the rendered file is opened with "w+" mode. - assert mock.call_args_list[0][0] == (filename, "w+") + tc.assertEqual(mock.call_args_list[0][0], (filename, "w+")) # Ensure that the correct user is lookup up. _pwnam.assert_called_with("snap_daemon") # Ensure the file is chmod'd correctly. @@ -341,7 +336,7 @@ def test_render_patroni_yml_file(peers_ips, patroni): patroni.render_patroni_yml_file() # Check the template is opened read-only in the call to open. - assert mock.call_args_list[0][0] == ("templates/patroni.yml.j2", "r") + tc.assertEqual(mock.call_args_list[0][0], ("templates/patroni.yml.j2", "r")) # Ensure the correct rendered template is sent to _render_file method. _render_file.assert_called_once_with( "/var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml", @@ -547,74 +542,3 @@ def test_member_inactive_error(peers_ips, patroni): assert patroni.member_inactive _get.assert_called_once_with("http://1.1.1.1:8008/health", verify=True, timeout=5) - - -def test_patroni_logs(harness, patroni): - with ( - patch("charm.snap.SnapCache") as _snap_cache, - ): - _cache = _snap_cache.return_value - _selected_snap = _cache.__getitem__.return_value - _selected_snap.logs.return_value = sentinel.logs - - assert patroni.patroni_logs() == sentinel.logs - - _selected_snap.logs.assert_called_once_with(services=["patroni"], num_lines=10) - - # Snap error - _selected_snap.logs.side_effect = snap.SnapError - - assert patroni.patroni_logs() == "" - - -def test_last_postgresql_logs(harness, patroni): - with ( - patch("cluster.glob") as _glob, - patch("builtins.open") as _open, - ): - # Empty string if no globs - _glob.glob.return_value = [] - _open.return_value.__enter__().read.return_value = sentinel.log_contents - - assert patroni.last_postgresql_logs() == "" - - _glob.glob.assert_called_once_with(f"{POSTGRESQL_LOGS_PATH}/*.log") - _glob.glob.reset_mock() - - _glob.glob.return_value = ["2.log", "3.log", "1.log"] - assert patroni.last_postgresql_logs() == sentinel.log_contents - _open.assert_called_once_with("3.log", "r") - - # Empty on OSError - _open.side_effect = OSError - - assert patroni.last_postgresql_logs() == "" - - -def test_primary_changed(harness, patroni): - with ( - patch("cluster.Patroni.get_primary", return_value="new_primary") as _get_primary, - ): - assert patroni.primary_changed("old_primary") - - _get_primary.assert_called_once_with() - - -def test_restart_patroni(harness, patroni): - with ( - patch("charm.snap.SnapCache") as _snap_cache, - ): - _cache = _snap_cache.return_value - _selected_snap = _cache.__getitem__.return_value - _selected_snap.services = { - "patroni": {"active": sentinel.restart}, - } - - assert patroni.restart_patroni() == sentinel.restart - - _selected_snap.restart.assert_called_once_with(services=["patroni"]) - - # Snap error - _selected_snap.restart.side_effect = snap.SnapError - - assert not patroni.restart_patroni() From e759af6e133fe83c71e2e1bcdeb5b81c4eedd04e Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 9 Jul 2024 13:25:25 +0300 Subject: [PATCH 09/10] Try to rekill process --- tests/integration/ha_tests/helpers.py | 4 +++- tests/integration/ha_tests/test_self_healing.py | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 3cd3297001..ca2b2f6a54 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -59,7 +59,7 @@ class ProcessRunningError(Exception): """Raised when a process is running when it is not expected to be.""" -async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool: +async def are_all_db_processes_down(ops_test: OpsTest, process: str, signal: str) -> bool: """Verifies that all units of the charm do not have the DB process running.""" app = await app_name(ops_test) if "/" in process: @@ -80,6 +80,8 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool: # If something was returned, there is a running process. if len(processes) > 0: logger.info("Unit %s not yet down" % unit.name) + # Try to rekill the unit + await send_signal_to_process(ops_test, unit.name, process, signal) raise ProcessRunningError except RetryError: return False diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index e958de09eb..03c670f192 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -276,9 +276,10 @@ async def test_full_cluster_restart( # they come back online they operate as expected. This check verifies that we meet the criteria # of all replicas being down at the same time. try: - assert await are_all_db_processes_down( - ops_test, process - ), "Not all units down at the same time." + ( + await are_all_db_processes_down(ops_test, process, signal), + "Not all units down at the same time.", + ) finally: if process == PATRONI_PROCESS: awaits = [] From bfc33a6149e1dce9f055ce0d50c8bf92f9ed7eb0 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 10 Jul 2024 01:33:15 +0300 Subject: [PATCH 10/10] Revert removed assert --- tests/integration/ha_tests/test_self_healing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 03c670f192..f23094169b 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -276,10 +276,9 @@ async def test_full_cluster_restart( # they come back online they operate as expected. This check verifies that we meet the criteria # of all replicas being down at the same time. try: - ( - await are_all_db_processes_down(ops_test, process, signal), - "Not all units down at the same time.", - ) + assert await are_all_db_processes_down( + ops_test, process, signal + ), "Not all units down at the same time." finally: if process == PATRONI_PROCESS: awaits = []