From 165d04cec15904dcb7143062118c23273b35e584 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 25 Jun 2024 06:02:46 -0300 Subject: [PATCH] Speed up bootstrap (#413) * Speed up bootstrap * Add unit tests * Sync poetry.lock from main * Add install and active log messages and set primary status message earlier * Enable and fix existing unit test Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 7 +++-- src/cluster.py | 2 +- src/upgrade.py | 6 ++-- tests/unit/test_charm.py | 60 +++++++++++++++++++++++++------------- tests/unit/test_cluster.py | 23 +++++++++++++++ tests/unit/test_upgrade.py | 35 ++++++++++++++++++++++ 6 files changed, 107 insertions(+), 26 deletions(-) diff --git a/src/charm.py b/src/charm.py index ce525bed36..f7f9675d59 100755 --- a/src/charm.py +++ b/src/charm.py @@ -10,6 +10,7 @@ import platform import subprocess import sys +from datetime import datetime from pathlib import Path from typing import Dict, List, Literal, Optional, Set, get_args @@ -352,7 +353,7 @@ def primary_endpoint(self) -> Optional[str]: logger.debug("primary endpoint early exit: Peer relation not joined yet.") return None try: - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(5), wait=wait_fixed(3)): with attempt: primary = self._patroni.get_primary() if primary is None and (standby_leader := self._patroni.get_standby_leader()): @@ -855,6 +856,7 @@ def _on_cluster_topology_change(self, _): def _on_install(self, event: InstallEvent) -> None: """Install prerequisites for the application.""" + logger.debug("Install start time: %s", datetime.now()) if not self._is_storage_attached(): self._reboot_on_detached_storage(event) return @@ -1162,7 +1164,8 @@ def _start_primary(self, event: StartEvent) -> None: # was fully initialised. self.enable_disable_extensions() - self.unit.status = ActiveStatus() + logger.debug("Active workload time: %s", datetime.now()) + self._set_primary_status_message() def _start_replica(self, event) -> None: """Configure the replica if the cluster was already initialised.""" diff --git a/src/cluster.py b/src/cluster.py index 29a42e87b5..14651400e9 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -362,7 +362,7 @@ def are_all_members_ready(self) -> bool: def get_patroni_health(self) -> Dict[str, str]: """Gets, retires and parses the Patroni health endpoint.""" - for attempt in Retrying(stop=stop_after_delay(90), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(7)): with attempt: r = requests.get( f"{self._patroni_url}/health", diff --git a/src/upgrade.py b/src/upgrade.py index f8fce9ab13..7fa435b651 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -115,8 +115,10 @@ def _on_upgrade_charm_check_legacy(self) -> None: peers_state = list(filter(lambda state: state != "", self.unit_states)) - if len(peers_state) == len(self.peer_relation.units) and ( - set(peers_state) == {"ready"} or len(peers_state) == 0 + if ( + len(peers_state) == len(self.peer_relation.units) + and (set(peers_state) == {"ready"} or len(peers_state) == 0) + and self.charm.is_cluster_initialised ): if self.charm._patroni.member_started: # All peers have set the state to ready diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index a0792250d8..e69dad14fd 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -4,6 +4,7 @@ import logging import platform import subprocess +from unittest import TestCase from unittest.mock import MagicMock, Mock, PropertyMock, call, mock_open, patch, sentinel import pytest @@ -37,6 +38,9 @@ CREATE_CLUSTER_CONF_PATH = "/etc/postgresql-common/createcluster.d/pgcharm.conf" +# used for assert functions +tc = TestCase() + @pytest.fixture(autouse=True) def harness(): @@ -165,7 +169,9 @@ def test_patroni_scrape_config_tls(harness): def test_primary_endpoint(harness): - with patch( + with patch("charm.stop_after_delay", new_callable=PropertyMock) as _stop_after_delay, patch( + "charm.wait_fixed", new_callable=PropertyMock + ) as _wait_fixed, patch( "charm.PostgresqlOperatorCharm._units_ips", new_callable=PropertyMock, return_value={"1.1.1.1", "1.1.1.2"}, @@ -174,6 +180,10 @@ def test_primary_endpoint(harness): _patroni.return_value.get_primary.return_value = sentinel.primary assert harness.charm.primary_endpoint == "1.1.1.1" + # Check needed to ensure a fast charm deployment. + _stop_after_delay.assert_called_once_with(5) + _wait_fixed.assert_called_once_with(3) + _patroni.return_value.get_member_ip.assert_called_once_with(sentinel.primary) _patroni.return_value.get_primary.assert_called_once_with() @@ -547,6 +557,9 @@ def test_enable_disable_extensions(harness, caplog): @patch_network_get(private_address="1.1.1.1") def test_on_start(harness): with ( + patch( + "charm.PostgresqlOperatorCharm._set_primary_status_message" + ) as _set_primary_status_message, patch( "charm.PostgresqlOperatorCharm.enable_disable_extensions" ) as _enable_disable_extensions, @@ -622,7 +635,7 @@ def test_on_start(harness): assert _postgresql.create_user.call_count == 4 # Considering the previous failed call. _oversee_users.assert_called_once() _enable_disable_extensions.assert_called_once() - assert isinstance(harness.model.unit.status, ActiveStatus) + _set_primary_status_message.assert_called_once() @patch_network_get(private_address="1.1.1.1") @@ -2256,16 +2269,21 @@ def test_update_new_unit_status(harness): handle_read_only_mode.assert_not_called() assert isinstance(harness.charm.unit.status, WaitingStatus) - @patch("charm.Patroni.member_started", new_callable=PropertyMock) - @patch("charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock) - @patch("charm.Patroni.get_primary") - def test_set_active_status(self, _get_primary, _is_standby_leader, _member_started): + +def test_set_primary_status_message(harness): + with ( + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch( + "charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock + ) as _is_standby_leader, + patch("charm.Patroni.get_primary") as _get_primary, + ): for values in itertools.product( [ RetryError(last_attempt=1), ConnectionError, - self.charm.unit.name, - f"{self.charm.app.name}/2", + harness.charm.unit.name, + f"{harness.charm.app.name}/2", ], [ RetryError(last_attempt=1), @@ -2275,34 +2293,34 @@ def test_set_active_status(self, _get_primary, _is_standby_leader, _member_start ], [True, False], ): - self.charm.unit.status = MaintenanceStatus("fake status") + harness.charm.unit.status = MaintenanceStatus("fake status") _member_started.return_value = values[2] if isinstance(values[0], str): _get_primary.side_effect = None _get_primary.return_value = values[0] - if values[0] != self.charm.unit.name and not isinstance(values[1], bool): + if values[0] != harness.charm.unit.name and not isinstance(values[1], bool): _is_standby_leader.side_effect = values[1] _is_standby_leader.return_value = None - self.charm._set_active_status() - self.assertIsInstance(self.charm.unit.status, MaintenanceStatus) + harness.charm._set_primary_status_message() + tc.assertIsInstance(harness.charm.unit.status, MaintenanceStatus) else: _is_standby_leader.side_effect = None _is_standby_leader.return_value = values[1] - self.charm._set_active_status() - self.assertIsInstance( - self.charm.unit.status, + harness.charm._set_primary_status_message() + tc.assertIsInstance( + harness.charm.unit.status, ActiveStatus - if values[0] == self.charm.unit.name or values[1] or values[2] + if values[0] == harness.charm.unit.name or values[1] or values[2] else MaintenanceStatus, ) - self.assertEqual( - self.charm.unit.status.message, + tc.assertEqual( + harness.charm.unit.status.message, "Primary" - if values[0] == self.charm.unit.name + if values[0] == harness.charm.unit.name else ("Standby" if values[1] else ("" if values[2] else "fake status")), ) else: _get_primary.side_effect = values[0] _get_primary.return_value = None - self.charm._set_active_status() - self.assertIsInstance(self.charm.unit.status, MaintenanceStatus) + harness.charm._set_primary_status_message() + tc.assertIsInstance(harness.charm.unit.status, MaintenanceStatus) diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 2aac95e8b1..dbe2cb3901 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -42,6 +42,7 @@ def json(self): "http://server1/cluster": { "members": [{"name": "postgresql-0", "host": "1.1.1.1", "role": "leader", "lag": "1"}] }, + "http://server1/health": {"state": "running"}, "http://server4/cluster": {"members": []}, } if args[0] in data: @@ -128,6 +129,28 @@ def test_get_member_ip(peers_ips, patroni): tc.assertIsNone(ip) +def test_get_patroni_health(peers_ips, patroni): + with patch("cluster.stop_after_delay", new_callable=PropertyMock) as _stop_after_delay, patch( + "cluster.wait_fixed", new_callable=PropertyMock + ) as _wait_fixed, patch( + "charm.Patroni._patroni_url", new_callable=PropertyMock + ) as _patroni_url, patch("requests.get", side_effect=mocked_requests_get) as _get: + # Test when the Patroni API is reachable. + _patroni_url.return_value = "http://server1" + health = patroni.get_patroni_health() + + # Check needed to ensure a fast charm deployment. + _stop_after_delay.assert_called_once_with(60) + _wait_fixed.assert_called_once_with(7) + + tc.assertEqual(health, {"state": "running"}) + + # Test when the Patroni API is not reachable. + _patroni_url.return_value = "http://server2" + with tc.assertRaises(tenacity.RetryError): + patroni.get_patroni_health() + + def test_get_postgresql_version(peers_ips, patroni): with patch("charm.snap.SnapClient") as _snap_client: # TODO test a real implementation diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py index fe8c74cf85..daad8a7044 100644 --- a/tests/unit/test_upgrade.py +++ b/tests/unit/test_upgrade.py @@ -60,6 +60,41 @@ def test_log_rollback(harness): ) +@pytest.mark.parametrize( + "unit_states,is_cluster_initialised,call", + [ + (["ready"], False, False), + (["ready", "ready"], True, False), + (["idle"], False, False), + (["idle"], True, False), + (["ready"], True, True), + ], +) +def test_on_upgrade_charm_check_legacy(harness, unit_states, is_cluster_initialised, call): + with ( + patch( + "charms.data_platform_libs.v0.upgrade.DataUpgrade.state", + new_callable=PropertyMock(return_value=None), + ) as _state, + patch( + "charms.data_platform_libs.v0.upgrade.DataUpgrade.unit_states", + new_callable=PropertyMock(return_value=unit_states), + ) as _unit_states, + patch( + "charm.PostgresqlOperatorCharm.is_cluster_initialised", + new_callable=PropertyMock(return_value=is_cluster_initialised), + ) as _is_cluster_initialised, + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch( + "upgrade.PostgreSQLUpgrade._prepare_upgrade_from_legacy" + ) as _prepare_upgrade_from_legacy, + ): + with harness.hooks_disabled(): + harness.set_leader(True) + harness.charm.upgrade._on_upgrade_charm_check_legacy() + _member_started.assert_called_once() if call else _member_started.assert_not_called() + + @patch_network_get(private_address="1.1.1.1") def test_on_upgrade_granted(harness): with (