From d027e5660384b46221649be0e7ea10f09fc9a97d Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Date: Fri, 26 Jul 2024 10:22:33 -0300 Subject: [PATCH] [DPE-4806] Restart pebble service if it's down (#581) * restart pebble service * Update warning message Co-authored-by: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> * address comments --------- Co-authored-by: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> --- src/charm.py | 14 ++++++ .../integration/ha_tests/test_self_healing.py | 50 ++++++++----------- tests/integration/test_tls.py | 4 +- tests/unit/test_charm.py | 11 +++- 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/src/charm.py b/src/charm.py index aaba6a5ddf..4014617ffc 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1286,6 +1286,20 @@ def _on_update_status(self, _) -> None: logger.debug("on_update_status early exit: Service has not been added nor started yet") return + if ( + "restoring-backup" not in self.app_peer_data + and "stopped" not in self.unit_peer_data + and services[0].current != ServiceStatus.ACTIVE + ): + logger.warning( + "%s pebble service inactive, restarting service" % self._postgresql_service + ) + container.restart(self._postgresql_service) + # If service doesn't recover fast, exit and wait for next hook run to re-check + if not self._patroni.member_started: + self.unit.status = MaintenanceStatus("Database service inactive, restarting") + return + if "restoring-backup" in self.app_peer_data and not self._was_restore_successful( services[0] ): diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index f181f8d7a6..d9f9a9594c 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -241,24 +241,25 @@ async def test_forceful_restart_without_data_and_transaction_logs( "scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp" ) - # Stop the systemd service on the primary unit. - logger.info(f"stopping database from {primary_name}") - await run_command_on_unit(ops_test, primary_name, "/charm/bin/pebble stop postgresql") - - # Data removal runs within a script, so it allows `*` expansion. - logger.info(f"removing data from {primary_name}") - return_code, _, _ = await ops_test.juju( - "ssh", - primary_name, - "bash", - "/tmp/clean-data-dir.sh", - ) - assert return_code == 0, "Failed to remove data directory" - - # Wait some time to elect a new primary. - sleep(MEDIAN_ELECTION_TIME * 2) + # Halts the execution of `update-status` hook so the pebble service doesn't get restarted + async with ops_test.fast_forward("24h"): + # Stop the systemd service on the primary unit. + logger.info(f"stopping database from {primary_name}") + await run_command_on_unit(ops_test, primary_name, "/charm/bin/pebble stop postgresql") + + # Data removal runs within a script, so it allows `*` expansion. + logger.info(f"removing data from {primary_name}") + return_code, _, _ = await ops_test.juju( + "ssh", + primary_name, + "bash", + "/tmp/clean-data-dir.sh", + ) + assert return_code == 0, "Failed to remove data directory" + + # Wait some time to elect a new primary. + sleep(MEDIAN_ELECTION_TIME * 2) - async with ops_test.fast_forward(): logger.info("checking whether writes are increasing") await are_writes_increasing(ops_test, primary_name) @@ -298,17 +299,10 @@ async def test_forceful_restart_without_data_and_transaction_logs( new_files ), f"WAL segments weren't correctly rotated on {unit_name}" - # Start the systemd service in the old primary. - logger.info(f"starting database on {primary_name}") - for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3), reraise=True): - with attempt: - await run_command_on_unit( - ops_test, primary_name, "/charm/bin/pebble start postgresql" - ) - - # Verify that the database service got restarted and is ready in the old primary. - logger.info(f"waiting for the database service to restart on {primary_name}") - assert await is_postgresql_ready(ops_test, primary_name) + # Database pebble service in old primary should recover after update-status run. + async with ops_test.fast_forward("10s"): + logger.info(f"Checking the database service on {primary_name}") + assert await is_postgresql_ready(ops_test, primary_name) await is_cluster_updated(ops_test, primary_name) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index be512a9c32..a2e3f249d2 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -159,9 +159,7 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: # Check that the primary changed. assert await primary_changed(ops_test, primary), "primary not changed" - # Restart the initial primary and check the logs to ensure TLS is being used by pg_rewind. - logger.info(f"starting database on {primary}") - await run_command_on_unit(ops_test, primary, "/charm/bin/pebble start postgresql") + # Check the logs to ensure TLS is being used by pg_rewind. for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True): with attempt: await check_tls_rewind(ops_test) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 10d1ab8c16..16d89ffb04 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -455,6 +455,7 @@ def test_on_update_status(harness): patch("charm.Patroni.member_started") as _member_started, patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, + patch("ops.model.Container.restart", new_callable=PropertyMock), patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"), ): # Test before the PostgreSQL service is available. @@ -464,12 +465,18 @@ def test_on_update_status(harness): _get_primary.assert_not_called() # Test when a failure need to be handled. - _pebble.get_services.return_value = ["service data"] + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] _handle_processes_failures.return_value = True harness.charm.on.update_status.emit() _get_primary.assert_not_called() + # Test when a failure need to be handled. + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] + harness.charm.on.update_status.emit() + _get_primary.assert_not_called() + # Check primary message not being set (current unit is not the primary). + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] _handle_processes_failures.return_value = False _get_primary.side_effect = [ "postgresql-k8s/1", @@ -514,7 +521,7 @@ def test_on_update_status_with_error_on_get_primary(harness): patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), ): # Mock the access to the list of Pebble services. - _pebble.get_services.return_value = ["service data"] + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] _get_primary.side_effect = [RetryError("fake error")]