Skip to content

Commit

Permalink
[DPE-4806] Restart pebble service if it's down (#581)
Browse files Browse the repository at this point in the history
* restart pebble service

* Update warning message

Co-authored-by: Dragomir Penev <[email protected]>

* address comments

---------

Co-authored-by: Dragomir Penev <[email protected]>
  • Loading branch information
lucasgameiroborges and dragomirp authored Jul 26, 2024
1 parent 88006ba commit d027e56
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 33 deletions.
14 changes: 14 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,20 @@ def _on_update_status(self, _) -> None:
logger.debug("on_update_status early exit: Service has not been added nor started yet")
return

if (
"restoring-backup" not in self.app_peer_data
and "stopped" not in self.unit_peer_data
and services[0].current != ServiceStatus.ACTIVE
):
logger.warning(
"%s pebble service inactive, restarting service" % self._postgresql_service
)
container.restart(self._postgresql_service)
# If service doesn't recover fast, exit and wait for next hook run to re-check
if not self._patroni.member_started:
self.unit.status = MaintenanceStatus("Database service inactive, restarting")
return

if "restoring-backup" in self.app_peer_data and not self._was_restore_successful(
services[0]
):
Expand Down
50 changes: 22 additions & 28 deletions tests/integration/ha_tests/test_self_healing.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,24 +241,25 @@ async def test_forceful_restart_without_data_and_transaction_logs(
"scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp"
)

# Stop the systemd service on the primary unit.
logger.info(f"stopping database from {primary_name}")
await run_command_on_unit(ops_test, primary_name, "/charm/bin/pebble stop postgresql")

# Data removal runs within a script, so it allows `*` expansion.
logger.info(f"removing data from {primary_name}")
return_code, _, _ = await ops_test.juju(
"ssh",
primary_name,
"bash",
"/tmp/clean-data-dir.sh",
)
assert return_code == 0, "Failed to remove data directory"

# Wait some time to elect a new primary.
sleep(MEDIAN_ELECTION_TIME * 2)
# Halts the execution of `update-status` hook so the pebble service doesn't get restarted
async with ops_test.fast_forward("24h"):
# Stop the systemd service on the primary unit.
logger.info(f"stopping database from {primary_name}")
await run_command_on_unit(ops_test, primary_name, "/charm/bin/pebble stop postgresql")

# Data removal runs within a script, so it allows `*` expansion.
logger.info(f"removing data from {primary_name}")
return_code, _, _ = await ops_test.juju(
"ssh",
primary_name,
"bash",
"/tmp/clean-data-dir.sh",
)
assert return_code == 0, "Failed to remove data directory"

# Wait some time to elect a new primary.
sleep(MEDIAN_ELECTION_TIME * 2)

async with ops_test.fast_forward():
logger.info("checking whether writes are increasing")
await are_writes_increasing(ops_test, primary_name)

Expand Down Expand Up @@ -298,17 +299,10 @@ async def test_forceful_restart_without_data_and_transaction_logs(
new_files
), f"WAL segments weren't correctly rotated on {unit_name}"

# Start the systemd service in the old primary.
logger.info(f"starting database on {primary_name}")
for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3), reraise=True):
with attempt:
await run_command_on_unit(
ops_test, primary_name, "/charm/bin/pebble start postgresql"
)

# Verify that the database service got restarted and is ready in the old primary.
logger.info(f"waiting for the database service to restart on {primary_name}")
assert await is_postgresql_ready(ops_test, primary_name)
# Database pebble service in old primary should recover after update-status run.
async with ops_test.fast_forward("10s"):
logger.info(f"Checking the database service on {primary_name}")
assert await is_postgresql_ready(ops_test, primary_name)

await is_cluster_updated(ops_test, primary_name)

Expand Down
4 changes: 1 addition & 3 deletions tests/integration/test_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,7 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:
# Check that the primary changed.
assert await primary_changed(ops_test, primary), "primary not changed"

# Restart the initial primary and check the logs to ensure TLS is being used by pg_rewind.
logger.info(f"starting database on {primary}")
await run_command_on_unit(ops_test, primary, "/charm/bin/pebble start postgresql")
# Check the logs to ensure TLS is being used by pg_rewind.
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True):
with attempt:
await check_tls_rewind(ops_test)
Expand Down
11 changes: 9 additions & 2 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ def test_on_update_status(harness):
patch("charm.Patroni.member_started") as _member_started,
patch("charm.Patroni.get_primary") as _get_primary,
patch("ops.model.Container.pebble") as _pebble,
patch("ops.model.Container.restart", new_callable=PropertyMock),
patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"),
):
# Test before the PostgreSQL service is available.
Expand All @@ -464,12 +465,18 @@ def test_on_update_status(harness):
_get_primary.assert_not_called()

# Test when a failure need to be handled.
_pebble.get_services.return_value = ["service data"]
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
_handle_processes_failures.return_value = True
harness.charm.on.update_status.emit()
_get_primary.assert_not_called()

# Test when a failure need to be handled.
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)]
harness.charm.on.update_status.emit()
_get_primary.assert_not_called()

# Check primary message not being set (current unit is not the primary).
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
_handle_processes_failures.return_value = False
_get_primary.side_effect = [
"postgresql-k8s/1",
Expand Down Expand Up @@ -514,7 +521,7 @@ def test_on_update_status_with_error_on_get_primary(harness):
patch("upgrade.PostgreSQLUpgrade.idle", return_value=True),
):
# Mock the access to the list of Pebble services.
_pebble.get_services.return_value = ["service data"]
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]

_get_primary.side_effect = [RetryError("fake error")]

Expand Down

0 comments on commit d027e56

Please sign in to comment.