Skip to content

Commit

Permalink
[DPE-4533] Pause Patroni in the TLS test (#588)
Browse files Browse the repository at this point in the history
* Pause Patroni in the TLS test

* Wrong arg

* Disable verify

* Other wrong arg

* Retry stopping

* Split tls test

* Abort on failed rewind test

* Split off remove TLS test

* Idle in the rewind test

* Catch restart error

* manually start service

* Revert test tweaks

* Check tls relation when checking for certificates

* Update config when restarting leader

* Revert restart changes and juju 2 overkill

* Directly TERM the postgres process

* Ignore volume errors

* Unit tests

* Lint
  • Loading branch information
dragomirp authored Aug 2, 2024
1 parent 3e48255 commit 978d0ec
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 88 deletions.
18 changes: 13 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1293,15 +1293,20 @@ def _on_stop(self, _):
f"failed to patch k8s {type(resource).__name__} {resource.metadata.name}"
)

def _on_update_status(self, _) -> None:
"""Update the unit status message."""
def _on_update_status_early_exit_checks(self, container) -> bool:
if not self.upgrade.idle:
logger.debug("Early exit on_update_status: upgrade in progress")
return
return False

container = self.unit.get_container("postgresql")
if not container.can_connect():
logger.debug("on_update_status early exit: Cannot connect to container")
return False
return True

def _on_update_status(self, _) -> None:
"""Update the unit status message."""
container = self.unit.get_container("postgresql")
if not self._on_update_status_early_exit_checks(container):
return

if self._has_blocked_status or self._has_non_restore_waiting_status:
Expand All @@ -1325,7 +1330,10 @@ def _on_update_status(self, _) -> None:
logger.warning(
"%s pebble service inactive, restarting service" % self._postgresql_service
)
container.restart(self._postgresql_service)
try:
container.restart(self._postgresql_service)
except ChangeError:
logger.exception("Failed to restart patroni")
# If service doesn't recover fast, exit and wait for next hook run to re-check
if not self._patroni.member_started:
self.unit.status = MaintenanceStatus("Database service inactive, restarting")
Expand Down
21 changes: 17 additions & 4 deletions tests/integration/ha_tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,22 +100,30 @@ def get_patroni_cluster(unit_ip: str) -> Dict[str, str]:
return resp.json()


async def change_patroni_setting(ops_test: OpsTest, setting: str, value: int) -> None:
async def change_patroni_setting(
ops_test: OpsTest, setting: str, value: Union[int, str], tls: bool = False
) -> None:
"""Change the value of one of the Patroni settings.
Args:
ops_test: ops_test instance.
setting: the name of the setting.
value: the value to assign to the setting.
tls: if Patroni is serving using tls.
"""
if tls:
schema = "https"
else:
schema = "http"
for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)):
with attempt:
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)
unit_ip = await get_unit_address(ops_test, primary_name)
requests.patch(
f"http://{unit_ip}:8008/config",
f"{schema}://{unit_ip}:8008/config",
json={setting: value},
verify=not tls,
)


Expand Down Expand Up @@ -404,22 +412,27 @@ def get_host_ip(host: str) -> str:
return member_ips


async def get_patroni_setting(ops_test: OpsTest, setting: str) -> Optional[int]:
async def get_patroni_setting(ops_test: OpsTest, setting: str, tls: bool = False) -> Optional[int]:
"""Get the value of one of the integer Patroni settings.
Args:
ops_test: ops_test instance.
setting: the name of the setting.
tls: if Patroni is serving using tls.
Returns:
the value of the configuration or None if it's using the default value.
"""
if tls:
schema = "https"
else:
schema = "http"
for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)):
with attempt:
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)
unit_ip = await get_unit_address(ops_test, primary_name)
configuration_info = requests.get(f"http://{unit_ip}:8008/config")
configuration_info = requests.get(f"{schema}://{unit_ip}:8008/config", verify=not tls)
primary_start_timeout = configuration_info.json().get(setting)
return int(primary_start_timeout) if primary_start_timeout is not None else None

Expand Down
46 changes: 34 additions & 12 deletions tests/integration/test_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from tenacity import Retrying, stop_after_delay, wait_fixed

from . import architecture, markers
from .ha_tests.helpers import (
change_patroni_setting,
)
from .helpers import (
DATABASE_APP_NAME,
build_and_deploy,
Expand Down Expand Up @@ -55,7 +58,6 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
await build_and_deploy(ops_test, DATABASE_UNITS, wait_for_idle=False)


@pytest.mark.group(1)
async def check_tls_rewind(ops_test: OpsTest) -> None:
"""Checks if TLS was used by rewind."""
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
Expand All @@ -76,23 +78,16 @@ async def check_tls_rewind(ops_test: OpsTest) -> None:


@pytest.mark.group(1)
@markers.amd64_only # mattermost-k8s charm not available for arm64
async def test_mattermost_db(ops_test: OpsTest) -> None:
"""Deploy Mattermost to test the 'db' relation.
Mattermost needs TLS enabled on PostgreSQL to correctly connect to it.
Args:
ops_test: The ops test framework
"""
@pytest.mark.abort_on_fail
async def test_tls(ops_test: OpsTest) -> None:
async with ops_test.fast_forward():
# Deploy TLS Certificates operator.
await ops_test.model.deploy(
tls_certificates_app_name, config=tls_config, channel=tls_channel
)
# Relate it to the PostgreSQL to enable TLS.
await ops_test.model.relate(DATABASE_APP_NAME, tls_certificates_app_name)
await ops_test.model.wait_for_idle(status="active", timeout=1000)
await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False)

# Wait for all units enabling TLS.
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
Expand Down Expand Up @@ -122,7 +117,10 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME], status="active", idle_period=30
)
# Pause Patroni so it doesn't wipe the custom changes
await change_patroni_setting(ops_test, "pause", True, tls=True)

async with ops_test.fast_forward("24h"):
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(2), reraise=True):
with attempt:
# Promote the replica to primary.
Expand Down Expand Up @@ -154,7 +152,13 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:

# Stop the initial primary.
logger.info(f"stopping database on {primary}")
await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql")
try:
await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql")
except Exception as e:
# pebble stop on juju 2 errors out and leaves dangling PG processes
if juju_major_version > 2:
raise e
await run_command_on_unit(ops_test, primary, "pkill --signal SIGTERM -x postgres")

# Check that the primary changed.
assert await primary_changed(ops_test, primary), "primary not changed"
Expand All @@ -163,12 +167,26 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True):
with attempt:
await check_tls_rewind(ops_test)
await change_patroni_setting(ops_test, "pause", False, tls=True)

async with ops_test.fast_forward():
# Await for postgresql to be stable if not already
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME], status="active", idle_period=15
)


@pytest.mark.group(1)
@markers.amd64_only # mattermost-k8s charm not available for arm64
async def test_mattermost_db(ops_test: OpsTest) -> None:
"""Deploy Mattermost to test the 'db' relation.
Mattermost needs TLS enabled on PostgreSQL to correctly connect to it.
Args:
ops_test: The ops test framework
"""
async with ops_test.fast_forward():
# Deploy and check Mattermost user and database existence.
relation_id = await deploy_and_relate_application_with_postgresql(
ops_test, "mattermost-k8s", MATTERMOST_APP_NAME, APPLICATION_UNITS, status="waiting"
Expand All @@ -179,6 +197,10 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:

await check_database_users_existence(ops_test, mattermost_users, [])


@pytest.mark.group(1)
async def test_remove_tls(ops_test: OpsTest) -> None:
async with ops_test.fast_forward():
# Remove the relation.
await ops_test.model.applications[DATABASE_APP_NAME].remove_relation(
f"{DATABASE_APP_NAME}:certificates", f"{tls_certificates_app_name}:certificates"
Expand Down
Loading

0 comments on commit 978d0ec

Please sign in to comment.