Skip to content

Commit

Permalink
[DPE-3591] Fix shared buffers validation (canonical#361)
Browse files Browse the repository at this point in the history
* Fix shared buffers validation

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Consider shared buffers config option value when calculating effective cache size value

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Fix PostgreSQL restart not being called

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Add and update unit test

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Update unit test

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Increase test timeout

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Increase test timeout

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Switch to GH runners

* Increase stanza timeout

* [MISC] Check for peer relation when getting the primary endpoint (canonical#366)

* Reenable landscape tests

* Try GH runners

* Back on self-hosted runner

* Don't check for primary if peer didn't join yet

* Don't recheck missing primary ip when looking for primary endpoint

* Catch list user exception

* Catch Psycopg2 error when trying to set config

* Try to preflight connection on config validation

* Don't retry db connection check

* Set the primary endpoint before starting the primary

* Revert "Set the primary endpoint before starting the primary"

This reverts commit 8184dfa.

* Wait for peer before starting primary

* Switch to GH runners

* Recheck DB connection

* Don't check peer on primary startup

* Try to skip primary IP validation if not all cluster IPs seem to be available

* Comment

* Review comments

* Unit tests

---------

Signed-off-by: Marcelo Henrique Neppel <[email protected]>
Co-authored-by: Dragomir Penev <[email protected]>
Co-authored-by: Dragomir Penev <[email protected]>
  • Loading branch information
3 people authored Feb 27, 2024
1 parent 3e9abaf commit 6d57972
Show file tree
Hide file tree
Showing 17 changed files with 166 additions and 63 deletions.
1 change: 0 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

81 changes: 64 additions & 17 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import os
import platform
import subprocess
import time
from typing import Dict, List, Literal, Optional, Set, get_args

from charms.data_platform_libs.v0.data_interfaces import DataPeer, DataPeerUnit
Expand All @@ -20,6 +19,7 @@
PostgreSQL,
PostgreSQLCreateUserError,
PostgreSQLEnableDisableExtensionError,
PostgreSQLListUsersError,
PostgreSQLUpdateUserPasswordError,
)
from charms.postgresql_k8s.v0.postgresql_tls import PostgreSQLTLS
Expand All @@ -44,7 +44,7 @@
Unit,
WaitingStatus,
)
from tenacity import RetryError, Retrying, retry, stop_after_delay, wait_fixed
from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed

from backups import PostgreSQLBackups
from cluster import (
Expand Down Expand Up @@ -300,6 +300,9 @@ def postgresql(self) -> PostgreSQL:
@property
def primary_endpoint(self) -> Optional[str]:
"""Returns the endpoint of the primary instance or None when no primary available."""
if not self._peers:
logger.debug("primary endpoint early exit: Peer relation not joined yet.")
return None
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
Expand All @@ -309,7 +312,20 @@ def primary_endpoint(self) -> Optional[str]:
# returned is not in the list of the current cluster members
# (like when the cluster was not updated yet after a failed switchover).
if not primary_endpoint or primary_endpoint not in self._units_ips:
raise ValueError()
# TODO figure out why peer data is not available
if (
primary_endpoint
and len(self._units_ips) == 1
and len(self._peers.units) > 1
):
logger.warning(
"Possibly incoplete peer data: Will not map primary IP to unit IP"
)
return primary_endpoint
logger.debug(
"primary endpoint early exit: Primary IP not in cached peer list."
)
primary_endpoint = None
except RetryError:
return None
else:
Expand Down Expand Up @@ -1037,6 +1053,10 @@ def _start_primary(self, event: StartEvent) -> None:
logger.exception(e)
self.unit.status = BlockedStatus("Failed to create postgres user")
return
except PostgreSQLListUsersError:
logger.warning("Deferriing on_start: Unable to list users")
event.defer()
return

self.postgresql.set_up_database()

Expand Down Expand Up @@ -1382,6 +1402,17 @@ def _is_workload_running(self) -> bool:

return charmed_postgresql_snap.services["patroni"]["active"]

@property
def _can_connect_to_postgresql(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3)):
with attempt:
assert self.postgresql.get_postgresql_timezones()
except RetryError:
logger.debug("Cannot connect to database")
return False
return True

def update_config(self, is_creating_backup: bool = False) -> bool:
"""Updates Patroni config file based on the existence of the TLS files."""
if (
Expand Down Expand Up @@ -1425,6 +1456,10 @@ def update_config(self, is_creating_backup: bool = False) -> bool:
logger.debug("Early exit update_config: Patroni not started yet")
return False

# Try to connect
if not self._can_connect_to_postgresql:
logger.warning("Early exit update_config: Cannot connect to Postgresql")
return False
self._validate_config_options()

self._patroni.bulk_update_parameters_controller_by_patroni(
Expand All @@ -1434,20 +1469,7 @@ def update_config(self, is_creating_backup: bool = False) -> bool:
}
)

restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled()
self._patroni.reload_patroni_configuration()
# Sleep the same time as Patroni's loop_wait default value, which tells how much time
# Patroni will wait before checking the configuration file again to reload it.
time.sleep(10)
restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending()
self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""})

# Restart PostgreSQL if TLS configuration has changed
# (so the both old and new connections use the configuration).
if restart_postgresql:
logger.info("PostgreSQL restart required")
self._peers.data[self.unit].pop("postgresql_restarted", None)
self.on[self.restart_manager.name].acquire_lock.emit()
self._handle_postgresql_restart_need(enable_tls)

# Restart the monitoring service if the password was rotated
cache = snap.SnapCache()
Expand Down Expand Up @@ -1487,6 +1509,31 @@ def _validate_config_options(self) -> None:
):
raise Exception("request_time_zone config option has an invalid value")

def _handle_postgresql_restart_need(self, enable_tls: bool) -> None:
"""Handle PostgreSQL restart need based on the TLS configuration and configuration changes."""
restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled()
self._patroni.reload_patroni_configuration()
# Wait for some more time than the Patroni's loop_wait default value (10 seconds),
# which tells how much time Patroni will wait before checking the configuration
# file again to reload it.
try:
for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)):
with attempt:
restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending()
if not restart_postgresql:
raise Exception
except RetryError:
# Ignore the error, as it happens only to indicate that the configuration has not changed.
pass
self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""})

# Restart PostgreSQL if TLS configuration has changed
# (so the both old and new connections use the configuration).
if restart_postgresql:
logger.info("PostgreSQL restart required")
self._peers.data[self.unit].pop("postgresql_restarted", None)
self.on[self.restart_manager.name].acquire_lock.emit()

def _update_relation_endpoints(self) -> None:
"""Updates endpoints and read-only endpoint in all relations."""
self.postgresql_client_relation.update_endpoints()
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ async def add_unit_with_storage(ops_test, app, storage):
return_code, _, _ = await ops_test.juju(*add_unit_cmd)
assert return_code == 0, "Failed to add unit with storage"
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1500)
await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=2000)
assert (
len(ops_test.model.applications[app].units) == expected_units
), "New unit not added to model"
Expand Down
1 change: 0 additions & 1 deletion tests/integration/ha_tests/test_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
)


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/ha_tests/test_restore_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
charm = None


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/ha_tests/test_self_healing.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
DB_PROCESSES = [POSTGRESQL_PROCESS, PATRONI_PROCESS]


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/ha_tests/test_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
TIMEOUT = 600


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_deploy_latest(ops_test: OpsTest) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/ha_tests/test_upgrade_from_stable.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
TIMEOUT = 600


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_deploy_stable(ops_test: OpsTest) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/new_relations/test_new_relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles"


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_deploy_charms(ops_test: OpsTest, charm):
Expand Down
3 changes: 1 addition & 2 deletions tests/integration/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ async def cloud_configs(ops_test: OpsTest, github_secrets) -> None:
bucket_object.delete()


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
async def test_none() -> None:
"""Empty test so that the suite will not fail if all tests are skippedi."""
Expand Down Expand Up @@ -133,7 +132,7 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> No
await action.wait()
async with ops_test.fast_forward(fast_interval="60s"):
await ops_test.model.wait_for_idle(
apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000
apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1200
)

primary = await get_primary(ops_test, f"{database_app_name}/0")
Expand Down
3 changes: 1 addition & 2 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
UNIT_IDS = [0, 1, 2]


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
Expand Down Expand Up @@ -313,7 +312,7 @@ async def test_persist_data_through_primary_deletion(ops_test: OpsTest):

# Add the unit again.
await ops_test.model.applications[DATABASE_APP_NAME].add_unit(count=1)
await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=1500)
await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=2000)

# Testing write occurred to every postgres instance by reading from them
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
)


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
async def test_mailman3_core_db(ops_test: OpsTest, charm: str) -> None:
"""Deploy Mailman3 Core to test the 'db' relation."""
Expand Down
2 changes: 0 additions & 2 deletions tests/integration/test_db_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@
RELATION_NAME = "db-admin"


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.skip(reason="DB Admin tests are currently broken")
async def test_landscape_scalable_bundle_db(ops_test: OpsTest, charm: str) -> None:
"""Deploy Landscape Scalable Bundle to test the 'db-admin' relation."""
await ops_test.model.deploy(
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_password_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
APP_NAME = METADATA["name"]


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@
)


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_plugins(ops_test: OpsTest) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
TLS_CONFIG = {"ca-common-name": "Test CA"}


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
Expand Down
Loading

0 comments on commit 6d57972

Please sign in to comment.