Skip to content

Commit

Permalink
[DPE-4620][MISC] CI stability fixes + slicing tests (#524)
Browse files Browse the repository at this point in the history
* restructure test

* refactor test for db_process

* split self healing test

* reposition get primary

* revert slicing

* retry connection and reduce check frequency

* add check frequency param

* fix on_stop patching + tls instability

* temporarily remove stop unit test

* Revert "fix on_stop patching + tls instability"

This reverts commit f9ab759.

* Revert "temporarily remove stop unit test"

This reverts commit 12040bd.

* keep TLS check + nits

* fix linting

* split self_healing and fix 409 error

* fix on_stop new logic

* more fixes

* try early exit on push_tls_files

* revert change on tls push

* reduce retry intervals

* revery changes to fix 409 error + increase timeout

* fine tune retries

* slice backups test

* fix slicing

* fix typo in get_primary

* fix typo

* set raise_on_error=False after deploys
  • Loading branch information
lucasgameiroborges authored Jul 11, 2024
1 parent bf297a2 commit f6a4e13
Show file tree
Hide file tree
Showing 13 changed files with 336 additions and 319 deletions.
13 changes: 8 additions & 5 deletions lib/charms/postgresql_k8s/v0/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ops.model import Relation
from psycopg2 import sql
from psycopg2.sql import Composed
from tenacity import Retrying, stop_after_attempt, wait_fixed

# The unique Charmhub library identifier, never change it
LIBID = "24ee217a54e840a598ff21a079c3e678"
Expand All @@ -36,7 +37,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 30
LIBPATCH = 31

INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles"

Expand Down Expand Up @@ -128,10 +129,12 @@ def _connect_to_database(
psycopg2 connection object.
"""
host = database_host if database_host is not None else self.primary_host
connection = psycopg2.connect(
f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'"
f"password='{self.password}' connect_timeout=1"
)
for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(3), reraise=True):
with attempt:
connection = psycopg2.connect(
f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'"
f"password='{self.password}' connect_timeout=1"
)
connection.autocommit = True
return connection

Expand Down
8 changes: 4 additions & 4 deletions src/patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def primary_endpoint_ready(self) -> bool:
Return whether the primary endpoint is redirecting connections to the primary pod.
"""
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)):
with attempt:
r = requests.get(
f"{'https' if self._tls_enabled else 'http'}://{self._primary_endpoint}:8008/health",
Expand All @@ -281,7 +281,7 @@ def primary_endpoint_ready(self) -> bool:
def member_replication_lag(self) -> str:
"""Member replication lag."""
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)):
with attempt:
cluster_status = requests.get(
f"{self._patroni_url}/cluster",
Expand All @@ -306,7 +306,7 @@ def member_started(self) -> bool:
allow server time to start up.
"""
try:
for attempt in Retrying(stop=stop_after_delay(90), wait=wait_fixed(3)):
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)):
with attempt:
r = requests.get(f"{self._patroni_url}/health", verify=self._verify)
except RetryError:
Expand All @@ -323,7 +323,7 @@ def member_streaming(self) -> bool:
allow server time to start up.
"""
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)):
with attempt:
r = requests.get(f"{self._patroni_url}/health", verify=self._verify)
except RetryError:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ async def is_connection_possible(ops_test: OpsTest, unit_name: str) -> bool:
"""Test a connection to a PostgreSQL server."""
try:
app = unit_name.split("/")[0]
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
for attempt in Retrying(stop=stop_after_delay(120), wait=wait_fixed(3)):
with attempt:
password = await asyncio.wait_for(
get_password(ops_test, database_app_name=app), 15
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/ha_tests/test_async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,13 @@ async def test_deploy_async_replication_setup(
apps=[DATABASE_APP_NAME, APPLICATION_NAME],
status="active",
timeout=TIMEOUT,
raise_on_error=False,
),
second_model.wait_for_idle(
apps=[DATABASE_APP_NAME, APPLICATION_NAME],
status="active",
timeout=TIMEOUT,
raise_on_error=False,
),
)

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/test_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:

if wait_for_apps:
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(status="active", timeout=1000)
await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False)


@pytest.mark.group(1)
Expand Down
61 changes: 17 additions & 44 deletions tests/integration/ha_tests/test_self_healing.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:

if wait_for_apps:
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(status="active", timeout=1000)
await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False)


@pytest.mark.group(1)
@markers.juju2
@pytest.mark.parametrize("process", DB_PROCESSES)
async def test_kill_db_process(
ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout
@pytest.mark.parametrize("signal", ["SIGTERM", pytest.param("SIGKILL", marks=markers.juju2)])
async def test_interruption_db_process(
ops_test: OpsTest, process: str, signal: str, continuous_writes, primary_start_timeout
) -> None:
# Locate primary unit.
app = await app_name(ops_test)
Expand All @@ -91,23 +91,25 @@ async def test_kill_db_process(
# Start an application that continuously writes data to the database.
await start_continuous_writes(ops_test, app)

# Kill the database process.
await send_signal_to_process(ops_test, primary_name, process, "SIGKILL")
# Interrupt the database process.
await send_signal_to_process(ops_test, primary_name, process, signal)

# Wait some time to elect a new primary.
sleep(MEDIAN_ELECTION_TIME * 2)
sleep(MEDIAN_ELECTION_TIME * 6)

async with ops_test.fast_forward():
await are_writes_increasing(ops_test, primary_name)

# Verify that a new primary gets elected (ie old primary is secondary).
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
with attempt:
new_primary_name = await get_primary(ops_test, app, down_unit=primary_name)
assert new_primary_name != primary_name

# Verify that the database service got restarted and is ready in the old primary.
logger.info(f"waiting for the database service to restart on {primary_name}")
assert await is_postgresql_ready(ops_test, primary_name)

# Verify that a new primary gets elected (ie old primary is secondary).
new_primary_name = await get_primary(ops_test, app, down_unit=primary_name)
assert new_primary_name != primary_name

await is_cluster_updated(ops_test, primary_name)


Expand Down Expand Up @@ -154,38 +156,6 @@ async def test_freeze_db_process(
await is_cluster_updated(ops_test, primary_name)


@pytest.mark.group(1)
@pytest.mark.parametrize("process", DB_PROCESSES)
async def test_restart_db_process(
ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout
) -> None:
# Locate primary unit.
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)

# Start an application that continuously writes data to the database.
await start_continuous_writes(ops_test, app)

# Restart the database process.
await send_signal_to_process(ops_test, primary_name, process, "SIGTERM")

# Wait some time to elect a new primary.
sleep(MEDIAN_ELECTION_TIME * 2)

async with ops_test.fast_forward():
await are_writes_increasing(ops_test, primary_name)

# Verify that the database service got restarted and is ready in the old primary.
logger.info(f"waiting for the database service to restart on {primary_name}")
assert await is_postgresql_ready(ops_test, primary_name)

# Verify that a new primary gets elected (ie old primary is secondary).
new_primary_name = await get_primary(ops_test, app, down_unit=primary_name)
assert new_primary_name != primary_name

await is_cluster_updated(ops_test, primary_name)


@pytest.mark.group(1)
@pytest.mark.unstable
@pytest.mark.parametrize("process", DB_PROCESSES)
Expand Down Expand Up @@ -408,9 +378,12 @@ async def test_network_cut(
await is_cluster_updated(ops_test, primary_name)


@pytest.mark.group(1)
@pytest.mark.group(2)
async def test_scaling_to_zero(ops_test: OpsTest, continuous_writes) -> None:
"""Scale the database to zero units and scale up again."""
# Deploy applications
await test_build_and_deploy(ops_test)

# Locate primary unit.
app = await app_name(ops_test)

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/test_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ async def test_deploy_latest(ops_test: OpsTest) -> None:
logger.info("Wait for applications to become active")
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active"
apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", raise_on_error=False
)
assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/test_upgrade_from_stable.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ async def test_deploy_stable(ops_test: OpsTest) -> None:
logger.info("Wait for applications to become active")
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active"
apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", raise_on_error=False
)
assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3

Expand Down
Loading

0 comments on commit f6a4e13

Please sign in to comment.