From f6a4e13a446544994674bb5f12334e0a3735512a Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Date: Thu, 11 Jul 2024 10:54:27 -0300 Subject: [PATCH] [DPE-4620][MISC] CI stability fixes + slicing tests (#524) * restructure test * refactor test for db_process * split self healing test * reposition get primary * revert slicing * retry connection and reduce check frequency * add check frequency param * fix on_stop patching + tls instability * temporarily remove stop unit test * Revert "fix on_stop patching + tls instability" This reverts commit f9ab75970d5ec95ee33eb1616fc69d2068ce4b96. * Revert "temporarily remove stop unit test" This reverts commit 12040bd81459411e8afb76cc9aed12a41b204878. * keep TLS check + nits * fix linting * split self_healing and fix 409 error * fix on_stop new logic * more fixes * try early exit on push_tls_files * revert change on tls push * reduce retry intervals * revery changes to fix 409 error + increase timeout * fine tune retries * slice backups test * fix slicing * fix typo in get_primary * fix typo * set raise_on_error=False after deploys --- lib/charms/postgresql_k8s/v0/postgresql.py | 13 +- src/patroni.py | 8 +- tests/integration/ha_tests/helpers.py | 2 +- .../ha_tests/test_async_replication.py | 2 + .../integration/ha_tests/test_replication.py | 2 +- .../integration/ha_tests/test_self_healing.py | 61 +--- tests/integration/ha_tests/test_upgrade.py | 2 +- .../ha_tests/test_upgrade_from_stable.py | 2 +- tests/integration/helpers.py | 214 ++++++++++++ .../new_relations/test_new_relations.py | 2 +- tests/integration/test_backups.py | 326 +++++------------- tests/integration/test_password_rotation.py | 16 +- tests/integration/test_tls.py | 5 + 13 files changed, 336 insertions(+), 319 deletions(-) diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 071643841f..3990a3913b 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -27,6 +27,7 @@ from ops.model import Relation from psycopg2 import sql from psycopg2.sql import Composed +from tenacity import Retrying, stop_after_attempt, wait_fixed # The unique Charmhub library identifier, never change it LIBID = "24ee217a54e840a598ff21a079c3e678" @@ -36,7 +37,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 30 +LIBPATCH = 31 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -128,10 +129,12 @@ def _connect_to_database( psycopg2 connection object. """ host = database_host if database_host is not None else self.primary_host - connection = psycopg2.connect( - f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'" - f"password='{self.password}' connect_timeout=1" - ) + for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(3), reraise=True): + with attempt: + connection = psycopg2.connect( + f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'" + f"password='{self.password}' connect_timeout=1" + ) connection.autocommit = True return connection diff --git a/src/patroni.py b/src/patroni.py index a522ff778e..93e1fd1948 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -264,7 +264,7 @@ def primary_endpoint_ready(self) -> bool: Return whether the primary endpoint is redirecting connections to the primary pod. """ try: - for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)): with attempt: r = requests.get( f"{'https' if self._tls_enabled else 'http'}://{self._primary_endpoint}:8008/health", @@ -281,7 +281,7 @@ def primary_endpoint_ready(self) -> bool: def member_replication_lag(self) -> str: """Member replication lag.""" try: - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)): with attempt: cluster_status = requests.get( f"{self._patroni_url}/cluster", @@ -306,7 +306,7 @@ def member_started(self) -> bool: allow server time to start up. """ try: - for attempt in Retrying(stop=stop_after_delay(90), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)): with attempt: r = requests.get(f"{self._patroni_url}/health", verify=self._verify) except RetryError: @@ -323,7 +323,7 @@ def member_streaming(self) -> bool: allow server time to start up. """ try: - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(1)): with attempt: r = requests.get(f"{self._patroni_url}/health", verify=self._verify) except RetryError: diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 768d2cbf05..790ddd3667 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -487,7 +487,7 @@ async def is_connection_possible(ops_test: OpsTest, unit_name: str) -> bool: """Test a connection to a PostgreSQL server.""" try: app = unit_name.split("/")[0] - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(120), wait=wait_fixed(3)): with attempt: password = await asyncio.wait_for( get_password(ops_test, database_app_name=app), 15 diff --git a/tests/integration/ha_tests/test_async_replication.py b/tests/integration/ha_tests/test_async_replication.py index 965c4ddf1a..451f6417c5 100644 --- a/tests/integration/ha_tests/test_async_replication.py +++ b/tests/integration/ha_tests/test_async_replication.py @@ -121,11 +121,13 @@ async def test_deploy_async_replication_setup( apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=TIMEOUT, + raise_on_error=False, ), second_model.wait_for_idle( apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", timeout=TIMEOUT, + raise_on_error=False, ), ) diff --git a/tests/integration/ha_tests/test_replication.py b/tests/integration/ha_tests/test_replication.py index 05fd305d3d..c4a58297db 100644 --- a/tests/integration/ha_tests/test_replication.py +++ b/tests/integration/ha_tests/test_replication.py @@ -49,7 +49,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: if wait_for_apps: async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) + await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False) @pytest.mark.group(1) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 9feb8e1887..f181f8d7a6 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -75,14 +75,14 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: if wait_for_apps: async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) + await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False) @pytest.mark.group(1) -@markers.juju2 @pytest.mark.parametrize("process", DB_PROCESSES) -async def test_kill_db_process( - ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout +@pytest.mark.parametrize("signal", ["SIGTERM", pytest.param("SIGKILL", marks=markers.juju2)]) +async def test_interruption_db_process( + ops_test: OpsTest, process: str, signal: str, continuous_writes, primary_start_timeout ) -> None: # Locate primary unit. app = await app_name(ops_test) @@ -91,23 +91,25 @@ async def test_kill_db_process( # Start an application that continuously writes data to the database. await start_continuous_writes(ops_test, app) - # Kill the database process. - await send_signal_to_process(ops_test, primary_name, process, "SIGKILL") + # Interrupt the database process. + await send_signal_to_process(ops_test, primary_name, process, signal) # Wait some time to elect a new primary. - sleep(MEDIAN_ELECTION_TIME * 2) + sleep(MEDIAN_ELECTION_TIME * 6) async with ops_test.fast_forward(): await are_writes_increasing(ops_test, primary_name) + # Verify that a new primary gets elected (ie old primary is secondary). + for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + with attempt: + new_primary_name = await get_primary(ops_test, app, down_unit=primary_name) + assert new_primary_name != primary_name + # Verify that the database service got restarted and is ready in the old primary. logger.info(f"waiting for the database service to restart on {primary_name}") assert await is_postgresql_ready(ops_test, primary_name) - # Verify that a new primary gets elected (ie old primary is secondary). - new_primary_name = await get_primary(ops_test, app, down_unit=primary_name) - assert new_primary_name != primary_name - await is_cluster_updated(ops_test, primary_name) @@ -154,38 +156,6 @@ async def test_freeze_db_process( await is_cluster_updated(ops_test, primary_name) -@pytest.mark.group(1) -@pytest.mark.parametrize("process", DB_PROCESSES) -async def test_restart_db_process( - ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout -) -> None: - # Locate primary unit. - app = await app_name(ops_test) - primary_name = await get_primary(ops_test, app) - - # Start an application that continuously writes data to the database. - await start_continuous_writes(ops_test, app) - - # Restart the database process. - await send_signal_to_process(ops_test, primary_name, process, "SIGTERM") - - # Wait some time to elect a new primary. - sleep(MEDIAN_ELECTION_TIME * 2) - - async with ops_test.fast_forward(): - await are_writes_increasing(ops_test, primary_name) - - # Verify that the database service got restarted and is ready in the old primary. - logger.info(f"waiting for the database service to restart on {primary_name}") - assert await is_postgresql_ready(ops_test, primary_name) - - # Verify that a new primary gets elected (ie old primary is secondary). - new_primary_name = await get_primary(ops_test, app, down_unit=primary_name) - assert new_primary_name != primary_name - - await is_cluster_updated(ops_test, primary_name) - - @pytest.mark.group(1) @pytest.mark.unstable @pytest.mark.parametrize("process", DB_PROCESSES) @@ -408,9 +378,12 @@ async def test_network_cut( await is_cluster_updated(ops_test, primary_name) -@pytest.mark.group(1) +@pytest.mark.group(2) async def test_scaling_to_zero(ops_test: OpsTest, continuous_writes) -> None: """Scale the database to zero units and scale up again.""" + # Deploy applications + await test_build_and_deploy(ops_test) + # Locate primary unit. app = await app_name(ops_test) diff --git a/tests/integration/ha_tests/test_upgrade.py b/tests/integration/ha_tests/test_upgrade.py index 58e3123c7a..ccd6cccfd5 100644 --- a/tests/integration/ha_tests/test_upgrade.py +++ b/tests/integration/ha_tests/test_upgrade.py @@ -57,7 +57,7 @@ async def test_deploy_latest(ops_test: OpsTest) -> None: logger.info("Wait for applications to become active") async with ops_test.fast_forward(): await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active" + apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", raise_on_error=False ) assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 diff --git a/tests/integration/ha_tests/test_upgrade_from_stable.py b/tests/integration/ha_tests/test_upgrade_from_stable.py index 06f2dd1b3d..e47453a187 100644 --- a/tests/integration/ha_tests/test_upgrade_from_stable.py +++ b/tests/integration/ha_tests/test_upgrade_from_stable.py @@ -52,7 +52,7 @@ async def test_deploy_stable(ops_test: OpsTest) -> None: logger.info("Wait for applications to become active") async with ops_test.fast_forward(): await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active" + apps=[DATABASE_APP_NAME, APPLICATION_NAME], status="active", raise_on_error=False ) assert len(ops_test.model.applications[DATABASE_APP_NAME].units) == 3 diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 3abde75daf..3b1a61b72e 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -3,6 +3,7 @@ # See LICENSE file for licensing details. import asyncio import itertools +import logging from datetime import datetime from multiprocessing import ProcessError from pathlib import Path @@ -39,6 +40,8 @@ charm = None +logger = logging.getLogger(__name__) + async def app_name( ops_test: OpsTest, application_name: str = "postgresql-k8s", model: Model = None @@ -100,6 +103,7 @@ async def build_and_deploy( apps=[database_app_name], status=status, raise_on_blocked=True, + raise_on_error=False, timeout=1000, wait_for_exact_units=num_units, ) @@ -304,6 +308,7 @@ async def deploy_and_relate_application_with_postgresql( apps=[application_name], status="active", raise_on_blocked=False, # Application that needs a relation is blocked initially. + raise_on_error=False, timeout=1000, ) @@ -799,3 +804,212 @@ async def cat_file_from_unit(ops_test: OpsTest, filepath: str, unit_name: str) - "Expected cat command %s to succeed instead it failed: %s", cat_cmd, return_code ) return output + + +async def backup_operations( + ops_test: OpsTest, + s3_integrator_app_name: str, + tls_certificates_app_name: str, + tls_config, + tls_channel, + credentials, + cloud, + config, +) -> None: + """Basic set of operations for backup testing in different cloud providers.""" + # Deploy S3 Integrator and TLS Certificates Operator. + await ops_test.model.deploy(s3_integrator_app_name) + await ops_test.model.deploy(tls_certificates_app_name, config=tls_config, channel=tls_channel) + # Deploy and relate PostgreSQL to S3 integrator (one database app for each cloud for now + # as archivo_mode is disabled after restoring the backup) and to TLS Certificates Operator + # (to be able to create backups from replicas). + database_app_name = f"{DATABASE_APP_NAME}-{cloud.lower()}" + await build_and_deploy(ops_test, 2, database_app_name=database_app_name, wait_for_idle=False) + + await ops_test.model.relate(database_app_name, tls_certificates_app_name) + async with ops_test.fast_forward(fast_interval="60s"): + await ops_test.model.wait_for_idle( + apps=[database_app_name], status="active", timeout=1000, raise_on_error=False + ) + await ops_test.model.relate(database_app_name, s3_integrator_app_name) + + # Configure and set access and secret keys. + logger.info(f"configuring S3 integrator for {cloud}") + await ops_test.model.applications[s3_integrator_app_name].set_config(config) + action = await ops_test.model.units.get(f"{s3_integrator_app_name}/0").run_action( + "sync-s3-credentials", + **credentials, + ) + await action.wait() + async with ops_test.fast_forward(fast_interval="60s"): + await ops_test.model.wait_for_idle( + apps=[database_app_name, s3_integrator_app_name], status="active", timeout=1000 + ) + + primary = await get_primary(ops_test, database_app_name) + for unit in ops_test.model.applications[database_app_name].units: + if unit.name != primary: + replica = unit.name + break + + # Write some data. + password = await get_password(ops_test, database_app_name=database_app_name) + address = await get_unit_address(ops_test, primary) + logger.info("creating a table in the database") + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute( + "CREATE TABLE IF NOT EXISTS backup_table_1 (test_collumn INT );" + ) + connection.close() + + # With a stable cluster, Run the "create backup" action + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) + logger.info("creating a backup") + action = await ops_test.model.units.get(replica).run_action("create-backup") + await action.wait() + backup_status = action.results.get("backup-status") + assert backup_status, "backup hasn't succeeded" + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Run the "list backups" action. + logger.info("listing the available backups") + action = await ops_test.model.units.get(replica).run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + # 5 lines for header output, 1 backup line ==> 6 total lines + assert len(backups.split("\n")) == 6, "full backup is not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Write some data. + logger.info("creating a second table in the database") + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("CREATE TABLE backup_table_2 (test_collumn INT );") + connection.close() + + # Run the "create backup" action. + logger.info("creating a backup") + action = await ops_test.model.units.get(replica).run_action( + "create-backup", **{"type": "differential"} + ) + await action.wait() + backup_status = action.results.get("backup-status") + assert backup_status, "backup hasn't succeeded" + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Run the "list backups" action. + logger.info("listing the available backups") + action = await ops_test.model.units.get(replica).run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + # 5 lines for header output, 2 backup lines ==> 7 total lines + assert len(backups.split("\n")) == 7, "differential backup is not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Write some data. + logger.info("creating a second table in the database") + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("CREATE TABLE backup_table_3 (test_collumn INT );") + connection.close() + # Scale down to be able to restore. + async with ops_test.fast_forward(fast_interval="60s"): + await scale_application(ops_test, database_app_name, 1) + + # Run the "restore backup" action for differential backup. + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + last_diff_backup = backups.split("\n")[-1] + backup_id = last_diff_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" + + # Wait for the restore to complete. + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Check that the backup was correctly restored by having only the first created table. + logger.info("checking that the backup was correctly restored") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" + ) + assert cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" + ) + assert cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_2' doesn't exist" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_3');" + ) + assert not cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_3' exists" + connection.close() + + # Run the "restore backup" action for full backup. + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + last_full_backup = backups.split("\n")[-2] + backup_id = last_full_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" + + # Wait for the restore to complete. + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Check that the backup was correctly restored by having only the first created table. + logger.info("checking that the backup was correctly restored") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + with db_connect(host=address, password=password) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" + ) + assert cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" + ) + assert not cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_2' exists" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_3');" + ) + assert not cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_3' exists" + connection.close() diff --git a/tests/integration/new_relations/test_new_relations.py b/tests/integration/new_relations/test_new_relations.py index 0681d9b8bf..829dfaad42 100644 --- a/tests/integration/new_relations/test_new_relations.py +++ b/tests/integration/new_relations/test_new_relations.py @@ -91,7 +91,7 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest, databas f"{APPLICATION_APP_NAME}:{FIRST_DATABASE_RELATION_NAME}", DATABASE_APP_NAME ) await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True + apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True, timeout=1000 ) # Check that on juju 3 we have secrets and no username and password in the rel databag diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 9c4141021c..dd8c355fa6 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -15,6 +15,7 @@ from . import architecture from .helpers import ( DATABASE_APP_NAME, + backup_operations, build_and_deploy, cat_file_from_unit, construct_endpoint, @@ -102,138 +103,39 @@ async def cloud_configs(ops_test: OpsTest, github_secrets) -> None: @pytest.mark.group(1) @pytest.mark.abort_on_fail -async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> None: - """Build and deploy two units of PostgreSQL and then test the backup and restore actions.""" - # Deploy S3 Integrator and TLS Certificates Operator. - await ops_test.model.deploy(S3_INTEGRATOR_APP_NAME) - await ops_test.model.deploy(tls_certificates_app_name, config=tls_config, channel=tls_channel) - - for cloud, config in cloud_configs[0].items(): - # Deploy and relate PostgreSQL to S3 integrator (one database app for each cloud for now - # as archivo_mode is disabled after restoring the backup) and to TLS Certificates Operator - # (to be able to create backups from replicas). - database_app_name = f"{DATABASE_APP_NAME}-{cloud.lower()}" - await build_and_deploy( - ops_test, 2, database_app_name=database_app_name, wait_for_idle=False - ) - - await ops_test.model.relate(database_app_name, tls_certificates_app_name) - async with ops_test.fast_forward(fast_interval="60s"): - await ops_test.model.wait_for_idle( - apps=[database_app_name], status="active", timeout=1000 - ) - await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) - - # Configure and set access and secret keys. - logger.info(f"configuring S3 integrator for {cloud}") - await ops_test.model.applications[S3_INTEGRATOR_APP_NAME].set_config(config) - action = await ops_test.model.units.get(f"{S3_INTEGRATOR_APP_NAME}/0").run_action( - "sync-s3-credentials", - **cloud_configs[1][cloud], - ) - await action.wait() - async with ops_test.fast_forward(fast_interval="60s"): - await ops_test.model.wait_for_idle( - apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000 - ) - - primary = await get_primary(ops_test, database_app_name) - for unit in ops_test.model.applications[database_app_name].units: - if unit.name != primary: - replica = unit.name - break +async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> None: + """Build and deploy two units of PostgreSQL in AWS and then test the backup and restore actions.""" + config = cloud_configs[0][AWS] + credentials = cloud_configs[1][AWS] - # Write some data. - password = await get_password(ops_test, database_app_name=database_app_name) - address = await get_unit_address(ops_test, primary) - logger.info("creating a table in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute( - "CREATE TABLE IF NOT EXISTS backup_table_1 (test_collumn INT );" - ) - connection.close() - - # With a stable cluster, Run the "create backup" action - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30) - logger.info("creating a backup") - action = await ops_test.model.units.get(replica).run_action("create-backup") - await action.wait() - backup_status = action.results.get("backup-status") - assert backup_status, "backup hasn't succeeded" - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) - - # Run the "list backups" action. - logger.info("listing the available backups") - action = await ops_test.model.units.get(replica).run_action("list-backups") - await action.wait() - backups = action.results.get("backups") - # 5 lines for header output, 1 backup line ==> 6 total lines - assert len(backups.split("\n")) == 6, "full backup is not outputted" - await ops_test.model.wait_for_idle(status="active", timeout=1000) - - # Write some data. - logger.info("creating a second table in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("CREATE TABLE backup_table_2 (test_collumn INT );") - connection.close() + await backup_operations( + ops_test, + S3_INTEGRATOR_APP_NAME, + tls_certificates_app_name, + tls_config, + tls_channel, + credentials, + AWS, + config, + ) + database_app_name = f"{DATABASE_APP_NAME}-aws" - # Run the "create backup" action. - logger.info("creating a backup") - action = await ops_test.model.units.get(replica).run_action( - "create-backup", **{"type": "differential"} + async with ops_test.fast_forward(): + logger.info("removing the TLS relation") + await ops_test.model.applications[database_app_name].remove_relation( + f"{database_app_name}:certificates", + f"{tls_certificates_app_name}:certificates", ) - await action.wait() - backup_status = action.results.get("backup-status") - assert backup_status, "backup hasn't succeeded" - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) - - # Run the "list backups" action. - logger.info("listing the available backups") - action = await ops_test.model.units.get(replica).run_action("list-backups") - await action.wait() - backups = action.results.get("backups") - # 5 lines for header output, 2 backup lines ==> 7 total lines - assert len(backups.split("\n")) == 7, "differential backup is not outputted" - await ops_test.model.wait_for_idle(status="active", timeout=1000) + await ops_test.model.wait_for_idle(apps=[database_app_name], status="active", timeout=1000) - # Write some data. - logger.info("creating a second table in the database") - with db_connect(host=address, password=password) as connection: - connection.autocommit = True - connection.cursor().execute("CREATE TABLE backup_table_3 (test_collumn INT );") - connection.close() - # Scale down to be able to restore. + # Scale up to be able to test primary and leader being different. async with ops_test.fast_forward(fast_interval="60s"): - await scale_application(ops_test, database_app_name, 1) + await scale_application(ops_test, database_app_name, 2) - # Run the "restore backup" action for differential backup. - for attempt in Retrying( - stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) - ): - with attempt: - logger.info("restoring the backup") - last_diff_backup = backups.split("\n")[-1] - backup_id = last_diff_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" - - # Wait for the restore to complete. - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) - - # Check that the backup was correctly restored by having only the first created table. - logger.info("checking that the backup was correctly restored") - primary = await get_primary(ops_test, database_app_name) - address = await get_unit_address(ops_test, primary) + logger.info("ensuring that the replication is working correctly") + new_unit_name = f"{database_app_name}/1" + address = await get_unit_address(ops_test, new_unit_name) + password = await get_password(ops_test, database_app_name=database_app_name) with db_connect( host=address, password=password ) as connection, connection.cursor() as cursor: @@ -243,138 +145,68 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, ) assert cursor.fetchone()[ 0 - ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" + ], f"replication isn't working correctly: table 'backup_table_1' doesn't exist in {new_unit_name}" cursor.execute( "SELECT EXISTS (SELECT FROM information_schema.tables" " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" ) - assert cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_2' doesn't exist" - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_3');" - ) assert not cursor.fetchone()[ 0 - ], "backup wasn't correctly restored: table 'backup_table_3' exists" + ], f"replication isn't working correctly: table 'backup_table_2' exists in {new_unit_name}" connection.close() - # Run the "restore backup" action for full backup. + old_primary = await get_primary(ops_test, database_app_name) + logger.info(f"performing a switchover from {old_primary} to {new_unit_name}") + await switchover(ops_test, old_primary, new_unit_name) + + logger.info("checking that the primary unit has changed") + primary = await get_primary(ops_test, database_app_name) for attempt in Retrying( - stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=30) ): with attempt: - logger.info("restoring the backup") - last_full_backup = backups.split("\n")[-2] - backup_id = last_full_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" - - # Wait for the restore to complete. - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(status="active", timeout=1000) - - # Check that the backup was correctly restored by having only the first created table. - logger.info("checking that the backup was correctly restored") - primary = await get_primary(ops_test, database_app_name) - address = await get_unit_address(ops_test, primary) - with db_connect( - host=address, password=password - ) as connection, connection.cursor() as cursor: - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" - ) - assert cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" - ) - assert not cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_2' exists" - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_3');" - ) - assert not cursor.fetchone()[ - 0 - ], "backup wasn't correctly restored: table 'backup_table_3' exists" - connection.close() + assert primary == new_unit_name + + # Ensure stanza is working correctly. + logger.info("listing the available backups to ensure that the stanza is working correctly") + action = await ops_test.model.units.get(new_unit_name).run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + assert backups, "backups not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) - # Run the following steps only in one cloud (it's enough for those checks). - if cloud == list(cloud_configs[0].keys())[0]: - async with ops_test.fast_forward(): - logger.info("removing the TLS relation") - await ops_test.model.applications[database_app_name].remove_relation( - f"{database_app_name}:certificates", - f"{tls_certificates_app_name}:certificates", - ) - await ops_test.model.wait_for_idle( - apps=[database_app_name], status="active", timeout=1000 - ) - - # Scale up to be able to test primary and leader being different. - async with ops_test.fast_forward(fast_interval="60s"): - await scale_application(ops_test, database_app_name, 2) - - logger.info("ensuring that the replication is working correctly") - new_unit_name = f"{database_app_name}/1" - address = await get_unit_address(ops_test, new_unit_name) - with db_connect( - host=address, password=password - ) as connection, connection.cursor() as cursor: - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" - ) - assert cursor.fetchone()[ - 0 - ], f"replication isn't working correctly: table 'backup_table_1' doesn't exist in {new_unit_name}" - cursor.execute( - "SELECT EXISTS (SELECT FROM information_schema.tables" - " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" - ) - assert not cursor.fetchone()[ - 0 - ], f"replication isn't working correctly: table 'backup_table_2' exists in {new_unit_name}" - connection.close() - - logger.info(f"performing a switchover from {primary} to {new_unit_name}") - await switchover(ops_test, primary, new_unit_name) - - logger.info("checking that the primary unit has changed") - primary = await get_primary(ops_test, database_app_name) - for attempt in Retrying( - stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=30) - ): - with attempt: - assert primary == new_unit_name - - # Ensure stanza is working correctly. - logger.info( - "listing the available backups to ensure that the stanza is working correctly" - ) - action = await ops_test.model.units.get(new_unit_name).run_action("list-backups") - await action.wait() - backups = action.results.get("backups") - assert backups, "backups not outputted" - await ops_test.model.wait_for_idle(status="active", timeout=1000) - - # Remove the database app. - await ops_test.model.remove_application(database_app_name, block_until_done=True) + # Remove the database app. + await ops_test.model.remove_application(database_app_name, block_until_done=True) # Remove the TLS operator. await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) -@pytest.mark.group(1) +@pytest.mark.group(2) +@pytest.mark.abort_on_fail +async def test_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> None: + """Build and deploy two units of PostgreSQL in GCP and then test the backup and restore actions.""" + config = cloud_configs[0][GCP] + credentials = cloud_configs[1][GCP] + + await backup_operations( + ops_test, + S3_INTEGRATOR_APP_NAME, + tls_certificates_app_name, + tls_config, + tls_channel, + credentials, + GCP, + config, + ) + database_app_name = f"{DATABASE_APP_NAME}-gcp" + + # Remove the database app. + await ops_test.model.remove_application(database_app_name, block_until_done=True) + # Remove the TLS operator. + await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) + + +@pytest.mark.group(2) async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None: """Test that is possible to restore a backup to another PostgreSQL cluster.""" previous_database_app_name = f"{DATABASE_APP_NAME}-gcp" @@ -465,7 +297,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None connection.close() -@pytest.mark.group(1) +@pytest.mark.group(2) async def test_invalid_config_and_recovery_after_fixing_it( ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict] ) -> None: @@ -501,10 +333,10 @@ async def test_invalid_config_and_recovery_after_fixing_it( logger.info( "configuring S3 integrator for a valid cloud, but with the path of another cluster repository" ) - await ops_test.model.applications[S3_INTEGRATOR_APP_NAME].set_config(cloud_configs[0][AWS]) + await ops_test.model.applications[S3_INTEGRATOR_APP_NAME].set_config(cloud_configs[0][GCP]) action = await ops_test.model.units.get(f"{S3_INTEGRATOR_APP_NAME}/0").run_action( "sync-s3-credentials", - **cloud_configs[1][AWS], + **cloud_configs[1][GCP], ) await action.wait() await wait_for_idle_on_blocked( @@ -517,7 +349,7 @@ async def test_invalid_config_and_recovery_after_fixing_it( # Provide valid backup configurations, with another path in the S3 bucket. logger.info("configuring S3 integrator for a valid cloud") - config = cloud_configs[0][AWS].copy() + config = cloud_configs[0][GCP].copy() config["path"] = f"/postgresql-k8s/{uuid.uuid1()}" await ops_test.model.applications[S3_INTEGRATOR_APP_NAME].set_config(config) logger.info("waiting for the database charm to become active") @@ -526,7 +358,7 @@ async def test_invalid_config_and_recovery_after_fixing_it( ) -@pytest.mark.group(1) +@pytest.mark.group(2) async def test_delete_pod(ops_test: OpsTest, github_secrets) -> None: logger.info("Getting original backup config") database_app_name = f"new-{DATABASE_APP_NAME}" diff --git a/tests/integration/test_password_rotation.py b/tests/integration/test_password_rotation.py index 9e84174b95..e99e00da8c 100644 --- a/tests/integration/test_password_rotation.py +++ b/tests/integration/test_password_rotation.py @@ -10,8 +10,8 @@ from . import markers from .helpers import ( - CHARM_SERIES, METADATA, + build_and_deploy, check_patroni, db_connect, get_leader_unit, @@ -31,20 +31,8 @@ @pytest.mark.skip_if_deployed async def test_deploy_active(ops_test: OpsTest): """Build the charm and deploy it.""" - charm = await ops_test.build_charm(".") async with ops_test.fast_forward(): - await ops_test.model.deploy( - charm, - resources={ - "postgresql-image": METADATA["resources"]["postgresql-image"]["upstream-source"] - }, - application_name=APP_NAME, - num_units=3, - series=CHARM_SERIES, - trust=True, - config={"profile": "testing"}, - ) - await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000) + await build_and_deploy(ops_test, 3, database_app_name=APP_NAME) @pytest.mark.group(1) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 6cc73cc147..be512a9c32 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -166,6 +166,11 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: with attempt: await check_tls_rewind(ops_test) + # Await for postgresql to be stable if not already + await ops_test.model.wait_for_idle( + apps=[DATABASE_APP_NAME], status="active", idle_period=15 + ) + # Deploy and check Mattermost user and database existence. relation_id = await deploy_and_relate_application_with_postgresql( ops_test, "mattermost-k8s", MATTERMOST_APP_NAME, APPLICATION_UNITS, status="waiting"