From c9ad12a44dad721c1198d22122332c661b4318e1 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 1 Nov 2023 10:58:53 +0000 Subject: [PATCH 01/10] test unpermitted shard relations --- tests/integration/sharding_tests/conftest.py | 21 +++++ .../sharding_tests/test_sharding_relations.py | 84 +++++++++++++++++++ tox.ini | 15 ++++ 3 files changed, 120 insertions(+) create mode 100644 tests/integration/sharding_tests/conftest.py create mode 100644 tests/integration/sharding_tests/test_sharding_relations.py diff --git a/tests/integration/sharding_tests/conftest.py b/tests/integration/sharding_tests/conftest.py new file mode 100644 index 000000000..d91962292 --- /dev/null +++ b/tests/integration/sharding_tests/conftest.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import pytest +from pytest_operator.plugin import OpsTest + + +@pytest.fixture(scope="module") +async def application_charm(ops_test: OpsTest): + """Build the application charm.""" + charm_path = "tests/integration/relation_tests/new_relations/application-charm" + charm = await ops_test.build_charm(charm_path) + return charm + + +@pytest.fixture(scope="module") +async def database_charm(ops_test: OpsTest): + """Build the database charm.""" + charm = await ops_test.build_charm(".") + return charm diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py new file mode 100644 index 000000000..6215fb34d --- /dev/null +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. +import pytest +from pytest_operator.plugin import OpsTest +from juju.errors import JujuAPIError + +SHARD_ONE_APP_NAME = "shard-one" +CONFIG_SERVER_ONE_APP_NAME = "config-server-one" +CONFIG_SERVER_TWO_APP_NAME = "config-server-two" +APP_CHARM_NAME = "application" + +CONFIG_SERVER_REL_NAME = "config-server" +SHARD_REL_NAME = "sharding" +DATABASE_REL_NAME = "first-database" + +# for now we have a large timeout due to the slow drainage of the `config.system.sessions` +# collection. More info here: +# https://stackoverflow.com/questions/77364840/mongodb-slow-chunk-migration-for-collection-config-system-sessions-with-remov +TIMEOUT = 30 * 60 + + +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest, application_charm, database_charm) -> None: + """Build and deploy a sharded cluster.""" + await ops_test.model.deploy( + database_charm, + config={"role": "config-server"}, + application_name=CONFIG_SERVER_ONE_APP_NAME, + ) + await ops_test.model.deploy( + database_charm, + config={"role": "config-server"}, + application_name=CONFIG_SERVER_TWO_APP_NAME, + ) + await ops_test.model.deploy( + database_charm, num_units=1, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME + ) + await ops_test.model.deploy(application_charm, application_name=APP_CHARM_NAME) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=[ + CONFIG_SERVER_ONE_APP_NAME, + CONFIG_SERVER_TWO_APP_NAME, + SHARD_ONE_APP_NAME, + ], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + + +async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: + """Verify that a shard can only be related to one config server.""" + await ops_test.model.integrate( + f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_ONE_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + multiple_config_server_rels_allowed = True + try: + await ops_test.model.integrate( + f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_TWO_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + except JujuAPIError as e: + if e.error_code == "quota limit exceeded": + multiple_config_server_rels_allowed = False + + assert not multiple_config_server_rels_allowed, "Shard can relate to multiple config servers." + + +async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: + """Verify that a shard cannot use the DB relation.""" + await ops_test.model.integrate( + f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", + SHARD_ONE_APP_NAME, + ) + + shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0] + assert ( + shard_unit.workload_status_message == "Sharding roles do not support database interface." + ), "Shard cannot be related using the database relation" diff --git a/tox.ini b/tox.ini index 49f16d8bf..edaae3a4a 100644 --- a/tox.ini +++ b/tox.ini @@ -192,6 +192,21 @@ deps = commands = pytest -v --tb native --log-cli-level=INFO -s --durations=0 {posargs} {[vars]tests_path}/integration/sharding_tests/test_sharding.py +[testenv:sharding-relation-integration] +description = Run sharding integration tests +pass_env = + {[testenv]pass_env} + CI +deps = + pytest + juju==3.2.0.1 + pytest-mock + pytest-operator + git+https://github.com/canonical/data-platform-workflows@v5.1.2\#subdirectory=python/pytest_plugins/pytest_operator_cache + -r {tox_root}/requirements.txt +commands = + pytest -v --tb native --log-cli-level=INFO -s --durations=0 {posargs} {[vars]tests_path}/integration/sharding_tests/test_sharding_relations.py + [testenv:integration] description = Run all integration tests From 8d00835951244ce220a7137315862a2212a4096c Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 1 Nov 2023 17:28:15 +0000 Subject: [PATCH 02/10] add legacy tests --- .../dummy_legacy_app/charmcraft.yaml | 11 +++ .../dummy_legacy_app/metadata.yaml | 14 ++++ .../dummy_legacy_app/requirements.txt | 1 + .../integration/dummy_legacy_app/src/charm.py | 38 +++++++++ tests/integration/sharding_tests/conftest.py | 12 +++ .../sharding_tests/test_sharding_relations.py | 81 ++++++++++++++++--- 6 files changed, 144 insertions(+), 13 deletions(-) create mode 100644 tests/integration/dummy_legacy_app/charmcraft.yaml create mode 100644 tests/integration/dummy_legacy_app/metadata.yaml create mode 100644 tests/integration/dummy_legacy_app/requirements.txt create mode 100755 tests/integration/dummy_legacy_app/src/charm.py diff --git a/tests/integration/dummy_legacy_app/charmcraft.yaml b/tests/integration/dummy_legacy_app/charmcraft.yaml new file mode 100644 index 000000000..d12b960e6 --- /dev/null +++ b/tests/integration/dummy_legacy_app/charmcraft.yaml @@ -0,0 +1,11 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +type: charm +bases: + - build-on: + - name: "ubuntu" + channel: "20.04" + run-on: + - name: "ubuntu" + channel: "20.04" diff --git a/tests/integration/dummy_legacy_app/metadata.yaml b/tests/integration/dummy_legacy_app/metadata.yaml new file mode 100644 index 000000000..0a5ca7db6 --- /dev/null +++ b/tests/integration/dummy_legacy_app/metadata.yaml @@ -0,0 +1,14 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. +name: legacy-application +description: | + Dummy legacy application charm used in integration tests. +summary: | + Dummy legacy application charm meant only to be used in non-legacy integration tests. Legacy + integration tests should use a production legacy application. +series: + - focal + +requires: + obsolete: + interface: mongodb diff --git a/tests/integration/dummy_legacy_app/requirements.txt b/tests/integration/dummy_legacy_app/requirements.txt new file mode 100644 index 000000000..96faf889a --- /dev/null +++ b/tests/integration/dummy_legacy_app/requirements.txt @@ -0,0 +1 @@ +ops >= 1.4.0 diff --git a/tests/integration/dummy_legacy_app/src/charm.py b/tests/integration/dummy_legacy_app/src/charm.py new file mode 100755 index 000000000..2c3e5dca8 --- /dev/null +++ b/tests/integration/dummy_legacy_app/src/charm.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Application charm that connects to database charms. + +This charm is meant to be used only for testing +of the libraries in this repository. +""" + +import logging + +from ops.charm import CharmBase +from ops.main import main +from ops.model import ActiveStatus + +logger = logging.getLogger(__name__) + +# Extra roles that this application needs when interacting with the database. +EXTRA_USER_ROLES = "admin" + + +class ApplicationCharm(CharmBase): + """Application charm that connects to database charms.""" + + def __init__(self, *args): + super().__init__(*args) + + # Default charm events. + self.framework.observe(self.on.start, self._on_start) + + def _on_start(self, _) -> None: + """Only sets an Active status.""" + self.unit.status = ActiveStatus() + + +if __name__ == "__main__": + main(ApplicationCharm) diff --git a/tests/integration/sharding_tests/conftest.py b/tests/integration/sharding_tests/conftest.py index d91962292..8bef5a710 100644 --- a/tests/integration/sharding_tests/conftest.py +++ b/tests/integration/sharding_tests/conftest.py @@ -6,6 +6,18 @@ from pytest_operator.plugin import OpsTest +@pytest.fixture(scope="module") +async def legacy_charm(ops_test: OpsTest): + """Build an application charm that uses the legacy interface. + + Note: this should only be used outside of the legacy integration tests, as those tests should + test a real legacy application. + """ + charm_path = "tests/integration/dummy_legacy_app" + charm = await ops_test.build_charm(charm_path) + return charm + + @pytest.fixture(scope="module") async def application_charm(ops_test: OpsTest): """Build the application charm.""" diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index 6215fb34d..7b7660adf 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -2,17 +2,19 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. import pytest -from pytest_operator.plugin import OpsTest from juju.errors import JujuAPIError +from pytest_operator.plugin import OpsTest SHARD_ONE_APP_NAME = "shard-one" CONFIG_SERVER_ONE_APP_NAME = "config-server-one" CONFIG_SERVER_TWO_APP_NAME = "config-server-two" APP_CHARM_NAME = "application" +LEGACY_APP_CHARM_NAME = "legacy-application" CONFIG_SERVER_REL_NAME = "config-server" SHARD_REL_NAME = "sharding" DATABASE_REL_NAME = "first-database" +LEGACY_RELATION_NAME = "obsolete" # for now we have a large timeout due to the slow drainage of the `config.system.sessions` # collection. More info here: @@ -21,7 +23,9 @@ @pytest.mark.abort_on_fail -async def test_build_and_deploy(ops_test: OpsTest, application_charm, database_charm) -> None: +async def test_build_and_deploy( + ops_test: OpsTest, application_charm, legacy_charm, database_charm +) -> None: """Build and deploy a sharded cluster.""" await ops_test.model.deploy( database_charm, @@ -37,18 +41,18 @@ async def test_build_and_deploy(ops_test: OpsTest, application_charm, database_c database_charm, num_units=1, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME ) await ops_test.model.deploy(application_charm, application_name=APP_CHARM_NAME) + await ops_test.model.deploy(legacy_charm, application_name=LEGACY_APP_CHARM_NAME) - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[ - CONFIG_SERVER_ONE_APP_NAME, - CONFIG_SERVER_TWO_APP_NAME, - SHARD_ONE_APP_NAME, - ], - idle_period=20, - raise_on_blocked=False, - timeout=TIMEOUT, - ) + await ops_test.model.wait_for_idle( + apps=[ + CONFIG_SERVER_ONE_APP_NAME, + CONFIG_SERVER_TWO_APP_NAME, + SHARD_ONE_APP_NAME, + ], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: @@ -78,7 +82,58 @@ async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: SHARD_ONE_APP_NAME, ) + await ops_test.model.wait_for_idle( + apps=[SHARD_ONE_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0] assert ( shard_unit.workload_status_message == "Sharding roles do not support database interface." ), "Shard cannot be related using the database relation" + + await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation( + f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", + SHARD_ONE_APP_NAME, + ) + + await ops_test.model.wait_for_idle( + apps=[SHARD_ONE_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + + +async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None: + """Verify that a shard cannot use the DB relation.""" + await ops_test.model.integrate( + LEGACY_APP_CHARM_NAME, + SHARD_ONE_APP_NAME, + ) + + await ops_test.model.wait_for_idle( + apps=[SHARD_ONE_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + + shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0] + assert ( + shard_unit.workload_status_message == "Sharding roles do not support obsolete interface." + ), "Shard cannot be related using the mongodb relation" + + await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation( + f"{SHARD_ONE_APP_NAME}:{LEGACY_RELATION_NAME}", + f"{LEGACY_APP_CHARM_NAME}:{LEGACY_RELATION_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[SHARD_ONE_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) From e46b31718b91ccc61ff54780c4a193cce3fba99f Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 1 Nov 2023 17:34:48 +0000 Subject: [PATCH 03/10] update ci --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c0bdab19..e42a302a1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -82,6 +82,7 @@ jobs: - backup-integration - metric-integration - sharding-integration + - sharding-relation-integration name: ${{ matrix.tox-environments }} needs: - lint From 03c0c918f3a7ec2b8c60aea5ac50741a80d5eadf Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 2 Nov 2023 09:11:29 +0000 Subject: [PATCH 04/10] test config-server relations --- .../sharding_tests/test_sharding_relations.py | 68 ++++++++++--------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index 7b7660adf..5b2971ed6 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -11,6 +11,8 @@ APP_CHARM_NAME = "application" LEGACY_APP_CHARM_NAME = "legacy-application" +SHARDING_COMPONENTS = [SHARD_ONE_APP_NAME, CONFIG_SERVER_ONE_APP_NAME] + CONFIG_SERVER_REL_NAME = "config-server" SHARD_REL_NAME = "sharding" DATABASE_REL_NAME = "first-database" @@ -76,31 +78,32 @@ async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: - """Verify that a shard cannot use the DB relation.""" - await ops_test.model.integrate( - f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", - SHARD_ONE_APP_NAME, - ) + """Verify that a shard components cannot use the DB relation.""" + for sharded_component in SHARDING_COMPONENTS: + await ops_test.model.integrate(f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", sharded_component) await ops_test.model.wait_for_idle( - apps=[SHARD_ONE_APP_NAME], + apps=SHARDING_COMPONENTS, idle_period=20, raise_on_blocked=False, timeout=TIMEOUT, ) - shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0] - assert ( - shard_unit.workload_status_message == "Sharding roles do not support database interface." - ), "Shard cannot be related using the database relation" - - await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation( - f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", - SHARD_ONE_APP_NAME, - ) + for sharded_component in SHARDING_COMPONENTS: + sharded_component_unit = ops_test.model.applications[sharded_component].units[0] + assert ( + sharded_component_unit.workload_status_message + == "Sharding roles do not support database interface." + ), f"{sharded_component} cannot be related using the database relation" + + for sharded_component in SHARDING_COMPONENTS: + await ops_test.model.applications[sharded_component].remove_relation( + f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", + sharded_component, + ) await ops_test.model.wait_for_idle( - apps=[SHARD_ONE_APP_NAME], + apps=SHARDING_COMPONENTS, idle_period=20, raise_on_blocked=False, timeout=TIMEOUT, @@ -108,31 +111,32 @@ async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None: - """Verify that a shard cannot use the DB relation.""" - await ops_test.model.integrate( - LEGACY_APP_CHARM_NAME, - SHARD_ONE_APP_NAME, - ) + """Verify that sharding components cannot use the DB relation.""" + for sharded_component in SHARDING_COMPONENTS: + await ops_test.model.integrate(LEGACY_APP_CHARM_NAME, sharded_component) await ops_test.model.wait_for_idle( - apps=[SHARD_ONE_APP_NAME], + apps=SHARDING_COMPONENTS, idle_period=20, raise_on_blocked=False, timeout=TIMEOUT, ) - shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0] - assert ( - shard_unit.workload_status_message == "Sharding roles do not support obsolete interface." - ), "Shard cannot be related using the mongodb relation" - - await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation( - f"{SHARD_ONE_APP_NAME}:{LEGACY_RELATION_NAME}", - f"{LEGACY_APP_CHARM_NAME}:{LEGACY_RELATION_NAME}", - ) + for sharded_component in SHARDING_COMPONENTS: + sharded_component_unit = ops_test.model.applications[sharded_component].units[0] + assert ( + sharded_component_unit.workload_status_message + == "Sharding roles do not support obsolete interface." + ), f"{sharded_component} cannot be related using the mongodb relation" + + for sharded_component in SHARDING_COMPONENTS: + await ops_test.model.applications[sharded_component].remove_relation( + f"{sharded_component}:{LEGACY_RELATION_NAME}", + f"{LEGACY_APP_CHARM_NAME}:{LEGACY_RELATION_NAME}", + ) await ops_test.model.wait_for_idle( - apps=[SHARD_ONE_APP_NAME], + apps=SHARDING_COMPONENTS, idle_period=20, raise_on_blocked=False, timeout=TIMEOUT, From 78b3096ecf9e2b5a2dde4e0c0c8c052311d28c76 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 2 Nov 2023 17:25:10 +0000 Subject: [PATCH 05/10] intellegent status reporting for replica misuse --- lib/charms/mongodb/v1/mongos.py | 2 +- lib/charms/mongodb/v1/shards_interface.py | 86 +++++++++++++++---- src/charm.py | 8 +- .../sharding_tests/test_sharding_relations.py | 2 +- 4 files changed, 71 insertions(+), 27 deletions(-) diff --git a/lib/charms/mongodb/v1/mongos.py b/lib/charms/mongodb/v1/mongos.py index a8d5455d6..6e7d8f383 100644 --- a/lib/charms/mongodb/v1/mongos.py +++ b/lib/charms/mongodb/v1/mongos.py @@ -204,7 +204,7 @@ def pre_remove_checks(self, shard_name): with attempt: balancer_state = self.client.admin.command("balancerStatus") if balancer_state["mode"] == "off": - raise BalancerNotEnabledError + raise BalancerNotEnabledError("balancer is not enabled.") def remove_shard(self, shard_name: str) -> None: """Removes shard from the cluster. diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index fe19bfa2b..7dcf4670e 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -32,7 +32,6 @@ from ops.model import ( ActiveStatus, BlockedStatus, - ErrorStatus, MaintenanceStatus, StatusBase, WaitingStatus, @@ -57,6 +56,14 @@ HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) FORBIDDEN_REMOVAL_ERR_CODE = 20 +AUTH_FAILED_CODE = 18 + + +class ShardAuthError(Exception): + """Raised when a shard doesn't have the same auth as the config server.""" + + def __init__(self, shard: str): + self.shard = shard class RemoveLastShardError(Exception): @@ -201,11 +208,11 @@ def _on_relation_event(self, event): logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) event.defer() - except BalancerNotEnabledError: - logger.error("Deferring on _relation_broken_event, balancer is not enabled.") + except ShardAuthError as e: + self.charm.unit.status = WaitingStatus(f"Waiting for {e.shard} to sync credentials.") event.defer() return - except (PyMongoError, NotReadyError) as e: + except (PyMongoError, NotReadyError, BalancerNotEnabledError) as e: logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) event.defer() return @@ -215,6 +222,7 @@ def add_shards(self, departed_shard_id): raises: PyMongoError """ + failed_to_add_shard = None with MongosConnection(self.charm.mongos_config) as mongo: cluster_shards = mongo.get_shard_members() relation_shards = self._get_shards_from_relations(departed_shard_id) @@ -231,10 +239,24 @@ def add_shards(self, departed_shard_id): logger.info("Adding shard: %s ", shard) mongo.add_shard(shard, shard_hosts) except PyMongoError as e: + # raise exception after trying to add the remaining shards, as to not prevent + # adding other shards logger.error("Failed to add shard %s to the config server, error=%r", shard, e) - raise + failed_to_add_shard = (e, shard) + + if not failed_to_add_shard: + self.charm.unit.status = ActiveStatus("") + return + + (error, shard) = failed_to_add_shard + + # Sometimes it can take up to 20 minutes for the shard to be restarted with the same auth + # as the config server. + if error.code == AUTH_FAILED_CODE: + logger.error(f"{shard} shard does not have the same auth as the config server.") + raise ShardAuthError(shard) - self.charm.unit.status = ActiveStatus("") + logger.error(f"Failed to add {shard} to cluster") def remove_shards(self, departed_shard_id): """Removes shards from cluster. @@ -280,13 +302,14 @@ def update_mongos_hosts(self): def get_config_server_status(self) -> Optional[StatusBase]: """Returns the current status of the config-server.""" - if not self.charm.is_role(Config.Role.CONFIG_SERVER): - logger.info("skipping status check, charm is not running as a shard") + if self.skip_config_server_status(): return None - if not self.charm.db_initialised: - logger.info("No status for shard to report, waiting for db to be initialised.") - return None + if ( + self.charm.is_role(Config.Role.REPLICATION) + and self.model.relations[Config.Relations.CONFIG_SERVER_RELATIONS_NAME] + ): + return BlockedStatus("Sharding interface cannot be used by replicas") if self.model.relations[LEGACY_REL_NAME]: return BlockedStatus(f"relation {LEGACY_REL_NAME} to shard not supported.") @@ -295,7 +318,7 @@ def get_config_server_status(self) -> Optional[StatusBase]: return BlockedStatus(f"relation {REL_NAME} to shard not supported.") if not self.is_mongos_running(): - return ErrorStatus("Internal mongos is not running.") + return BlockedStatus("Internal mongos is not running.") shard_draining = self.get_draining_shards() if shard_draining: @@ -308,10 +331,22 @@ def get_config_server_status(self) -> Optional[StatusBase]: unreachable_shards = self.get_unreachable_shards() if unreachable_shards: unreachable_shards = ", ".join(unreachable_shards) - return ErrorStatus(f"Shards {unreachable_shards} are unreachable.") + return BlockedStatus(f"Shards {unreachable_shards} are unreachable.") return ActiveStatus() + def skip_config_server_status(self) -> bool: + """Returns true if the status check should be skipped.""" + if self.charm.is_role(Config.Role.SHARD): + logger.info("skipping config server status check, charm is running as a shard") + return True + + if not self.charm.db_initialised: + logger.info("No status for shard to report, waiting for db to be initialised.") + return True + + return False + def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -565,13 +600,14 @@ def get_shard_status(self) -> Optional[StatusBase]: Note: No need to report if currently draining, since that check block other hooks from executing. """ - if not self.charm.is_role(Config.Role.SHARD): - logger.info("skipping status check, charm is not running as a shard") + if self.skip_shard_status(): return None - if not self.charm.db_initialised: - logger.info("No status for shard to report, waiting for db to be initialised.") - return None + if ( + self.charm.is_role(Config.Role.REPLICATION) + and self.model.relations[Config.Relations.CONFIG_SERVER_RELATIONS_NAME] + ): + return BlockedStatus("Sharding interface cannot be used by replicas") if self.model.get_relation(LEGACY_REL_NAME): return BlockedStatus(f"relation {LEGACY_REL_NAME} to shard not supported.") @@ -586,7 +622,7 @@ def get_shard_status(self) -> Optional[StatusBase]: return ActiveStatus("Shard drained from cluster, ready for removal") if not self._is_mongos_reachable(): - return ErrorStatus("Config server unreachable") + return BlockedStatus("Config server unreachable") if not self._is_added_to_cluster(): return MaintenanceStatus("Adding shard to config-server") @@ -596,6 +632,18 @@ def get_shard_status(self) -> Optional[StatusBase]: return ActiveStatus() + def skip_shard_status(self) -> bool: + """Returns true if the status check should be skipped.""" + if self.charm.is_role(Config.Role.CONFIG_SERVER): + logger.info("skipping status check, charm is running as config-server") + return True + + if not self.charm.db_initialised: + logger.info("No status for shard to report, waiting for db to be initialised.") + return True + + return False + def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool: """Returns whether a shard has been drained from the cluster. diff --git a/src/charm.py b/src/charm.py index efe94ecf9..f34131a53 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1357,12 +1357,8 @@ def get_status(self) -> StatusBase: """ # retrieve statuses of different services running on Charmed MongoDB mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit)) - shard_status = self.shard.get_shard_status() if self.is_role(Config.Role.SHARD) else None - config_server_status = ( - self.config_server.get_config_server_status() - if self.is_role(Config.Role.CONFIG_SERVER) - else None - ) + shard_status = self.shard.get_shard_status() + config_server_status = self.config_server.get_config_server_status() pbm_status = ( self.backups.get_pbm_status() if self.model.get_relation(S3_RELATION) else None ) diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index 6215fb34d..6a0d5c0bb 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -2,8 +2,8 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. import pytest -from pytest_operator.plugin import OpsTest from juju.errors import JujuAPIError +from pytest_operator.plugin import OpsTest SHARD_ONE_APP_NAME = "shard-one" CONFIG_SERVER_ONE_APP_NAME = "config-server-one" From 3c6df32a1226e254cf0718e67c7acd97fc4e824f Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 3 Nov 2023 09:18:50 +0000 Subject: [PATCH 06/10] small grammar fix --- tests/integration/sharding_tests/test_sharding_relations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index 3ae14ecf5..f41be2688 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -77,7 +77,7 @@ async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: - """Verify that a shard components cannot use the DB relation.""" + """Verify that sharding components cannot use the DB relation.""" for sharded_component in SHARDING_COMPONENTS: await ops_test.model.integrate(f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", sharded_component) From 9936f50cf5e9c305257f76715e94fd660c9912eb Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 3 Nov 2023 13:13:31 +0000 Subject: [PATCH 07/10] add tests + fixes to tests --- lib/charms/mongodb/v1/shards_interface.py | 23 +++++- src/charm.py | 2 +- .../sharding_tests/test_sharding_relations.py | 82 ++++++++++++++++++- 3 files changed, 100 insertions(+), 7 deletions(-) diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index 7dcf4670e..9d5c39a74 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -309,7 +309,7 @@ def get_config_server_status(self) -> Optional[StatusBase]: self.charm.is_role(Config.Role.REPLICATION) and self.model.relations[Config.Relations.CONFIG_SERVER_RELATIONS_NAME] ): - return BlockedStatus("Sharding interface cannot be used by replicas") + return BlockedStatus("sharding interface cannot be used by replicas") if self.model.relations[LEGACY_REL_NAME]: return BlockedStatus(f"relation {LEGACY_REL_NAME} to shard not supported.") @@ -331,7 +331,7 @@ def get_config_server_status(self) -> Optional[StatusBase]: unreachable_shards = self.get_unreachable_shards() if unreachable_shards: unreachable_shards = ", ".join(unreachable_shards) - return BlockedStatus(f"Shards {unreachable_shards} are unreachable.") + return BlockedStatus(f"shards {unreachable_shards} are unreachable.") return ActiveStatus() @@ -345,6 +345,12 @@ def skip_config_server_status(self) -> bool: logger.info("No status for shard to report, waiting for db to be initialised.") return True + if ( + self.charm.is_role(Config.Role.REPLICATION) + and not self.model.relations[Config.Relations.CONFIG_SERVER_RELATIONS_NAME] + ): + return True + return False def _update_relation_data(self, relation_id: int, data: dict) -> None: @@ -527,6 +533,11 @@ def pass_hook_checks(self, event): event.defer() return False + mongos_hosts = event.relation.data[event.relation.app].get(HOSTS_KEY, None) + if isinstance(event, RelationBrokenEvent) and not mongos_hosts: + logger.info("Config-server relation never set up, no need to process broken event.") + return False + return True def _on_relation_broken(self, event: RelationBrokenEvent) -> None: @@ -607,7 +618,7 @@ def get_shard_status(self) -> Optional[StatusBase]: self.charm.is_role(Config.Role.REPLICATION) and self.model.relations[Config.Relations.CONFIG_SERVER_RELATIONS_NAME] ): - return BlockedStatus("Sharding interface cannot be used by replicas") + return BlockedStatus("sharding interface cannot be used by replicas") if self.model.get_relation(LEGACY_REL_NAME): return BlockedStatus(f"relation {LEGACY_REL_NAME} to shard not supported.") @@ -642,6 +653,12 @@ def skip_shard_status(self) -> bool: logger.info("No status for shard to report, waiting for db to be initialised.") return True + if ( + self.charm.is_role(Config.Role.REPLICATION) + and not self.model.relations[Config.Relations.CONFIG_SERVER_RELATIONS_NAME] + ): + return True + return False def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool: diff --git a/src/charm.py b/src/charm.py index f34131a53..75e605b0f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1396,7 +1396,7 @@ def is_relation_feasible(self, rel_interface) -> bool: not self.is_sharding_component() and rel_interface == Config.Relations.SHARDING_RELATIONS_NAME ): - self.unit.status = BlockedStatus("role replication does not support sharding") + self.unit.status = BlockedStatus("sharding interface cannot be used by replicas") logger.error( "Charm is in sharding role: %s. Does not support %s interface.", self.role, diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index f41be2688..eb1d6dbd9 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -5,9 +5,10 @@ from juju.errors import JujuAPIError from pytest_operator.plugin import OpsTest -SHARD_ONE_APP_NAME = "shard-one" +SHARD_ONE_APP_NAME = "shard" CONFIG_SERVER_ONE_APP_NAME = "config-server-one" CONFIG_SERVER_TWO_APP_NAME = "config-server-two" +REPLICATION_APP_NAME = "replication" APP_CHARM_NAME = "application" LEGACY_APP_CHARM_NAME = "legacy-application" @@ -18,7 +19,7 @@ DATABASE_REL_NAME = "first-database" LEGACY_RELATION_NAME = "obsolete" -RELATION_LIMIT_MESSAGE = 'cannot add relation "shard-one:sharding config-server-two:config-server": establishing a new relation for shard-one:sharding would exceed its maximum relation limit of 1' +RELATION_LIMIT_MESSAGE = 'cannot add relation "shard:sharding config-server-two:config-server": establishing a new relation for shard:sharding would exceed its maximum relation limit of 1' # for now we have a large timeout due to the slow drainage of the `config.system.sessions` # collection. More info here: # https://stackoverflow.com/questions/77364840/mongodb-slow-chunk-migration-for-collection-config-system-sessions-with-remov @@ -41,8 +42,9 @@ async def test_build_and_deploy( application_name=CONFIG_SERVER_TWO_APP_NAME, ) await ops_test.model.deploy( - database_charm, num_units=1, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME + database_charm, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME ) + await ops_test.model.deploy(database_charm, application_name=REPLICATION_APP_NAME) await ops_test.model.deploy(application_charm, application_name=APP_CHARM_NAME) await ops_test.model.deploy(legacy_charm, application_name=LEGACY_APP_CHARM_NAME) @@ -75,6 +77,19 @@ async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: juju_error.value.args[0] == RELATION_LIMIT_MESSAGE ), "Shard can relate to multiple config servers." + # clean up relation + await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation( + f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_ONE_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[REPLICATION_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: """Verify that sharding components cannot use the DB relation.""" @@ -142,3 +157,64 @@ async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None: raise_on_blocked=False, timeout=TIMEOUT, ) + + +async def test_replication_config_server_relation(ops_test: OpsTest): + """Verifies that using a replica as a shard fails.""" + # attempt to add a replication deployment as a shard to the config server. + await ops_test.model.integrate( + f"{REPLICATION_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_ONE_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[REPLICATION_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + + replication_unit = ops_test.model.applications[REPLICATION_APP_NAME].units[0] + assert ( + replication_unit.workload_status_message == "sharding interface cannot be used by replicas" + ), "replication cannot be related to config server." + + # clean up relations + await ops_test.model.applications[REPLICATION_APP_NAME].remove_relation( + f"{REPLICATION_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_ONE_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + +async def test_replication_shard_relation(ops_test: OpsTest): + """Verifies that using a replica as a config-server fails.""" + # attempt to add a shard to a replication deployment as a config server. + await ops_test.model.integrate( + f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}", + f"{REPLICATION_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[REPLICATION_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) + + replication_unit = ops_test.model.applications[REPLICATION_APP_NAME].units[0] + assert ( + replication_unit.workload_status_message == "sharding interface cannot be used by replicas" + ), "replication cannot be related to config server." + + # clean up relation + await ops_test.model.applications[REPLICATION_APP_NAME].remove_relation( + f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}", + f"{REPLICATION_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[REPLICATION_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + ) From ad56eabdb50f32a7200105c63f262bd40db557db Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 3 Nov 2023 13:33:13 +0000 Subject: [PATCH 08/10] small fixes --- lib/charms/mongodb/v1/mongos.py | 2 +- lib/charms/mongodb/v1/shards_interface.py | 2 +- tests/integration/sharding_tests/test_sharding_relations.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/charms/mongodb/v1/mongos.py b/lib/charms/mongodb/v1/mongos.py index 6e7d8f383..ee2c11678 100644 --- a/lib/charms/mongodb/v1/mongos.py +++ b/lib/charms/mongodb/v1/mongos.py @@ -21,7 +21,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 3 +LIBPATCH = 4 # path to store mongodb ketFile logger = logging.getLogger(__name__) diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index 9d5c39a74..7972f5b48 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -51,7 +51,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 3 +LIBPATCH = 4 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index eb1d6dbd9..bcfa3d2a8 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -126,7 +126,7 @@ async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None: - """Verify that sharding components cannot use the DB relation.""" + """Verify that sharding components cannot use the legacy DB relation.""" for sharded_component in SHARDING_COMPONENTS: await ops_test.model.integrate(LEGACY_APP_CHARM_NAME, sharded_component) From fe86e58d66ec334995e92ea089c620a63e793256 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 3 Nov 2023 16:27:20 +0000 Subject: [PATCH 09/10] small fixees --- lib/charms/mongodb/v1/shards_interface.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index 7972f5b48..f12373097 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -257,6 +257,7 @@ def add_shards(self, departed_shard_id): raise ShardAuthError(shard) logger.error(f"Failed to add {shard} to cluster") + raise error def remove_shards(self, departed_shard_id): """Removes shards from cluster. From 1c330f50c4f5b28bf70d303c7c370b3f196332ea Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 21 Nov 2023 23:04:32 +0000 Subject: [PATCH 10/10] fmt +lint --- src/charm.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/charm.py b/src/charm.py index 3dc1ce382..885d8c076 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1359,10 +1359,7 @@ def get_status(self) -> StatusBase: mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit)) shard_status = self.shard.get_shard_status() config_server_status = self.config_server.get_config_server_status() - pbm_status = ( - self.backups.get_pbm_status() - ) - + pbm_status = self.backups.get_pbm_status() # failure in mongodb takes precedence over sharding and config server if not isinstance(mongodb_status, ActiveStatus):