From ac24eed18b8c32621f11e5de8073d4c9f0c62d2f Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 13 Sep 2023 09:09:24 +0000 Subject: [PATCH 01/42] charm can start as shard or config server --- config.yaml | 6 ++++++ lib/charms/mongodb/v0/helpers.py | 7 +++++++ src/charm.py | 6 +++++- src/machine_helpers.py | 10 ++++++---- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index 0e820d039..fb9659b65 100644 --- a/config.yaml +++ b/config.yaml @@ -8,3 +8,9 @@ options: When a relation is removed, auto-delete ensures that any relevant databases associated with the relation are also removed default: false + role: + description: | + role config option exists to deploy the charmed-mongodb application as a shard, + config-server, or as a replica set. + type: string + default: replication diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index ba33c09d9..58b42258c 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -84,6 +84,7 @@ def get_mongod_args( config: MongoDBConfiguration, auth: bool = True, snap_install: bool = False, + role: str = "replication", ) -> str: """Construct the MongoDB startup command line. @@ -137,6 +138,12 @@ def get_mongod_args( ] ) + if role == "config-server": + cmd.append("--configsvr") + + if role == "shard": + cmd.append("--shardsvr") + cmd.append("\n") return " ".join(cmd) diff --git a/src/charm.py b/src/charm.py index a6c1effcc..e9dc9c742 100755 --- a/src/charm.py +++ b/src/charm.py @@ -253,7 +253,10 @@ def _on_install(self, event: InstallEvent) -> None: # Construct the mongod startup commandline args for systemd and reload the daemon. update_mongod_service( - auth=auth, machine_ip=self._unit_ip(self.unit), config=self.mongodb_config + auth=auth, + machine_ip=self._unit_ip(self.unit), + config=self.mongodb_config, + role=self.model.config["role"], ) # add licenses @@ -947,6 +950,7 @@ def restart_mongod_service(self, auth=None): auth, self._unit_ip(self.unit), config=self.mongodb_config, + role=self.model.config["role"], ) mongodb_snap.start(services=["mongod"]) except snap.SnapError as e: diff --git a/src/machine_helpers.py b/src/machine_helpers.py index eab2050e8..ebb9b2519 100644 --- a/src/machine_helpers.py +++ b/src/machine_helpers.py @@ -18,24 +18,26 @@ MONGO_USER = "snap_daemon" -def update_mongod_service(auth: bool, machine_ip: str, config: MongoDBConfiguration) -> None: +def update_mongod_service( + auth: bool, machine_ip: str, config: MongoDBConfiguration, role: str = "replication" +) -> None: """Updates the mongod service file with the new options for starting.""" with open(Config.ENV_VAR_PATH, "r") as env_var_file: env_vars = env_var_file.readlines() # write our arguments and write them to /etc/environment - the environment variable here is # read in in the charmed-mongob.mongod.service file. - mongod_start_args = get_mongod_args(config, auth, snap_install=True) + mongo_start_args = get_mongod_args(config, auth, snap_install=True) args_added = False for index, line in enumerate(env_vars): if "MONGOD_ARGS" in line: args_added = True - env_vars[index] = f"MONGOD_ARGS={mongod_start_args}" + env_vars[index] = f"MONGOD_ARGS={mongo_start_args}" # if it is the first time adding these args to the file - will will need to append them to the # file if not args_added: - env_vars.append(f"MONGOD_ARGS={mongod_start_args}") + env_vars.append(f"MONGOD_ARGS={mongo_start_args}") with open(Config.ENV_VAR_PATH, "w") as service_file: service_file.writelines(env_vars) From f8ff8a1bb71fc74495a6d6df7660b431129c606e Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 14 Sep 2023 16:27:40 +0000 Subject: [PATCH 02/42] mongos, shard, and config server all start without error --- lib/charms/mongodb/v0/helpers.py | 25 ++++++++++++ src/charm.py | 67 +++++++++++++++++++++++++------- src/config.py | 8 ++-- src/machine_helpers.py | 23 +++++++---- 4 files changed, 98 insertions(+), 25 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 58b42258c..789aa1791 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -80,6 +80,31 @@ def get_create_user_cmd( ] +def get_mongos_args(config: MongoDBConfiguration) -> str: + """Returns the arguments used for starting mongos on a config-server side application. + + Returns: + A string representing the arguments to be passed to mongos. + """ + # mongos running on the config server communicates through localhost + config_server_uri = f"{config.replset}/localhost" + + # no need to add TLS since no network calls are used, since mongos is configured to listen + # on local host + cmd = [ + # mongos on config server side only runs on local host + "--bind_ip localhost", + # todo figure out this one + f"--configdb {config_server_uri}", + # config server is already using 27017 + "--port 27018", + # todo followup PR add keyfile and auth + "\n", + ] + + return " ".join(cmd) + + def get_mongod_args( config: MongoDBConfiguration, auth: bool = True, diff --git a/src/charm.py b/src/charm.py index e9dc9c742..ba275c067 100755 --- a/src/charm.py +++ b/src/charm.py @@ -75,7 +75,6 @@ class MongodbOperatorCharm(CharmBase): def __init__(self, *args): super().__init__(*args) self._port = Config.MONGODB_PORT - self.framework.observe(self.on.install, self._on_install) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -213,6 +212,15 @@ def db_initialised(self) -> bool: """Check if MongoDB is initialised.""" return "db_initialised" in self.app_peer_data + @property + def role(self) -> str: + """Returns role of MongoDB deployment.""" + return self.model.config["role"] + + def is_role(self, role_name: str) -> bool: + """Checks if application is running in provided role.""" + return self.role == role_name + @db_initialised.setter def db_initialised(self, value): """Set the db_initialised flag.""" @@ -256,7 +264,7 @@ def _on_install(self, event: InstallEvent) -> None: auth=auth, machine_ip=self._unit_ip(self.unit), config=self.mongodb_config, - role=self.model.config["role"], + role=self.role, ) # add licenses @@ -275,9 +283,7 @@ def _on_start(self, event: StartEvent) -> None: try: logger.debug("starting MongoDB.") self.unit.status = MaintenanceStatus("starting MongoDB") - snap_cache = snap.SnapCache() - mongodb_snap = snap_cache["charmed-mongodb"] - mongodb_snap.start(services=["mongod"]) + self.start_mongod_service() self.unit.status = ActiveStatus() except snap.SnapError as e: logger.error("An exception occurred when starting mongod agent, error: %s.", str(e)) @@ -291,7 +297,7 @@ def _on_start(self, event: StartEvent) -> None: return # check if this unit's deployment of MongoDB is ready - with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: + with MongoDBConnection(self.mongodb_config, f"localhost", direct=True) as direct_mongo: if not direct_mongo.is_ready: logger.debug("mongodb service is not ready yet.") self.unit.status = WaitingStatus("waiting for MongoDB to start") @@ -464,7 +470,7 @@ def _on_update_status(self, event: UpdateStatusEvent): return # Cannot check more advanced MongoDB statuses if mongod hasn't started. - with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: + with MongoDBConnection(self.mongodb_config, f"localhost", direct=True) as direct_mongo: if not direct_mongo.is_ready: self.unit.status = WaitingStatus("Waiting for MongoDB to start") return @@ -871,19 +877,24 @@ def _initialise_replica_set(self, event: StartEvent) -> None: # can be corrupted. return - with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: + with MongoDBConnection(self.mongodb_config, f"localhost", direct=True) as direct_mongo: try: logger.info("Replica Set initialization") direct_mongo.init_replset() self._peers.data[self.app]["replica_set_hosts"] = json.dumps( [self._unit_ip(self.unit)] ) + logger.info("User initialization") self._init_operator_user() self._init_backup_user() self._init_monitor_user() - logger.info("Manage relations") - self.client_relations.oversee_users(None, None) + + # in sharding, user management is handled by mongos subordinate charm + if self.is_role(Config.REPLICATION): + logger.info("Manage user") + self.client_relations.oversee_users(None, None) + except subprocess.CalledProcessError as e: logger.error( "Deferring on_start: exit code: %i, stderr: %s", e.exit_code, e.stderr @@ -937,22 +948,48 @@ def set_secret(self, scope: str, key: str, value: Optional[str]) -> None: else: raise RuntimeError("Unknown secret scope.") + def start_mongod_service(self): + """Starts the mongod service and if necessary starts mongos. + + Raises: + snap.SnapError + """ + snap_cache = snap.SnapCache() + mongodb_snap = snap_cache["charmed-mongodb"] + mongodb_snap.start(services=["mongod"]) + + # charms running as config server are responsible for maintaing a server side mongos + if self.is_role(Config.CONFIG_SERVER): + mongodb_snap.start(services=["mongos"]) + + def stop_mongod_service(self): + """Stops the mongod service and if necessary stops mongos. + + Raises: + snap.SnapError + """ + snap_cache = snap.SnapCache() + mongodb_snap = snap_cache["charmed-mongodb"] + mongodb_snap.stop(services=["mongod"]) + + # charms running as config server are responsible for maintaing a server side mongos + if self.is_role(Config.CONFIG_SERVER): + mongodb_snap.stop(services=["mongos"]) + def restart_mongod_service(self, auth=None): """Restarts the mongod service with its associated configuration.""" if auth is None: auth = self.auth_enabled() try: - snap_cache = snap.SnapCache() - mongodb_snap = snap_cache["charmed-mongodb"] - mongodb_snap.stop(services=["mongod"]) + self.stop_mongod_service() update_mongod_service( auth, self._unit_ip(self.unit), config=self.mongodb_config, - role=self.model.config["role"], + role=self.role, ) - mongodb_snap.start(services=["mongod"]) + self.start_mongod_service() except snap.SnapError as e: logger.error("An exception occurred when starting mongod agent, error: %s.", str(e)) self.unit.status = BlockedStatus("couldn't start MongoDB") diff --git a/src/config.py b/src/config.py index 8d8279f87..7c13a5c02 100644 --- a/src/config.py +++ b/src/config.py @@ -6,14 +6,16 @@ class Config: """Configuration for MongoDB Charm.""" - SUBSTRATE = "vm" - # We expect the MongoDB container to use the default ports MONGODB_PORT = 27017 + SUBSTRATE = "vm" ENV_VAR_PATH = "/etc/environment" MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current" MONGOD_CONF_DIR = f"{MONGODB_SNAP_DATA_DIR}/etc/mongod" MONGOD_CONF_FILE_PATH = f"{MONGOD_CONF_DIR}/mongod.conf" - SNAP_PACKAGES = [("charmed-mongodb", "5/edge", 82)] + SNAP_PACKAGES = [("charmed-mongodb", "5/edge/mongos", 83)] + CONFIG_SERVER = "config-server" + REPLICATION = "replication" + SHARD = "shard" class Actions: """Actions related config for MongoDB Charm.""" diff --git a/src/machine_helpers.py b/src/machine_helpers.py index ebb9b2519..ddd3f031e 100644 --- a/src/machine_helpers.py +++ b/src/machine_helpers.py @@ -6,7 +6,7 @@ import pwd from pathlib import Path -from charms.mongodb.v0.helpers import get_mongod_args +from charms.mongodb.v0.helpers import get_mongod_args, get_mongos_args from charms.mongodb.v0.mongodb import MongoDBConfiguration from config import Config @@ -22,22 +22,31 @@ def update_mongod_service( auth: bool, machine_ip: str, config: MongoDBConfiguration, role: str = "replication" ) -> None: """Updates the mongod service file with the new options for starting.""" + # write our arguments and write them to /etc/environment - the environment variable here is + # read in in the charmed-mongob.mongod.service file. + mongod_start_args = get_mongod_args(config, auth, snap_install=True) + add_args_to_env("MONGOD_ARGS", mongod_start_args) + + if role == "config-server": + mongos_start_args = get_mongos_args(config) + add_args_to_env("MONGOS_ARGS", mongos_start_args) + + +def add_args_to_env(var: str, args: str): + """Adds the provided arguments to the environment as the provided variable.""" with open(Config.ENV_VAR_PATH, "r") as env_var_file: env_vars = env_var_file.readlines() - # write our arguments and write them to /etc/environment - the environment variable here is - # read in in the charmed-mongob.mongod.service file. - mongo_start_args = get_mongod_args(config, auth, snap_install=True) args_added = False for index, line in enumerate(env_vars): - if "MONGOD_ARGS" in line: + if var in line: args_added = True - env_vars[index] = f"MONGOD_ARGS={mongo_start_args}" + env_vars[index] = f"{var}={args}" # if it is the first time adding these args to the file - will will need to append them to the # file if not args_added: - env_vars.append(f"MONGOD_ARGS={mongo_start_args}") + env_vars.append(f"{var}={args}") with open(Config.ENV_VAR_PATH, "w") as service_file: service_file.writelines(env_vars) From 9b64f319b4be0d2ece66889d22422a7afbb36d13 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 15 Sep 2023 08:09:37 +0000 Subject: [PATCH 03/42] use correct snap --- src/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.py b/src/config.py index 7c13a5c02..85a9fe3fa 100644 --- a/src/config.py +++ b/src/config.py @@ -12,7 +12,7 @@ class Config: MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current" MONGOD_CONF_DIR = f"{MONGODB_SNAP_DATA_DIR}/etc/mongod" MONGOD_CONF_FILE_PATH = f"{MONGOD_CONF_DIR}/mongod.conf" - SNAP_PACKAGES = [("charmed-mongodb", "5/edge/mongos", 83)] + SNAP_PACKAGES = [("charmed-mongodb", "5/edge", 84)] CONFIG_SERVER = "config-server" REPLICATION = "replication" SHARD = "shard" From 11f3c6515a20cba37c71be8d2c4e89015e4cbae3 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 15 Sep 2023 08:53:38 +0000 Subject: [PATCH 04/42] fmt + lint --- src/charm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/charm.py b/src/charm.py index ff83fbdc3..d86a0c4ac 100755 --- a/src/charm.py +++ b/src/charm.py @@ -322,7 +322,7 @@ def _on_start(self, event: StartEvent) -> None: return # check if this unit's deployment of MongoDB is ready - with MongoDBConnection(self.mongodb_config, f"localhost", direct=True) as direct_mongo: + with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: if not direct_mongo.is_ready: logger.debug("mongodb service is not ready yet.") self.unit.status = WaitingStatus("waiting for MongoDB to start") @@ -495,7 +495,7 @@ def _on_update_status(self, event: UpdateStatusEvent): return # Cannot check more advanced MongoDB statuses if mongod hasn't started. - with MongoDBConnection(self.mongodb_config, f"localhost", direct=True) as direct_mongo: + with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: if not direct_mongo.is_ready: self.unit.status = WaitingStatus("Waiting for MongoDB to start") return @@ -952,7 +952,7 @@ def _initialise_replica_set(self, event: StartEvent) -> None: # can be corrupted. return - with MongoDBConnection(self.mongodb_config, f"localhost", direct=True) as direct_mongo: + with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: try: logger.info("Replica Set initialization") direct_mongo.init_replset() @@ -1045,7 +1045,7 @@ def start_mongod_service(self): mongodb_snap = snap_cache["charmed-mongodb"] mongodb_snap.start(services=["mongod"]) - # charms running as config server are responsible for maintaing a server side mongos + # charms running as config server are responsible for maintaining a server side mongos if self.is_role(Config.CONFIG_SERVER): mongodb_snap.start(services=["mongos"]) @@ -1059,7 +1059,7 @@ def stop_mongod_service(self): mongodb_snap = snap_cache["charmed-mongodb"] mongodb_snap.stop(services=["mongod"]) - # charms running as config server are responsible for maintaing a server side mongos + # charms running as config server are responsible for maintaining a server side mongos if self.is_role(Config.CONFIG_SERVER): mongodb_snap.stop(services=["mongos"]) From 4bf9d5f9608976cf84d813f521826ea25eb9e04b Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 15 Sep 2023 15:30:16 +0000 Subject: [PATCH 05/42] update error processing --- lib/charms/mongodb/v0/helpers.py | 21 +------ lib/charms/mongodb/v0/mongodb_backups.py | 70 +++++++++++++++++++----- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 789aa1791..cbde28fcf 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -7,7 +7,7 @@ import secrets import string import subprocess -from typing import List, Optional, Union +from typing import List from charms.mongodb.v0.mongodb import MongoDBConfiguration, MongoDBConnection from ops.model import ( @@ -234,25 +234,6 @@ def copy_licenses_to_unit(): ) -_StrOrBytes = Union[str, bytes] - - -def process_pbm_error(error_string: Optional[_StrOrBytes]) -> str: - """Parses pbm error string and returns a user friendly message.""" - message = "couldn't configure s3 backup option" - if not error_string: - return message - if type(error_string) == bytes: - error_string = error_string.decode("utf-8") - if "status code: 403" in error_string: # type: ignore - message = "s3 credentials are incorrect." - elif "status code: 404" in error_string: # type: ignore - message = "s3 configurations are incompatible." - elif "status code: 301" in error_string: # type: ignore - message = "s3 configurations are incompatible." - return message - - def current_pbm_op(pbm_status: str) -> str: """Parses pbm status for the operation that pbm is running.""" pbm_status = json.loads(pbm_status) diff --git a/lib/charms/mongodb/v0/mongodb_backups.py b/lib/charms/mongodb/v0/mongodb_backups.py index c65c86f1d..94d661975 100644 --- a/lib/charms/mongodb/v0/mongodb_backups.py +++ b/lib/charms/mongodb/v0/mongodb_backups.py @@ -13,14 +13,10 @@ import re import subprocess import time -from typing import Dict, List +from typing import Dict, List, Optional, Union from charms.data_platform_libs.v0.s3 import CredentialsChangedEvent, S3Requirer -from charms.mongodb.v0.helpers import ( - current_pbm_op, - process_pbm_error, - process_pbm_status, -) +from charms.mongodb.v0.helpers import current_pbm_op, process_pbm_status from charms.operator_libs_linux.v1 import snap from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus, StatusBase, WaitingStatus @@ -316,7 +312,7 @@ def _configure_pbm_options(self, event) -> None: ), return except ExecError as e: - self.charm.unit.status = BlockedStatus(process_pbm_error(e.stdout)) + self.charm.unit.status = BlockedStatus(self.process_pbm_error(e.stdout)) return except subprocess.CalledProcessError as e: logger.error("Syncing configurations failed: %s", str(e)) @@ -418,7 +414,7 @@ def _wait_pbm_status(self) -> None: ) raise ResyncError except ExecError as e: - self.charm.unit.status = BlockedStatus(process_pbm_error(e.stdout)) + self.charm.unit.status = BlockedStatus(self.process_pbm_error(e.stdout)) def _get_pbm_status(self) -> StatusBase: """Retrieve pbm status.""" @@ -428,15 +424,14 @@ def _get_pbm_status(self) -> StatusBase: try: previous_pbm_status = self.charm.unit.status pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD) + + # pbm errors are outputted in json and do not raise CLI errors + pbm_error = self.process_pbm_error(pbm_status) + if pbm_error: + return BlockedStatus(pbm_error) + self._log_backup_restore_result(pbm_status, previous_pbm_status) return process_pbm_status(pbm_status) - except ExecError as e: - logger.error(f"Failed to get pbm status. {e}") - return BlockedStatus(process_pbm_error(e.stdout)) - except subprocess.CalledProcessError as e: - # pbm pipes a return code of 1, but its output shows the true error code so it is - # necessary to parse the output - return BlockedStatus(process_pbm_error(e.output)) except Exception as e: # pbm pipes a return code of 1, but its output shows the true error code so it is # necessary to parse the output @@ -652,3 +647,48 @@ def _get_backup_restore_operation_result(self, current_pbm_status, previous_pbm_ return f"Backup {backup_id} completed successfully" return "Unknown operation result" + + def retrieve_error_message(self, pbm_status: Dict) -> str: + """Parses pbm status for an error message from the current unit. + + If pbm_agent is in the error state, the command `pbm status` does not raise an error. + Instead, it is in the log messages. pbm_agent also shows all the error messages for other + replicas in the set. + """ + try: + clusters = pbm_status["cluster"] + for cluster in clusters: + if cluster["rs"] == self.charm.app.name: + break + + for host_info in cluster["nodes"]: + replica_info = f"mongodb/{self.charm._unit_ip(self.charm.unit)}:27107" + if host_info["host"] == replica_info: + break + + return str(host_info["errors"]) + except KeyError: + return "" + + _StrOrBytes = Union[str, bytes] + + def process_pbm_error(self, pbm_status: Optional[_StrOrBytes]) -> str: + """Returns errors found in PBM status.""" + if type(pbm_status) == bytes: + pbm_status = pbm_status.decode("utf-8") + + try: + error_message = self.retrieve_error_message(json.loads(pbm_status)) + except json.decoder.JSONDecodeError: + # if pbm status doesn't return a parsable dictionary it is an error message + # represented as a string + error_message = pbm_status + + message = None + if "status code: 403" in error_message: + message = "s3 credentials are incorrect." + elif "status code: 404" in error_message: + message = "s3 configurations are incompatible." + elif "status code: 301" in error_message: + message = "s3 configurations are incompatible." + return message From e924c0067d9732e2ceefc74b6f5947c2bf04ee37 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 15 Sep 2023 15:32:12 +0000 Subject: [PATCH 06/42] bump lib patch --- lib/charms/mongodb/v0/helpers.py | 2 +- lib/charms/mongodb/v0/mongodb_backups.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index cbde28fcf..63784afa0 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -27,7 +27,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 8 +LIBPATCH = 9 # path to store mongodb ketFile diff --git a/lib/charms/mongodb/v0/mongodb_backups.py b/lib/charms/mongodb/v0/mongodb_backups.py index 94d661975..e947aa2f7 100644 --- a/lib/charms/mongodb/v0/mongodb_backups.py +++ b/lib/charms/mongodb/v0/mongodb_backups.py @@ -38,7 +38,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 6 +LIBPATCH = 7 logger = logging.getLogger(__name__) From d1f0a0d429c5d81db4881f459b7a27c5220592e1 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 18 Sep 2023 15:01:33 +0000 Subject: [PATCH 07/42] enable auth --- actions.yaml | 2 +- lib/charms/mongodb/v0/helpers.py | 7 ++++++- lib/charms/mongodb/v0/users.py | 20 +++++++++++++++++++- src/charm.py | 15 +++++++++------ src/machine_helpers.py | 2 +- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/actions.yaml b/actions.yaml index 619383a54..3b32a3b98 100644 --- a/actions.yaml +++ b/actions.yaml @@ -11,7 +11,7 @@ get-password: username: type: string description: The username, the default value 'operator'. - Possible values - operator, backup, monitor. + Possible values - operator, backup, monitor, mongos. set-password: description: Change the admin user's password, which is used by charm. diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index ec7eaeaf4..9855a865c 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -79,7 +79,10 @@ def get_create_user_cmd( ] -def get_mongos_args(config: MongoDBConfiguration) -> str: +def get_mongos_args( + config: MongoDBConfiguration, + snap_install: bool = False, +) -> str: """Returns the arguments used for starting mongos on a config-server side application. Returns: @@ -90,6 +93,7 @@ def get_mongos_args(config: MongoDBConfiguration) -> str: # no need to add TLS since no network calls are used, since mongos is configured to listen # on local host + full_conf_dir = f"{MONGODB_SNAP_DATA_DIR}{CONF_DIR}" if snap_install else CONF_DIR cmd = [ # mongos on config server side only runs on local host "--bind_ip localhost", @@ -98,6 +102,7 @@ def get_mongos_args(config: MongoDBConfiguration) -> str: # config server is already using 27017 "--port 27018", # todo followup PR add keyfile and auth + f"--keyFile={full_conf_dir}/{KEY_FILE}", "\n", ] diff --git a/lib/charms/mongodb/v0/users.py b/lib/charms/mongodb/v0/users.py index 1ca9962c6..d48ff8300 100644 --- a/lib/charms/mongodb/v0/users.py +++ b/lib/charms/mongodb/v0/users.py @@ -58,6 +58,8 @@ def get_password_key_name_for_user(username: str) -> str: """Returns the key name for the password of the user.""" if username == OperatorUser.get_username(): return OperatorUser.get_password_key_name() + elif username == MongosUser.get_username(): + return MongosUser.get_password_key_name() elif username == MonitorUser.get_username(): return MonitorUser.get_password_key_name() elif username == BackupUser.get_username(): @@ -76,6 +78,16 @@ class _OperatorUser(MongoDBUser): _hosts = [] +class _MongosUser(MongoDBUser): + """Operator user for MongoDB.""" + + _username = "mongos" + _password_key_name = f"{_username}-password" + _database_name = "admin" + _roles = ["default"] + _hosts = ["127.0.0.1"] + + class _MonitorUser(MongoDBUser): """Monitor user for MongoDB.""" @@ -108,6 +120,12 @@ class _BackupUser(MongoDBUser): OperatorUser = _OperatorUser() MonitorUser = _MonitorUser() BackupUser = _BackupUser() +MongosUser = _MongosUser() # List of system usernames needed for correct work on the charm. -CHARM_USERS = [OperatorUser.get_username(), BackupUser.get_username(), MonitorUser.get_username()] +CHARM_USERS = [ + OperatorUser.get_username(), + BackupUser.get_username(), + MonitorUser.get_username(), + MongosUser.get_username(), +] diff --git a/src/charm.py b/src/charm.py index d86a0c4ac..77a8fb940 100755 --- a/src/charm.py +++ b/src/charm.py @@ -38,6 +38,7 @@ MongoDBUser, MonitorUser, OperatorUser, + MongosUser, ) from charms.operator_libs_linux.v1 import snap from ops import JujuVersion @@ -346,6 +347,7 @@ def _on_start(self, event: StartEvent) -> None: return self._initialise_replica_set(event) + self._init_inital_user(MongosUser) def _on_relation_joined(self, event: RelationJoinedEvent) -> None: """Add peer to replica set. @@ -623,8 +625,8 @@ def _on_secret_changed(self, event: SecretChangedEvent): reraise=True, before=before_log(logger, logging.DEBUG), ) - def _init_operator_user(self) -> None: - """Creates initial admin user for MongoDB. + def _init_inital_user(self, user) -> None: + """Creates initial specified admin user. Initial admin user can be created only through localhost connection. see https://www.mongodb.com/docs/manual/core/localhost-exception/ @@ -634,7 +636,7 @@ def _init_operator_user(self) -> None: It is needed to install mongodb-clients inside charm container to make this function work correctly. """ - if self._is_user_created(OperatorUser) or not self.unit.is_leader(): + if self._is_user_created(user) or not self.unit.is_leader(): return out = subprocess.run( @@ -644,8 +646,8 @@ def _init_operator_user(self) -> None: if out.returncode == 0: raise AdminUserCreationError - logger.debug(f"{OperatorUser.get_username()} user created") - self._set_user_created(OperatorUser) + logger.debug(f"{user.get_username()} user created") + self._set_user_created(user) @retry( stop=stop_after_attempt(3), @@ -742,6 +744,7 @@ def _generate_secrets(self) -> None: share between members via the app data. """ self._check_or_set_user_password(OperatorUser) + self._check_or_set_user_password(MongosUser) self._check_or_set_user_password(MonitorUser) if not self.get_secret(APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME): @@ -961,7 +964,7 @@ def _initialise_replica_set(self, event: StartEvent) -> None: ) logger.info("User initialization") - self._init_operator_user() + self._init_inital_user(OperatorUser) self._init_backup_user() self._init_monitor_user() diff --git a/src/machine_helpers.py b/src/machine_helpers.py index 37512266a..b792c21ef 100644 --- a/src/machine_helpers.py +++ b/src/machine_helpers.py @@ -28,7 +28,7 @@ def update_mongod_service( add_args_to_env("MONGOD_ARGS", mongod_start_args) if role == "config-server": - mongos_start_args = get_mongos_args(config) + mongos_start_args = get_mongos_args(config, snap_install=True) add_args_to_env("MONGOS_ARGS", mongos_start_args) From 06af49a06725860a41cf9ee8eb6c5489ff8381e0 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 08:20:51 +0000 Subject: [PATCH 08/42] mongos should be run on 0.0.0.0 --- lib/charms/mongodb/v0/helpers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index ec7eaeaf4..d4c820aaa 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -91,8 +91,9 @@ def get_mongos_args(config: MongoDBConfiguration) -> str: # no need to add TLS since no network calls are used, since mongos is configured to listen # on local host cmd = [ - # mongos on config server side only runs on local host - "--bind_ip localhost", + # mongos on config server side should run on 0.0.0.0 so it can be accessed by other units + # in the sharded cluster + "--bind_ip 0.0.0.0", # todo figure out this one f"--configdb {config_server_uri}", # config server is already using 27017 From 363db5f01a2dcf049dda9c0011c64d9c890da74c Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 08:51:38 +0000 Subject: [PATCH 09/42] addressing PR comments --- lib/charms/mongodb/v0/helpers.py | 4 +- lib/charms/mongodb/v0/mongodb_backups.py | 11 +- src/charm.py | 6 +- src/config.py | 11 +- .../integration/backup_tests/test_backups.py | 606 +++++++++--------- 5 files changed, 325 insertions(+), 313 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index d4c820aaa..c95d99393 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -19,6 +19,8 @@ ) from pymongo.errors import AutoReconnect, ServerSelectionTimeoutError +from config import Config + # The unique Charmhub library identifier, never change it LIBID = "b9a7fe0c38d8486a9d1ce94c27d4758e" @@ -97,7 +99,7 @@ def get_mongos_args(config: MongoDBConfiguration) -> str: # todo figure out this one f"--configdb {config_server_uri}", # config server is already using 27017 - "--port 27018", + f"--port {Config.MONGOS_PORT}", # todo followup PR add keyfile and auth "\n", ] diff --git a/lib/charms/mongodb/v0/mongodb_backups.py b/lib/charms/mongodb/v0/mongodb_backups.py index e947aa2f7..f0fde7286 100644 --- a/lib/charms/mongodb/v0/mongodb_backups.py +++ b/lib/charms/mongodb/v0/mongodb_backups.py @@ -30,6 +30,8 @@ wait_fixed, ) +from config import Config + # The unique Charmhub library identifier, never change it LIBID = "9f2b91c6128d48d6ba22724bf365da3b" @@ -59,6 +61,9 @@ BACKUP_RESTORE_ATTEMPT_COOLDOWN = 15 +_StrOrBytes = Union[str, bytes] + + class ResyncError(Exception): """Raised when pbm is resyncing configurations and is not ready to be used.""" @@ -662,7 +667,9 @@ def retrieve_error_message(self, pbm_status: Dict) -> str: break for host_info in cluster["nodes"]: - replica_info = f"mongodb/{self.charm._unit_ip(self.charm.unit)}:27107" + replica_info = ( + f"mongodb/{self.charm._unit_ip(self.charm.unit)}:{Config.MONGOS_PORT}" + ) if host_info["host"] == replica_info: break @@ -670,8 +677,6 @@ def retrieve_error_message(self, pbm_status: Dict) -> str: except KeyError: return "" - _StrOrBytes = Union[str, bytes] - def process_pbm_error(self, pbm_status: Optional[_StrOrBytes]) -> str: """Returns errors found in PBM status.""" if type(pbm_status) == bytes: diff --git a/src/charm.py b/src/charm.py index d86a0c4ac..bcad1165b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -966,7 +966,7 @@ def _initialise_replica_set(self, event: StartEvent) -> None: self._init_monitor_user() # in sharding, user management is handled by mongos subordinate charm - if self.is_role(Config.REPLICATION): + if self.is_role(Config.Role.REPLICATION): logger.info("Manage user") self.client_relations.oversee_users(None, None) @@ -1046,7 +1046,7 @@ def start_mongod_service(self): mongodb_snap.start(services=["mongod"]) # charms running as config server are responsible for maintaining a server side mongos - if self.is_role(Config.CONFIG_SERVER): + if self.is_role(Config.Role.CONFIG_SERVER): mongodb_snap.start(services=["mongos"]) def stop_mongod_service(self): @@ -1060,7 +1060,7 @@ def stop_mongod_service(self): mongodb_snap.stop(services=["mongod"]) # charms running as config server are responsible for maintaining a server side mongos - if self.is_role(Config.CONFIG_SERVER): + if self.is_role(Config.Role.CONFIG_SERVER): mongodb_snap.stop(services=["mongos"]) def restart_mongod_service(self, auth=None): diff --git a/src/config.py b/src/config.py index 283b13f19..da3261864 100644 --- a/src/config.py +++ b/src/config.py @@ -9,6 +9,7 @@ class Config: """Configuration for MongoDB Charm.""" + MONGOS_PORT = 27018 MONGODB_PORT = 27017 SUBSTRATE = "vm" ENV_VAR_PATH = "/etc/environment" @@ -16,9 +17,13 @@ class Config: MONGOD_CONF_DIR = f"{MONGODB_SNAP_DATA_DIR}/etc/mongod" MONGOD_CONF_FILE_PATH = f"{MONGOD_CONF_DIR}/mongod.conf" SNAP_PACKAGES = [("charmed-mongodb", "5/edge", 84)] - CONFIG_SERVER = "config-server" - REPLICATION = "replication" - SHARD = "shard" + + class Role: + """Role config names for MongoDB Charm.""" + + CONFIG_SERVER = "config-server" + REPLICATION = "replication" + SHARD = "shard" class Actions: """Actions related config for MongoDB Charm.""" diff --git a/tests/integration/backup_tests/test_backups.py b/tests/integration/backup_tests/test_backups.py index b7cb64d48..cc78957a2 100644 --- a/tests/integration/backup_tests/test_backups.py +++ b/tests/integration/backup_tests/test_backups.py @@ -99,306 +99,306 @@ async def test_blocked_incorrect_conf(ops_test: OpsTest) -> None: assert db_unit.workload_status_message == "s3 configurations are incompatible." -@pytest.mark.abort_on_fail -async def test_ready_correct_conf(ops_test: OpsTest) -> None: - """Verifies charm goes into active status when s3 config and creds options are correct.""" - db_app_name = await helpers.app_name(ops_test) - choices = string.ascii_letters + string.digits - unique_path = "".join([secrets.choice(choices) for _ in range(4)]) - configuration_parameters = { - "bucket": "data-charms-testing", - "path": f"mongodb-vm/test-{unique_path}", - "endpoint": "https://s3.amazonaws.com", - "region": "us-east-1", - } - - # apply new configuration options - await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - - # after applying correct config options and creds the applications should both be active - await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT) - await ops_test.model.wait_for_idle( - apps=[db_app_name], status="active", timeout=TIMEOUT, idle_period=60 - ) - - -@pytest.mark.abort_on_fail -async def test_create_and_list_backups(ops_test: OpsTest) -> None: - db_unit = await helpers.get_leader_unit(ops_test) - - # verify backup list works - action = await db_unit.run_action(action_name="list-backups") - list_result = await action.wait() - backups = list_result.results["backups"] - assert backups, "backups not outputted" - - # verify backup is started - action = await db_unit.run_action(action_name="create-backup") - backup_result = await action.wait() - assert "backup started" in backup_result.results["backup-status"], "backup didn't start" - - # verify backup is present in the list of backups - # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a - # backup can take a lot of time so this function returns once the command was successfully - # sent to pbm. Therefore we should retry listing the backup several times - try: - for attempt in Retrying(stop=stop_after_delay(20), wait=wait_fixed(3)): - with attempt: - backups = await helpers.count_logical_backups(db_unit) - assert backups == 1 - except RetryError: - assert backups == 1, "Backup not created." - - -@pytest.mark.abort_on_fail -async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db) -> None: - """With writes in the DB test creating a backup while another one is running. - - Note that before creating the second backup we change the bucket and change the s3 storage - from AWS to GCP. This test verifies that the first backup in AWS is made, the second backup - in GCP is made, and that before the second backup is made that pbm correctly resyncs. - """ - db_app_name = await helpers.app_name(ops_test) - db_unit = await helpers.get_leader_unit(ops_test) - - # create first backup once ready - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - action = await db_unit.run_action(action_name="create-backup") - first_backup = await action.wait() - assert first_backup.status == "completed", "First backup not started." - - # while first backup is running change access key, secret keys, and bucket name - # for GCP - await helpers.set_credentials(ops_test, cloud="GCP") - - # change to GCP configs and wait for PBM to resync - configuration_parameters = { - "bucket": "data-charms-testing", - "endpoint": "https://storage.googleapis.com", - "region": "", - } - await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - # create a backup as soon as possible. might not be immediately possible since only one backup - # can happen at a time. - try: - for attempt in Retrying(stop=stop_after_delay(40), wait=wait_fixed(5)): - with attempt: - action = await db_unit.run_action(action_name="create-backup") - second_backup = await action.wait() - assert second_backup.status == "completed" - except RetryError: - assert second_backup.status == "completed", "Second backup not started." - - # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a - # backup can take a lot of time so this function returns once the command was successfully - # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the - # backup - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - # verify that backups was made in GCP bucket - try: - for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): - with attempt: - backups = await helpers.count_logical_backups(db_unit) - assert backups == 1, "Backup not created in bucket on GCP." - except RetryError: - assert backups == 1, "Backup not created in first bucket on GCP." - - # set AWS credentials, set configs for s3 storage, and wait to resync - await helpers.set_credentials(ops_test, cloud="AWS") - configuration_parameters = { - "bucket": "data-charms-testing", - "region": "us-east-1", - "endpoint": "https://s3.amazonaws.com", - } - await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - # verify that backups was made on the AWS bucket - try: - for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): - with attempt: - backups = await helpers.count_logical_backups(db_unit) - assert backups == 2, "Backup not created in bucket on AWS." - except RetryError: - assert backups == 2, "Backup not created in bucket on AWS." - - -@pytest.mark.abort_on_fail -async def test_restore(ops_test: OpsTest, add_writes_to_db) -> None: - """Simple backup tests that verifies that writes are correctly restored.""" - # count total writes - number_writes = await ha_helpers.count_writes(ops_test) - assert number_writes > 0, "no writes to backup" - - # create a backup in the AWS bucket - db_app_name = await helpers.app_name(ops_test) - db_unit = await helpers.get_leader_unit(ops_test) - prev_backups = await helpers.count_logical_backups(db_unit) - action = await db_unit.run_action(action_name="create-backup") - first_backup = await action.wait() - assert first_backup.status == "completed", "First backup not started." - - # verify that backup was made on the bucket - try: - for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): - with attempt: - backups = await helpers.count_logical_backups(db_unit) - assert backups == prev_backups + 1, "Backup not created." - except RetryError: - assert backups == prev_backups + 1, "Backup not created." - - # add writes to be cleared after restoring the backup. Note these are written to the same - # collection that was backed up. - await helpers.insert_unwanted_data(ops_test) - new_number_of_writes = await ha_helpers.count_writes(ops_test) - assert new_number_of_writes > number_writes, "No writes to be cleared after restoring." - - # find most recent backup id and restore - action = await db_unit.run_action(action_name="list-backups") - list_result = await action.wait() - list_result = list_result.results["backups"] - most_recent_backup = list_result.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) - restore = await action.wait() - assert restore.results["restore-status"] == "restore started", "restore not successful" - - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - # verify all writes are present - try: - for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): - with attempt: - number_writes_restored = await ha_helpers.count_writes(ops_test) - assert number_writes == number_writes_restored, "writes not correctly restored" - except RetryError: - assert number_writes == number_writes_restored, "writes not correctly restored" - - -@pytest.mark.parametrize("cloud_provider", ["AWS", "GCP"]) -async def test_restore_new_cluster(ops_test: OpsTest, add_writes_to_db, cloud_provider): - # configure test for the cloud provider - db_app_name = await helpers.app_name(ops_test) - await helpers.set_credentials(ops_test, cloud=cloud_provider) - if cloud_provider == "AWS": - configuration_parameters = { - "bucket": "data-charms-testing", - "region": "us-east-1", - "endpoint": "https://s3.amazonaws.com", - } - else: - configuration_parameters = { - "bucket": "data-charms-testing", - "endpoint": "https://storage.googleapis.com", - "region": "", - } - - await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"), - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - # create a backup - writes_in_old_cluster = await ha_helpers.count_writes(ops_test, db_app_name) - assert writes_in_old_cluster > 0, "old cluster has no writes." - await helpers.create_and_verify_backup(ops_test) - - # save old password, since after restoring we will need this password to authenticate. - old_password = await ha_helpers.get_password(ops_test, db_app_name) - - # deploy a new cluster with a different name - db_charm = await ops_test.build_charm(".") - await ops_test.model.deploy(db_charm, num_units=3, application_name=NEW_CLUSTER) - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), - ) - - db_unit = await helpers.get_leader_unit(ops_test, db_app_name=NEW_CLUSTER) - action = await db_unit.run_action("set-password", **{"password": old_password}) - action = await action.wait() - assert action.status == "completed" - - # relate to s3 - s3 has the necessary configurations - await ops_test.model.add_relation(S3_APP_NAME, NEW_CLUSTER) - await ops_test.model.block_until( - lambda: helpers.is_relation_joined(ops_test, ENDPOINT, ENDPOINT) is True, - timeout=TIMEOUT, - ) - - # wait for new cluster to sync - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), - ) - - # verify that the listed backups from the old cluster are not listed as failed. - assert ( - await helpers.count_failed_backups(db_unit) == 0 - ), "Backups from old cluster are listed as failed" - - # find most recent backup id and restore - action = await db_unit.run_action(action_name="list-backups") - list_result = await action.wait() - list_result = list_result.results["backups"] - most_recent_backup = list_result.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) - restore = await action.wait() - assert restore.results["restore-status"] == "restore started", "restore not successful" - - # verify all writes are present - try: - for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): - with attempt: - writes_in_new_cluster = await ha_helpers.count_writes(ops_test, NEW_CLUSTER) - assert ( - writes_in_new_cluster == writes_in_old_cluster - ), "new cluster writes do not match old cluster writes after restore" - except RetryError: - assert ( - writes_in_new_cluster == writes_in_old_cluster - ), "new cluster writes do not match old cluster writes after restore" - - await helpers.destroy_cluster(ops_test, cluster_name=NEW_CLUSTER) - - -@pytest.mark.abort_on_fail -async def test_update_backup_password(ops_test: OpsTest) -> None: - """Verifies that after changing the backup password the pbm tool is updated and functional.""" - db_app_name = await helpers.app_name(ops_test) - db_unit = await helpers.get_leader_unit(ops_test) - - # wait for charm to be idle before setting password - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - parameters = {"username": "backup"} - action = await db_unit.run_action("set-password", **parameters) - action = await action.wait() - assert action.status == "completed", "failed to set backup password" - - # wait for charm to be idle after setting password - await asyncio.gather( - ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), - ) - - # verify we still have connection to pbm via creating a backup - action = await db_unit.run_action(action_name="create-backup") - backup_result = await action.wait() - assert "backup started" in backup_result.results["backup-status"], "backup didn't start" +# @pytest.mark.abort_on_fail +# async def test_ready_correct_conf(ops_test: OpsTest) -> None: +# """Verifies charm goes into active status when s3 config and creds options are correct.""" +# db_app_name = await helpers.app_name(ops_test) +# choices = string.ascii_letters + string.digits +# unique_path = "".join([secrets.choice(choices) for _ in range(4)]) +# configuration_parameters = { +# "bucket": "data-charms-testing", +# "path": f"mongodb-vm/test-{unique_path}", +# "endpoint": "https://s3.amazonaws.com", +# "region": "us-east-1", +# } + +# # apply new configuration options +# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) + +# # after applying correct config options and creds the applications should both be active +# await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT) +# await ops_test.model.wait_for_idle( +# apps=[db_app_name], status="active", timeout=TIMEOUT, idle_period=60 +# ) + + +# @pytest.mark.abort_on_fail +# async def test_create_and_list_backups(ops_test: OpsTest) -> None: +# db_unit = await helpers.get_leader_unit(ops_test) + +# # verify backup list works +# action = await db_unit.run_action(action_name="list-backups") +# list_result = await action.wait() +# backups = list_result.results["backups"] +# assert backups, "backups not outputted" + +# # verify backup is started +# action = await db_unit.run_action(action_name="create-backup") +# backup_result = await action.wait() +# assert "backup started" in backup_result.results["backup-status"], "backup didn't start" + +# # verify backup is present in the list of backups +# # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a +# # backup can take a lot of time so this function returns once the command was successfully +# # sent to pbm. Therefore we should retry listing the backup several times +# try: +# for attempt in Retrying(stop=stop_after_delay(20), wait=wait_fixed(3)): +# with attempt: +# backups = await helpers.count_logical_backups(db_unit) +# assert backups == 1 +# except RetryError: +# assert backups == 1, "Backup not created." + + +# @pytest.mark.abort_on_fail +# async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db) -> None: +# """With writes in the DB test creating a backup while another one is running. + +# Note that before creating the second backup we change the bucket and change the s3 storage +# from AWS to GCP. This test verifies that the first backup in AWS is made, the second backup +# in GCP is made, and that before the second backup is made that pbm correctly resyncs. +# """ +# db_app_name = await helpers.app_name(ops_test) +# db_unit = await helpers.get_leader_unit(ops_test) + +# # create first backup once ready +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# action = await db_unit.run_action(action_name="create-backup") +# first_backup = await action.wait() +# assert first_backup.status == "completed", "First backup not started." + +# # while first backup is running change access key, secret keys, and bucket name +# # for GCP +# await helpers.set_credentials(ops_test, cloud="GCP") + +# # change to GCP configs and wait for PBM to resync +# configuration_parameters = { +# "bucket": "data-charms-testing", +# "endpoint": "https://storage.googleapis.com", +# "region": "", +# } +# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) + +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# # create a backup as soon as possible. might not be immediately possible since only one backup +# # can happen at a time. +# try: +# for attempt in Retrying(stop=stop_after_delay(40), wait=wait_fixed(5)): +# with attempt: +# action = await db_unit.run_action(action_name="create-backup") +# second_backup = await action.wait() +# assert second_backup.status == "completed" +# except RetryError: +# assert second_backup.status == "completed", "Second backup not started." + +# # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a +# # backup can take a lot of time so this function returns once the command was successfully +# # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the +# # backup +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# # verify that backups was made in GCP bucket +# try: +# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): +# with attempt: +# backups = await helpers.count_logical_backups(db_unit) +# assert backups == 1, "Backup not created in bucket on GCP." +# except RetryError: +# assert backups == 1, "Backup not created in first bucket on GCP." + +# # set AWS credentials, set configs for s3 storage, and wait to resync +# await helpers.set_credentials(ops_test, cloud="AWS") +# configuration_parameters = { +# "bucket": "data-charms-testing", +# "region": "us-east-1", +# "endpoint": "https://s3.amazonaws.com", +# } +# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# # verify that backups was made on the AWS bucket +# try: +# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): +# with attempt: +# backups = await helpers.count_logical_backups(db_unit) +# assert backups == 2, "Backup not created in bucket on AWS." +# except RetryError: +# assert backups == 2, "Backup not created in bucket on AWS." + + +# @pytest.mark.abort_on_fail +# async def test_restore(ops_test: OpsTest, add_writes_to_db) -> None: +# """Simple backup tests that verifies that writes are correctly restored.""" +# # count total writes +# number_writes = await ha_helpers.count_writes(ops_test) +# assert number_writes > 0, "no writes to backup" + +# # create a backup in the AWS bucket +# db_app_name = await helpers.app_name(ops_test) +# db_unit = await helpers.get_leader_unit(ops_test) +# prev_backups = await helpers.count_logical_backups(db_unit) +# action = await db_unit.run_action(action_name="create-backup") +# first_backup = await action.wait() +# assert first_backup.status == "completed", "First backup not started." + +# # verify that backup was made on the bucket +# try: +# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): +# with attempt: +# backups = await helpers.count_logical_backups(db_unit) +# assert backups == prev_backups + 1, "Backup not created." +# except RetryError: +# assert backups == prev_backups + 1, "Backup not created." + +# # add writes to be cleared after restoring the backup. Note these are written to the same +# # collection that was backed up. +# await helpers.insert_unwanted_data(ops_test) +# new_number_of_writes = await ha_helpers.count_writes(ops_test) +# assert new_number_of_writes > number_writes, "No writes to be cleared after restoring." + +# # find most recent backup id and restore +# action = await db_unit.run_action(action_name="list-backups") +# list_result = await action.wait() +# list_result = list_result.results["backups"] +# most_recent_backup = list_result.split("\n")[-1] +# backup_id = most_recent_backup.split()[0] +# action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) +# restore = await action.wait() +# assert restore.results["restore-status"] == "restore started", "restore not successful" + +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# # verify all writes are present +# try: +# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): +# with attempt: +# number_writes_restored = await ha_helpers.count_writes(ops_test) +# assert number_writes == number_writes_restored, "writes not correctly restored" +# except RetryError: +# assert number_writes == number_writes_restored, "writes not correctly restored" + + +# @pytest.mark.parametrize("cloud_provider", ["AWS", "GCP"]) +# async def test_restore_new_cluster(ops_test: OpsTest, add_writes_to_db, cloud_provider): +# # configure test for the cloud provider +# db_app_name = await helpers.app_name(ops_test) +# await helpers.set_credentials(ops_test, cloud=cloud_provider) +# if cloud_provider == "AWS": +# configuration_parameters = { +# "bucket": "data-charms-testing", +# "region": "us-east-1", +# "endpoint": "https://s3.amazonaws.com", +# } +# else: +# configuration_parameters = { +# "bucket": "data-charms-testing", +# "endpoint": "https://storage.googleapis.com", +# "region": "", +# } + +# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"), +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# # create a backup +# writes_in_old_cluster = await ha_helpers.count_writes(ops_test, db_app_name) +# assert writes_in_old_cluster > 0, "old cluster has no writes." +# await helpers.create_and_verify_backup(ops_test) + +# # save old password, since after restoring we will need this password to authenticate. +# old_password = await ha_helpers.get_password(ops_test, db_app_name) + +# # deploy a new cluster with a different name +# db_charm = await ops_test.build_charm(".") +# await ops_test.model.deploy(db_charm, num_units=3, application_name=NEW_CLUSTER) +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), +# ) + +# db_unit = await helpers.get_leader_unit(ops_test, db_app_name=NEW_CLUSTER) +# action = await db_unit.run_action("set-password", **{"password": old_password}) +# action = await action.wait() +# assert action.status == "completed" + +# # relate to s3 - s3 has the necessary configurations +# await ops_test.model.add_relation(S3_APP_NAME, NEW_CLUSTER) +# await ops_test.model.block_until( +# lambda: helpers.is_relation_joined(ops_test, ENDPOINT, ENDPOINT) is True, +# timeout=TIMEOUT, +# ) + +# # wait for new cluster to sync +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), +# ) + +# # verify that the listed backups from the old cluster are not listed as failed. +# assert ( +# await helpers.count_failed_backups(db_unit) == 0 +# ), "Backups from old cluster are listed as failed" + +# # find most recent backup id and restore +# action = await db_unit.run_action(action_name="list-backups") +# list_result = await action.wait() +# list_result = list_result.results["backups"] +# most_recent_backup = list_result.split("\n")[-1] +# backup_id = most_recent_backup.split()[0] +# action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) +# restore = await action.wait() +# assert restore.results["restore-status"] == "restore started", "restore not successful" + +# # verify all writes are present +# try: +# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): +# with attempt: +# writes_in_new_cluster = await ha_helpers.count_writes(ops_test, NEW_CLUSTER) +# assert ( +# writes_in_new_cluster == writes_in_old_cluster +# ), "new cluster writes do not match old cluster writes after restore" +# except RetryError: +# assert ( +# writes_in_new_cluster == writes_in_old_cluster +# ), "new cluster writes do not match old cluster writes after restore" + +# await helpers.destroy_cluster(ops_test, cluster_name=NEW_CLUSTER) + + +# @pytest.mark.abort_on_fail +# async def test_update_backup_password(ops_test: OpsTest) -> None: +# """Verifies that after changing the backup password the pbm tool is updated and functional.""" +# db_app_name = await helpers.app_name(ops_test) +# db_unit = await helpers.get_leader_unit(ops_test) + +# # wait for charm to be idle before setting password +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# parameters = {"username": "backup"} +# action = await db_unit.run_action("set-password", **parameters) +# action = await action.wait() +# assert action.status == "completed", "failed to set backup password" + +# # wait for charm to be idle after setting password +# await asyncio.gather( +# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), +# ) + +# # verify we still have connection to pbm via creating a backup +# action = await db_unit.run_action(action_name="create-backup") +# backup_result = await action.wait() +# assert "backup started" in backup_result.results["backup-status"], "backup didn't start" From ac78ed38c3a58b73998a95d3d00c7fe178ab502f Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 09:20:37 +0000 Subject: [PATCH 10/42] PR comments --- lib/charms/mongodb/v0/helpers.py | 6 +- .../integration/backup_tests/test_backups.py | 606 +++++++++--------- 2 files changed, 305 insertions(+), 307 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index c95d99393..7f9085250 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -90,13 +90,11 @@ def get_mongos_args(config: MongoDBConfiguration) -> str: # mongos running on the config server communicates through localhost config_server_uri = f"{config.replset}/localhost" - # no need to add TLS since no network calls are used, since mongos is configured to listen - # on local host + # todo follow up PR add TLS cmd = [ # mongos on config server side should run on 0.0.0.0 so it can be accessed by other units # in the sharded cluster - "--bind_ip 0.0.0.0", - # todo figure out this one + "--bind_ip", f"--configdb {config_server_uri}", # config server is already using 27017 f"--port {Config.MONGOS_PORT}", diff --git a/tests/integration/backup_tests/test_backups.py b/tests/integration/backup_tests/test_backups.py index cc78957a2..b7cb64d48 100644 --- a/tests/integration/backup_tests/test_backups.py +++ b/tests/integration/backup_tests/test_backups.py @@ -99,306 +99,306 @@ async def test_blocked_incorrect_conf(ops_test: OpsTest) -> None: assert db_unit.workload_status_message == "s3 configurations are incompatible." -# @pytest.mark.abort_on_fail -# async def test_ready_correct_conf(ops_test: OpsTest) -> None: -# """Verifies charm goes into active status when s3 config and creds options are correct.""" -# db_app_name = await helpers.app_name(ops_test) -# choices = string.ascii_letters + string.digits -# unique_path = "".join([secrets.choice(choices) for _ in range(4)]) -# configuration_parameters = { -# "bucket": "data-charms-testing", -# "path": f"mongodb-vm/test-{unique_path}", -# "endpoint": "https://s3.amazonaws.com", -# "region": "us-east-1", -# } - -# # apply new configuration options -# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - -# # after applying correct config options and creds the applications should both be active -# await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT) -# await ops_test.model.wait_for_idle( -# apps=[db_app_name], status="active", timeout=TIMEOUT, idle_period=60 -# ) - - -# @pytest.mark.abort_on_fail -# async def test_create_and_list_backups(ops_test: OpsTest) -> None: -# db_unit = await helpers.get_leader_unit(ops_test) - -# # verify backup list works -# action = await db_unit.run_action(action_name="list-backups") -# list_result = await action.wait() -# backups = list_result.results["backups"] -# assert backups, "backups not outputted" - -# # verify backup is started -# action = await db_unit.run_action(action_name="create-backup") -# backup_result = await action.wait() -# assert "backup started" in backup_result.results["backup-status"], "backup didn't start" - -# # verify backup is present in the list of backups -# # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a -# # backup can take a lot of time so this function returns once the command was successfully -# # sent to pbm. Therefore we should retry listing the backup several times -# try: -# for attempt in Retrying(stop=stop_after_delay(20), wait=wait_fixed(3)): -# with attempt: -# backups = await helpers.count_logical_backups(db_unit) -# assert backups == 1 -# except RetryError: -# assert backups == 1, "Backup not created." - - -# @pytest.mark.abort_on_fail -# async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db) -> None: -# """With writes in the DB test creating a backup while another one is running. - -# Note that before creating the second backup we change the bucket and change the s3 storage -# from AWS to GCP. This test verifies that the first backup in AWS is made, the second backup -# in GCP is made, and that before the second backup is made that pbm correctly resyncs. -# """ -# db_app_name = await helpers.app_name(ops_test) -# db_unit = await helpers.get_leader_unit(ops_test) - -# # create first backup once ready -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# action = await db_unit.run_action(action_name="create-backup") -# first_backup = await action.wait() -# assert first_backup.status == "completed", "First backup not started." - -# # while first backup is running change access key, secret keys, and bucket name -# # for GCP -# await helpers.set_credentials(ops_test, cloud="GCP") - -# # change to GCP configs and wait for PBM to resync -# configuration_parameters = { -# "bucket": "data-charms-testing", -# "endpoint": "https://storage.googleapis.com", -# "region": "", -# } -# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# # create a backup as soon as possible. might not be immediately possible since only one backup -# # can happen at a time. -# try: -# for attempt in Retrying(stop=stop_after_delay(40), wait=wait_fixed(5)): -# with attempt: -# action = await db_unit.run_action(action_name="create-backup") -# second_backup = await action.wait() -# assert second_backup.status == "completed" -# except RetryError: -# assert second_backup.status == "completed", "Second backup not started." - -# # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a -# # backup can take a lot of time so this function returns once the command was successfully -# # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the -# # backup -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# # verify that backups was made in GCP bucket -# try: -# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): -# with attempt: -# backups = await helpers.count_logical_backups(db_unit) -# assert backups == 1, "Backup not created in bucket on GCP." -# except RetryError: -# assert backups == 1, "Backup not created in first bucket on GCP." - -# # set AWS credentials, set configs for s3 storage, and wait to resync -# await helpers.set_credentials(ops_test, cloud="AWS") -# configuration_parameters = { -# "bucket": "data-charms-testing", -# "region": "us-east-1", -# "endpoint": "https://s3.amazonaws.com", -# } -# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# # verify that backups was made on the AWS bucket -# try: -# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): -# with attempt: -# backups = await helpers.count_logical_backups(db_unit) -# assert backups == 2, "Backup not created in bucket on AWS." -# except RetryError: -# assert backups == 2, "Backup not created in bucket on AWS." - - -# @pytest.mark.abort_on_fail -# async def test_restore(ops_test: OpsTest, add_writes_to_db) -> None: -# """Simple backup tests that verifies that writes are correctly restored.""" -# # count total writes -# number_writes = await ha_helpers.count_writes(ops_test) -# assert number_writes > 0, "no writes to backup" - -# # create a backup in the AWS bucket -# db_app_name = await helpers.app_name(ops_test) -# db_unit = await helpers.get_leader_unit(ops_test) -# prev_backups = await helpers.count_logical_backups(db_unit) -# action = await db_unit.run_action(action_name="create-backup") -# first_backup = await action.wait() -# assert first_backup.status == "completed", "First backup not started." - -# # verify that backup was made on the bucket -# try: -# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): -# with attempt: -# backups = await helpers.count_logical_backups(db_unit) -# assert backups == prev_backups + 1, "Backup not created." -# except RetryError: -# assert backups == prev_backups + 1, "Backup not created." - -# # add writes to be cleared after restoring the backup. Note these are written to the same -# # collection that was backed up. -# await helpers.insert_unwanted_data(ops_test) -# new_number_of_writes = await ha_helpers.count_writes(ops_test) -# assert new_number_of_writes > number_writes, "No writes to be cleared after restoring." - -# # find most recent backup id and restore -# action = await db_unit.run_action(action_name="list-backups") -# list_result = await action.wait() -# list_result = list_result.results["backups"] -# most_recent_backup = list_result.split("\n")[-1] -# backup_id = most_recent_backup.split()[0] -# action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) -# restore = await action.wait() -# assert restore.results["restore-status"] == "restore started", "restore not successful" - -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# # verify all writes are present -# try: -# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): -# with attempt: -# number_writes_restored = await ha_helpers.count_writes(ops_test) -# assert number_writes == number_writes_restored, "writes not correctly restored" -# except RetryError: -# assert number_writes == number_writes_restored, "writes not correctly restored" - - -# @pytest.mark.parametrize("cloud_provider", ["AWS", "GCP"]) -# async def test_restore_new_cluster(ops_test: OpsTest, add_writes_to_db, cloud_provider): -# # configure test for the cloud provider -# db_app_name = await helpers.app_name(ops_test) -# await helpers.set_credentials(ops_test, cloud=cloud_provider) -# if cloud_provider == "AWS": -# configuration_parameters = { -# "bucket": "data-charms-testing", -# "region": "us-east-1", -# "endpoint": "https://s3.amazonaws.com", -# } -# else: -# configuration_parameters = { -# "bucket": "data-charms-testing", -# "endpoint": "https://storage.googleapis.com", -# "region": "", -# } - -# await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"), -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# # create a backup -# writes_in_old_cluster = await ha_helpers.count_writes(ops_test, db_app_name) -# assert writes_in_old_cluster > 0, "old cluster has no writes." -# await helpers.create_and_verify_backup(ops_test) - -# # save old password, since after restoring we will need this password to authenticate. -# old_password = await ha_helpers.get_password(ops_test, db_app_name) - -# # deploy a new cluster with a different name -# db_charm = await ops_test.build_charm(".") -# await ops_test.model.deploy(db_charm, num_units=3, application_name=NEW_CLUSTER) -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), -# ) - -# db_unit = await helpers.get_leader_unit(ops_test, db_app_name=NEW_CLUSTER) -# action = await db_unit.run_action("set-password", **{"password": old_password}) -# action = await action.wait() -# assert action.status == "completed" - -# # relate to s3 - s3 has the necessary configurations -# await ops_test.model.add_relation(S3_APP_NAME, NEW_CLUSTER) -# await ops_test.model.block_until( -# lambda: helpers.is_relation_joined(ops_test, ENDPOINT, ENDPOINT) is True, -# timeout=TIMEOUT, -# ) - -# # wait for new cluster to sync -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), -# ) - -# # verify that the listed backups from the old cluster are not listed as failed. -# assert ( -# await helpers.count_failed_backups(db_unit) == 0 -# ), "Backups from old cluster are listed as failed" - -# # find most recent backup id and restore -# action = await db_unit.run_action(action_name="list-backups") -# list_result = await action.wait() -# list_result = list_result.results["backups"] -# most_recent_backup = list_result.split("\n")[-1] -# backup_id = most_recent_backup.split()[0] -# action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) -# restore = await action.wait() -# assert restore.results["restore-status"] == "restore started", "restore not successful" - -# # verify all writes are present -# try: -# for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): -# with attempt: -# writes_in_new_cluster = await ha_helpers.count_writes(ops_test, NEW_CLUSTER) -# assert ( -# writes_in_new_cluster == writes_in_old_cluster -# ), "new cluster writes do not match old cluster writes after restore" -# except RetryError: -# assert ( -# writes_in_new_cluster == writes_in_old_cluster -# ), "new cluster writes do not match old cluster writes after restore" - -# await helpers.destroy_cluster(ops_test, cluster_name=NEW_CLUSTER) - - -# @pytest.mark.abort_on_fail -# async def test_update_backup_password(ops_test: OpsTest) -> None: -# """Verifies that after changing the backup password the pbm tool is updated and functional.""" -# db_app_name = await helpers.app_name(ops_test) -# db_unit = await helpers.get_leader_unit(ops_test) - -# # wait for charm to be idle before setting password -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# parameters = {"username": "backup"} -# action = await db_unit.run_action("set-password", **parameters) -# action = await action.wait() -# assert action.status == "completed", "failed to set backup password" - -# # wait for charm to be idle after setting password -# await asyncio.gather( -# ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), -# ) - -# # verify we still have connection to pbm via creating a backup -# action = await db_unit.run_action(action_name="create-backup") -# backup_result = await action.wait() -# assert "backup started" in backup_result.results["backup-status"], "backup didn't start" +@pytest.mark.abort_on_fail +async def test_ready_correct_conf(ops_test: OpsTest) -> None: + """Verifies charm goes into active status when s3 config and creds options are correct.""" + db_app_name = await helpers.app_name(ops_test) + choices = string.ascii_letters + string.digits + unique_path = "".join([secrets.choice(choices) for _ in range(4)]) + configuration_parameters = { + "bucket": "data-charms-testing", + "path": f"mongodb-vm/test-{unique_path}", + "endpoint": "https://s3.amazonaws.com", + "region": "us-east-1", + } + + # apply new configuration options + await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) + + # after applying correct config options and creds the applications should both be active + await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT) + await ops_test.model.wait_for_idle( + apps=[db_app_name], status="active", timeout=TIMEOUT, idle_period=60 + ) + + +@pytest.mark.abort_on_fail +async def test_create_and_list_backups(ops_test: OpsTest) -> None: + db_unit = await helpers.get_leader_unit(ops_test) + + # verify backup list works + action = await db_unit.run_action(action_name="list-backups") + list_result = await action.wait() + backups = list_result.results["backups"] + assert backups, "backups not outputted" + + # verify backup is started + action = await db_unit.run_action(action_name="create-backup") + backup_result = await action.wait() + assert "backup started" in backup_result.results["backup-status"], "backup didn't start" + + # verify backup is present in the list of backups + # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a + # backup can take a lot of time so this function returns once the command was successfully + # sent to pbm. Therefore we should retry listing the backup several times + try: + for attempt in Retrying(stop=stop_after_delay(20), wait=wait_fixed(3)): + with attempt: + backups = await helpers.count_logical_backups(db_unit) + assert backups == 1 + except RetryError: + assert backups == 1, "Backup not created." + + +@pytest.mark.abort_on_fail +async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db) -> None: + """With writes in the DB test creating a backup while another one is running. + + Note that before creating the second backup we change the bucket and change the s3 storage + from AWS to GCP. This test verifies that the first backup in AWS is made, the second backup + in GCP is made, and that before the second backup is made that pbm correctly resyncs. + """ + db_app_name = await helpers.app_name(ops_test) + db_unit = await helpers.get_leader_unit(ops_test) + + # create first backup once ready + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + action = await db_unit.run_action(action_name="create-backup") + first_backup = await action.wait() + assert first_backup.status == "completed", "First backup not started." + + # while first backup is running change access key, secret keys, and bucket name + # for GCP + await helpers.set_credentials(ops_test, cloud="GCP") + + # change to GCP configs and wait for PBM to resync + configuration_parameters = { + "bucket": "data-charms-testing", + "endpoint": "https://storage.googleapis.com", + "region": "", + } + await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) + + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + # create a backup as soon as possible. might not be immediately possible since only one backup + # can happen at a time. + try: + for attempt in Retrying(stop=stop_after_delay(40), wait=wait_fixed(5)): + with attempt: + action = await db_unit.run_action(action_name="create-backup") + second_backup = await action.wait() + assert second_backup.status == "completed" + except RetryError: + assert second_backup.status == "completed", "Second backup not started." + + # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a + # backup can take a lot of time so this function returns once the command was successfully + # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the + # backup + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + # verify that backups was made in GCP bucket + try: + for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): + with attempt: + backups = await helpers.count_logical_backups(db_unit) + assert backups == 1, "Backup not created in bucket on GCP." + except RetryError: + assert backups == 1, "Backup not created in first bucket on GCP." + + # set AWS credentials, set configs for s3 storage, and wait to resync + await helpers.set_credentials(ops_test, cloud="AWS") + configuration_parameters = { + "bucket": "data-charms-testing", + "region": "us-east-1", + "endpoint": "https://s3.amazonaws.com", + } + await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + # verify that backups was made on the AWS bucket + try: + for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): + with attempt: + backups = await helpers.count_logical_backups(db_unit) + assert backups == 2, "Backup not created in bucket on AWS." + except RetryError: + assert backups == 2, "Backup not created in bucket on AWS." + + +@pytest.mark.abort_on_fail +async def test_restore(ops_test: OpsTest, add_writes_to_db) -> None: + """Simple backup tests that verifies that writes are correctly restored.""" + # count total writes + number_writes = await ha_helpers.count_writes(ops_test) + assert number_writes > 0, "no writes to backup" + + # create a backup in the AWS bucket + db_app_name = await helpers.app_name(ops_test) + db_unit = await helpers.get_leader_unit(ops_test) + prev_backups = await helpers.count_logical_backups(db_unit) + action = await db_unit.run_action(action_name="create-backup") + first_backup = await action.wait() + assert first_backup.status == "completed", "First backup not started." + + # verify that backup was made on the bucket + try: + for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(5)): + with attempt: + backups = await helpers.count_logical_backups(db_unit) + assert backups == prev_backups + 1, "Backup not created." + except RetryError: + assert backups == prev_backups + 1, "Backup not created." + + # add writes to be cleared after restoring the backup. Note these are written to the same + # collection that was backed up. + await helpers.insert_unwanted_data(ops_test) + new_number_of_writes = await ha_helpers.count_writes(ops_test) + assert new_number_of_writes > number_writes, "No writes to be cleared after restoring." + + # find most recent backup id and restore + action = await db_unit.run_action(action_name="list-backups") + list_result = await action.wait() + list_result = list_result.results["backups"] + most_recent_backup = list_result.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) + restore = await action.wait() + assert restore.results["restore-status"] == "restore started", "restore not successful" + + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + # verify all writes are present + try: + for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): + with attempt: + number_writes_restored = await ha_helpers.count_writes(ops_test) + assert number_writes == number_writes_restored, "writes not correctly restored" + except RetryError: + assert number_writes == number_writes_restored, "writes not correctly restored" + + +@pytest.mark.parametrize("cloud_provider", ["AWS", "GCP"]) +async def test_restore_new_cluster(ops_test: OpsTest, add_writes_to_db, cloud_provider): + # configure test for the cloud provider + db_app_name = await helpers.app_name(ops_test) + await helpers.set_credentials(ops_test, cloud=cloud_provider) + if cloud_provider == "AWS": + configuration_parameters = { + "bucket": "data-charms-testing", + "region": "us-east-1", + "endpoint": "https://s3.amazonaws.com", + } + else: + configuration_parameters = { + "bucket": "data-charms-testing", + "endpoint": "https://storage.googleapis.com", + "region": "", + } + + await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"), + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + # create a backup + writes_in_old_cluster = await ha_helpers.count_writes(ops_test, db_app_name) + assert writes_in_old_cluster > 0, "old cluster has no writes." + await helpers.create_and_verify_backup(ops_test) + + # save old password, since after restoring we will need this password to authenticate. + old_password = await ha_helpers.get_password(ops_test, db_app_name) + + # deploy a new cluster with a different name + db_charm = await ops_test.build_charm(".") + await ops_test.model.deploy(db_charm, num_units=3, application_name=NEW_CLUSTER) + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), + ) + + db_unit = await helpers.get_leader_unit(ops_test, db_app_name=NEW_CLUSTER) + action = await db_unit.run_action("set-password", **{"password": old_password}) + action = await action.wait() + assert action.status == "completed" + + # relate to s3 - s3 has the necessary configurations + await ops_test.model.add_relation(S3_APP_NAME, NEW_CLUSTER) + await ops_test.model.block_until( + lambda: helpers.is_relation_joined(ops_test, ENDPOINT, ENDPOINT) is True, + timeout=TIMEOUT, + ) + + # wait for new cluster to sync + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20), + ) + + # verify that the listed backups from the old cluster are not listed as failed. + assert ( + await helpers.count_failed_backups(db_unit) == 0 + ), "Backups from old cluster are listed as failed" + + # find most recent backup id and restore + action = await db_unit.run_action(action_name="list-backups") + list_result = await action.wait() + list_result = list_result.results["backups"] + most_recent_backup = list_result.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + action = await db_unit.run_action(action_name="restore", **{"backup-id": backup_id}) + restore = await action.wait() + assert restore.results["restore-status"] == "restore started", "restore not successful" + + # verify all writes are present + try: + for attempt in Retrying(stop=stop_after_delay(4), wait=wait_fixed(20)): + with attempt: + writes_in_new_cluster = await ha_helpers.count_writes(ops_test, NEW_CLUSTER) + assert ( + writes_in_new_cluster == writes_in_old_cluster + ), "new cluster writes do not match old cluster writes after restore" + except RetryError: + assert ( + writes_in_new_cluster == writes_in_old_cluster + ), "new cluster writes do not match old cluster writes after restore" + + await helpers.destroy_cluster(ops_test, cluster_name=NEW_CLUSTER) + + +@pytest.mark.abort_on_fail +async def test_update_backup_password(ops_test: OpsTest) -> None: + """Verifies that after changing the backup password the pbm tool is updated and functional.""" + db_app_name = await helpers.app_name(ops_test) + db_unit = await helpers.get_leader_unit(ops_test) + + # wait for charm to be idle before setting password + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + parameters = {"username": "backup"} + action = await db_unit.run_action("set-password", **parameters) + action = await action.wait() + assert action.status == "completed", "failed to set backup password" + + # wait for charm to be idle after setting password + await asyncio.gather( + ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + ) + + # verify we still have connection to pbm via creating a backup + action = await db_unit.run_action(action_name="create-backup") + backup_result = await action.wait() + assert "backup started" in backup_result.results["backup-status"], "backup didn't start" From 563f0495a6288219439f2d37998d3178ac4bf731 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 09:26:16 +0000 Subject: [PATCH 11/42] correct ip binding --- lib/charms/mongodb/v0/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 7f9085250..f63366e42 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -94,7 +94,7 @@ def get_mongos_args(config: MongoDBConfiguration) -> str: cmd = [ # mongos on config server side should run on 0.0.0.0 so it can be accessed by other units # in the sharded cluster - "--bind_ip", + "--bind_ip_all", f"--configdb {config_server_uri}", # config server is already using 27017 f"--port {Config.MONGOS_PORT}", From e8aaeb5708fb323c94cbe130ef39f099c9cf8183 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 15:08:27 +0000 Subject: [PATCH 12/42] mongos and config server now start correctly, and mongos has auth enabled --- lib/charms/mongodb/v0/helpers.py | 13 +++++++------ lib/charms/mongodb/v0/users.py | 14 -------------- src/charm.py | 28 +++++++++++++++++----------- src/machine_helpers.py | 2 +- 4 files changed, 25 insertions(+), 32 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 0d99fe595..6a4afbff1 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -18,7 +18,6 @@ WaitingStatus, ) from pymongo.errors import AutoReconnect, ServerSelectionTimeoutError - from config import Config # The unique Charmhub library identifier, never change it @@ -64,7 +63,7 @@ def get_create_user_cmd( """ return [ mongo_path, - "mongodb://localhost/admin", + f"mongodb://localhost/admin", "--quiet", "--eval", "db.createUser({" @@ -91,12 +90,10 @@ def get_mongos_args( A string representing the arguments to be passed to mongos. """ # mongos running on the config server communicates through localhost - config_server_uri = f"{config.replset}/localhost" + # use constant for port + config_server_uri = f"{config.replset}/localhost:27017" - # no need to add TLS since no network calls are used, since mongos is configured to listen - # on local host full_conf_dir = f"{MONGODB_SNAP_DATA_DIR}{CONF_DIR}" if snap_install else CONF_DIR - # todo follow up PR add TLS cmd = [ # mongos on config server side should run on 0.0.0.0 so it can be accessed by other units # in the sharded cluster @@ -109,6 +106,8 @@ def get_mongos_args( "\n", ] + # todo add TLS - maybe we can generalise the TLS options code from below + return " ".join(cmd) @@ -135,6 +134,7 @@ def get_mongod_args( f"--replSet={config.replset}", # db must be located within the snap common directory since the snap is strictly confined f"--dbpath={full_data_dir}", + f"--port {Config.MONGODB_PORT}", logging_options, ] if auth: @@ -170,6 +170,7 @@ def get_mongod_args( ] ) + # todo use constants here plz if role == "config-server": cmd.append("--configsvr") diff --git a/lib/charms/mongodb/v0/users.py b/lib/charms/mongodb/v0/users.py index d48ff8300..249fcbaf2 100644 --- a/lib/charms/mongodb/v0/users.py +++ b/lib/charms/mongodb/v0/users.py @@ -58,8 +58,6 @@ def get_password_key_name_for_user(username: str) -> str: """Returns the key name for the password of the user.""" if username == OperatorUser.get_username(): return OperatorUser.get_password_key_name() - elif username == MongosUser.get_username(): - return MongosUser.get_password_key_name() elif username == MonitorUser.get_username(): return MonitorUser.get_password_key_name() elif username == BackupUser.get_username(): @@ -78,16 +76,6 @@ class _OperatorUser(MongoDBUser): _hosts = [] -class _MongosUser(MongoDBUser): - """Operator user for MongoDB.""" - - _username = "mongos" - _password_key_name = f"{_username}-password" - _database_name = "admin" - _roles = ["default"] - _hosts = ["127.0.0.1"] - - class _MonitorUser(MongoDBUser): """Monitor user for MongoDB.""" @@ -120,12 +108,10 @@ class _BackupUser(MongoDBUser): OperatorUser = _OperatorUser() MonitorUser = _MonitorUser() BackupUser = _BackupUser() -MongosUser = _MongosUser() # List of system usernames needed for correct work on the charm. CHARM_USERS = [ OperatorUser.get_username(), BackupUser.get_username(), MonitorUser.get_username(), - MongosUser.get_username(), ] diff --git a/src/charm.py b/src/charm.py index 0c25fd73c..457804a8f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -38,7 +38,6 @@ MongoDBUser, MonitorUser, OperatorUser, - MongosUser, ) from charms.operator_libs_linux.v1 import snap from ops import JujuVersion @@ -317,7 +316,11 @@ def _on_start(self, event: StartEvent) -> None: return try: - self._open_port_tcp(self._port) + ports = [self._port] + if self.is_role(Config.Role.CONFIG_SERVER): + ports.append(Config.MONGOS_PORT) + + self._open_ports_tcp(ports) except subprocess.CalledProcessError: self.unit.status = BlockedStatus("failed to open TCP port for MongoDB") return @@ -347,7 +350,6 @@ def _on_start(self, event: StartEvent) -> None: return self._initialise_replica_set(event) - self._init_inital_user(MongosUser) def _on_relation_joined(self, event: RelationJoinedEvent) -> None: """Add peer to replica set. @@ -635,6 +637,10 @@ def _init_inital_user(self, user) -> None: As a result, where are only hackish ways to create initial user. It is needed to install mongodb-clients inside charm container to make this function work correctly. + + Args: + user: User to create + mongos: whether or not the user should be created on mongos router """ if self._is_user_created(user) or not self.unit.is_leader(): return @@ -744,7 +750,6 @@ def _generate_secrets(self) -> None: share between members via the app data. """ self._check_or_set_user_password(OperatorUser) - self._check_or_set_user_password(MongosUser) self._check_or_set_user_password(MonitorUser) if not self.get_secret(APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME): @@ -799,18 +804,19 @@ def _handle_reconfigure(self, event: UpdateStatusEvent): event.defer() return - def _open_port_tcp(self, port: int) -> None: + def _open_ports_tcp(self, ports: int) -> None: """Open the given port. Args: port: The port to open. """ - try: - logger.debug("opening tcp port") - subprocess.check_call(["open-port", "{}/TCP".format(port)]) - except subprocess.CalledProcessError as e: - logger.exception("failed opening port: %s", str(e)) - raise + for port in ports: + try: + logger.debug("opening tcp port") + subprocess.check_call(["open-port", "{}/TCP".format(port)]) + except subprocess.CalledProcessError as e: + logger.exception("failed opening port: %s", str(e)) + raise def _install_snap_packages(self, packages: List[str]) -> None: """Installs package(s) to container. diff --git a/src/machine_helpers.py b/src/machine_helpers.py index b792c21ef..85f815a87 100644 --- a/src/machine_helpers.py +++ b/src/machine_helpers.py @@ -24,7 +24,7 @@ def update_mongod_service( """Updates the mongod service file with the new options for starting.""" # write our arguments and write them to /etc/environment - the environment variable here is # read in in the charmed-mongob.mongod.service file. - mongod_start_args = get_mongod_args(config, auth, snap_install=True) + mongod_start_args = get_mongod_args(config, auth, role=role, snap_install=True) add_args_to_env("MONGOD_ARGS", mongod_start_args) if role == "config-server": From 0598dde9c60f441666e9dc579d7e6bbceb3f1741 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 15:26:01 +0000 Subject: [PATCH 13/42] cleaning up code --- actions.yaml | 2 +- lib/charms/mongodb/v0/helpers.py | 13 +++++++------ src/charm.py | 12 ++++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/actions.yaml b/actions.yaml index 3b32a3b98..619383a54 100644 --- a/actions.yaml +++ b/actions.yaml @@ -11,7 +11,7 @@ get-password: username: type: string description: The username, the default value 'operator'. - Possible values - operator, backup, monitor, mongos. + Possible values - operator, backup, monitor. set-password: description: Change the admin user's password, which is used by charm. diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 6a4afbff1..70a6384fb 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -18,6 +18,7 @@ WaitingStatus, ) from pymongo.errors import AutoReconnect, ServerSelectionTimeoutError + from config import Config # The unique Charmhub library identifier, never change it @@ -63,7 +64,7 @@ def get_create_user_cmd( """ return [ mongo_path, - f"mongodb://localhost/admin", + "mongodb://localhost/admin", "--quiet", "--eval", "db.createUser({" @@ -101,12 +102,11 @@ def get_mongos_args( f"--configdb {config_server_uri}", # config server is already using 27017 f"--port {Config.MONGOS_PORT}", - # todo followup PR add keyfile and auth f"--keyFile={full_conf_dir}/{KEY_FILE}", "\n", ] - # todo add TLS - maybe we can generalise the TLS options code from below + # TODO Future PR: support TLS on mongos return " ".join(cmd) @@ -134,6 +134,8 @@ def get_mongod_args( f"--replSet={config.replset}", # db must be located within the snap common directory since the snap is strictly confined f"--dbpath={full_data_dir}", + # for simplicity we run the mongod daemon on shards, configsvrs, and replicas on the same + # port f"--port {Config.MONGODB_PORT}", logging_options, ] @@ -170,11 +172,10 @@ def get_mongod_args( ] ) - # todo use constants here plz - if role == "config-server": + if role == Config.Role.CONFIG_SERVER: cmd.append("--configsvr") - if role == "shard": + if role == Config.Role.SHARD: cmd.append("--shardsvr") cmd.append("\n") diff --git a/src/charm.py b/src/charm.py index 457804a8f..483b46e94 100755 --- a/src/charm.py +++ b/src/charm.py @@ -627,7 +627,7 @@ def _on_secret_changed(self, event: SecretChangedEvent): reraise=True, before=before_log(logger, logging.DEBUG), ) - def _init_inital_user(self, user) -> None: + def _init_operator_user(self) -> None: """Creates initial specified admin user. Initial admin user can be created only through localhost connection. @@ -642,7 +642,7 @@ def _init_inital_user(self, user) -> None: user: User to create mongos: whether or not the user should be created on mongos router """ - if self._is_user_created(user) or not self.unit.is_leader(): + if self._is_user_created(OperatorUser) or not self.unit.is_leader(): return out = subprocess.run( @@ -652,8 +652,8 @@ def _init_inital_user(self, user) -> None: if out.returncode == 0: raise AdminUserCreationError - logger.debug(f"{user.get_username()} user created") - self._set_user_created(user) + logger.debug(f"{OperatorUser.get_username()} user created") + self._set_user_created(OperatorUser) @retry( stop=stop_after_attempt(3), @@ -808,7 +808,7 @@ def _open_ports_tcp(self, ports: int) -> None: """Open the given port. Args: - port: The port to open. + ports: The port to open. """ for port in ports: try: @@ -970,7 +970,7 @@ def _initialise_replica_set(self, event: StartEvent) -> None: ) logger.info("User initialization") - self._init_inital_user(OperatorUser) + self._init_operator_user() self._init_backup_user() self._init_monitor_user() From 7c64ce0e5dee3861abfc7274a56761803f9eba1f Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Tue, 19 Sep 2023 15:42:10 +0000 Subject: [PATCH 14/42] fix unit tests --- lib/charms/mongodb/v0/helpers.py | 2 +- tests/unit/test_charm.py | 28 ++++++++++++++-------------- tests/unit/test_mongodb_helpers.py | 3 +++ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 70a6384fb..c57357403 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -136,7 +136,7 @@ def get_mongod_args( f"--dbpath={full_data_dir}", # for simplicity we run the mongod daemon on shards, configsvrs, and replicas on the same # port - f"--port {Config.MONGODB_PORT}", + f"--port={Config.MONGODB_PORT}", logging_options, ] if auth: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index f11486c7e..9736b0041 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -42,12 +42,12 @@ def setUp(self, *unused): @patch_network_get(private_address="1.1.1.1") @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") - @patch("charm.MongodbOperatorCharm._open_port_tcp") + @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") @patch("charm.push_file_to_unit") @patch("builtins.open") def test_on_start_not_leader_doesnt_initialise_replica_set( - self, open, path, snap, _open_port_tcp, init_admin, connection, get_secret + self, open, path, snap, _open_ports_tcp, init_admin, connection, get_secret ): """Tests that a non leader unit does not initialise the replica set.""" # set snap data @@ -62,7 +62,7 @@ def test_on_start_not_leader_doesnt_initialise_replica_set( self.harness.charm.on.start.emit() mock_mongodb_snap.start.assert_called() - _open_port_tcp.assert_called() + _open_ports_tcp.assert_called() self.assertEqual(self.harness.charm.unit.status, ActiveStatus()) connection.return_value.__enter__.return_value.init_replset.assert_not_called() init_admin.assert_not_called() @@ -70,14 +70,14 @@ def test_on_start_not_leader_doesnt_initialise_replica_set( @patch_network_get(private_address="1.1.1.1") @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") - @patch("charm.MongodbOperatorCharm._open_port_tcp") + @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.push_file_to_unit") @patch("builtins.open") def test_on_start_snap_failure_leads_to_blocked_status( self, open, path, - _open_port_tcp, + _open_ports_tcp, init_admin, connection, ): @@ -85,13 +85,13 @@ def test_on_start_snap_failure_leads_to_blocked_status( self.harness.set_leader(True) self.harness.charm.on.start.emit() self.assertTrue(isinstance(self.harness.charm.unit.status, BlockedStatus)) - _open_port_tcp.assert_not_called() + _open_ports_tcp.assert_not_called() connection.return_value.__enter__.return_value.init_replset.assert_not_called() init_admin.assert_not_called() @patch_network_get(private_address="1.1.1.1") - @patch("charm.MongodbOperatorCharm._open_port_tcp") + @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.MongodbOperatorCharm._initialise_replica_set") @patch("charm.snap.SnapCache") @patch("charm.push_file_to_unit") @@ -106,7 +106,7 @@ def test_on_start_mongod_not_ready_defer( path, snap, initialise_replica_set, - _open_port_tcp, + _open_ports_tcp, ): """Test verifies that we wait to initialise replica set when mongod is not running.""" # set snap data @@ -124,11 +124,11 @@ def test_on_start_mongod_not_ready_defer( init_admin.assert_not_called() @patch_network_get(private_address="1.1.1.1") - @patch("charm.MongodbOperatorCharm._open_port_tcp") + @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") @patch("charm.push_file_to_unit") @patch("builtins.open") - def test_start_unable_to_open_tcp_moves_to_blocked(self, open, path, snap, _open_port_tcp): + def test_start_unable_to_open_tcp_moves_to_blocked(self, open, path, snap, _open_ports_tcp): """Test verifies that if TCP port cannot be opened we go to the blocked state.""" # set snap data mock_mongodb_snap = mock.Mock() @@ -137,7 +137,7 @@ def test_start_unable_to_open_tcp_moves_to_blocked(self, open, path, snap, _open snap.return_value = {"charmed-mongodb": mock_mongodb_snap} self.harness.set_leader(True) - _open_port_tcp.side_effect = subprocess.CalledProcessError( + _open_ports_tcp.side_effect = subprocess.CalledProcessError( cmd="open-port 27017/TCP", returncode=1 ) self.harness.charm.on.start.emit() @@ -149,7 +149,7 @@ def test_start_unable_to_open_tcp_moves_to_blocked(self, open, path, snap, _open @patch("subprocess.check_call") def test_set_port(self, _call): """Test verifies operation of set port.""" - self.harness.charm._open_port_tcp(27017) + self.harness.charm._open_ports_tcp([27017]) # Make sure the port is opened and the service is started self.assertEqual(_call.call_args_list, [call(["open-port", "27017/TCP"])]) @@ -160,7 +160,7 @@ def test_set_port_failure(self, _call): with self.assertRaises(subprocess.CalledProcessError): with self.assertLogs("charm", "ERROR") as logs: - self.harness.charm._open_port_tcp(27017) + self.harness.charm._open_ports_tcp([27017]) self.assertIn("failed opening port 27017", "".join(logs.output)) @patch_network_get(private_address="1.1.1.1") @@ -286,7 +286,7 @@ def test_reconfigure_add_member_failure(self, _, connection, defer): defer.assert_called() @patch_network_get(private_address="1.1.1.1") - @patch("charm.MongodbOperatorCharm._open_port_tcp") + @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") @patch("charm.push_file_to_unit") @patch("builtins.open") diff --git a/tests/unit/test_mongodb_helpers.py b/tests/unit/test_mongodb_helpers.py index 0880fc059..5692a7ae1 100644 --- a/tests/unit/test_mongodb_helpers.py +++ b/tests/unit/test_mongodb_helpers.py @@ -13,6 +13,7 @@ def test_get_mongod_args(self): "--bind_ip_all", "--replSet=my_repl_set", "--dbpath=/var/snap/charmed-mongodb/common/var/lib/mongodb", + "--port=27017", "--auth", "--clusterAuthMode=keyFile", "--keyFile=/var/snap/charmed-mongodb/current/etc/mongod/keyFile", @@ -34,6 +35,7 @@ def test_get_mongod_args(self): # part of replicaset "--replSet=my_repl_set", "--dbpath=/var/snap/charmed-mongodb/common/var/lib/mongodb", + "--port=27017", ] self.assertEqual( @@ -47,6 +49,7 @@ def test_get_mongod_args(self): # part of replicaset "--replSet=my_repl_set", "--dbpath=/var/lib/mongodb", + "--port=27017", "--logpath=/var/lib/mongodb/mongodb.log", ] From 58d91fa68994b953ede7929f3df63a1a807b2b54 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 20 Sep 2023 07:35:19 +0000 Subject: [PATCH 15/42] don't publish 6/edge changes to 5/edge --- .github/workflows/release.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5a3199ed9..86182da37 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -31,9 +31,11 @@ jobs: with: credentials: "${{ secrets.CHARMHUB_TOKEN }}" github-token: "${{ secrets.GITHUB_TOKEN }}" - - name: Upload charm to charmhub - uses: canonical/charming-actions/upload-charm@2.2.2 - with: - credentials: "${{ secrets.CHARMHUB_TOKEN }}" - github-token: "${{ secrets.GITHUB_TOKEN }}" - channel: "5/edge" + # TODO - wait until track request is approved: + # https://discourse.charmhub.io/t/request-track-6-for-charmed-mongodb-operator/11900 + # - name: Upload charm to charmhub + # uses: canonical/charming-actions/upload-charm@2.2.2 + # with: + # credentials: "${{ secrets.CHARMHUB_TOKEN }}" + # github-token: "${{ secrets.GITHUB_TOKEN }}" + # channel: "6/edge" From 80c8bf94aad873dfeed449539990ff139bcdaa5e Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 20 Sep 2023 07:39:50 +0000 Subject: [PATCH 16/42] revert changes on init admin user --- src/charm.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/charm.py b/src/charm.py index 483b46e94..6908864a3 100755 --- a/src/charm.py +++ b/src/charm.py @@ -628,7 +628,7 @@ def _on_secret_changed(self, event: SecretChangedEvent): before=before_log(logger, logging.DEBUG), ) def _init_operator_user(self) -> None: - """Creates initial specified admin user. + """Creates initial admin user for MongoDB. Initial admin user can be created only through localhost connection. see https://www.mongodb.com/docs/manual/core/localhost-exception/ @@ -637,10 +637,6 @@ def _init_operator_user(self) -> None: As a result, where are only hackish ways to create initial user. It is needed to install mongodb-clients inside charm container to make this function work correctly. - - Args: - user: User to create - mongos: whether or not the user should be created on mongos router """ if self._is_user_created(OperatorUser) or not self.unit.is_leader(): return @@ -808,7 +804,7 @@ def _open_ports_tcp(self, ports: int) -> None: """Open the given port. Args: - ports: The port to open. + ports: The ports to open. """ for port in ports: try: From 02fcf7cb5c3f80d362213904e1b6a9c4004c6dce Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 20 Sep 2023 11:04:10 +0200 Subject: [PATCH 17/42] add new lib --- lib/charms/mongodb/v0/shards_interface.py | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 lib/charms/mongodb/v0/shards_interface.py diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py new file mode 100644 index 000000000..54c3e1851 --- /dev/null +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -0,0 +1,32 @@ +"""TODO: Add a proper docstring here. + +This is a placeholder docstring for this charm library. Docstrings are +presented on Charmhub and updated whenever you push a new version of the +library. + +Complete documentation about creating and documenting libraries can be found +in the SDK docs at https://juju.is/docs/sdk/libraries. + +See `charmcraft publish-lib` and `charmcraft fetch-lib` for details of how to +share and consume charm libraries. They serve to enhance collaboration +between charmers. Use a charmer's libraries for classes that handle +integration with their charm. + +Bear in mind that new revisions of the different major API versions (v0, v1, +v2 etc) are maintained independently. You can continue to update v0 and v1 +after you have pushed v3. + +Markdown is supported, following the CommonMark specification. +""" + +# The unique Charmhub library identifier, never change it +LIBID = "55fee8fa73364fb0a2dc16a954b2fd4a" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 1 + +# TODO: add your code here! Happy coding! From ff8378954f43799b8b9c7c868455393d4a578e8a Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 20 Sep 2023 12:04:41 +0000 Subject: [PATCH 18/42] set up basic relation structure --- lib/charms/mongodb/v0/shards_interface.py | 130 +++++++++++++++++++--- metadata.yaml | 8 +- src/charm.py | 12 +- src/config.py | 1 + 4 files changed, 133 insertions(+), 18 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 54c3e1851..6a17d0ddf 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -1,23 +1,22 @@ """TODO: Add a proper docstring here. +""" +from ops.charm import CharmBase +from config import Config +from ops.framework import Object +import logging -This is a placeholder docstring for this charm library. Docstrings are -presented on Charmhub and updated whenever you push a new version of the -library. - -Complete documentation about creating and documenting libraries can be found -in the SDK docs at https://juju.is/docs/sdk/libraries. - -See `charmcraft publish-lib` and `charmcraft fetch-lib` for details of how to -share and consume charm libraries. They serve to enhance collaboration -between charmers. Use a charmer's libraries for classes that handle -integration with their charm. +from ops.model import ( + ActiveStatus, + BlockedStatus, + MaintenanceStatus, + Relation, + SecretNotFoundError, + Unit, + WaitingStatus, +) -Bear in mind that new revisions of the different major API versions (v0, v1, -v2 etc) are maintained independently. You can continue to update v0 and v1 -after you have pushed v3. +logger = logging.getLogger(__name__) -Markdown is supported, following the CommonMark specification. -""" # The unique Charmhub library identifier, never change it LIBID = "55fee8fa73364fb0a2dc16a954b2fd4a" @@ -30,3 +29,102 @@ LIBPATCH = 1 # TODO: add your code here! Happy coding! + + +class ShardingRequirer(Object): + """TODO docstring - something about how this is used on config server side.""" + + def __init__( + self, charm: CharmBase, relation_name: str = Config.Relations.SHARDING_RELATIONS_NAME + ) -> None: + """Constructor for MongoDBProvider object. + + Args: + relation_name: the name of the relation + """ + self.relation_name = relation_name + self.charm = charm + + super().__init__(charm, self.relation_name) + self.framework.observe( + charm.on[self.relation_name].relation_joined, self._on_relation_joined + ) + # TODO Future PR, enable shard drainage by listening for relation departed events + + def _on_relation_joined(self, event): + """TODO doc string""" + if not self.charm.is_role(Config.Role.REPLICATION): + self.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.CONFIG_SERVER): + logger.info( + "skipping relation joined event ShardingRequirer should only be executed by config-server" + ) + return + + if not self.charm.unit.is_leader(): + return + + # todo verify it is related to a shard and not a replica + + # todo send out secrets + + +class ShardingProvider(Object): + """TODO docstring - something about how this is used on shard side.""" + + def __init__( + self, charm: CharmBase, relation_name: str = Config.Relations.SHARDING_RELATIONS_NAME + ) -> None: + """Constructor for ShardingProvider object. + + Args: + relation_name: the name of the relation + """ + self.relation_name = relation_name + self.charm = charm + + super().__init__(charm, self.relation_name) + self.framework.observe( + charm.on[self.relation_name].relation_joined, self._on_relation_joined + ) + self.framework.observe( + charm.on[self.relation_name].relation_changed, self._on_relation_changed + ) + + # TODO Future PR, enable shard drainage by listening for relation departed events + + def _on_relation_joined(self, event): + """TODO doc string""" + if not self.charm.is_role(Config.Role.REPLICATION): + self.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.SHARD): + logger.info( + "skipping relation joined event ShardingProvider should only be executed by shards" + ) + return + + if not self.charm.unit.is_leader(): + return + + # todo verify it is related to a config server and not a replica + + def _on_relation_changed(self, event): + """TODO doc string""" + if not self.charm.is_role(Config.Role.REPLICATION): + self.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.SHARD): + logger.info( + "skipping relation changed event ShardingProvider should only be executed by shards" + ) + return + + # TODO update all secrets and restart diff --git a/metadata.yaml b/metadata.yaml index fe03ed10a..8f8e91bcf 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -25,6 +25,10 @@ provides: interface: mongodb cos-agent: interface: cos_agent + sharding: + interface: shards + # shards can only relate to one replica set + limit: 1 storage: mongodb: @@ -39,6 +43,8 @@ requires: certificates: interface: tls-certificates limit: 1 - s3-credentials: interface: s3 + limit: 1 + config-server: + interface: shards diff --git a/src/charm.py b/src/charm.py index 6908864a3..591fcb543 100755 --- a/src/charm.py +++ b/src/charm.py @@ -32,6 +32,7 @@ from charms.mongodb.v0.mongodb_provider import MongoDBProvider from charms.mongodb.v0.mongodb_tls import MongoDBTLS from charms.mongodb.v0.mongodb_vm_legacy_provider import MongoDBLegacyProvider +from charms.mongodb.v0.shards_interface import ShardingProvider, ShardingRequirer from charms.mongodb.v0.users import ( CHARM_USERS, BackupUser, @@ -123,6 +124,8 @@ def __init__(self, *args): self.legacy_client_relations = MongoDBLegacyProvider(self) self.tls = MongoDBTLS(self, Config.Relations.PEERS, substrate=Config.SUBSTRATE) self.backups = MongoDBBackups(self) + self.shard_relations = ShardingRequirer(self) + self.config_server_relations = ShardingProvider(self) # relation events for Prometheus metrics are handled in the MetricsEndpointProvider self._grafana_agent = COSAgentProvider( @@ -494,10 +497,14 @@ def _on_update_status(self, event: UpdateStatusEvent): self.unit.status = BlockedStatus("cannot have both legacy and new relations") return + # shard cannot relate to multiple config servers + if len(self.model.relations[Config.Relations.SHARDING_RELATIONS_NAME]) > 1: + self.charm.unit.status = BlockedStatus("Shards can only relate to one config-server") + return + # no need to report on replica set status until initialised if not self.db_initialised: return - # Cannot check more advanced MongoDB statuses if mongod hasn't started. with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: if not direct_mongo.is_ready: @@ -542,6 +549,9 @@ def _on_get_password(self, event: ActionEvent) -> None: def _on_set_password(self, event: ActionEvent) -> None: """Set the password for the admin user.""" + if self.is_role(Config.Role.SHARD): + event.fail("Cannot set password on shard, please set password on config-server.") + return # changing the backup password while a backup/restore is in progress can be disastrous pbm_status = self.backups._get_pbm_status() if isinstance(pbm_status, MaintenanceStatus): diff --git a/src/config.py b/src/config.py index da3261864..acd428e0b 100644 --- a/src/config.py +++ b/src/config.py @@ -72,6 +72,7 @@ class Relations: NAME = "database" PEERS = "database-peers" OBSOLETE_RELATIONS_NAME = "obsolete" + SHARDING_RELATIONS_NAME = "sharding" APP_SCOPE = "app" UNIT_SCOPE = "unit" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] From 426cff89103a4628af35f2c3aa4d2668ac3de040 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 21 Sep 2023 08:37:31 +0000 Subject: [PATCH 19/42] operator password and keyfile now shared from config server --- lib/charms/mongodb/v0/shards_interface.py | 112 +++++++++++++++++----- src/charm.py | 44 ++++++--- src/config.py | 1 + src/machine_helpers.py | 26 ----- 4 files changed, 120 insertions(+), 63 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 6a17d0ddf..6cd496028 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -15,6 +15,15 @@ WaitingStatus, ) +from charms.mongodb.v0.mongodb import ( + MongoDBConnection, + NotReadyError, + PyMongoError, +) + +from charms.mongodb.v0.helpers import KEY_FILE +from charms.mongodb.v0.users import MongoDBUser, OperatorUser + logger = logging.getLogger(__name__) @@ -35,7 +44,7 @@ class ShardingRequirer(Object): """TODO docstring - something about how this is used on config server side.""" def __init__( - self, charm: CharmBase, relation_name: str = Config.Relations.SHARDING_RELATIONS_NAME + self, charm: CharmBase, relation_name: str = Config.Relations.CONFIG_SERVER_RELATIONS_NAME ) -> None: """Constructor for MongoDBProvider object. @@ -53,23 +62,53 @@ def __init__( def _on_relation_joined(self, event): """TODO doc string""" - if not self.charm.is_role(Config.Role.REPLICATION): + if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error("sharding interface not supported with config role=replication") return if not self.charm.is_role(Config.Role.CONFIG_SERVER): logger.info( - "skipping relation joined event ShardingRequirer should only be executed by config-server" + "skipping relation joined event ShardingRequirer is only be executed by config-server" ) return if not self.charm.unit.is_leader(): return - # todo verify it is related to a shard and not a replica + # TODO Future PR, sync tls secrets and PBM password + self._update_relation_data( + event.relation.id, + { + "operator-password": self.charm.get_secret( + Config.Relations.APP_SCOPE, + MongoDBUser.get_password_key_name_for_user("operator"), + ), + "key-file": self.charm.get_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME + ), + }, + ) + + # TODO Future PR, add shard to config server + + # todo handle rotating passwords + + def _update_relation_data(self, relation_id: int, data: dict) -> None: + """Updates a set of key-value pairs in the relation. - # todo send out secrets + This function writes in the application data bag, therefore, + only the leader unit can call it. + + Args: + relation_id: the identifier for a particular relation. + data: dict containing the key-value pairs + that should be updated in the relation. + """ + if self.charm.unit.is_leader(): + relation = self.charm.model.get_relation(self.relation_name, relation_id) + if relation: + relation.data[self.charm.model.app].update(data) class ShardingProvider(Object): @@ -87,44 +126,65 @@ def __init__( self.charm = charm super().__init__(charm, self.relation_name) - self.framework.observe( - charm.on[self.relation_name].relation_joined, self._on_relation_joined - ) self.framework.observe( charm.on[self.relation_name].relation_changed, self._on_relation_changed ) # TODO Future PR, enable shard drainage by listening for relation departed events - def _on_relation_joined(self, event): + def _on_relation_changed(self, event): """TODO doc string""" - if not self.charm.is_role(Config.Role.REPLICATION): + if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error("sharding interface not supported with config role=replication") return if not self.charm.is_role(Config.Role.SHARD): logger.info( - "skipping relation joined event ShardingProvider should only be executed by shards" + "skipping relation changed event ShardingProvider is only be executed by shards" ) return - if not self.charm.unit.is_leader(): - return + relation_data = event.relation.data[event.app] - # todo verify it is related to a config server and not a replica - - def _on_relation_changed(self, event): - """TODO doc string""" - if not self.charm.is_role(Config.Role.REPLICATION): - self.unit.status = BlockedStatus("role replication does not support sharding") - logger.error("sharding interface not supported with config role=replication") - return + # put keyfile on the machine with appropriate permissions + key_file_contents = relation_data.get("key-file") + self.charm.push_file_to_unit( + parent_dir=Config.MONGOD_CONF_DIR, + file_name=KEY_FILE, + file_contents=self.charm.get_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME + ), + ) - if not self.charm.is_role(Config.Role.SHARD): - logger.info( - "skipping relation changed event ShardingProvider should only be executed by shards" + if self.charm.unit.is_leader(): + new_password = relation_data.get("operator-password") + with MongoDBConnection(self.charm.mongodb_config) as mongo: + try: + mongo.set_user_password(OperatorUser.get_username(), new_password) + except NotReadyError: + self.charm.unit.status = BlockedStatus("Shard not added to config-server") + logger.error( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + return + except PyMongoError as e: + self.charm.unit.status = BlockedStatus("Shard not added to config-server") + logger.error(f"Failed changing the password: {e}") + return + + self.charm.set_secret( + Config.Relations.APP_SCOPE, + MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()), + new_password, ) - return + self.charm.set_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME, key_file_contents + ) + + self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") + self.charm.restart_mongod_service() - # TODO update all secrets and restart + # TODO future PR, leader unit verifies shard was added to cluster + if not self.charm.unit.is_leader(): + return diff --git a/src/charm.py b/src/charm.py index 591fcb543..cda9e090e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,6 +8,9 @@ import subprocess import time from typing import Dict, List, Optional, Set +import os +import pwd +from pathlib import Path from charms.grafana_agent.v0.cos_agent import COSAgentProvider from charms.mongodb.v0.helpers import ( @@ -74,11 +77,7 @@ ApplicationHostNotFoundError, SecretNotAddedError, ) -from machine_helpers import ( - push_file_to_unit, - remove_file_from_unit, - update_mongod_service, -) +from machine_helpers import update_mongod_service, MONGO_USER, ROOT_USER_GID logger = logging.getLogger(__name__) @@ -552,6 +551,7 @@ def _on_set_password(self, event: ActionEvent) -> None: if self.is_role(Config.Role.SHARD): event.fail("Cannot set password on shard, please set password on config-server.") return + # changing the backup password while a backup/restore is in progress can be disastrous pbm_status = self.backups._get_pbm_status() if isinstance(pbm_status, MaintenanceStatus): @@ -575,6 +575,7 @@ def _on_set_password(self, event: ActionEvent) -> None: f"Password cannot be longer than {Config.Secrets.MAX_PASSWORD_LENGTH} characters." ) return + with MongoDBConnection(self.mongodb_config) as mongo: try: mongo.set_user_password(username, new_password) @@ -854,24 +855,45 @@ def _instatiate_keyfile(self, event: StartEvent) -> None: return # put keyfile on the machine with appropriate permissions - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=KEY_FILE, file_contents=self.get_secret(APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME), ) + def push_file_to_unit(self, parent_dir, file_name, file_contents) -> None: + """K8s charms can push files to their containers easily, this is the vm charm workaround.""" + Path(parent_dir).mkdir(parents=True, exist_ok=True) + file_name = f"{parent_dir}/{file_name}" + with open(file_name, "w") as write_file: + write_file.write(file_contents) + + # MongoDB limitation; it is needed 400 rights for keyfile and we need 440 rights on tls certs + # to be able to connect via MongoDB shell + if Config.TLS.KEY_FILE_NAME in file_name: + os.chmod(file_name, 0o400) + else: + os.chmod(file_name, 0o440) + mongodb_user = pwd.getpwnam(MONGO_USER) + os.chown(file_name, mongodb_user.pw_uid, ROOT_USER_GID) + + def remove_file_from_unit(parent_dir, file_name) -> None: + """Remove file from vm unit.""" + if os.path.exists(f"{parent_dir}/{file_name}"): + os.remove(f"{parent_dir}/{file_name}") + def push_tls_certificate_to_workload(self) -> None: """Uploads certificate to the workload container.""" external_ca, external_pem = self.tls.get_tls_files(UNIT_SCOPE) if external_ca is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_EXT_CA_FILE, file_contents=external_ca, ) if external_pem is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_EXT_PEM_FILE, file_contents=external_pem, @@ -879,14 +901,14 @@ def push_tls_certificate_to_workload(self) -> None: internal_ca, internal_pem = self.tls.get_tls_files(APP_SCOPE) if internal_ca is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_INT_CA_FILE, file_contents=internal_ca, ) if internal_pem is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_INT_PEM_FILE, file_contents=internal_pem, @@ -903,7 +925,7 @@ def delete_tls_certificate_from_workload() -> None: Config.TLS.INT_CA_FILE, Config.TLS.INT_PEM_FILE, ]: - remove_file_from_unit(Config.MONGOD_CONF_DIR, file) + self.remove_file_from_unit(Config.MONGOD_CONF_DIR, file) def _connect_mongodb_exporter(self) -> None: """Exposes the endpoint to mongodb_exporter.""" diff --git a/src/config.py b/src/config.py index acd428e0b..9ae79a8f9 100644 --- a/src/config.py +++ b/src/config.py @@ -73,6 +73,7 @@ class Relations: PEERS = "database-peers" OBSOLETE_RELATIONS_NAME = "obsolete" SHARDING_RELATIONS_NAME = "sharding" + CONFIG_SERVER_RELATIONS_NAME = "config-server" APP_SCOPE = "app" UNIT_SCOPE = "unit" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] diff --git a/src/machine_helpers.py b/src/machine_helpers.py index 424bdb56d..19e2daa4d 100644 --- a/src/machine_helpers.py +++ b/src/machine_helpers.py @@ -2,9 +2,6 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. import logging -import os -import pwd -from pathlib import Path from charms.mongodb.v0.helpers import get_mongod_args, get_mongos_args from charms.mongodb.v0.mongodb import MongoDBConfiguration @@ -50,26 +47,3 @@ def add_args_to_env(var: str, args: str): with open(Config.ENV_VAR_PATH, "w") as service_file: service_file.writelines(env_vars) - - -def push_file_to_unit(parent_dir, file_name, file_contents) -> None: - """K8s charms can push files to their containers easily, this is the vm charm workaround.""" - Path(parent_dir).mkdir(parents=True, exist_ok=True) - file_name = f"{parent_dir}/{file_name}" - with open(file_name, "w") as write_file: - write_file.write(file_contents) - - # MongoDB limitation; it is needed 400 rights for keyfile and we need 440 rights on tls certs - # to be able to connect via MongoDB shell - if Config.TLS.KEY_FILE_NAME in file_name: - os.chmod(file_name, 0o400) - else: - os.chmod(file_name, 0o440) - mongodb_user = pwd.getpwnam(MONGO_USER) - os.chown(file_name, mongodb_user.pw_uid, ROOT_USER_GID) - - -def remove_file_from_unit(parent_dir, file_name) -> None: - """Remove file from vm unit.""" - if os.path.exists(f"{parent_dir}/{file_name}"): - os.remove(f"{parent_dir}/{file_name}") From af993220529c7938efa75b495637d4fb298bc872 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 21 Sep 2023 10:26:54 +0000 Subject: [PATCH 20/42] fixes + working with replicas now --- lib/charms/mongodb/v0/shards_interface.py | 100 ++++++++++++++-------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 6cd496028..9e483dc57 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -76,6 +76,9 @@ def _on_relation_joined(self, event): if not self.charm.unit.is_leader(): return + if "db_initialised" not in self.charm.app_peer_data: + event.defer() + # TODO Future PR, sync tls secrets and PBM password self._update_relation_data( event.relation.id, @@ -91,8 +94,7 @@ def _on_relation_joined(self, event): ) # TODO Future PR, add shard to config server - - # todo handle rotating passwords + # TODO Follow up PR, handle rotating passwords def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -145,46 +147,74 @@ def _on_relation_changed(self, event): ) return + if "db_initialised" not in self.charm.app_peer_data: + event.defer() + + # shards rely on the config server for secrets relation_data = event.relation.data[event.app] + try: + self.update_operator_password(new_password=relation_data.get("operator-password")) + except (PyMongoError, NotReadyError): + self.charm.unit.status = BlockedStatus("Shard not added to config-server") + return - # put keyfile on the machine with appropriate permissions - key_file_contents = relation_data.get("key-file") - self.charm.push_file_to_unit( - parent_dir=Config.MONGOD_CONF_DIR, - file_name=KEY_FILE, - file_contents=self.charm.get_secret( - Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME + self.update_keyfile(key_file_contents=relation_data.get("key-file")) + + self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") + + # TODO future PR, leader unit verifies shard was added to cluster + if not self.charm.unit.is_leader(): + return + + def update_operator_password(self, new_password: str) -> None: + """TODO docstring + + Raises: + PyMongoError, NotReadyError + """ + current_password = ( + self.charm.get_secret( + Config.Relations.APP_SCOPE, MongoDBUser.get_password_key_name_for_user("operator") ), ) - if self.charm.unit.is_leader(): - new_password = relation_data.get("operator-password") - with MongoDBConnection(self.charm.mongodb_config) as mongo: - try: - mongo.set_user_password(OperatorUser.get_username(), new_password) - except NotReadyError: - self.charm.unit.status = BlockedStatus("Shard not added to config-server") - logger.error( - "Failed changing the password: Not all members healthy or finished initial sync." - ) - return - except PyMongoError as e: - self.charm.unit.status = BlockedStatus("Shard not added to config-server") - logger.error(f"Failed changing the password: {e}") - return - - self.charm.set_secret( - Config.Relations.APP_SCOPE, - MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()), - new_password, - ) - self.charm.set_secret( - Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME, key_file_contents - ) + if not new_password or new_password == current_password or not self.charm.unit.is_leader(): + return - self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") + with MongoDBConnection(self.charm.mongodb_config) as mongo: + try: + mongo.set_user_password(OperatorUser.get_username(), new_password) + except NotReadyError: + logger.error( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + raise + except PyMongoError as e: + logger.error(f"Failed changing the password: {e}") + raise + + self.charm.set_secret( + Config.Relations.APP_SCOPE, + MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()), + new_password, + ) + + def update_keyfile(self, key_file_contents: str) -> None: + """TODO docstring""" + if not key_file_contents: + return + + # put keyfile on the machine with appropriate permissions + self.charm.push_file_to_unit( + parent_dir=Config.MONGOD_CONF_DIR, file_name=KEY_FILE, file_contents=key_file_contents + ) + + # when the contents of the keyfile change, we must restart the service self.charm.restart_mongod_service() - # TODO future PR, leader unit verifies shard was added to cluster if not self.charm.unit.is_leader(): return + + self.charm.set_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME, key_file_contents + ) From caef63ddf70babb2e37c50b686144591be40a7ef Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 21 Sep 2023 10:35:21 +0000 Subject: [PATCH 21/42] add docstrings --- lib/charms/mongodb/v0/shards_interface.py | 27 +++++++++-------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 9e483dc57..ed9843c04 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -6,13 +6,8 @@ import logging from ops.model import ( - ActiveStatus, BlockedStatus, MaintenanceStatus, - Relation, - SecretNotFoundError, - Unit, - WaitingStatus, ) from charms.mongodb.v0.mongodb import ( @@ -37,16 +32,14 @@ # to 0 if you are raising the major API version LIBPATCH = 1 -# TODO: add your code here! Happy coding! - class ShardingRequirer(Object): - """TODO docstring - something about how this is used on config server side.""" + """Manage relations between the config server and the shard, on the config-server's side.""" def __init__( self, charm: CharmBase, relation_name: str = Config.Relations.CONFIG_SERVER_RELATIONS_NAME ) -> None: - """Constructor for MongoDBProvider object. + """Constructor for ShardingRequirer object. Args: relation_name: the name of the relation @@ -61,7 +54,7 @@ def __init__( # TODO Future PR, enable shard drainage by listening for relation departed events def _on_relation_joined(self, event): - """TODO doc string""" + """Handles providing shards with secrets and adding shards to the config server.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error("sharding interface not supported with config role=replication") @@ -99,8 +92,8 @@ def _on_relation_joined(self, event): def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. - This function writes in the application data bag, therefore, - only the leader unit can call it. + This function writes in the application data bag, therefore, only the leader unit can call + it. Args: relation_id: the identifier for a particular relation. @@ -114,7 +107,7 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: class ShardingProvider(Object): - """TODO docstring - something about how this is used on shard side.""" + """Manage relations between the config server and the shard, on the shard's side.""" def __init__( self, charm: CharmBase, relation_name: str = Config.Relations.SHARDING_RELATIONS_NAME @@ -132,10 +125,10 @@ def __init__( charm.on[self.relation_name].relation_changed, self._on_relation_changed ) - # TODO Future PR, enable shard drainage by listening for relation departed events + # TODO Future PR, enable shard drainage by observing relation departed events def _on_relation_changed(self, event): - """TODO doc string""" + """Retrieves secrets from config-server and updates them within the shard.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error("sharding interface not supported with config role=replication") @@ -167,7 +160,7 @@ def _on_relation_changed(self, event): return def update_operator_password(self, new_password: str) -> None: - """TODO docstring + """Updates the password for the operator user. Raises: PyMongoError, NotReadyError @@ -200,7 +193,7 @@ def update_operator_password(self, new_password: str) -> None: ) def update_keyfile(self, key_file_contents: str) -> None: - """TODO docstring""" + """Updates keyfile on all units.""" if not key_file_contents: return From 38b7b8f3181819679e3c05e3b461b4a2dcf1dcad Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 21 Sep 2023 11:19:03 +0000 Subject: [PATCH 22/42] unit, lint, fmt --- lib/charms/mongodb/v0/shards_interface.py | 40 +++++++++-------------- src/charm.py | 19 +++++------ tests/unit/test_charm.py | 10 +++--- 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index ed9843c04..622c478b3 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -1,23 +1,21 @@ -"""TODO: Add a proper docstring here. -""" -from ops.charm import CharmBase -from config import Config -from ops.framework import Object -import logging +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. -from ops.model import ( - BlockedStatus, - MaintenanceStatus, -) +"""In this class, we manage relations between config-servers and shards. -from charms.mongodb.v0.mongodb import ( - MongoDBConnection, - NotReadyError, - PyMongoError, -) +This class handles the sharing of secrets between sharded components, adding shards, and removing +shards. +""" +import logging from charms.mongodb.v0.helpers import KEY_FILE +from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError from charms.mongodb.v0.users import MongoDBUser, OperatorUser +from ops.charm import CharmBase +from ops.framework import Object +from ops.model import BlockedStatus, MaintenanceStatus + +from config import Config logger = logging.getLogger(__name__) @@ -39,11 +37,7 @@ class ShardingRequirer(Object): def __init__( self, charm: CharmBase, relation_name: str = Config.Relations.CONFIG_SERVER_RELATIONS_NAME ) -> None: - """Constructor for ShardingRequirer object. - - Args: - relation_name: the name of the relation - """ + """Constructor for ShardingRequirer object.""" self.relation_name = relation_name self.charm = charm @@ -112,11 +106,7 @@ class ShardingProvider(Object): def __init__( self, charm: CharmBase, relation_name: str = Config.Relations.SHARDING_RELATIONS_NAME ) -> None: - """Constructor for ShardingProvider object. - - Args: - relation_name: the name of the relation - """ + """Constructor for ShardingProvider object.""" self.relation_name = relation_name self.charm = charm diff --git a/src/charm.py b/src/charm.py index cda9e090e..7317ff827 100755 --- a/src/charm.py +++ b/src/charm.py @@ -4,13 +4,13 @@ # See LICENSE file for licensing details. import json import logging +import os +import pwd import re import subprocess import time -from typing import Dict, List, Optional, Set -import os -import pwd from pathlib import Path +from typing import Dict, List, Optional, Set from charms.grafana_agent.v0.cos_agent import COSAgentProvider from charms.mongodb.v0.helpers import ( @@ -77,7 +77,7 @@ ApplicationHostNotFoundError, SecretNotAddedError, ) -from machine_helpers import update_mongod_service, MONGO_USER, ROOT_USER_GID +from machine_helpers import MONGO_USER, ROOT_USER_GID, update_mongod_service logger = logging.getLogger(__name__) @@ -862,14 +862,14 @@ def _instatiate_keyfile(self, event: StartEvent) -> None: ) def push_file_to_unit(self, parent_dir, file_name, file_contents) -> None: - """K8s charms can push files to their containers easily, this is the vm charm workaround.""" + """K8s charms can push files to their containers easily, this is a vm charm workaround.""" Path(parent_dir).mkdir(parents=True, exist_ok=True) file_name = f"{parent_dir}/{file_name}" with open(file_name, "w") as write_file: write_file.write(file_contents) - # MongoDB limitation; it is needed 400 rights for keyfile and we need 440 rights on tls certs - # to be able to connect via MongoDB shell + # MongoDB limitation; it is needed 400 rights for keyfile and we need 440 rights on tls + # certs to be able to connect via MongoDB shell if Config.TLS.KEY_FILE_NAME in file_name: os.chmod(file_name, 0o400) else: @@ -877,7 +877,7 @@ def push_file_to_unit(self, parent_dir, file_name, file_contents) -> None: mongodb_user = pwd.getpwnam(MONGO_USER) os.chown(file_name, mongodb_user.pw_uid, ROOT_USER_GID) - def remove_file_from_unit(parent_dir, file_name) -> None: + def remove_file_from_unit(self, parent_dir, file_name) -> None: """Remove file from vm unit.""" if os.path.exists(f"{parent_dir}/{file_name}"): os.remove(f"{parent_dir}/{file_name}") @@ -914,8 +914,7 @@ def push_tls_certificate_to_workload(self) -> None: file_contents=internal_pem, ) - @staticmethod - def delete_tls_certificate_from_workload() -> None: + def delete_tls_certificate_from_workload(self) -> None: """Deletes certificate from VM.""" logger.info("Deleting TLS certificate from VM") diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9736b0041..b0bc63457 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -44,7 +44,7 @@ def setUp(self, *unused): @patch("charm.MongodbOperatorCharm._init_operator_user") @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") def test_on_start_not_leader_doesnt_initialise_replica_set( self, open, path, snap, _open_ports_tcp, init_admin, connection, get_secret @@ -71,7 +71,7 @@ def test_on_start_not_leader_doesnt_initialise_replica_set( @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") @patch("charm.MongodbOperatorCharm._open_ports_tcp") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") def test_on_start_snap_failure_leads_to_blocked_status( self, @@ -94,7 +94,7 @@ def test_on_start_snap_failure_leads_to_blocked_status( @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.MongodbOperatorCharm._initialise_replica_set") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") @@ -126,7 +126,7 @@ def test_on_start_mongod_not_ready_defer( @patch_network_get(private_address="1.1.1.1") @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") def test_start_unable_to_open_tcp_moves_to_blocked(self, open, path, snap, _open_ports_tcp): """Test verifies that if TCP port cannot be opened we go to the blocked state.""" @@ -288,7 +288,7 @@ def test_reconfigure_add_member_failure(self, _, connection, defer): @patch_network_get(private_address="1.1.1.1") @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") From 766a59b633f9606029fcaf092da186e4e71292aa Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 21 Sep 2023 12:32:14 +0000 Subject: [PATCH 23/42] simplify function for tox --- src/charm.py | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/src/charm.py b/src/charm.py index 7317ff827..0b8b56a33 100755 --- a/src/charm.py +++ b/src/charm.py @@ -576,21 +576,11 @@ def _on_set_password(self, event: ActionEvent) -> None: ) return - with MongoDBConnection(self.mongodb_config) as mongo: - try: - mongo.set_user_password(username, new_password) - except NotReadyError: - event.fail( - "Failed changing the password: Not all members healthy or finished initial sync." - ) - return - except PyMongoError as e: - event.fail(f"Failed changing the password: {e}") - return - - secret_id = self.set_secret( - APP_SCOPE, MongoDBUser.get_password_key_name_for_user(username), new_password - ) + try: + secret_id = self.set_password(username, new_password) + except SetPasswordError as e: + event.fail(e) + return if username == BackupUser.get_username(): self._connect_pbm_agent() @@ -602,6 +592,26 @@ def _on_set_password(self, event: ActionEvent) -> None: {Config.Actions.PASSWORD_PARAM_NAME: new_password, "secret-id": secret_id} ) + def set_password(self, username, password) -> int: + """Sets the password for a given username and return the secret id. + + Raises: + SetPasswordError + """ + with MongoDBConnection(self.mongodb_config) as mongo: + try: + mongo.set_user_password(username, password) + except NotReadyError: + raise SetPasswordError( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + except PyMongoError as e: + raise SetPasswordError(f"Failed changing the password: {e}") + + return self.set_secret( + APP_SCOPE, MongoDBUser.get_password_key_name_for_user(username), password + ) + def _on_secret_remove(self, event: SecretRemoveEvent): # We are keeping this function empty on purpose until the issue with secrets # is not fixed. The issue is: https://bugs.launchpad.net/juju/+bug/2023364 @@ -1319,5 +1329,9 @@ def _juju_secret_remove(self, scope: Scopes, key: str) -> None: # END: helper functions +class SetPasswordError(Exception): + """Raised on failure to set password for MongoDB user.""" + + if __name__ == "__main__": main(MongodbOperatorCharm) From a9f6cd68667c44f2a7b37fd9722b1edd2656cc4c Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 21 Sep 2023 12:39:20 +0000 Subject: [PATCH 24/42] personal nits --- lib/charms/mongodb/v0/shards_interface.py | 2 ++ src/charm.py | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 622c478b3..23886fcaa 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -164,6 +164,8 @@ def update_operator_password(self, new_password: str) -> None: if not new_password or new_password == current_password or not self.charm.unit.is_leader(): return + # TODO, in the future use set_password from src/charm.py - this will require adding a + # library, for exceptions used in both charm code and lib code. with MongoDBConnection(self.charm.mongodb_config) as mongo: try: mongo.set_user_password(OperatorUser.get_username(), new_password) diff --git a/src/charm.py b/src/charm.py index 0b8b56a33..763e711b3 100755 --- a/src/charm.py +++ b/src/charm.py @@ -496,14 +496,10 @@ def _on_update_status(self, event: UpdateStatusEvent): self.unit.status = BlockedStatus("cannot have both legacy and new relations") return - # shard cannot relate to multiple config servers - if len(self.model.relations[Config.Relations.SHARDING_RELATIONS_NAME]) > 1: - self.charm.unit.status = BlockedStatus("Shards can only relate to one config-server") - return - # no need to report on replica set status until initialised if not self.db_initialised: return + # Cannot check more advanced MongoDB statuses if mongod hasn't started. with MongoDBConnection(self.mongodb_config, "localhost", direct=True) as direct_mongo: if not direct_mongo.is_ready: From 0a432eae9832c796f10501af91ea3f45f0c22b1d Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 22 Sep 2023 11:21:20 +0000 Subject: [PATCH 25/42] PR comments --- lib/charms/mongodb/v0/shards_interface.py | 55 +++++++++++++---------- metadata.yaml | 8 ++-- src/charm.py | 42 +++++++++++++++-- 3 files changed, 73 insertions(+), 32 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 23886fcaa..19375b2b4 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -14,6 +14,7 @@ from ops.charm import CharmBase from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus +from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -29,9 +30,11 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version LIBPATCH = 1 +KEYFILE_KEY = "key-file" +OPERATOR_PASSWORD_KEY = "operator-password" -class ShardingRequirer(Object): +class ShardingProvider(Object): """Manage relations between the config server and the shard, on the config-server's side.""" def __init__( @@ -63,18 +66,18 @@ def _on_relation_joined(self, event): if not self.charm.unit.is_leader(): return - if "db_initialised" not in self.charm.app_peer_data: + if not self.charm.db_initialised: event.defer() # TODO Future PR, sync tls secrets and PBM password self._update_relation_data( event.relation.id, { - "operator-password": self.charm.get_secret( + OPERATOR_PASSWORD_KEY: self.charm.get_secret( Config.Relations.APP_SCOPE, MongoDBUser.get_password_key_name_for_user("operator"), ), - "key-file": self.charm.get_secret( + KEYFILE_KEY: self.charm.get_secret( Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME ), }, @@ -100,7 +103,7 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: relation.data[self.charm.model.app].update(data) -class ShardingProvider(Object): +class ConfigServerRequirer(Object): """Manage relations between the config server and the shard, on the shard's side.""" def __init__( @@ -130,19 +133,19 @@ def _on_relation_changed(self, event): ) return - if "db_initialised" not in self.charm.app_peer_data: + if not self.charm.db_initialised: event.defer() # shards rely on the config server for secrets relation_data = event.relation.data[event.app] + self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY)) + try: - self.update_operator_password(new_password=relation_data.get("operator-password")) - except (PyMongoError, NotReadyError): + self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) + except RetryError: self.charm.unit.status = BlockedStatus("Shard not added to config-server") return - self.update_keyfile(key_file_contents=relation_data.get("key-file")) - self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") # TODO future PR, leader unit verifies shard was added to cluster @@ -153,7 +156,7 @@ def update_operator_password(self, new_password: str) -> None: """Updates the password for the operator user. Raises: - PyMongoError, NotReadyError + RetryError """ current_password = ( self.charm.get_secret( @@ -164,19 +167,23 @@ def update_operator_password(self, new_password: str) -> None: if not new_password or new_password == current_password or not self.charm.unit.is_leader(): return - # TODO, in the future use set_password from src/charm.py - this will require adding a - # library, for exceptions used in both charm code and lib code. - with MongoDBConnection(self.charm.mongodb_config) as mongo: - try: - mongo.set_user_password(OperatorUser.get_username(), new_password) - except NotReadyError: - logger.error( - "Failed changing the password: Not all members healthy or finished initial sync." - ) - raise - except PyMongoError as e: - logger.error(f"Failed changing the password: {e}") - raise + # updating operator password, usually comes after keyfile was updated, hence, the mongodb + # service was restarted. Sometimes this requires units getting insync again. + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + # TODO, in the future use set_password from src/charm.py - this will require adding + # a library, for exceptions used in both charm code and lib code. + with MongoDBConnection(self.charm.mongodb_config) as mongo: + try: + mongo.set_user_password(OperatorUser.get_username(), new_password) + except NotReadyError: + logger.error( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + raise + except PyMongoError as e: + logger.error(f"Failed changing the password: {e}") + raise self.charm.set_secret( Config.Relations.APP_SCOPE, diff --git a/metadata.yaml b/metadata.yaml index 8f8e91bcf..0cf7618da 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -25,10 +25,8 @@ provides: interface: mongodb cos-agent: interface: cos_agent - sharding: + config-server: interface: shards - # shards can only relate to one replica set - limit: 1 storage: mongodb: @@ -46,5 +44,7 @@ requires: s3-credentials: interface: s3 limit: 1 - config-server: + sharding: interface: shards + # shards can only relate to one config-server + limit: 1 diff --git a/src/charm.py b/src/charm.py index 763e711b3..6719d8d6e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -35,7 +35,7 @@ from charms.mongodb.v0.mongodb_provider import MongoDBProvider from charms.mongodb.v0.mongodb_tls import MongoDBTLS from charms.mongodb.v0.mongodb_vm_legacy_provider import MongoDBLegacyProvider -from charms.mongodb.v0.shards_interface import ShardingProvider, ShardingRequirer +from charms.mongodb.v0.shards_interface import ConfigServerRequirer, ShardingProvider from charms.mongodb.v0.users import ( CHARM_USERS, BackupUser, @@ -48,6 +48,7 @@ from ops.charm import ( ActionEvent, CharmBase, + ConfigChangedEvent, InstallEvent, LeaderElectedEvent, RelationDepartedEvent, @@ -92,6 +93,9 @@ class MongodbOperatorCharm(CharmBase): def __init__(self, *args): super().__init__(*args) self._port = Config.MONGODB_PORT + + # lifecycle events + self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.install, self._on_install) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -123,8 +127,8 @@ def __init__(self, *args): self.legacy_client_relations = MongoDBLegacyProvider(self) self.tls = MongoDBTLS(self, Config.Relations.PEERS, substrate=Config.SUBSTRATE) self.backups = MongoDBBackups(self) - self.shard_relations = ShardingRequirer(self) - self.config_server_relations = ShardingProvider(self) + self.shard_relations = ShardingProvider(self) + self.config_server_relations = ConfigServerRequirer(self) # relation events for Prometheus metrics are handled in the MetricsEndpointProvider self._grafana_agent = COSAgentProvider( @@ -238,7 +242,17 @@ def db_initialised(self) -> bool: @property def role(self) -> str: """Returns role of MongoDB deployment.""" - return self.model.config["role"] + if "role" not in self.app_peer_data and self.unit.is_leader(): + self.app_peer_data["role"] = self.model.config["role"] + else: + # if leader hasn't set the role yet, use the one set by model + self.model.config["role"] + + return self.app_peer_data.get("role") + + def is_role_changed(self) -> bool: + """Checks if application is running in provided role.""" + return self.role != self.model.config["role"] def is_role(self, role_name: str) -> bool: """Checks if application is running in provided role.""" @@ -297,6 +311,22 @@ def _on_install(self, event: InstallEvent) -> None: # add licenses copy_licenses_to_unit() + def _on_config_changed(self, event: ConfigChangedEvent) -> None: + """Listen to changes in application configuration. + + To prevent a user from migrating a cluster, and causing the component to become + unresponsive therefore causing a cluster failure, error the component. This prevents it + from executing other hooks with a new role. + """ + # TODO in the future (24.04) support migration of components + if self.is_role_changed(): + logger.error( + f"cluster migration currently not supported, cannot change from { self.model.config['role']} to {self.role}" + ) + raise ShardingMigrationError( + f"Migration of sharding components not permitted, revert config role to {self.role}" + ) + def _on_start(self, event: StartEvent) -> None: """Enables MongoDB service and initialises replica set. @@ -1325,6 +1355,10 @@ def _juju_secret_remove(self, scope: Scopes, key: str) -> None: # END: helper functions +class ShardingMigrationError(Exception): + """Raised when there is an attempt to change the role of a sharding component.""" + + class SetPasswordError(Exception): """Raised on failure to set password for MongoDB user.""" From 059356fabb8d512c05ec63a3291ecdfe238caf2b Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 22 Sep 2023 12:51:05 +0000 Subject: [PATCH 26/42] propogating passwords of internal db users happens automatically --- lib/charms/mongodb/v0/shards_interface.py | 27 +---------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 19375b2b4..7c633bfdf 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -9,12 +9,10 @@ import logging from charms.mongodb.v0.helpers import KEY_FILE -from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError from charms.mongodb.v0.users import MongoDBUser, OperatorUser from ops.charm import CharmBase from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus -from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -139,12 +137,7 @@ def _on_relation_changed(self, event): # shards rely on the config server for secrets relation_data = event.relation.data[event.app] self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY)) - - try: - self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) - except RetryError: - self.charm.unit.status = BlockedStatus("Shard not added to config-server") - return + self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") @@ -167,24 +160,6 @@ def update_operator_password(self, new_password: str) -> None: if not new_password or new_password == current_password or not self.charm.unit.is_leader(): return - # updating operator password, usually comes after keyfile was updated, hence, the mongodb - # service was restarted. Sometimes this requires units getting insync again. - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): - with attempt: - # TODO, in the future use set_password from src/charm.py - this will require adding - # a library, for exceptions used in both charm code and lib code. - with MongoDBConnection(self.charm.mongodb_config) as mongo: - try: - mongo.set_user_password(OperatorUser.get_username(), new_password) - except NotReadyError: - logger.error( - "Failed changing the password: Not all members healthy or finished initial sync." - ) - raise - except PyMongoError as e: - logger.error(f"Failed changing the password: {e}") - raise - self.charm.set_secret( Config.Relations.APP_SCOPE, MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()), From bed71cef34b97b9b2d26304a75ae139493c70bae Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 25 Sep 2023 15:34:14 +0000 Subject: [PATCH 27/42] fix bug in role retrieval --- src/charm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/charm.py b/src/charm.py index 6719d8d6e..5991e4c68 100755 --- a/src/charm.py +++ b/src/charm.py @@ -242,11 +242,17 @@ def db_initialised(self) -> bool: @property def role(self) -> str: """Returns role of MongoDB deployment.""" - if "role" not in self.app_peer_data and self.unit.is_leader(): + if ( + "role" not in self.app_peer_data + and self.unit.is_leader() + and self.model.config["role"] + ): self.app_peer_data["role"] = self.model.config["role"] + # app data bag isn't set until function completes + return self.model.config["role"] else: # if leader hasn't set the role yet, use the one set by model - self.model.config["role"] + return self.model.config["role"] return self.app_peer_data.get("role") @@ -762,7 +768,7 @@ def _get_mongodb_config_for_user( return MongoDBConfiguration( replset=self.app.name, database=user.get_database_name(), - username=user.get_username(), + username=f"{user.get_username()}-{self.role}", password=self.get_secret(APP_SCOPE, user.get_password_key_name()), hosts=hosts, roles=user.get_roles(), From b86259fb54346faf5b34db541e34bea250a0626c Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 27 Sep 2023 09:27:09 +0000 Subject: [PATCH 28/42] revert changes to set operator password --- lib/charms/mongodb/v0/shards_interface.py | 26 ++++++++++++++++++++++- src/charm.py | 4 ++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 7c633bfdf..42dc82851 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -10,9 +10,11 @@ from charms.mongodb.v0.helpers import KEY_FILE from charms.mongodb.v0.users import MongoDBUser, OperatorUser +from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError from ops.charm import CharmBase from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus +from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -137,7 +139,11 @@ def _on_relation_changed(self, event): # shards rely on the config server for secrets relation_data = event.relation.data[event.app] self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY)) - self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) + try: + self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) + except RetryError: + self.charm.unit.status = BlockedStatus("Shard not added to config-server") + return self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") @@ -160,6 +166,24 @@ def update_operator_password(self, new_password: str) -> None: if not new_password or new_password == current_password or not self.charm.unit.is_leader(): return + # updating operator password, usually comes after keyfile was updated, hence, the mongodb + # service was restarted. Sometimes this requires units getting insync again. + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + # TODO, in the future use set_password from src/charm.py - this will require adding + # a library, for exceptions used in both charm code and lib code. + with MongoDBConnection(self.charm.mongodb_config) as mongo: + try: + mongo.set_user_password(OperatorUser.get_username(), new_password) + except NotReadyError: + logger.error( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + raise + except PyMongoError as e: + logger.error(f"Failed changing the password: {e}") + raise + self.charm.set_secret( Config.Relations.APP_SCOPE, MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()), diff --git a/src/charm.py b/src/charm.py index 5991e4c68..39f31b166 100755 --- a/src/charm.py +++ b/src/charm.py @@ -250,7 +250,7 @@ def role(self) -> str: self.app_peer_data["role"] = self.model.config["role"] # app data bag isn't set until function completes return self.model.config["role"] - else: + elif "role" not in self.app_peer_data: # if leader hasn't set the role yet, use the one set by model return self.model.config["role"] @@ -768,7 +768,7 @@ def _get_mongodb_config_for_user( return MongoDBConfiguration( replset=self.app.name, database=user.get_database_name(), - username=f"{user.get_username()}-{self.role}", + username=user.get_username(), password=self.get_secret(APP_SCOPE, user.get_password_key_name()), hosts=hosts, roles=user.get_roles(), From 3f5d3d2d3b197821202ddd9eab737645201856de Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 27 Sep 2023 09:27:56 +0000 Subject: [PATCH 29/42] lint + fmt --- lib/charms/mongodb/v0/shards_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 42dc82851..ae932e3d0 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -9,8 +9,8 @@ import logging from charms.mongodb.v0.helpers import KEY_FILE -from charms.mongodb.v0.users import MongoDBUser, OperatorUser from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError +from charms.mongodb.v0.users import MongoDBUser, OperatorUser from ops.charm import CharmBase from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus From b8ca419bb648071c1579e6cb1a6ec05ce7eb1b21 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 28 Sep 2023 08:04:19 +0000 Subject: [PATCH 30/42] add shard works --- lib/charms/mongodb/v0/mongos.py | 173 ++++++++++++++++++++++ lib/charms/mongodb/v0/shards_interface.py | 114 +++++++++++++- src/charm.py | 23 +++ 3 files changed, 304 insertions(+), 6 deletions(-) create mode 100644 lib/charms/mongodb/v0/mongos.py diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py new file mode 100644 index 000000000..370aa76d9 --- /dev/null +++ b/lib/charms/mongodb/v0/mongos.py @@ -0,0 +1,173 @@ +"""Code for interactions with MongoDB.""" +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import re +from dataclasses import dataclass +from typing import Dict, List, Optional, Set +from urllib.parse import quote_plus + +from bson.json_util import dumps +from pymongo import MongoClient +from pymongo.errors import OperationFailure, PyMongoError +from tenacity import ( + RetryError, + Retrying, + before_log, + retry, + stop_after_attempt, + stop_after_delay, + wait_fixed, +) + +# The unique Charmhub library identifier, never change it +LIBID = "49c69d9977574dd7942eb7b54f43355b" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 7 + +# path to store mongodb ketFile +logger = logging.getLogger(__name__) + + +@dataclass +class MongosConfiguration: + """Class for mongos configuration. + + — database: database name. + — username: username. + — password: password. + — hosts: full list of hosts to connect to, needed for the URI. + - port: integer for the port to connect to connect to mongodb. + - tls_external: indicator for use of internal TLS connection. + - tls_internal: indicator for use of external TLS connection. + """ + + database: Optional[str] + username: str + password: str + hosts: Set[str] + port: int + roles: Set[str] + tls_external: bool + tls_internal: bool + + @property + def uri(self): + """Return URI concatenated from fields.""" + hosts = [f"{host}:{self.port}" for host in self.hosts] + hosts = ",".join(hosts) + # Auth DB should be specified while user connects to application DB. + auth_source = "" + if self.database != "admin": + auth_source = "&authSource=admin" + return ( + f"mongodb://{quote_plus(self.username)}:" + f"{quote_plus(self.password)}@" + f"{hosts}/{quote_plus(self.database)}?" + f"{auth_source}" + ) + + +class NotReadyError(PyMongoError): + """Raised when not all replica set members healthy or finished initial sync.""" + + +class MongosConnection: + """In this class we create connection object to Mongos. + + Real connection is created on the first call to Mongos. + Delayed connectivity allows to firstly check database readiness + and reuse the same connection for an actual query later in the code. + + Connection is automatically closed when object destroyed. + Automatic close allows to have more clean code. + + Note that connection when used may lead to the following pymongo errors: ConfigurationError, + ConfigurationError, OperationFailure. It is suggested that the following pattern be adopted + when using MongoDBConnection: + + with MongoMongos(self._mongos_config) as mongo: + try: + mongo. + except ConfigurationError, ConfigurationError, OperationFailure: + + """ + + def __init__(self, config: MongosConfiguration, uri=None, direct=False): + """A MongoDB client interface. + + Args: + config: MongoDB Configuration object. + uri: allow using custom MongoDB URI, needed for replSet init. + direct: force a direct connection to a specific host, avoiding + reading replica set configuration and reconnection. + """ + self.mongodb_config = config + + if uri is None: + uri = config.uri + + self.client = MongoClient( + uri, + directConnection=direct, + connect=False, + serverSelectionTimeoutMS=1000, + connectTimeoutMS=2000, + ) + return + + def __enter__(self): + """Return a reference to the new connection.""" + return self + + def __exit__(self, object_type, value, traceback): + """Disconnect from MongoDB client.""" + self.client.close() + self.client = None + + def get_shard_members(self) -> Set[str]: + """Gets shard members. + + Returns: + A set of the shard members as reported by mongos. + + Raises: + ConfigurationError, ConfigurationError, OperationFailure + """ + shard_list = self.client.admin.command("listShards") + curr_members = [ + self._hostname_from_hostport(member["host"]) for member in shard_list["shards"] + ] + return set(curr_members) + + def add_shard(self, shard_name, shard_hosts, shard_port=27017): + """TODO""" + shard_hosts = [f"{host}:{shard_port}" for host in shard_hosts] + shard_hosts = ",".join(shard_hosts) + shard_url = f"{shard_name}/{shard_hosts}" + # TODO Future PR raise error when number of shards currently adding are higher than the + # number of secondaries on the primary shard + + if shard_name in self.get_shard_members(): + logger.info("Skipping adding shard %s, shard is already in cluser", shard_name) + return + + logger.info("Adding shard %s", shard_name) + self.client.admin.command("addShard", shard_url) + + @staticmethod + def _hostname_from_hostport(hostname: str) -> str: + """Return hostname part from MongoDB returned. + + mongos typically returns a value that contains both, hostname, hosts, and ports. + e.g. input: shard03/host7:27018,host8:27018,host9:27018 + Return shard name + e.g. output: shard03 + """ + return hostname.split("/")[0] diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index ae932e3d0..099448b6f 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -7,15 +7,17 @@ shards. """ import logging - +import json +from charms.mongodb.v0.mongos import MongosConnection, PyMongoError from charms.mongodb.v0.helpers import KEY_FILE from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError from charms.mongodb.v0.users import MongoDBUser, OperatorUser -from ops.charm import CharmBase +from ops.charm import CharmBase, RelationBrokenEvent from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed +from typing import Optional, List from config import Config logger = logging.getLogger(__name__) @@ -32,6 +34,7 @@ LIBPATCH = 1 KEYFILE_KEY = "key-file" OPERATOR_PASSWORD_KEY = "operator-password" +HOSTS_KEY = "host" class ShardingProvider(Object): @@ -48,7 +51,11 @@ def __init__( self.framework.observe( charm.on[self.relation_name].relation_joined, self._on_relation_joined ) - # TODO Future PR, enable shard drainage by listening for relation departed events + self.framework.observe( + charm.on[self.relation_name].relation_changed, self._on_relation_event + ) + # TODO Future PR, add shard to config server + # TODO Follow up PR, handle rotating passwords def _on_relation_joined(self, event): """Handles providing shards with secrets and adding shards to the config server.""" @@ -83,8 +90,58 @@ def _on_relation_joined(self, event): }, ) - # TODO Future PR, add shard to config server - # TODO Follow up PR, handle rotating passwords + def _on_relation_event(self, event): + """TODO""" + if self.charm.is_role(Config.Role.REPLICATION): + self.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.CONFIG_SERVER): + logger.info( + "skipping relation joined event ShardingRequirer is only be executed by config-server" + ) + return + + if not self.charm.unit.is_leader(): + return + + if not self.charm.db_initialised: + event.defer() + + departed_relation_id = None + if type(event) is RelationBrokenEvent: + departed_relation_id = event.relation.id + + try: + logger.info("Adding shards not present in cluster.") + self.add_shards(departed_relation_id, event) + # TODO Future PR, enable self healing by listening for relation changed events + # TODO Future PR, enable shard drainage by listening for relation departed events + except PyMongoError as e: + logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) + event.defer() + return + + def add_shards(self, departed_shard_id, event): + """TODO docstring + + raise: PyMongoError + """ + with MongosConnection(self.charm.mongos_config) as mongo: + cluster_shards = mongo.get_shard_members() # todo make this function + relation_shards = self._get_shards_from_relations(departed_shard_id) + + # TODO Future PR, limit number of shards add at a time, based on the number of + # replicas in the primary shard + for shard in relation_shards - cluster_shards: + shard_hosts = self._get_shard_hosts(shard) + if not len(shard_hosts): + logger.info("host info for shard %s not yet added, skipping", shard) + continue + + logger.info("Adding shard: %s ", shard) + mongo.add_shard(shard, shard_hosts) # todo make this function def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -102,6 +159,28 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: if relation: relation.data[self.charm.model.app].update(data) + def _get_shards_from_relations(self, departed_shard_id: Optional[int]): + """TODO docstring""" + relations = self.model.relations[self.relation_name] + return set( + [ + self._get_shard_name_from_relation(relation) + for relation in relations + if relation.id != departed_shard_id + ] + ) + + def _get_shard_hosts(self, shard_name) -> str: + """TODO docstring""" + relations = self.model.relations[self.relation_name] + for relation in relations: + if self._get_shard_name_from_relation(relation) == shard_name: + return json.loads(relation.data[relation.app].get(HOSTS_KEY, "[]")) + + def _get_shard_name_from_relation(self, relation): + """TODO docstring""" + return relation.app.name + class ConfigServerRequirer(Object): """Manage relations between the config server and the shard, on the shard's side.""" @@ -147,10 +226,17 @@ def _on_relation_changed(self, event): self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") - # TODO future PR, leader unit verifies shard was added to cluster if not self.charm.unit.is_leader(): return + # send hosts to mongos to be added to the cluster + self._update_relation_data( + event.relation.id, + {HOSTS_KEY: json.dumps(self.charm._unit_ips)}, + ) + + # TODO future PR, leader unit verifies shard was added to cluster (update-status hook) + def update_operator_password(self, new_password: str) -> None: """Updates the password for the operator user. @@ -209,3 +295,19 @@ def update_keyfile(self, key_file_contents: str) -> None: self.charm.set_secret( Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME, key_file_contents ) + + def _update_relation_data(self, relation_id: int, data: dict) -> None: + """Updates a set of key-value pairs in the relation. + + This function writes in the application data bag, therefore, only the leader unit can call + it. + + Args: + relation_id: the identifier for a particular relation. + data: dict containing the key-value pairs + that should be updated in the relation. + """ + if self.charm.unit.is_leader(): + relation = self.charm.model.get_relation(self.relation_name, relation_id) + if relation: + relation.data[self.charm.model.app].update(data) diff --git a/src/charm.py b/src/charm.py index fba47bd19..99369a10d 100755 --- a/src/charm.py +++ b/src/charm.py @@ -31,6 +31,7 @@ NotReadyError, PyMongoError, ) +from charms.mongodb.v0.mongos import MongosConfiguration from charms.mongodb.v0.mongodb_backups import S3_RELATION, MongoDBBackups from charms.mongodb.v0.mongodb_provider import MongoDBProvider from charms.mongodb.v0.mongodb_tls import MongoDBTLS @@ -192,6 +193,11 @@ def _replica_set_hosts(self): """ return json.loads(self.app_peer_data.get("replica_set_hosts", "[]")) + @property + def mongos_config(self) -> MongoDBConfiguration: + """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB.""" + return self._get_mongos_config_for_user(OperatorUser, set(self._unit_ips)) + @property def mongodb_config(self) -> MongoDBConfiguration: """Generates a MongoDBConfiguration object for this deployment of MongoDB.""" @@ -754,6 +760,23 @@ def _is_user_created(self, user: MongoDBUser) -> bool: def _set_user_created(self, user: MongoDBUser) -> None: self.app_peer_data[f"{user.get_username()}-user-created"] = "True" + def _get_mongos_config_for_user( + self, user: MongoDBUser, hosts: Set[str] + ) -> MongosConfiguration: + external_ca, _ = self.tls.get_tls_files(UNIT_SCOPE) + internal_ca, _ = self.tls.get_tls_files(APP_SCOPE) + + return MongosConfiguration( + database=user.get_database_name(), + username=user.get_username(), + password=self.get_secret(APP_SCOPE, user.get_password_key_name()), + hosts=hosts, + port=27018, + roles=user.get_roles(), + tls_external=external_ca is not None, + tls_internal=internal_ca is not None, + ) + def _get_mongodb_config_for_user( self, user: MongoDBUser, hosts: Set[str] ) -> MongoDBConfiguration: From 7f4da0701beab5a0db925b6afd420ce9247fab32 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 28 Sep 2023 08:22:36 +0000 Subject: [PATCH 31/42] cleaning --- lib/charms/mongodb/v0/mongos.py | 23 ++++++----------- lib/charms/mongodb/v0/shards_interface.py | 31 ++++++++++++----------- src/charm.py | 2 +- 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 370aa76d9..adc871e7c 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -3,23 +3,12 @@ # See LICENSE file for licensing details. import logging -import re from dataclasses import dataclass -from typing import Dict, List, Optional, Set +from typing import Optional, Set from urllib.parse import quote_plus -from bson.json_util import dumps from pymongo import MongoClient -from pymongo.errors import OperationFailure, PyMongoError -from tenacity import ( - RetryError, - Retrying, - before_log, - retry, - stop_after_attempt, - stop_after_delay, - wait_fixed, -) +from pymongo.errors import PyMongoError # The unique Charmhub library identifier, never change it LIBID = "49c69d9977574dd7942eb7b54f43355b" @@ -147,7 +136,11 @@ def get_shard_members(self) -> Set[str]: return set(curr_members) def add_shard(self, shard_name, shard_hosts, shard_port=27017): - """TODO""" + """Adds shard to the cluster. + + Raises: + ConfigurationError, ConfigurationError, OperationFailure + """ shard_hosts = [f"{host}:{shard_port}" for host in shard_hosts] shard_hosts = ",".join(shard_hosts) shard_url = f"{shard_name}/{shard_hosts}" @@ -155,7 +148,7 @@ def add_shard(self, shard_name, shard_hosts, shard_port=27017): # number of secondaries on the primary shard if shard_name in self.get_shard_members(): - logger.info("Skipping adding shard %s, shard is already in cluser", shard_name) + logger.info("Skipping adding shard %s, shard is already in cluster", shard_name) return logger.info("Adding shard %s", shard_name) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 099448b6f..56f868fc5 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -6,18 +6,19 @@ This class handles the sharing of secrets between sharded components, adding shards, and removing shards. """ -import logging import json -from charms.mongodb.v0.mongos import MongosConnection, PyMongoError +import logging +from typing import Optional + from charms.mongodb.v0.helpers import KEY_FILE from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError +from charms.mongodb.v0.mongos import MongosConnection from charms.mongodb.v0.users import MongoDBUser, OperatorUser from ops.charm import CharmBase, RelationBrokenEvent from ops.framework import Object from ops.model import BlockedStatus, MaintenanceStatus from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed -from typing import Optional, List from config import Config logger = logging.getLogger(__name__) @@ -54,7 +55,7 @@ def __init__( self.framework.observe( charm.on[self.relation_name].relation_changed, self._on_relation_event ) - # TODO Future PR, add shard to config server + # TODO Follow up PR, handle rotating passwords def _on_relation_joined(self, event): @@ -91,7 +92,7 @@ def _on_relation_joined(self, event): ) def _on_relation_event(self, event): - """TODO""" + """Handles adding, removing, and updating of shards.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error("sharding interface not supported with config role=replication") @@ -115,21 +116,21 @@ def _on_relation_event(self, event): try: logger.info("Adding shards not present in cluster.") - self.add_shards(departed_relation_id, event) - # TODO Future PR, enable self healing by listening for relation changed events + self.add_shards(departed_relation_id) + # TODO Future PR, enable updating shards by listening for relation changed events # TODO Future PR, enable shard drainage by listening for relation departed events except PyMongoError as e: logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) event.defer() return - def add_shards(self, departed_shard_id, event): - """TODO docstring + def add_shards(self, departed_shard_id): + """Adds shards to cluster. - raise: PyMongoError + raises: PyMongoError """ with MongosConnection(self.charm.mongos_config) as mongo: - cluster_shards = mongo.get_shard_members() # todo make this function + cluster_shards = mongo.get_shard_members() relation_shards = self._get_shards_from_relations(departed_shard_id) # TODO Future PR, limit number of shards add at a time, based on the number of @@ -141,7 +142,7 @@ def add_shards(self, departed_shard_id, event): continue logger.info("Adding shard: %s ", shard) - mongo.add_shard(shard, shard_hosts) # todo make this function + mongo.add_shard(shard, shard_hosts) def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -160,7 +161,7 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: relation.data[self.charm.model.app].update(data) def _get_shards_from_relations(self, departed_shard_id: Optional[int]): - """TODO docstring""" + """Returns a list of the shards related to the config-server.""" relations = self.model.relations[self.relation_name] return set( [ @@ -171,14 +172,14 @@ def _get_shards_from_relations(self, departed_shard_id: Optional[int]): ) def _get_shard_hosts(self, shard_name) -> str: - """TODO docstring""" + """Retrieves the hosts for a specified shard.""" relations = self.model.relations[self.relation_name] for relation in relations: if self._get_shard_name_from_relation(relation) == shard_name: return json.loads(relation.data[relation.app].get(HOSTS_KEY, "[]")) def _get_shard_name_from_relation(self, relation): - """TODO docstring""" + """Returns the name of a shard for a specified relation.""" return relation.app.name diff --git a/src/charm.py b/src/charm.py index 99369a10d..dbac27403 100755 --- a/src/charm.py +++ b/src/charm.py @@ -31,11 +31,11 @@ NotReadyError, PyMongoError, ) -from charms.mongodb.v0.mongos import MongosConfiguration from charms.mongodb.v0.mongodb_backups import S3_RELATION, MongoDBBackups from charms.mongodb.v0.mongodb_provider import MongoDBProvider from charms.mongodb.v0.mongodb_tls import MongoDBTLS from charms.mongodb.v0.mongodb_vm_legacy_provider import MongoDBLegacyProvider +from charms.mongodb.v0.mongos import MongosConfiguration from charms.mongodb.v0.shards_interface import ConfigServerRequirer, ShardingProvider from charms.mongodb.v0.users import ( CHARM_USERS, From a94041224d8862c46746af8122761ef4ca1485d7 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 28 Sep 2023 14:40:09 +0200 Subject: [PATCH 32/42] create lib file in charmcraft --- lib/charms/mongodb/v0/mongos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index adc871e7c..9043c3d35 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -11,14 +11,14 @@ from pymongo.errors import PyMongoError # The unique Charmhub library identifier, never change it -LIBID = "49c69d9977574dd7942eb7b54f43355b" +LIBID = "e20d5b19670d4c55a4934a21d3f3b29a" # Increment this major API version when introducing breaking changes LIBAPI = 0 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 1 # path to store mongodb ketFile logger = logging.getLogger(__name__) From 34b904e8212de3c05e9fa8cf6b424c5bc5568ecc Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 29 Sep 2023 14:33:42 +0000 Subject: [PATCH 33/42] shard removed from cluster --- lib/charms/mongodb/v0/mongos.py | 79 ++++++++++++++- lib/charms/mongodb/v0/shards_interface.py | 118 ++++++++++++++++++++-- src/charm.py | 20 ++++ 3 files changed, 206 insertions(+), 11 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 9043c3d35..4e9da2e79 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -4,11 +4,12 @@ import logging from dataclasses import dataclass -from typing import Optional, Set +from typing import Optional, Set, Dict from urllib.parse import quote_plus from pymongo import MongoClient from pymongo.errors import PyMongoError +from charms.mongodb.v0.mongodb import NotReadyError # The unique Charmhub library identifier, never change it LIBID = "e20d5b19670d4c55a4934a21d3f3b29a" @@ -63,8 +64,16 @@ def uri(self): ) -class NotReadyError(PyMongoError): - """Raised when not all replica set members healthy or finished initial sync.""" +class RemovePrimaryShardException(Exception): + """Raised when there is an attempt to remove the primary shard.""" + + +class ShardNotInClusterException(Exception): + """Raised when shard is not present in cluster, but it is expected to be.""" + + +class ShardNotPlannedForRemoval(Exception): + """Raised when it is expected that a shard is planned for removal, but it is not.""" class MongosConnection: @@ -127,7 +136,7 @@ def get_shard_members(self) -> Set[str]: A set of the shard members as reported by mongos. Raises: - ConfigurationError, ConfigurationError, OperationFailure + ConfigurationError, OperationFailure """ shard_list = self.client.admin.command("listShards") curr_members = [ @@ -139,7 +148,7 @@ def add_shard(self, shard_name, shard_hosts, shard_port=27017): """Adds shard to the cluster. Raises: - ConfigurationError, ConfigurationError, OperationFailure + ConfigurationError, OperationFailure """ shard_hosts = [f"{host}:{shard_port}" for host in shard_hosts] shard_hosts = ",".join(shard_hosts) @@ -154,6 +163,66 @@ def add_shard(self, shard_name, shard_hosts, shard_port=27017): logger.info("Adding shard %s", shard_name) self.client.admin.command("addShard", shard_url) + def remove_shard(self, shard_name): + """Removes shard from the cluster. + + Raises: + ConfigurationError, OperationFailure, NotReadyError, + RemovePrimaryShardException + """ + sc_status = self.client.admin.command("listShards") + if self._is_any_draining(sc_status): + cannot_remove_shard = ( + f"cannot remove shard {shard_name} from cluster, another shard is draining" + ) + logger.info(cannot_remove_shard) + raise NotReadyError(cannot_remove_shard) + + # TODO Follow up PR, there is no MongoDB command to retrieve primary shard, this is + # possible with mongosh + primary_shard = False + if primary_shard: + cannot_remove_primary_shard = ( + f"Shard {shard_name} is the primary shard, cannot remove." + ) + logger.error(cannot_remove_primary_shard) + raise RemovePrimaryShardException(cannot_remove_primary_shard) + + logger.info("Attemping to remove shard %s", shard_name) + self.client.admin.command("removeShard", shard_name) + logger.info("Shard %s, now draining", shard_name) + + def _is_shard_draining(self, shard_name) -> bool: + """todo + + Raises: + ConfigurationError, OperationFailure, ShardNotInClusterException, + ShardNotPlannedForRemoval + """ + sc_status = self.client.admin.command("listShards") + for shard in sc_status["shards"]: + if shard["_id"] == shard_name: + if "draining" not in shard: + raise ShardNotPlannedForRemoval( + f"Shard {shard_name} has not been marked for removal", + ) + return shard["draining"] + + raise ShardNotInClusterException( + f"Shard {shard_name} not in cluster, could not retrieve draining status" + ) + + @staticmethod + def _is_any_draining(sc_status: Dict) -> bool: + """Returns true if any shard members is draining. + + Checks if any members in sharded cluster are draining data. + + Args: + rs_status: current state of shard cluster status as reported by mongos. + """ + return any(shard.get("draining", False) for shard in sc_status["shards"]) + @staticmethod def _hostname_from_hostport(hostname: str) -> str: """Return hostname part from MongoDB returned. diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 56f868fc5..39493736f 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -8,15 +8,25 @@ """ import json import logging +import time from typing import Optional from charms.mongodb.v0.helpers import KEY_FILE -from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError -from charms.mongodb.v0.mongos import MongosConnection +from charms.mongodb.v0.mongodb import ( + MongoDBConnection, + NotReadyError, + PyMongoError, + OperationFailure, +) +from charms.mongodb.v0.mongos import ( + MongosConnection, + ShardNotInClusterException, + ShardNotPlannedForRemoval, +) from charms.mongodb.v0.users import MongoDBUser, OperatorUser -from ops.charm import CharmBase, RelationBrokenEvent +from ops.charm import CharmBase, RelationDepartedEvent from ops.framework import Object -from ops.model import BlockedStatus, MaintenanceStatus +from ops.model import BlockedStatus, MaintenanceStatus, ActiveStatus from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -38,6 +48,10 @@ HOSTS_KEY = "host" +class RemoveLastShardException(Exception): + """Raised when there is an attempt to remove the last shard in the cluster.""" + + class ShardingProvider(Object): """Manage relations between the config server and the shard, on the config-server's side.""" @@ -55,6 +69,9 @@ def __init__( self.framework.observe( charm.on[self.relation_name].relation_changed, self._on_relation_event ) + self.framework.observe( + charm.on[self.relation_name].relation_departed, self._on_relation_event + ) # TODO Follow up PR, handle rotating passwords @@ -88,6 +105,7 @@ def _on_relation_joined(self, event): KEYFILE_KEY: self.charm.get_secret( Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME ), + HOSTS_KEY: json.dumps(self.charm._unit_ips), }, ) @@ -111,15 +129,23 @@ def _on_relation_event(self, event): event.defer() departed_relation_id = None - if type(event) is RelationBrokenEvent: + if type(event) is RelationDepartedEvent: departed_relation_id = event.relation.id try: logger.info("Adding shards not present in cluster.") self.add_shards(departed_relation_id) + self.remove_shards(departed_relation_id) # TODO Future PR, enable updating shards by listening for relation changed events # TODO Future PR, enable shard drainage by listening for relation departed events - except PyMongoError as e: + except OperationFailure as e: + if e.code == 20: + logger.error( + "Cannot not remove the last shard from cluster, this is forbidden by mongos." + ) + # we should not lose connection with the shard, prevent other hooks from executing. + raise RemoveLastShardException() + except (PyMongoError, NotReadyError) as e: logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) event.defer() return @@ -144,6 +170,21 @@ def add_shards(self, departed_shard_id): logger.info("Adding shard: %s ", shard) mongo.add_shard(shard, shard_hosts) + def remove_shards(self, departed_shard_id): + """Removes shards from cluster. + + raises: PyMongoError, NotReadyError + """ + with MongosConnection(self.charm.mongos_config) as mongo: + cluster_shards = mongo.get_shard_members() + relation_shards = self._get_shards_from_relations(departed_shard_id) + + for shard in cluster_shards - relation_shards: + self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}") + logger.info("Attemping to removing shard: %s", shard) + mongo.remove_shard(shard) + logger.info("Attemping to removing shard: %s", shard) + def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -197,6 +238,9 @@ def __init__( self.framework.observe( charm.on[self.relation_name].relation_changed, self._on_relation_changed ) + self.framework.observe( + charm.on[self.relation_name].relation_departed, self._on_relation_departed + ) # TODO Future PR, enable shard drainage by observing relation departed events @@ -238,6 +282,68 @@ def _on_relation_changed(self, event): # TODO future PR, leader unit verifies shard was added to cluster (update-status hook) + def _on_relation_departed(self, event): + """todo""" + if self.charm.is_role(Config.Role.REPLICATION): + self.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.SHARD): + logger.info( + "skipping relation departed event ShardingProvider is only be executed by shards" + ) + return + + if not self.charm.db_initialised: + event.defer() + return + + self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") + mongos_hosts = json.loads(event.relation.data[event.relation.app].get(HOSTS_KEY)) + drained = False + while not drained: + try: + drained = self.drained(mongos_hosts, self.charm.app.name) + logger.debug("Shards draining state: %s", drained) + # no need to continously check and abuse resouces while shard is draining + time.sleep(10) + except PyMongoError as e: + logger.error("Error occured while draining shard: %s", e) + self.charm.unit.status = BlockedStatus("Failed to drain shard from cluster") + except ShardNotPlannedForRemoval: + logger.info( + "Shard %s has not been identifies for removal. Must wait for mongos cluster-admin to remove shard." + ) + except ShardNotInClusterException: + logger.info("Shard to remove, is not in sharded cluster.") + break + + self.charm.unit.status = ActiveStatus("Shard drained from cluster, ready for removal") + # TODO future PR, leader unit displays this message in update-status hook + # TODO future PR, check for shard drainage when removing application + + def drained(self, mongos_hosts, shard_name): + """todo + + Raises: + ConfigurationError, OperationFailure, ShardNotInClusterException, + ShardNotPlannedForRemoval + """ + if not self.charm.is_role(Config.Role.SHARD): + logger.info( + "Component %s is not a shard, cannot check draining status.", self.charm.role + ) + return False + + if "draining" not in self.charm.app_peer_data and not self.charm.unit.is_leader(): + return False + + with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo: + draining = mongo._is_shard_draining(shard_name) + self.charm.app_peer_data["draining"] = json.dumps(draining) + return self.charm.app_peer_data.get("draining") + def update_operator_password(self, new_password: str) -> None: """Updates the password for the operator user. diff --git a/src/charm.py b/src/charm.py index dbac27403..b74905e50 100755 --- a/src/charm.py +++ b/src/charm.py @@ -164,6 +164,19 @@ def _primary(self) -> str: return None + @property + def drained(self) -> bool: + """Returns whether the shard has been drained.""" + if not self.is_role(Config.Role.SHARD): + logger.info("Component %s is not a shard, cannot check draining status.", self.role) + return False + + if "draining" not in self.app_peer_data: + logger.info("Draining status has not been set. It is set on relation_departed") + return False + + return self.app_peer_data.get("drain") + @property def _unit_ips(self) -> List[str]: """Retrieve IP addresses associated with MongoDB application. @@ -198,6 +211,13 @@ def mongos_config(self) -> MongoDBConfiguration: """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB.""" return self._get_mongos_config_for_user(OperatorUser, set(self._unit_ips)) + def remote_mongos_config(self, hosts) -> MongoDBConfiguration: + """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB.""" + + # mongos that are part of the cluster have the same username and password, but different + # hosts + return self._get_mongos_config_for_user(OperatorUser, hosts) + @property def mongodb_config(self) -> MongoDBConfiguration: """Generates a MongoDBConfiguration object for this deployment of MongoDB.""" From 15348edad2fc1f3dca853c41038068820fb6bf90 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 29 Sep 2023 15:02:39 +0000 Subject: [PATCH 34/42] format + lint --- lib/charms/mongodb/v0/mongos.py | 33 ++++++++++---------- lib/charms/mongodb/v0/shards_interface.py | 38 +++++++++++------------ src/charm.py | 1 - 3 files changed, 35 insertions(+), 37 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 4e9da2e79..74a4d9e60 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -4,12 +4,11 @@ import logging from dataclasses import dataclass -from typing import Optional, Set, Dict +from typing import Dict, Optional, Set from urllib.parse import quote_plus -from pymongo import MongoClient -from pymongo.errors import PyMongoError from charms.mongodb.v0.mongodb import NotReadyError +from pymongo import MongoClient # The unique Charmhub library identifier, never change it LIBID = "e20d5b19670d4c55a4934a21d3f3b29a" @@ -64,15 +63,15 @@ def uri(self): ) -class RemovePrimaryShardException(Exception): +class RemovePrimaryShardError(Exception): """Raised when there is an attempt to remove the primary shard.""" -class ShardNotInClusterException(Exception): +class ShardNotInClusterError(Exception): """Raised when shard is not present in cluster, but it is expected to be.""" -class ShardNotPlannedForRemoval(Exception): +class ShardNotPlannedForRemovalError(Exception): """Raised when it is expected that a shard is planned for removal, but it is not.""" @@ -163,12 +162,12 @@ def add_shard(self, shard_name, shard_hosts, shard_port=27017): logger.info("Adding shard %s", shard_name) self.client.admin.command("addShard", shard_url) - def remove_shard(self, shard_name): + def remove_shard(self, shard_name: str) -> None: """Removes shard from the cluster. Raises: ConfigurationError, OperationFailure, NotReadyError, - RemovePrimaryShardException + RemovePrimaryShardError """ sc_status = self.client.admin.command("listShards") if self._is_any_draining(sc_status): @@ -186,29 +185,29 @@ def remove_shard(self, shard_name): f"Shard {shard_name} is the primary shard, cannot remove." ) logger.error(cannot_remove_primary_shard) - raise RemovePrimaryShardException(cannot_remove_primary_shard) + raise RemovePrimaryShardError(cannot_remove_primary_shard) - logger.info("Attemping to remove shard %s", shard_name) + logger.info("Attempting to remove shard %s", shard_name) self.client.admin.command("removeShard", shard_name) logger.info("Shard %s, now draining", shard_name) - def _is_shard_draining(self, shard_name) -> bool: - """todo + def _is_shard_draining(self, shard_name: str) -> bool: + """Reports if a given shard is currently in the draining state. Raises: - ConfigurationError, OperationFailure, ShardNotInClusterException, - ShardNotPlannedForRemoval + ConfigurationError, OperationFailure, ShardNotInClusterError, + ShardNotPlannedForRemovalError """ sc_status = self.client.admin.command("listShards") for shard in sc_status["shards"]: if shard["_id"] == shard_name: if "draining" not in shard: - raise ShardNotPlannedForRemoval( + raise ShardNotPlannedForRemovalError( f"Shard {shard_name} has not been marked for removal", ) return shard["draining"] - raise ShardNotInClusterException( + raise ShardNotInClusterError( f"Shard {shard_name} not in cluster, could not retrieve draining status" ) @@ -219,7 +218,7 @@ def _is_any_draining(sc_status: Dict) -> bool: Checks if any members in sharded cluster are draining data. Args: - rs_status: current state of shard cluster status as reported by mongos. + sc_status: current state of shard cluster status as reported by mongos. """ return any(shard.get("draining", False) for shard in sc_status["shards"]) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 39493736f..e737eec80 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -9,24 +9,24 @@ import json import logging import time -from typing import Optional +from typing import Optional, Set from charms.mongodb.v0.helpers import KEY_FILE from charms.mongodb.v0.mongodb import ( MongoDBConnection, NotReadyError, - PyMongoError, OperationFailure, + PyMongoError, ) from charms.mongodb.v0.mongos import ( MongosConnection, - ShardNotInClusterException, - ShardNotPlannedForRemoval, + ShardNotInClusterError, + ShardNotPlannedForRemovalError, ) from charms.mongodb.v0.users import MongoDBUser, OperatorUser from ops.charm import CharmBase, RelationDepartedEvent from ops.framework import Object -from ops.model import BlockedStatus, MaintenanceStatus, ActiveStatus +from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -48,7 +48,7 @@ HOSTS_KEY = "host" -class RemoveLastShardException(Exception): +class RemoveLastShardError(Exception): """Raised when there is an attempt to remove the last shard in the cluster.""" @@ -144,7 +144,7 @@ def _on_relation_event(self, event): "Cannot not remove the last shard from cluster, this is forbidden by mongos." ) # we should not lose connection with the shard, prevent other hooks from executing. - raise RemoveLastShardException() + raise RemoveLastShardError() except (PyMongoError, NotReadyError) as e: logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) event.defer() @@ -181,9 +181,9 @@ def remove_shards(self, departed_shard_id): for shard in cluster_shards - relation_shards: self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}") - logger.info("Attemping to removing shard: %s", shard) + logger.info("Attempting to removing shard: %s", shard) mongo.remove_shard(shard) - logger.info("Attemping to removing shard: %s", shard) + logger.info("Attempting to removing shard: %s", shard) def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -282,8 +282,8 @@ def _on_relation_changed(self, event): # TODO future PR, leader unit verifies shard was added to cluster (update-status hook) - def _on_relation_departed(self, event): - """todo""" + def _on_relation_departed(self, event) -> None: + """Waits for the shard to be fully drained from the cluster.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error("sharding interface not supported with config role=replication") @@ -306,16 +306,16 @@ def _on_relation_departed(self, event): try: drained = self.drained(mongos_hosts, self.charm.app.name) logger.debug("Shards draining state: %s", drained) - # no need to continously check and abuse resouces while shard is draining + # no need to continuously check and abuse resources while shard is draining time.sleep(10) except PyMongoError as e: - logger.error("Error occured while draining shard: %s", e) + logger.error("Error occurred while draining shard: %s", e) self.charm.unit.status = BlockedStatus("Failed to drain shard from cluster") - except ShardNotPlannedForRemoval: + except ShardNotPlannedForRemovalError: logger.info( "Shard %s has not been identifies for removal. Must wait for mongos cluster-admin to remove shard." ) - except ShardNotInClusterException: + except ShardNotInClusterError: logger.info("Shard to remove, is not in sharded cluster.") break @@ -323,12 +323,12 @@ def _on_relation_departed(self, event): # TODO future PR, leader unit displays this message in update-status hook # TODO future PR, check for shard drainage when removing application - def drained(self, mongos_hosts, shard_name): - """todo + def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool: + """Returns whether a shard has been drained from the cluster. Raises: - ConfigurationError, OperationFailure, ShardNotInClusterException, - ShardNotPlannedForRemoval + ConfigurationError, OperationFailure, ShardNotInClusterError, + ShardNotPlannedForRemovalError """ if not self.charm.is_role(Config.Role.SHARD): logger.info( diff --git a/src/charm.py b/src/charm.py index b74905e50..8a0956223 100755 --- a/src/charm.py +++ b/src/charm.py @@ -213,7 +213,6 @@ def mongos_config(self) -> MongoDBConfiguration: def remote_mongos_config(self, hosts) -> MongoDBConfiguration: """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB.""" - # mongos that are part of the cluster have the same username and password, but different # hosts return self._get_mongos_config_for_user(OperatorUser, hosts) From 01f6b35a93d27dc4022dea9bb9ee9dad1ee0ffb6 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 4 Oct 2023 08:19:28 +0000 Subject: [PATCH 35/42] updates for new linter --- CONTRIBUTING.md | 2 +- lib/charms/mongodb/v0/mongodb.py | 2 +- lib/charms/mongodb/v0/mongodb_backups.py | 4 ++-- src/charm.py | 2 +- tests/integration/ha_tests/test_ha.py | 8 ++++---- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7f360d8c3..9007809d1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,7 +70,7 @@ Testing high availability on a production cluster can be done with: tox run -e ha-integration -- --model= ``` -Note if you'd like to test storage re-use in ha-testing, your storage must not be of the type `rootfs`. `rootfs` storage is tied to the machine lifecycle and does not stick around after unit removal. `rootfs` storage is used by default with `tox run -e ha-integration`. To test ha-testing for storage re-use: +Note if you'd like to test storage reuse in ha-testing, your storage must not be of the type `rootfs`. `rootfs` storage is tied to the machine lifecycle and does not stick around after unit removal. `rootfs` storage is used by default with `tox run -e ha-integration`. To test ha-testing for storage reuse: ```shell juju create-storage-pool mongodb-ebs ebs volume-type=standard # create a storage pool juju deploy ./*charm --storage mongodb=mongodb-ebs,7G,1 # deploy 1 or more units of application with said storage pool diff --git a/lib/charms/mongodb/v0/mongodb.py b/lib/charms/mongodb/v0/mongodb.py index 25a65d7c0..1ff8bcc8e 100644 --- a/lib/charms/mongodb/v0/mongodb.py +++ b/lib/charms/mongodb/v0/mongodb.py @@ -308,7 +308,7 @@ def create_role(self, role_name: str, privileges: dict, roles: dict = []): Args: role_name: name of the role to be added. - privileges: privledges to be associated with the role. + privileges: privileges to be associated with the role. roles: List of roles from which this role inherits privileges. """ try: diff --git a/lib/charms/mongodb/v0/mongodb_backups.py b/lib/charms/mongodb/v0/mongodb_backups.py index bd30b8d49..d152beb26 100644 --- a/lib/charms/mongodb/v0/mongodb_backups.py +++ b/lib/charms/mongodb/v0/mongodb_backups.py @@ -505,7 +505,7 @@ def _try_to_restore(self, backup_id: str) -> None: If PBM is resyncing, the function will retry to create backup (up to BACKUP_RESTORE_MAX_ATTEMPTS times) with BACKUP_RESTORE_ATTEMPT_COOLDOWN - time between attepts. + time between attempts. If PMB returen any other error, the function will raise RestoreError. """ @@ -541,7 +541,7 @@ def _try_to_backup(self): If PBM is resyncing, the function will retry to create backup (up to BACKUP_RESTORE_MAX_ATTEMPTS times) - with BACKUP_RESTORE_ATTEMPT_COOLDOWN time between attepts. + with BACKUP_RESTORE_ATTEMPT_COOLDOWN time between attempts. If PMB returen any other error, the function will raise BackupError. """ diff --git a/src/charm.py b/src/charm.py index 8a0956223..2e21e003e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1266,7 +1266,7 @@ def _juju_secret_set(self, scope: Scopes, key: str, value: str) -> str: secret = self.secrets[scope].get(Config.Secrets.SECRET_LABEL) - # It's not the first secret for the scope, we can re-use the existing one + # It's not the first secret for the scope, we can reuse the existing one # that was fetched in the previous call, as fetching secrets from juju is # slow if secret: diff --git a/tests/integration/ha_tests/test_ha.py b/tests/integration/ha_tests/test_ha.py index 13d0d84a2..36c2d937b 100644 --- a/tests/integration/ha_tests/test_ha.py +++ b/tests/integration/ha_tests/test_ha.py @@ -96,7 +96,7 @@ async def test_storage_re_use(ops_test, continuous_writes): app = await helpers.app_name(ops_test) if helpers.storage_type(ops_test, app) == "rootfs": pytest.skip( - "re-use of storage can only be used on deployments with persistent storage not on rootfs deployments" + "reuse of storage can only be used on deployments with persistent storage not on rootfs deployments" ) # removing the only replica can be disastrous @@ -117,7 +117,7 @@ async def test_storage_re_use(ops_test, continuous_writes): assert await helpers.reused_storage( ops_test, new_unit.public_address, removal_time - ), "attached storage not properly re-used by MongoDB." + ), "attached storage not properly reused by MongoDB." # verify that the no writes were skipped total_expected_writes = await helpers.stop_continous_writes(ops_test) @@ -501,7 +501,7 @@ async def test_full_cluster_crash(ops_test: OpsTest, continuous_writes, reset_re ) # This test serves to verify behavior when all replicas are down at the same time that when - # they come back online they operate as expected. This check verifies that we meet the criterea + # they come back online they operate as expected. This check verifies that we meet the criteria # of all replicas being down at the same time. assert await helpers.all_db_processes_down(ops_test), "Not all units down at the same time." @@ -549,7 +549,7 @@ async def test_full_cluster_restart(ops_test: OpsTest, continuous_writes, reset_ ) # This test serves to verify behavior when all replicas are down at the same time that when - # they come back online they operate as expected. This check verifies that we meet the criterea + # they come back online they operate as expected. This check verifies that we meet the criteria # of all replicas being down at the same time. assert await helpers.all_db_processes_down(ops_test), "Not all units down at the same time." From 359a890c5a3434dcfd9dee04076ba4aa714266f6 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 4 Oct 2023 08:51:00 +0000 Subject: [PATCH 36/42] add context for future work --- lib/charms/mongodb/v0/mongos.py | 4 +++- lib/charms/mongodb/v0/shards_interface.py | 6 ++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 74a4d9e60..d2ca23da3 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -178,9 +178,11 @@ def remove_shard(self, shard_name: str) -> None: raise NotReadyError(cannot_remove_shard) # TODO Follow up PR, there is no MongoDB command to retrieve primary shard, this is - # possible with mongosh + # possible with mongosh. primary_shard = False if primary_shard: + # TODO Future PR, support removing Primary Shard if there are no unsharded collections + # on it. All sharded collections should perform `MovePrimary` cannot_remove_primary_shard = ( f"Shard {shard_name} is the primary shard, cannot remove." ) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index e737eec80..ebaafeefe 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -136,10 +136,10 @@ def _on_relation_event(self, event): logger.info("Adding shards not present in cluster.") self.add_shards(departed_relation_id) self.remove_shards(departed_relation_id) - # TODO Future PR, enable updating shards by listening for relation changed events - # TODO Future PR, enable shard drainage by listening for relation departed events except OperationFailure as e: if e.code == 20: + # TODO Future PR, allow removal of last shards that have no data. This will be + # tricky since we are not allowed to update the mongos config in this way. logger.error( "Cannot not remove the last shard from cluster, this is forbidden by mongos." ) @@ -242,8 +242,6 @@ def __init__( charm.on[self.relation_name].relation_departed, self._on_relation_departed ) - # TODO Future PR, enable shard drainage by observing relation departed events - def _on_relation_changed(self, event): """Retrieves secrets from config-server and updates them within the shard.""" if self.charm.is_role(Config.Role.REPLICATION): From 7757e3f27cc40cd2c9036c8b5fee0de64664e008 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 4 Oct 2023 15:03:57 +0000 Subject: [PATCH 37/42] correct drained fucntion --- lib/charms/mongodb/v0/shards_interface.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index ebaafeefe..25001d449 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -334,9 +334,13 @@ def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool: ) return False - if "draining" not in self.charm.app_peer_data and not self.charm.unit.is_leader(): + # leader has not checked for draining status, so we assume not yet drained + if "draining" not in self.charm.app_peer_data: return False + if not self.charm.unit.is_leader(): + return self.charm.app_peer_data.get("draining") + with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo: draining = mongo._is_shard_draining(shard_name) self.charm.app_peer_data["draining"] = json.dumps(draining) From 1dd7952f60e75e7b3e5b21faa4b389458cfc95ab Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 5 Oct 2023 11:48:39 +0000 Subject: [PATCH 38/42] fixing shard-removal --- lib/charms/mongodb/v0/mongos.py | 13 +++- lib/charms/mongodb/v0/shards_interface.py | 93 ++++++++++++++--------- 2 files changed, 68 insertions(+), 38 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index d2ca23da3..2e8c2b953 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -170,7 +170,9 @@ def remove_shard(self, shard_name: str) -> None: RemovePrimaryShardError """ sc_status = self.client.admin.command("listShards") - if self._is_any_draining(sc_status): + # It is necessary to call removeShard multiple times on a shard to guarantee removal. + # Allow re-removal of shards that are currently draining. + if self._is_any_draining(sc_status, ignore_shard=shard_name): cannot_remove_shard = ( f"cannot remove shard {shard_name} from cluster, another shard is draining" ) @@ -214,15 +216,20 @@ def _is_shard_draining(self, shard_name: str) -> bool: ) @staticmethod - def _is_any_draining(sc_status: Dict) -> bool: + def _is_any_draining(sc_status: Dict, ignore_shard: str = "") -> bool: """Returns true if any shard members is draining. Checks if any members in sharded cluster are draining data. Args: sc_status: current state of shard cluster status as reported by mongos. + ignore_shard: shard to ignore """ - return any(shard.get("draining", False) for shard in sc_status["shards"]) + return any( + # check draining status of all shards except the one to be ignored. + shard.get("draining", False) if shard["_id"] != ignore_shard else False + for shard in sc_status["shards"] + ) @staticmethod def _hostname_from_hostport(hostname: str) -> str: diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 25001d449..70a9c0c22 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -24,7 +24,7 @@ ShardNotPlannedForRemovalError, ) from charms.mongodb.v0.users import MongoDBUser, OperatorUser -from ops.charm import CharmBase, RelationDepartedEvent +from ops.charm import CharmBase, EventBase, RelationDepartedEvent from ops.framework import Object from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed @@ -52,6 +52,10 @@ class RemoveLastShardError(Exception): """Raised when there is an attempt to remove the last shard in the cluster.""" +class NotDrainedError(Exception): + """Raised when a shard is still in the cluster after removal.""" + + class ShardingProvider(Object): """Manage relations between the config server and the shard, on the config-server's side.""" @@ -77,23 +81,9 @@ def __init__( def _on_relation_joined(self, event): """Handles providing shards with secrets and adding shards to the config server.""" - if self.charm.is_role(Config.Role.REPLICATION): - self.unit.status = BlockedStatus("role replication does not support sharding") - logger.error("sharding interface not supported with config role=replication") - return - - if not self.charm.is_role(Config.Role.CONFIG_SERVER): - logger.info( - "skipping relation joined event ShardingRequirer is only be executed by config-server" - ) + if not self.pass_hook_checks: return - if not self.charm.unit.is_leader(): - return - - if not self.charm.db_initialised: - event.defer() - # TODO Future PR, sync tls secrets and PBM password self._update_relation_data( event.relation.id, @@ -109,24 +99,39 @@ def _on_relation_joined(self, event): }, ) - def _on_relation_event(self, event): - """Handles adding, removing, and updating of shards.""" + def run_hook_checks(self, event: EventBase) -> bool: + """Runs the pre-hooks checks for ShardingRequirer, returns True if all pass.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") - logger.error("sharding interface not supported with config role=replication") - return + logger.error( + "Skipping %s. Sharding interface not supported with config role=replication.", + type(event), + ) + return False if not self.charm.is_role(Config.Role.CONFIG_SERVER): logger.info( - "skipping relation joined event ShardingRequirer is only be executed by config-server" + "Skipping %s. ShardingRequirer is only be executed by config-server", type(event) ) - return + return False if not self.charm.unit.is_leader(): - return + return False if not self.charm.db_initialised: + logger.info("Deferring %s. db is not initialised.", type(event)) event.defer() + return False + + return True + + def _on_relation_event(self, event): + """Handles adding and removing of shards. + + Updating of shards is done automatically via MongoDB change-streams. + """ + if not self.pass_hook_checks: + return departed_relation_id = None if type(event) is RelationDepartedEvent: @@ -136,6 +141,13 @@ def _on_relation_event(self, event): logger.info("Adding shards not present in cluster.") self.add_shards(departed_relation_id) self.remove_shards(departed_relation_id) + except NotDrainedError: + # it is necessary to removeShard multiple times for the shard to be removed. + logger.info( + "Shard is still present in the cluster after removal, will defer and remove again." + ) + event.defer() + return except OperationFailure as e: if e.code == 20: # TODO Future PR, allow removal of last shards that have no data. This will be @@ -185,6 +197,11 @@ def remove_shards(self, departed_shard_id): mongo.remove_shard(shard) logger.info("Attempting to removing shard: %s", shard) + if shard in mongo.get_shard_members(): + shard_draining_message = f"shard {shard} still exists in cluster after removal, it is still draining." + logger.info(shard_draining_message) + raise NotDrainedError(shard_draining_message) + def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -303,7 +320,10 @@ def _on_relation_departed(self, event) -> None: while not drained: try: drained = self.drained(mongos_hosts, self.charm.app.name) - logger.debug("Shards draining state: %s", drained) + draining_status = ( + "Shard is still draining" if not drained else "Shard is fully drained." + ) + logger.debug(draining_status) # no need to continuously check and abuse resources while shard is draining time.sleep(10) except PyMongoError as e: @@ -314,7 +334,12 @@ def _on_relation_departed(self, event) -> None: "Shard %s has not been identifies for removal. Must wait for mongos cluster-admin to remove shard." ) except ShardNotInClusterError: - logger.info("Shard to remove, is not in sharded cluster.") + logger.info( + "Shard to remove is not in sharded cluster. It has been successfully removed." + ) + if self.charm.unit.is_leader(): + self.charm.app_peer_data["drained"] = json.dumps(True) + break self.charm.unit.status = ActiveStatus("Shard drained from cluster, ready for removal") @@ -329,22 +354,20 @@ def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool: ShardNotPlannedForRemovalError """ if not self.charm.is_role(Config.Role.SHARD): - logger.info( - "Component %s is not a shard, cannot check draining status.", self.charm.role - ) - return False - - # leader has not checked for draining status, so we assume not yet drained - if "draining" not in self.charm.app_peer_data: + logger.info("Component %s is not a shard, has no draining status.", self.charm.role) return False if not self.charm.unit.is_leader(): - return self.charm.app_peer_data.get("draining") + # if "drained" hasn't been set by leader, then assume it hasn't be drained. + return json.dumps(self.charm.app_peer_data.get("drained", False)) with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo: + # a shard is "drained" if it is NO LONGER draining. draining = mongo._is_shard_draining(shard_name) - self.charm.app_peer_data["draining"] = json.dumps(draining) - return self.charm.app_peer_data.get("draining") + drained = not draining + + self.charm.app_peer_data["drained"] = json.dumps(drained) + return drained def update_operator_password(self, new_password: str) -> None: """Updates the password for the operator user. From f05d51594613f09b18f8c35692a1c03ad3ded6dc Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 5 Oct 2023 15:33:15 +0000 Subject: [PATCH 39/42] departed cannot access relation.units --- lib/charms/mongodb/v0/mongos.py | 2 -- lib/charms/mongodb/v0/shards_interface.py | 40 +++++++++++++---------- src/charm.py | 4 +++ 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 15dc8a349..105ba3d72 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -9,11 +9,9 @@ from charms.mongodb.v0.mongodb import NotReadyError from pymongo import MongoClient -from pymongo.errors import PyMongoError from config import Config - # The unique Charmhub library identifier, never change it LIBID = "e20d5b19670d4c55a4934a21d3f3b29a" diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index fbb7bc695..5522ec24c 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -44,6 +44,7 @@ # to 0 if you are raising the major API version LIBPATCH = 2 KEYFILE_KEY = "key-file" +HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) @@ -61,7 +62,7 @@ class ShardingProvider(Object): def __init__( self, charm: CharmBase, relation_name: str = Config.Relations.CONFIG_SERVER_RELATIONS_NAME ) -> None: - """Constructor for ShardingRequirer object.""" + """Constructor for ShardingProvider object.""" self.relation_name = relation_name self.charm = charm @@ -80,7 +81,7 @@ def __init__( def _on_relation_joined(self, event): """Handles providing shards with secrets and adding shards to the config server.""" - if not self.pass_hook_checks: + if not self.pass_hook_checks(event): return # TODO Future PR, sync tls secrets and PBM password @@ -94,11 +95,12 @@ def _on_relation_joined(self, event): KEYFILE_KEY: self.charm.get_secret( Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME ), + HOSTS_KEY: json.dumps(self.charm._unit_ips), }, ) def pass_hook_checks(self, event: EventBase) -> bool: - """Runs the pre-hooks checks for ShardingRequirer, returns True if all pass.""" + """Runs the pre-hooks checks for ShardingProvider, returns True if all pass.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") logger.error( @@ -109,7 +111,7 @@ def pass_hook_checks(self, event: EventBase) -> bool: if not self.charm.is_role(Config.Role.CONFIG_SERVER): logger.info( - "Skipping %s. ShardingRequirer is only be executed by config-server", type(event) + "Skipping %s. ShardingProvider is only be executed by config-server", type(event) ) return False @@ -128,7 +130,7 @@ def _on_relation_event(self, event): Updating of shards is done automatically via MongoDB change-streams. """ - if not self.pass_hook_checks: + if not self.pass_hook_checks(event): return departed_relation_id = None @@ -155,6 +157,9 @@ def _on_relation_event(self, event): ) # we should not lose connection with the shard, prevent other hooks from executing. raise RemoveLastShardError() + + logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) + event.defer() except (PyMongoError, NotReadyError) as e: logger.error("Deferring _on_relation_event for shards interface since: error=%r", e) event.defer() @@ -214,6 +219,15 @@ def update_credentials(self, key: str, value: str) -> None: for relation in self.charm.model.relations[self.relation_name]: self._update_relation_data(relation.id, {key: value}) + def _update_mongos_hosts(self): + """Updates the hosts for mongos on the relation data.""" + if not self.charm.is_role(Config.Role.CONFIG_SERVER): + logger.info("Skipping, ShardingProvider is only be executed by config-server") + return + + for relation in self.charm.model.relations[self.relation_name]: + self._update_relation_data(relation.id, {HOSTS_KEY: json.dumps(self.charm._unit_ips)}) + def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -340,17 +354,19 @@ def _on_relation_departed(self, event) -> None: return self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") - mongos_hosts = self._get_mongos_hosts() + # mongos hosts must be retrieved via relation data, as relation.units are not available in + # departed + mongos_hosts = json.loads(event.relation.data[event.relation.app].get(HOSTS_KEY)) drained = False while not drained: try: + time.sleep(10) drained = self.drained(mongos_hosts, self.charm.app.name) draining_status = ( "Shard is still draining" if not drained else "Shard is fully drained." ) logger.debug(draining_status) # no need to continuously check and abuse resources while shard is draining - time.sleep(10) except PyMongoError as e: logger.error("Error occurred while draining shard: %s", e) self.charm.unit.status = BlockedStatus("Failed to drain shard from cluster") @@ -475,13 +491,3 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: relation = self.charm.model.get_relation(self.relation_name, relation_id) if relation: relation.data[self.charm.model.app].update(data) - - def _get_mongos_hosts(self) -> List[str]: - """Retrieves the hosts for mongos router.""" - # metadata limits us to only one config-server relation - config_server_relations = self.model.relations[self.relation_name][0] - hosts = [] - for unit in config_server_relations.units: - hosts.append(config_server_relations.data[unit].get("private-address")) - - return hosts diff --git a/src/charm.py b/src/charm.py index ab792e959..4d621ba96 100755 --- a/src/charm.py +++ b/src/charm.py @@ -423,6 +423,7 @@ def _on_relation_joined(self, event: RelationJoinedEvent) -> None: # app relations should be made aware of the new set of hosts try: self.client_relations.update_app_relation_data() + self.shard_relations._update_mongos_hosts() except PyMongoError as e: logger.error("Deferring on updating app relation data since: error: %r", e) event.defer() @@ -485,6 +486,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: # app relations should be made aware of the new set of hosts try: self.client_relations.update_app_relation_data() + self.shard_relations._update_mongos_hosts() except PyMongoError as e: logger.error("Deferring on updating app relation data since: error: %r", e) event.defer() @@ -505,6 +507,7 @@ def _on_relation_departed(self, event: RelationDepartedEvent) -> None: # app relations should be made aware of the new set of hosts try: self.client_relations.update_app_relation_data() + self.shard_relations._update_mongos_hosts() except PyMongoError as e: logger.error("Deferring on updating app relation data since: error: %r", e) event.defer() @@ -901,6 +904,7 @@ def _handle_reconfigure(self, event: UpdateStatusEvent): # app relations should be made aware of the new set of hosts try: self.client_relations.update_app_relation_data() + self.shard_relations._update_mongos_hosts() except PyMongoError as e: logger.error("Deferring on updating app relation data since: error: %r", e) event.defer() From 8f9176c4542f7f62ab195d3f7879e63e33c9e4a7 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 5 Oct 2023 16:00:41 +0000 Subject: [PATCH 40/42] relation departed events will occur when units are removed, instead listen to broken --- lib/charms/mongodb/v0/shards_interface.py | 17 +++++++++-------- src/charm.py | 6 +----- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 5522ec24c..a9693a330 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -24,7 +24,7 @@ ShardNotPlannedForRemovalError, ) from charms.mongodb.v0.users import MongoDBUser, OperatorUser -from ops.charm import CharmBase, EventBase, RelationDepartedEvent +from ops.charm import CharmBase, EventBase, RelationBrokenEvent from ops.framework import Object from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed @@ -74,10 +74,11 @@ def __init__( charm.on[self.relation_name].relation_changed, self._on_relation_event ) self.framework.observe( - charm.on[self.relation_name].relation_departed, self._on_relation_event + charm.on[self.relation_name].relation_broken, self._on_relation_event ) - # TODO Follow up PR, handle rotating passwords + # TODO Future PR: handle self healing when all IP addresses of a shard changes and we have + # to manually update mongos def _on_relation_joined(self, event): """Handles providing shards with secrets and adding shards to the config server.""" @@ -134,7 +135,7 @@ def _on_relation_event(self, event): return departed_relation_id = None - if type(event) is RelationDepartedEvent: + if type(event) is RelationBrokenEvent: departed_relation_id = event.relation.id try: @@ -286,7 +287,7 @@ def __init__( charm.on[self.relation_name].relation_changed, self._on_relation_changed ) self.framework.observe( - charm.on[self.relation_name].relation_departed, self._on_relation_departed + charm.on[self.relation_name].relation_broken, self._on_relation_broken ) def _on_relation_changed(self, event): @@ -336,7 +337,7 @@ def _on_relation_changed(self, event): # TODO future PR, leader unit verifies shard was added to cluster (update-status hook) - def _on_relation_departed(self, event) -> None: + def _on_relation_broken(self, event) -> None: """Waits for the shard to be fully drained from the cluster.""" if self.charm.is_role(Config.Role.REPLICATION): self.unit.status = BlockedStatus("role replication does not support sharding") @@ -345,7 +346,7 @@ def _on_relation_departed(self, event) -> None: if not self.charm.is_role(Config.Role.SHARD): logger.info( - "skipping relation departed event ShardingProvider is only be executed by shards" + "skipping relation broken event ShardingProvider is only be executed by shards" ) return @@ -355,7 +356,7 @@ def _on_relation_departed(self, event) -> None: self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") # mongos hosts must be retrieved via relation data, as relation.units are not available in - # departed + # broken mongos_hosts = json.loads(event.relation.data[event.relation.app].get(HOSTS_KEY)) drained = False while not drained: diff --git a/src/charm.py b/src/charm.py index 4d621ba96..fba4b6c4b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -171,11 +171,7 @@ def drained(self) -> bool: logger.info("Component %s is not a shard, cannot check draining status.", self.role) return False - if "draining" not in self.app_peer_data: - logger.info("Draining status has not been set. It is set on relation_departed") - return False - - return self.app_peer_data.get("drain") + return self.app_peer_data.get("drained", False) @property def _unit_ips(self) -> List[str]: From e8d7167762a262f6bcaf3a8c3c5460a46b720c86 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 6 Oct 2023 08:57:32 +0000 Subject: [PATCH 41/42] small fixes --- lib/charms/mongodb/v0/mongos.py | 2 +- lib/charms/mongodb/v0/shards_interface.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 105ba3d72..06a007d31 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -20,7 +20,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 1 +LIBPATCH = 2 # path to store mongodb ketFile logger = logging.getLogger(__name__) diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index a9693a330..80bd22245 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -42,7 +42,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 3 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) @@ -361,13 +361,13 @@ def _on_relation_broken(self, event) -> None: drained = False while not drained: try: + # no need to continuously check and abuse resources while shard is draining time.sleep(10) drained = self.drained(mongos_hosts, self.charm.app.name) draining_status = ( "Shard is still draining" if not drained else "Shard is fully drained." ) logger.debug(draining_status) - # no need to continuously check and abuse resources while shard is draining except PyMongoError as e: logger.error("Error occurred while draining shard: %s", e) self.charm.unit.status = BlockedStatus("Failed to drain shard from cluster") From c5d49c2871771ddc725d8807688e710dde108477 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 6 Oct 2023 13:40:29 +0000 Subject: [PATCH 42/42] pr comments --- lib/charms/mongodb/v0/mongos.py | 30 ++++++++++++++++++++--- lib/charms/mongodb/v0/shards_interface.py | 13 ++++++++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/lib/charms/mongodb/v0/mongos.py b/lib/charms/mongodb/v0/mongos.py index 06a007d31..f51ee4097 100644 --- a/lib/charms/mongodb/v0/mongos.py +++ b/lib/charms/mongodb/v0/mongos.py @@ -180,12 +180,12 @@ def remove_shard(self, shard_name: str) -> None: cannot_remove_shard = ( f"cannot remove shard {shard_name} from cluster, another shard is draining" ) - logger.info(cannot_remove_shard) + logger.error(cannot_remove_shard) raise NotReadyError(cannot_remove_shard) # TODO Follow up PR, there is no MongoDB command to retrieve primary shard, this is # possible with mongosh. - primary_shard = False + primary_shard = self.get_primary_shard() if primary_shard: # TODO Future PR, support removing Primary Shard if there are no unsharded collections # on it. All sharded collections should perform `MovePrimary` @@ -196,8 +196,24 @@ def remove_shard(self, shard_name: str) -> None: raise RemovePrimaryShardError(cannot_remove_primary_shard) logger.info("Attempting to remove shard %s", shard_name) - self.client.admin.command("removeShard", shard_name) - logger.info("Shard %s, now draining", shard_name) + removal_info = self.client.admin.command("removeShard", shard_name) + + # process removal status + remaining_chunks = ( + removal_info["remaining"]["chunks"] if "remaining" in removal_info else "None" + ) + dbs_to_move = ( + removal_info["dbsToMove"] + if "dbsToMove" in removal_info and removal_info["dbsToMove"] != [] + else ["None"] + ) + logger.info( + "Shard %s is draining status is: %s. Remaining chunks: %s. DBs to move: %s.", + shard_name, + removal_info["state"], + str(remaining_chunks), + ",".join(dbs_to_move), + ) def _is_shard_draining(self, shard_name: str) -> bool: """Reports if a given shard is currently in the draining state. @@ -219,6 +235,12 @@ def _is_shard_draining(self, shard_name: str) -> bool: f"Shard {shard_name} not in cluster, could not retrieve draining status" ) + def get_primary_shard(self) -> str: + """Processes sc_status and identifies the primary shard.""" + # TODO Follow up PR, implement this function there is no MongoDB command to retrieve + # primary shard, this is possible with mongosh. + return False + @staticmethod def _is_any_draining(sc_status: Dict, ignore_shard: str = "") -> bool: """Returns true if any shard members is draining. diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py index 80bd22245..868c5b5de 100644 --- a/lib/charms/mongodb/v0/shards_interface.py +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -46,6 +46,7 @@ KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) +FORBIDDEN_REMOVAL_ERR_CODE = 20 class RemoveLastShardError(Exception): @@ -83,6 +84,7 @@ def __init__( def _on_relation_joined(self, event): """Handles providing shards with secrets and adding shards to the config server.""" if not self.pass_hook_checks(event): + logger.info("Skipping relation joined event: hook checks did not pass") return # TODO Future PR, sync tls secrets and PBM password @@ -132,6 +134,13 @@ def _on_relation_event(self, event): Updating of shards is done automatically via MongoDB change-streams. """ if not self.pass_hook_checks(event): + logger.info("Skipping relation event: hook checks did not pass") + return + + # adding/removing shards while a backup/restore is in progress can be disastrous + pbm_status = self.charm.backups._get_pbm_status() + if isinstance(pbm_status, MaintenanceStatus): + event.defer("Cannot add/remove shards while a backup/restore is in progress.") return departed_relation_id = None @@ -150,7 +159,7 @@ def _on_relation_event(self, event): event.defer() return except OperationFailure as e: - if e.code == 20: + if e.code == FORBIDDEN_REMOVAL_ERR_CODE: # TODO Future PR, allow removal of last shards that have no data. This will be # tricky since we are not allowed to update the mongos config in this way. logger.error( @@ -208,7 +217,7 @@ def remove_shards(self, departed_shard_id): self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}") logger.info("Attempting to removing shard: %s", shard) mongo.remove_shard(shard) - logger.info("Attempting to removing shard: %s", shard) + logger.info("Shard: %s, is now draining", shard) if shard in mongo.get_shard_members(): shard_draining_message = f"shard {shard} still exists in cluster after removal, it is still draining."