diff --git a/lib/charms/mongodb/v0/shards_interface.py b/lib/charms/mongodb/v0/shards_interface.py new file mode 100644 index 000000000..be991a7c9 --- /dev/null +++ b/lib/charms/mongodb/v0/shards_interface.py @@ -0,0 +1,235 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +"""In this class, we manage relations between config-servers and shards. + +This class handles the sharing of secrets between sharded components, adding shards, and removing +shards. +""" +import logging + +from charms.mongodb.v0.helpers import KEY_FILE +from charms.mongodb.v0.mongodb import MongoDBConnection, NotReadyError, PyMongoError +from charms.mongodb.v0.users import MongoDBUser, OperatorUser +from ops.charm import CharmBase +from ops.framework import Object +from ops.model import BlockedStatus, MaintenanceStatus, WaitingStatus +from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed + +from config import Config + +logger = logging.getLogger(__name__) + + +# The unique Charmhub library identifier, never change it +LIBID = "55fee8fa73364fb0a2dc16a954b2fd4a" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 1 +KEYFILE_KEY = "key-file" +OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) + + +class ShardingProvider(Object): + """Manage relations between the config server and the shard, on the config-server's side.""" + + def __init__( + self, charm: CharmBase, relation_name: str = Config.Relations.CONFIG_SERVER_RELATIONS_NAME + ) -> None: + """Constructor for ShardingRequirer object.""" + self.relation_name = relation_name + self.charm = charm + + super().__init__(charm, self.relation_name) + self.framework.observe( + charm.on[self.relation_name].relation_joined, self._on_relation_joined + ) + # TODO Future PR, enable shard drainage by listening for relation departed events + + def _on_relation_joined(self, event): + """Handles providing shards with secrets and adding shards to the config server.""" + if self.charm.is_role(Config.Role.REPLICATION): + self.charm.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.CONFIG_SERVER): + logger.info( + "skipping relation joined event ShardingRequirer is only be executed by config-server" + ) + return + + if not self.charm.unit.is_leader(): + return + + if not self.charm.db_initialised: + event.defer() + return + + # TODO Future PR, sync tls secrets and PBM password + self._update_relation_data( + event.relation.id, + { + OPERATOR_PASSWORD_KEY: self.charm.get_secret( + Config.Relations.APP_SCOPE, + OPERATOR_PASSWORD_KEY, + ), + KEYFILE_KEY: self.charm.get_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME + ), + }, + ) + + # TODO Future PR, add shard to config server + # TODO Follow up PR, handle rotating passwords + + def _update_relation_data(self, relation_id: int, data: dict) -> None: + """Updates a set of key-value pairs in the relation. + + This function writes in the application data bag, therefore, only the leader unit can call + it. + + Args: + relation_id: the identifier for a particular relation. + data: dict containing the key-value pairs + that should be updated in the relation. + """ + if self.charm.unit.is_leader(): + relation = self.charm.model.get_relation(self.relation_name, relation_id) + if relation: + relation.data[self.charm.model.app].update(data) + + +class ConfigServerRequirer(Object): + """Manage relations between the config server and the shard, on the shard's side.""" + + def __init__( + self, charm: CharmBase, relation_name: str = Config.Relations.SHARDING_RELATIONS_NAME + ) -> None: + """Constructor for ShardingProvider object.""" + self.relation_name = relation_name + self.charm = charm + + super().__init__(charm, self.relation_name) + self.framework.observe( + charm.on[self.relation_name].relation_changed, self._on_relation_changed + ) + + # TODO Future PR, enable shard drainage by observing relation departed events + + def _on_relation_changed(self, event): + """Retrieves secrets from config-server and updates them within the shard.""" + if self.charm.is_role(Config.Role.REPLICATION): + self.charm.unit.status = BlockedStatus("role replication does not support sharding") + logger.error("sharding interface not supported with config role=replication") + return + + if not self.charm.is_role(Config.Role.SHARD): + logger.info( + "skipping relation changed event ShardingProvider is only be executed by shards" + ) + return + + if not self.charm.db_initialised: + event.defer() + return + + # shards rely on the config server for secrets + relation_data = event.relation.data[event.app] + self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY)) + + # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). + with MongoDBConnection(self.charm.mongodb_config) as mongo: + if not mongo.is_ready: + logger.info("shard has not started yet, deferfing") + self.charm.unit.status = WaitingStatus("Waiting for MongoDB to start") + event.defer() + return + + self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") + + if not self.charm.unit.is_leader(): + return + + # TODO Future work, see if needed to check for all units restarted / primary elected + + try: + self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) + except RetryError: + self.charm.unit.status = BlockedStatus("Shard not added to config-server") + logger.error( + "Shard could not be added to config server, failed to set operator password." + ) + return + + # TODO future PR, leader unit verifies shard was added to cluster + + def update_operator_password(self, new_password: str) -> None: + """Updates the password for the operator user. + + Raises: + RetryError + """ + if not new_password or not self.charm.unit.is_leader(): + return + + current_password = ( + self.charm.get_secret( + Config.Relations.APP_SCOPE, + OPERATOR_PASSWORD_KEY, + ), + ) + + if new_password == current_password: + return + + # updating operator password, usually comes after keyfile was updated, hence, the mongodb + # service was restarted. Sometimes this requires units getting insync again. + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + # TODO, in the future use set_password from src/charm.py - this will require adding + # a library, for exceptions used in both charm code and lib code. + with MongoDBConnection(self.charm.mongodb_config) as mongo: + try: + mongo.set_user_password(OperatorUser.get_username(), new_password) + except NotReadyError: + logger.error( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + raise + except PyMongoError as e: + logger.error(f"Failed changing the password: {e}") + raise + + self.charm.set_secret( + Config.Relations.APP_SCOPE, + OPERATOR_PASSWORD_KEY, + new_password, + ) + + def update_keyfile(self, key_file_contents: str) -> None: + """Updates keyfile on all units.""" + # keyfile is set by leader in application data, application data does not necessarily + # match what is on the machine. + current_key_file = self.charm.get_keyfile_contents() + if not key_file_contents or key_file_contents == current_key_file: + return + + # put keyfile on the machine with appropriate permissions + self.charm.push_file_to_unit( + parent_dir=Config.MONGOD_CONF_DIR, file_name=KEY_FILE, file_contents=key_file_contents + ) + + # when the contents of the keyfile change, we must restart the service + self.charm.restart_mongod_service() + + if not self.charm.unit.is_leader(): + return + + self.charm.set_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME, key_file_contents + ) diff --git a/metadata.yaml b/metadata.yaml index fe03ed10a..0cf7618da 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -25,6 +25,8 @@ provides: interface: mongodb cos-agent: interface: cos_agent + config-server: + interface: shards storage: mongodb: @@ -39,6 +41,10 @@ requires: certificates: interface: tls-certificates limit: 1 - s3-credentials: interface: s3 + limit: 1 + sharding: + interface: shards + # shards can only relate to one config-server + limit: 1 diff --git a/src/charm.py b/src/charm.py index 4501f943a..d5c34e40e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -4,9 +4,12 @@ # See LICENSE file for licensing details. import json import logging +import os +import pwd import re import subprocess import time +from pathlib import Path from typing import Dict, List, Optional, Set from charms.grafana_agent.v0.cos_agent import COSAgentProvider @@ -32,6 +35,7 @@ from charms.mongodb.v0.mongodb_provider import MongoDBProvider from charms.mongodb.v0.mongodb_tls import MongoDBTLS from charms.mongodb.v0.mongodb_vm_legacy_provider import MongoDBLegacyProvider +from charms.mongodb.v0.shards_interface import ConfigServerRequirer, ShardingProvider from charms.mongodb.v0.users import ( CHARM_USERS, BackupUser, @@ -43,6 +47,7 @@ from ops.charm import ( ActionEvent, CharmBase, + ConfigChangedEvent, InstallEvent, LeaderElectedEvent, RelationDepartedEvent, @@ -72,11 +77,7 @@ ApplicationHostNotFoundError, SecretNotAddedError, ) -from machine_helpers import ( - push_file_to_unit, - remove_file_from_unit, - update_mongod_service, -) +from machine_helpers import MONGO_USER, ROOT_USER_GID, update_mongod_service logger = logging.getLogger(__name__) @@ -91,6 +92,9 @@ class MongodbOperatorCharm(CharmBase): def __init__(self, *args): super().__init__(*args) self._port = Config.MONGODB_PORT + + # lifecycle events + self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.install, self._on_install) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -122,6 +126,8 @@ def __init__(self, *args): self.legacy_client_relations = MongoDBLegacyProvider(self) self.tls = MongoDBTLS(self, Config.Relations.PEERS, substrate=Config.SUBSTRATE) self.backups = MongoDBBackups(self) + self.shard_relations = ShardingProvider(self) + self.config_server_relations = ConfigServerRequirer(self) # relation events for Prometheus metrics are handled in the MetricsEndpointProvider self._grafana_agent = COSAgentProvider( @@ -235,7 +241,23 @@ def db_initialised(self) -> bool: @property def role(self) -> str: """Returns role of MongoDB deployment.""" - return self.model.config["role"] + if ( + "role" not in self.app_peer_data + and self.unit.is_leader() + and self.model.config["role"] + ): + self.app_peer_data["role"] = self.model.config["role"] + # app data bag isn't set until function completes + return self.model.config["role"] + elif "role" not in self.app_peer_data: + # if leader hasn't set the role yet, use the one set by model + return self.model.config["role"] + + return self.app_peer_data.get("role") + + def is_role_changed(self) -> bool: + """Checks if application is running in provided role.""" + return self.role != self.model.config["role"] def is_role(self, role_name: str) -> bool: """Checks if application is running in provided role.""" @@ -290,6 +312,22 @@ def _on_install(self, event: InstallEvent) -> None: # add licenses copy_licenses_to_unit() + def _on_config_changed(self, event: ConfigChangedEvent) -> None: + """Listen to changes in application configuration. + + To prevent a user from migrating a cluster, and causing the component to become + unresponsive therefore causing a cluster failure, error the component. This prevents it + from executing other hooks with a new role. + """ + # TODO in the future (24.04) support migration of components + if self.is_role_changed(): + logger.error( + f"cluster migration currently not supported, cannot change from { self.model.config['role']} to {self.role}" + ) + raise ShardingMigrationError( + f"Migration of sharding components not permitted, revert config role to {self.role}" + ) + def _on_start(self, event: StartEvent) -> None: """Enables MongoDB service and initialises replica set. @@ -537,6 +575,10 @@ def _on_get_password(self, event: ActionEvent) -> None: def _on_set_password(self, event: ActionEvent) -> None: """Set the password for the admin user.""" + if self.is_role(Config.Role.SHARD): + event.fail("Cannot set password on shard, please set password on config-server.") + return + # changing the backup password while a backup/restore is in progress can be disastrous pbm_status = self.backups._get_pbm_status() if isinstance(pbm_status, MaintenanceStatus): @@ -560,21 +602,12 @@ def _on_set_password(self, event: ActionEvent) -> None: f"Password cannot be longer than {Config.Secrets.MAX_PASSWORD_LENGTH} characters." ) return - with MongoDBConnection(self.mongodb_config) as mongo: - try: - mongo.set_user_password(username, new_password) - except NotReadyError: - event.fail( - "Failed changing the password: Not all members healthy or finished initial sync." - ) - return - except PyMongoError as e: - event.fail(f"Failed changing the password: {e}") - return - secret_id = self.set_secret( - APP_SCOPE, MongoDBUser.get_password_key_name_for_user(username), new_password - ) + try: + secret_id = self.set_password(username, new_password) + except SetPasswordError as e: + event.fail(e) + return if username == BackupUser.get_username(): self._connect_pbm_agent() @@ -586,6 +619,26 @@ def _on_set_password(self, event: ActionEvent) -> None: {Config.Actions.PASSWORD_PARAM_NAME: new_password, "secret-id": secret_id} ) + def set_password(self, username, password) -> int: + """Sets the password for a given username and return the secret id. + + Raises: + SetPasswordError + """ + with MongoDBConnection(self.mongodb_config) as mongo: + try: + mongo.set_user_password(username, password) + except NotReadyError: + raise SetPasswordError( + "Failed changing the password: Not all members healthy or finished initial sync." + ) + except PyMongoError as e: + raise SetPasswordError(f"Failed changing the password: {e}") + + return self.set_secret( + APP_SCOPE, MongoDBUser.get_password_key_name_for_user(username), password + ) + def _on_secret_remove(self, event: SecretRemoveEvent): # We are keeping this function empty on purpose until the issue with secrets # is not fixed. The issue is: https://bugs.launchpad.net/juju/+bug/2023364 @@ -839,24 +892,63 @@ def _instatiate_keyfile(self, event: StartEvent) -> None: return # put keyfile on the machine with appropriate permissions - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=KEY_FILE, file_contents=self.get_secret(APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME), ) + def get_keyfile_contents(self) -> str: + """Retrieves the contents of the keyfile on host machine.""" + # wait for keyFile to be created by leader unit + if not self.get_secret(APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME): + logger.debug("waiting for leader unit to generate keyfile contents") + return + + key_file_path = f"{Config.MONGOD_CONF_DIR}/{KEY_FILE}" + key_file = Path(key_file_path) + if not key_file.is_file(): + logger.info("no keyfile present") + return + + with open(key_file_path, "r") as file: + key = file.read() + + return key + + def push_file_to_unit(self, parent_dir, file_name, file_contents) -> None: + """K8s charms can push files to their containers easily, this is a vm charm workaround.""" + Path(parent_dir).mkdir(parents=True, exist_ok=True) + file_name = f"{parent_dir}/{file_name}" + with open(file_name, "w") as write_file: + write_file.write(file_contents) + + # MongoDB limitation; it is needed 400 rights for keyfile and we need 440 rights on tls + # certs to be able to connect via MongoDB shell + if Config.TLS.KEY_FILE_NAME in file_name: + os.chmod(file_name, 0o400) + else: + os.chmod(file_name, 0o440) + mongodb_user = pwd.getpwnam(MONGO_USER) + os.chown(file_name, mongodb_user.pw_uid, ROOT_USER_GID) + + def remove_file_from_unit(self, parent_dir, file_name) -> None: + """Remove file from vm unit.""" + if os.path.exists(f"{parent_dir}/{file_name}"): + os.remove(f"{parent_dir}/{file_name}") + def push_tls_certificate_to_workload(self) -> None: """Uploads certificate to the workload container.""" external_ca, external_pem = self.tls.get_tls_files(UNIT_SCOPE) if external_ca is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_EXT_CA_FILE, file_contents=external_ca, ) if external_pem is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_EXT_PEM_FILE, file_contents=external_pem, @@ -864,21 +956,20 @@ def push_tls_certificate_to_workload(self) -> None: internal_ca, internal_pem = self.tls.get_tls_files(APP_SCOPE) if internal_ca is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_INT_CA_FILE, file_contents=internal_ca, ) if internal_pem is not None: - push_file_to_unit( + self.push_file_to_unit( parent_dir=Config.MONGOD_CONF_DIR, file_name=TLS_INT_PEM_FILE, file_contents=internal_pem, ) - @staticmethod - def delete_tls_certificate_from_workload() -> None: + def delete_tls_certificate_from_workload(self) -> None: """Deletes certificate from VM.""" logger.info("Deleting TLS certificate from VM") @@ -888,7 +979,7 @@ def delete_tls_certificate_from_workload() -> None: Config.TLS.INT_CA_FILE, Config.TLS.INT_PEM_FILE, ]: - remove_file_from_unit(Config.MONGOD_CONF_DIR, file) + self.remove_file_from_unit(Config.MONGOD_CONF_DIR, file) def _connect_mongodb_exporter(self) -> None: """Exposes the endpoint to mongodb_exporter.""" @@ -1261,5 +1352,13 @@ def _juju_secret_remove(self, scope: Scopes, key: str) -> None: # END: helper functions +class ShardingMigrationError(Exception): + """Raised when there is an attempt to change the role of a sharding component.""" + + +class SetPasswordError(Exception): + """Raised on failure to set password for MongoDB user.""" + + if __name__ == "__main__": main(MongodbOperatorCharm) diff --git a/src/config.py b/src/config.py index 6da269ec1..073065bee 100644 --- a/src/config.py +++ b/src/config.py @@ -72,6 +72,8 @@ class Relations: NAME = "database" PEERS = "database-peers" OBSOLETE_RELATIONS_NAME = "obsolete" + SHARDING_RELATIONS_NAME = "sharding" + CONFIG_SERVER_RELATIONS_NAME = "config-server" APP_SCOPE = "app" UNIT_SCOPE = "unit" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] diff --git a/src/machine_helpers.py b/src/machine_helpers.py index 424bdb56d..19e2daa4d 100644 --- a/src/machine_helpers.py +++ b/src/machine_helpers.py @@ -2,9 +2,6 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. import logging -import os -import pwd -from pathlib import Path from charms.mongodb.v0.helpers import get_mongod_args, get_mongos_args from charms.mongodb.v0.mongodb import MongoDBConfiguration @@ -50,26 +47,3 @@ def add_args_to_env(var: str, args: str): with open(Config.ENV_VAR_PATH, "w") as service_file: service_file.writelines(env_vars) - - -def push_file_to_unit(parent_dir, file_name, file_contents) -> None: - """K8s charms can push files to their containers easily, this is the vm charm workaround.""" - Path(parent_dir).mkdir(parents=True, exist_ok=True) - file_name = f"{parent_dir}/{file_name}" - with open(file_name, "w") as write_file: - write_file.write(file_contents) - - # MongoDB limitation; it is needed 400 rights for keyfile and we need 440 rights on tls certs - # to be able to connect via MongoDB shell - if Config.TLS.KEY_FILE_NAME in file_name: - os.chmod(file_name, 0o400) - else: - os.chmod(file_name, 0o440) - mongodb_user = pwd.getpwnam(MONGO_USER) - os.chown(file_name, mongodb_user.pw_uid, ROOT_USER_GID) - - -def remove_file_from_unit(parent_dir, file_name) -> None: - """Remove file from vm unit.""" - if os.path.exists(f"{parent_dir}/{file_name}"): - os.remove(f"{parent_dir}/{file_name}") diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9736b0041..b0bc63457 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -44,7 +44,7 @@ def setUp(self, *unused): @patch("charm.MongodbOperatorCharm._init_operator_user") @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") def test_on_start_not_leader_doesnt_initialise_replica_set( self, open, path, snap, _open_ports_tcp, init_admin, connection, get_secret @@ -71,7 +71,7 @@ def test_on_start_not_leader_doesnt_initialise_replica_set( @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") @patch("charm.MongodbOperatorCharm._open_ports_tcp") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") def test_on_start_snap_failure_leads_to_blocked_status( self, @@ -94,7 +94,7 @@ def test_on_start_snap_failure_leads_to_blocked_status( @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.MongodbOperatorCharm._initialise_replica_set") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user") @@ -126,7 +126,7 @@ def test_on_start_mongod_not_ready_defer( @patch_network_get(private_address="1.1.1.1") @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") def test_start_unable_to_open_tcp_moves_to_blocked(self, open, path, snap, _open_ports_tcp): """Test verifies that if TCP port cannot be opened we go to the blocked state.""" @@ -288,7 +288,7 @@ def test_reconfigure_add_member_failure(self, _, connection, defer): @patch_network_get(private_address="1.1.1.1") @patch("charm.MongodbOperatorCharm._open_ports_tcp") @patch("charm.snap.SnapCache") - @patch("charm.push_file_to_unit") + @patch("charm.MongodbOperatorCharm.push_file_to_unit") @patch("builtins.open") @patch("charm.MongoDBConnection") @patch("charm.MongodbOperatorCharm._init_operator_user")