Skip to content

Commit

Permalink
[DPE-3651] fix race condition with cluster auth (#363)
Browse files Browse the repository at this point in the history
## Issue
Flakey sharding tests would show that a cluster component would
*temporarily* go into error due to unauthorised access in update_status,
only for the for the unit to get resolved in the next hook execution

## Reasoning
this is due to improper sync of passwords across cluster

Fixes #348 

## Solution
add a check in update status to notify the user why the cluster
component is not yet connected
  • Loading branch information
MiaAltieri authored Feb 29, 2024
1 parent f7f8f85 commit 55b000e
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 1 deletion.
56 changes: 55 additions & 1 deletion lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 7
LIBPATCH = 8
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
Expand Down Expand Up @@ -442,6 +442,27 @@ def get_draining_shards(self) -> List[str]:

return draining_shards

def cluster_password_synced(self) -> bool:
"""Returns True if the cluster password is synced."""
# base case: not config-server
if not self.charm.is_role(Config.Role.CONFIG_SERVER):
return True

# base case: no cluster relation
if not self.model.relations[self.relation_name]:
return True

try:
# check our ability to use connect to cluster
with MongosConnection(self.charm.mongos_config) as mongos:
mongos.get_shard_members()
except OperationFailure as e:
if e.code == 18: # Unauthorized Error - i.e. password is not in sync
return False
raise

return True


class ConfigServerRequirer(Object):
"""Manage relations between the config server and the shard, on the shard's side."""
Expand Down Expand Up @@ -483,6 +504,8 @@ def _on_relation_changed(self, event):

# if re-using an old shard, re-set drained flag.
self.charm.unit_peer_data["drained"] = json.dumps(False)

# TODO: Future PR better status message behavior
self.charm.unit.status = MaintenanceStatus("Adding shard to config-server")

# shards rely on the config server for secrets
Expand Down Expand Up @@ -809,6 +832,37 @@ def _is_added_to_cluster(self) -> bool:

raise

def cluster_password_synced(self) -> bool:
"""Returns True if the cluster password is synced for the shard."""
# base case: not a shard
if not self.charm.is_role(Config.Role.SHARD):
return True

# base case: no cluster relation
if not self.model.get_relation(self.relation_name):
return True

try:
# check our ability to use connect to both mongos and our current replica set.
mongos_reachable = self._is_mongos_reachable()
with MongoDBConnection(self.charm.mongodb_config) as mongo:
mongod_reachable = mongo.is_ready
except OperationFailure as e:
if e.code == 18: # Unauthorized Error - i.e. password is not in sync
return False
raise

return mongos_reachable and mongod_reachable

def get_shard_members(self) -> List[str]:
"""Returns a list of shard members.
Raises: PyMongoError
"""
mongos_hosts = self.get_mongos_hosts()
with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo:
return mongo.get_shard_members()

def _is_shard_aware(self) -> bool:
"""Returns True if shard is in cluster and shard aware."""
if not self.model.get_relation(self.relation_name):
Expand Down
11 changes: 11 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,17 @@ def _on_update_status(self, event: UpdateStatusEvent):
self.unit.status = WaitingStatus("Waiting for MongoDB to start")
return

# Cannot check more advanced MongoDB statuses if the cluster doesn't have passwords synced
# this can occur in two cases:
# 1. password rotation
# 2. race conditions when a new shard is addeded.
if (
not self.shard.cluster_password_synced()
or not self.config_server.cluster_password_synced()
):
self.unit.status = WaitingStatus("Waiting to sync passwords across the cluster")
return

# leader should periodically handle configuring the replica set. Incidents such as network
# cuts can lead to new IP addresses and therefore will require a reconfigure. Especially
# in the case that the leader a change in IP address it will not receive a relation event.
Expand Down

0 comments on commit 55b000e

Please sign in to comment.