Skip to content

Commit

Permalink
fix bugs (#299)
Browse files Browse the repository at this point in the history
## Issue
We have several bugs in the sharding implementation, fix them
#297 
#301 
#302
  • Loading branch information
MiaAltieri authored Nov 6, 2023
1 parent 3bb5a44 commit 1d35616
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 21 deletions.
35 changes: 15 additions & 20 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 3
LIBPATCH = 4
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
Expand Down Expand Up @@ -434,8 +434,7 @@ def _on_relation_changed(self, event):
return

# if re-using an old shard, re-set drained flag.
if self.charm.unit.is_leader():
self.charm.app_peer_data["drained"] = json.dumps(False)
self.charm.unit_peer_data["drained"] = json.dumps(False)

self.charm.unit.status = MaintenanceStatus("Adding shard to config-server")

Expand Down Expand Up @@ -475,7 +474,7 @@ def _on_relation_changed(self, event):
event.defer()
return

self.charm.app_peer_data["added_to_cluster"] = json.dumps(True)
self.charm.app_peer_data["mongos_hosts"] = json.dumps(self.get_mongos_hosts())

def pass_hook_checks(self, event):
"""Runs the pre-hooks checks for ConfigServerRequirer, returns True if all pass."""
Expand Down Expand Up @@ -517,16 +516,11 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
return

self.charm.unit.status = MaintenanceStatus("Draining shard from cluster")
# mongos hosts must be retrieved via relation data, as relation.units are not available in
# broken
mongos_hosts = json.loads(event.relation.data[event.relation.app].get(HOSTS_KEY))
mongos_hosts = json.loads(self.charm.app_peer_data["mongos_hosts"])
self.wait_for_draining(mongos_hosts)

self.charm.unit.status = ActiveStatus("Shard drained from cluster, ready for removal")

if self.charm.unit.is_leader():
self.charm.app_peer_data["added_to_cluster"] = json.dumps(False)

def wait_for_draining(self, mongos_hosts: List[str]):
"""Waits for shards to be drained from sharded cluster."""
drained = False
Expand Down Expand Up @@ -554,8 +548,7 @@ def wait_for_draining(self, mongos_hosts: List[str]):
logger.info(
"Shard to remove is not in sharded cluster. It has been successfully removed."
)
if self.charm.unit.is_leader():
self.charm.app_peer_data["drained"] = json.dumps(True)
self.charm.unit_peer_data["drained"] = json.dumps(True)

break

Expand Down Expand Up @@ -607,16 +600,12 @@ def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool:
logger.info("Component %s is not a shard, has no draining status.", self.charm.role)
return False

if not self.charm.unit.is_leader():
# if "drained" hasn't been set by leader, then assume it hasn't be drained.
return json.dumps(self.charm.app_peer_data.get("drained", False))

with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo:
# a shard is "drained" if it is NO LONGER draining.
draining = mongo._is_shard_draining(shard_name)
drained = not draining

self.charm.app_peer_data["drained"] = json.dumps(drained)
self.charm.unit_peer_data["drained"] = json.dumps(drained)
return drained

def update_operator_password(self, new_password: str) -> None:
Expand Down Expand Up @@ -721,10 +710,16 @@ def _is_mongos_reachable(self) -> bool:

def _is_added_to_cluster(self) -> bool:
"""Returns True if the shard has been added to the cluster."""
if "added_to_cluster" not in self.charm.app_peer_data:
return False
try:
mongos_hosts = self.get_mongos_hosts()
with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo:
cluster_shards = mongo.get_shard_members()
return self.charm.app.name in cluster_shards
except OperationFailure as e:
if e.code == 13: # Unauthorized, we are not yet connected to mongos
return False

return json.loads(self.charm.app_peer_data.get("added_to_cluster"))
raise

def _is_shard_aware(self) -> bool:
"""Returns True if shard is in cluster and shard aware."""
Expand Down
2 changes: 1 addition & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def drained(self) -> bool:
logger.info("Component %s is not a shard, cannot check draining status.", self.role)
return False

return self.app_peer_data.get("drained", False)
return self.unit_peer_data.get("drained", False)

@property
def _unit_ips(self) -> List[str]:
Expand Down
8 changes: 8 additions & 0 deletions tests/integration/sharding_tests/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,14 @@ async def test_unconventual_shard_removal(ops_test: OpsTest):
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)

await ops_test.model.wait_for_idle(
apps=[SHARD_TWO_APP_NAME],
idle_period=20,
status="active",
timeout=TIMEOUT,
raise_on_error=False,
)

await ops_test.model.applications[SHARD_TWO_APP_NAME].destroy_units(f"{SHARD_TWO_APP_NAME}/0")
await ops_test.model.wait_for_idle(
apps=[SHARD_TWO_APP_NAME],
Expand Down

0 comments on commit 1d35616

Please sign in to comment.