Skip to content

Commit

Permalink
[DPE-2763] POC Status reporting shard side (#275)
Browse files Browse the repository at this point in the history
## Issue
Statuses are not properly reported for shards in update status hook

## Solution
Report statuses

## Testing
```
# deploy charms 
juju deploy ./*charm --config role="config-server" config-server-one 
juju deploy ./*charm --config role="shard" shard-one 
juju deploy ./*charm --config role="shard" shard-two 

# speed up frequency of status checks
juju model-config update-status-hook-interval=10s

# monitor `juju status --watch 1s`
Unit                  Workload  Agent  Machine  Public address  Ports            Message
config-server-one/0*  active    idle   2        10.61.64.75     27017-27018/tcp  Primary
shard-one/0*          blocked   idle   0        10.61.64.126    27017/tcp        missing relation to config server
shard-two/0*          blocked   idle   1        10.61.64.216    27017/tcp        missing relation to config server

# relate application
juju integrate config-server-one:config-server shard-one:sharding
juju integrate config-server-one:config-server shard-two:sharding

# monitor `juju status --watch 1s`
Unit                  Workload  Agent  Machine  Public address  Ports            Message
config-server-one/0*  active    idle   2        10.61.64.75     27017-27018/tcp  Primary
shard-one/0*          active    idle   0        10.61.64.126    27017/tcp        Shard connected to config-server: config-server-one
shard-two/0*          active    idle   1        10.61.64.216    27017/tcp        Shard connected to config-server: config-server-one

# remove a relation to shard
juju remove-relation config-server-one:config-server shard-two:sharding

# monitor `juju status --watch 1s`
config-server-one/0*  active    idle   0        10.61.64.50     27017-27018/tcp  Primary
shard-one/0*          active    idle   1        10.61.64.235    27017/tcp        Shard connected to config-server: config-server-one
shard-two/0*          active    idle   2        10.61.64.128    27017/tcp        Shard drained from cluster, ready for removal

# add non supported relation
cd mongodb-operator/tests/integration/relation_tests/new_relations/application-charm
charmcraft pack
juju deploy ./*charm
juju relation shard-one application

# monitor `juju status --watch 1s`
application/0*        active    idle   3        10.61.64.203
config-server-one/0*  active    idle   0        10.61.64.43     27017-27018/tcp
shard-one/0*          blocked   idle   1        10.61.64.31     27017/tcp        Sharding roles do not support mongodb_client interface.
shard-two/0*          active    idle   2        10.61.64.239    27017/tcp
```
  • Loading branch information
MiaAltieri authored Oct 27, 2023
1 parent 3dcab70 commit cc52ce1
Show file tree
Hide file tree
Showing 6 changed files with 249 additions and 49 deletions.
37 changes: 25 additions & 12 deletions lib/charms/mongodb/v1/mongodb_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1
LIBPATCH = 2

logger = logging.getLogger(__name__)
REL_NAME = "database"
Expand Down Expand Up @@ -82,6 +82,28 @@ def __init__(self, charm: CharmBase, substrate="k8s", relation_name: str = "data
self.database_provides.on.database_requested, self._on_relation_event
)

def pass_hook_checks(self) -> bool:
"""Runs the pre-hooks checks for MongoDBProvider, returns True if all pass."""
if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping code for relations.")
return False

# legacy relations have auth disabled, which new relations require
if self.model.get_relation(LEGACY_REL_NAME):
self.charm.unit.status = BlockedStatus("cannot have both legacy and new relations")
logger.error("Auth disabled due to existing connections to legacy relations")
return False

if not self.charm.unit.is_leader():
return False

# We shouldn't try to create or update users if the database is not
# initialised. We will create users as part of initialisation.
if not self.charm.db_initialised:
return False

return True

def _on_relation_event(self, event):
"""Handle relation joined events.
Expand All @@ -90,17 +112,8 @@ def _on_relation_event(self, event):
data. As a result, related charm gets credentials for accessing the
MongoDB database.
"""
if not self.charm.unit.is_leader():
return
# We shouldn't try to create or update users if the database is not
# initialised. We will create users as part of initialisation.
if "db_initialised" not in self.charm.app_peer_data:
return

# legacy relations have auth disabled, which new relations require
if self.model.get_relation(LEGACY_REL_NAME):
self.charm.unit.status = BlockedStatus("cannot have both legacy and new relations")
logger.error("Auth disabled due to existing connections to legacy relations")
if not self.pass_hook_checks():
logger.info("Skipping %s: hook checks did not pass", type(event))
return

# If auth is disabled but there are no legacy relation users, this means that legacy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
LIBID = "896a48bc89b84d30839335bb37170509"

# Increment this major API version when introducing breaking changes
LIBAPI = 0
LIBAPI = 1

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 0
logger = logging.getLogger(__name__)
REL_NAME = "database"

Expand All @@ -41,6 +41,7 @@ def __init__(self, charm):
"""Manager of MongoDB client relations."""
super().__init__(charm, "client-relations")
self.charm = charm
self.relation_name = LEGACY_REL_NAME
self.framework.observe(
self.charm.on[LEGACY_REL_NAME].relation_created, self._on_legacy_relation_created
)
Expand All @@ -64,6 +65,10 @@ def _on_legacy_relation_created(self, event):
)
return

if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping code for legacy relations.")
return

# If auth is already disabled its likely it has a connection with another legacy relation
# user. Shutting down and restarting mongod would lead to downtime for the other legacy
# relation user and hence shouldn't be done. Not to mention there is no need to disable
Expand Down
34 changes: 32 additions & 2 deletions lib/charms/mongodb/v1/mongos.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from charms.mongodb.v0.mongodb import NotReadyError
from pymongo import MongoClient, collection
from tenacity import Retrying, stop_after_delay, wait_fixed
from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed

from config import Config

Expand All @@ -21,7 +21,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1
LIBPATCH = 2

# path to store mongodb ketFile
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -334,6 +334,36 @@ def _log_removal_info(self, removal_info, shard_name):
",".join(dbs_to_move),
)

@property
def is_ready(self) -> bool:
"""Is mongos ready for services requests.
Returns:
True if services is ready False otherwise. Retries over a period of 60 seconds times to
allow server time to start up.
Raises:
ConfigurationError, ConfigurationError, OperationFailure
"""
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
# The ping command is cheap and does not require auth.
self.client.admin.command("ping")
except RetryError:
return False

return True

def is_shard_aware(self, shard_name: str) -> bool:
"""Returns True if provided shard is shard aware."""
sc_status = self.client.admin.command("listShards")
for shard in sc_status["shards"]:
if shard["_id"] == shard_name:
return shard["state"] == 1

return False

def _retrieve_remaining_chunks(self, removal_info) -> int:
"""Parses the remaining chunks to remove from removeShard command."""
return removal_info["remaining"]["chunks"] if "remaining" in removal_info else 0
Expand Down
128 changes: 110 additions & 18 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
PyMongoError,
)
from charms.mongodb.v1.helpers import KEY_FILE
from charms.mongodb.v1.mongodb_provider import LEGACY_REL_NAME, REL_NAME
from charms.mongodb.v1.mongos import (
BalancerNotEnabledError,
MongosConnection,
Expand All @@ -28,7 +29,13 @@
from charms.mongodb.v1.users import MongoDBUser, OperatorUser
from ops.charm import CharmBase, EventBase, RelationBrokenEvent
from ops.framework import Object
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus
from ops.model import (
ActiveStatus,
BlockedStatus,
MaintenanceStatus,
StatusBase,
WaitingStatus,
)
from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed

from config import Config
Expand All @@ -44,7 +51,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1
LIBPATCH = 2
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
Expand Down Expand Up @@ -106,12 +113,8 @@ def _on_relation_joined(self, event):

def pass_hook_checks(self, event: EventBase) -> bool:
"""Runs the pre-hooks checks for ShardingProvider, returns True if all pass."""
if self.charm.is_role(Config.Role.REPLICATION):
self.charm.unit.status = BlockedStatus("role replication does not support sharding")
logger.error(
"Skipping %s. Sharding interface not supported with config role=replication.",
type(event),
)
if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping event %s , relation not feasible.", type(event))
return False

if not self.charm.is_role(Config.Role.CONFIG_SERVER):
Expand Down Expand Up @@ -268,6 +271,10 @@ def update_mongos_hosts(self):
for relation in self.charm.model.relations[self.relation_name]:
self._update_relation_data(relation.id, {HOSTS_KEY: json.dumps(self.charm._unit_ips)})

def get_config_server_status(self):
"""TODO: Implement this function in a separate PR."""
return None

def _update_relation_data(self, relation_id: int, data: dict) -> None:
"""Updates a set of key-value pairs in the relation.
Expand Down Expand Up @@ -349,6 +356,12 @@ def _on_relation_changed(self, event):
logger.info("Skipping relation joined event: hook checks re not passed")
return

# if re-using an old shard, re-set drained flag.
if self.charm.unit.is_leader():
self.charm.app_peer_data["drained"] = json.dumps(False)

self.charm.unit.status = MaintenanceStatus("Adding shard to config-server")

# shards rely on the config server for secrets
relation_data = event.relation.data[event.app]
self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY))
Expand All @@ -361,8 +374,6 @@ def _on_relation_changed(self, event):
event.defer()
return

self.charm.unit.status = MaintenanceStatus("Adding shard to config-server")

if not self.charm.unit.is_leader():
return

Expand All @@ -377,13 +388,12 @@ def _on_relation_changed(self, event):
)
return

# TODO future PR, leader unit verifies shard was added to cluster (update-status hook)
self.charm.app_peer_data["added_to_cluster"] = json.dumps(True)

def pass_hook_checks(self, event):
"""Runs the pre-hooks checks for ConfigServerRequirer, returns True if all pass."""
if self.charm.is_role(Config.Role.REPLICATION):
self.charm.unit.status = BlockedStatus("role replication does not support sharding")
logger.error("sharding interface not supported with config role=replication")
if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping event %s , relation not feasible.", type(event))
return False

if not self.charm.is_role(Config.Role.SHARD):
Expand Down Expand Up @@ -426,8 +436,9 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
self.wait_for_draining(mongos_hosts)

self.charm.unit.status = ActiveStatus("Shard drained from cluster, ready for removal")
# TODO future PR, leader unit displays this message in update-status hook
# TODO future PR, check for shard drainage when removing application

if self.charm.unit.is_leader():
self.charm.app_peer_data["added_to_cluster"] = json.dumps(False)

def wait_for_draining(self, mongos_hosts: List[str]):
"""Waits for shards to be drained from sharded cluster."""
Expand All @@ -438,6 +449,7 @@ def wait_for_draining(self, mongos_hosts: List[str]):
# no need to continuously check and abuse resources while shard is draining
time.sleep(10)
drained = self.drained(mongos_hosts, self.charm.app.name)
self.charm.unit.status = MaintenanceStatus("Draining shard from cluster")
draining_status = (
"Shard is still draining" if not drained else "Shard is fully drained."
)
Expand All @@ -459,6 +471,44 @@ def wait_for_draining(self, mongos_hosts: List[str]):

break

def get_shard_status(self) -> Optional[StatusBase]:
"""Returns the current status of the shard.
Note: No need to report if currently draining, since that check block other hooks from
executing.
"""
if not self.charm.is_role(Config.Role.SHARD):
logger.info("skipping status check, charm is not running as a shard")
return None

if not self.charm.db_initialised:
logger.info("No status for shard to report, waiting for db to be initialised.")
return None

if self.model.get_relation(LEGACY_REL_NAME):
return BlockedStatus(f"relation {LEGACY_REL_NAME} to shard not supported.")

if self.model.get_relation(REL_NAME):
return BlockedStatus(f"relation {REL_NAME} to shard not supported.")

if not self.model.get_relation(self.relation_name) and not self.charm.drained:
return BlockedStatus("missing relation to config server")

if not self.model.get_relation(self.relation_name) and self.charm.drained:
return ActiveStatus("Shard drained from cluster, ready for removal")

if not self._is_mongos_reachable():
return BlockedStatus("Config server unreachable")

if not self._is_added_to_cluster():
return MaintenanceStatus("Adding shard to config-server")

if not self._is_shard_aware():
return BlockedStatus("Shard is not yet shard aware")

config_server_name = self.get_related_config_server()
return ActiveStatus(f"Shard connected to config-server: {config_server_name}")

def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool:
"""Returns whether a shard has been drained from the cluster.
Expand Down Expand Up @@ -564,16 +614,58 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None:
if relation:
relation.data[self.charm.model.app].update(data)

def _is_mongos_reachable(self) -> bool:
"""Returns True if mongos is reachable."""
if not self.model.get_relation(self.relation_name):
logger.info("Mongos is not reachable, no relation to config-sever")
return False

mongos_hosts = self.get_mongos_hosts()
if not mongos_hosts:
return False

self.charm.remote_mongos_config(set(mongos_hosts))
config = self.charm.remote_mongos_config(set(mongos_hosts))

# use a URI that is not dependent on the operator password, as we are not guaranteed that
# the shard has received the password yet.
uri = f"mongodb://{','.join(mongos_hosts)}"
with MongosConnection(config, uri) as mongo:
return mongo.is_ready

def _is_added_to_cluster(self) -> bool:
"""Returns True if the shard has been added to the cluster."""
return json.loads(self.charm.app_peer_data.get("added_to_cluster", "False"))

def _is_shard_aware(self) -> bool:
"""Returns True if shard is in cluster and shard aware."""
if not self.model.get_relation(self.relation_name):
logger.info(
"Mongos is not reachable, no relation to config-sever, cannot check shard status."
)
return False

mongos_hosts = self.get_mongos_hosts()
with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo:
return mongo.is_shard_aware(shard_name=self.charm.app.name)

def has_config_server(self) -> bool:
"""Returns True if currently related to config server."""
return len(self.charm.model.relations[self.relation_name]) > 0

def get_related_config_server(self) -> List[str]:
def get_related_config_server(self) -> str:
"""Returns the related config server."""
return [rel.app.name for rel in self.charm.model.relations[self.relation_name]]
if self.relation_name not in self.charm.model.relations:
return None

# metadata.yaml prevents having multiple config servers
return self.charm.model.relations[self.relation_name][0].app.name

def get_mongos_hosts(self) -> List[str]:
"""Returns a list of IP addresses for the mongos hosts."""
# only one related config-server is possible
config_server_relation = self.charm.model.relations[self.relation_name][0]
if HOSTS_KEY not in config_server_relation.data[config_server_relation.app]:
return

return json.loads(config_server_relation.data[config_server_relation.app].get(HOSTS_KEY))
Loading

0 comments on commit cc52ce1

Please sign in to comment.