Skip to content

Commit

Permalink
Improve node removal by validating departed|broken relations
Browse files Browse the repository at this point in the history
  • Loading branch information
addyess committed Apr 25, 2024
1 parent 81caac3 commit 3bdae50
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 23 deletions.
86 changes: 69 additions & 17 deletions charms/worker/k8s/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from functools import cached_property
from pathlib import Path
from time import sleep
from typing import Dict, Optional
from typing import Dict, Optional, Union
from urllib.parse import urlparse

import charms.contextual_status as status
Expand Down Expand Up @@ -78,6 +78,22 @@ def _get_public_address() -> str:
return subprocess.check_output(cmd).decode("UTF-8").strip()


def _cluster_departing_unit(event: ops.EventBase) -> Union[bool, ops.Unit]:
"""Determine if the given event signals the end of the cluster for this unit.
Args:
event (ops.EventBase): event to consider.
Returns:
Optional[ops.Unit] Unit leaving the cluster
"""
return (
isinstance(event, ops.RelationDepartedEvent)
and event.relation.name in ["k8s-cluster", "cluster"]
and event.departing_unit
)


class K8sCharm(ops.CharmBase):
"""A charm for managing a K8s cluster via the k8s snap.
Expand Down Expand Up @@ -311,13 +327,22 @@ def _configure_cloud_provider(self, config: BootstrapConfig):
log.info("Using external as cloud-provider")
config.cloud_provider = "external"

def _revoke_cluster_tokens(self):
def _revoke_cluster_tokens(self, event: ops.EventBase):
"""Revoke tokens for the units in the cluster and k8s-cluster relations.
if self is dying, only try to remove itself from the cluster
if event is relation_departed, remove that unit
Args:
event (ops.Event): event triggering token revocation
"""
log.info("Garbage collect cluster tokens")
to_remove = {self.unit} if self.is_dying else None
to_remove = set()
if self.is_dying:
to_remove |= {self.unit}
if unit := _cluster_departing_unit(event):
to_remove |= {unit}

if peer := self.model.get_relation("cluster"):
self.distributor.revoke_tokens(
Expand Down Expand Up @@ -511,18 +536,20 @@ def _join_cluster(self):
self.api_manager.join_cluster(request)
log.info("Success")

def _reconcile(self, event):
def _reconcile(self, event: ops.EventBase):
"""Reconcile state change events.
Args:
event: ops.EventBase - event that triggered the reconciliation
"""
log.info("Reconcile event=%s", event)

self._evaluate_removal(event)
if self.is_dying and self.lead_control_plane:
self._revoke_cluster_tokens()
self._revoke_cluster_tokens(event)
if self.is_dying:
self._update_status()
self._last_gasp(event)
self._last_gasp()
return

self._apply_proxy_environment()
Expand All @@ -535,7 +562,7 @@ def _reconcile(self, event):
self._create_cluster_tokens()
self._create_cos_tokens()
self._apply_cos_requirements()
self._revoke_cluster_tokens()
self._revoke_cluster_tokens(event)
self._ensure_cluster_config()
self._join_cluster()
self._configure_cos_integration()
Expand Down Expand Up @@ -567,6 +594,10 @@ def _update_status(self):
if version := self._get_snap_version():
self.unit.set_workload_version(version)

if not self._is_node_ready():
status.add(ops.WaitingStatus("Node not yet Ready"))
return

def _evaluate_removal(self, event: ops.EventBase):
"""Determine if my unit is being removed.
Expand All @@ -575,25 +606,46 @@ def _evaluate_removal(self, event: ops.EventBase):
"""
if self.is_dying:
return
if isinstance(event, ops.RelationDepartedEvent):
# if a unit is departing, and it's me?
self._stored.removing = event.departing_unit == self.unit
if unit := _cluster_departing_unit(event):
# Juju says I am being removed
self._stored.removing = unit == self.unit
elif isinstance(event, ops.RelationBrokenEvent) and event.relation.name == "cluster":
# Control-plane never experience RelationBroken on "cluster", it's a peer relation
# Worker units experience RelationBroken on "cluster" when the relation is removed
# or this unit is being removed.
self._stored.removing = self.is_worker
elif isinstance(event, (ops.RemoveEvent, ops.StopEvent)):
# If I myself am dying, its me!
self._stored.removing = True

def _last_gasp(self, event):
"""Busy wait on stop event until the unit isn't clustered anymore.
def _is_node_ready(self, node: str = "") -> bool:
"""Determine if node is in the kubernetes cluster.
Args:
event: ops.EventBase - event that triggered charm hook
node (str): name of node
Returns:
bool: True when this unit is marked as Ready
"""
if not isinstance(event, ops.StopEvent):
return
busy_wait = 30
node = node or self.get_node_name()
cmd = shlex.split(f"k8s kubectl get nodes {node}")
cmd += ['-o=jsonpath={.status.conditions[?(@.type=="Ready")].status}']
try:
return subprocess.check_output(cmd) == b"True"
except subprocess.CalledProcessError:
return False

def _last_gasp(self):
"""Busy wait on stop event until the unit isn't clustered anymore."""
busy_wait, reported_down = 30, 0
status.add(ops.MaintenanceStatus("Ensuring cluster removal"))
while busy_wait and self.api_manager.is_cluster_bootstrapped():
while busy_wait and reported_down == 3:
log.info("Waiting for this unit to uncluster")
if self._is_node_ready():
log.info("Cluster Node is still ready")
reported_down = 0
else:
reported_down += 1
sleep(1)
busy_wait -= 1

Expand Down
12 changes: 6 additions & 6 deletions charms/worker/k8s/templates/snap_installation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
# See LICENSE file for licensing details.

amd64:
- name: k8s
install-type: store
channel: edge
- name: k8s
install-type: store
channel: edge
arm64:
- name: k8s
install-type: store
channel: edge
- name: k8s
install-type: store
channel: edge

0 comments on commit 3bdae50

Please sign in to comment.