From 32c2da00e4afc8af673041bbc78a14ed8720ba8a Mon Sep 17 00:00:00 2001
From: Mateo Florido <32885896+mateoflorido@users.noreply.github.com>
Date: Tue, 19 Nov 2024 09:56:14 -0500
Subject: [PATCH] Implement `pre-upgrade-check` action (#168)

Implement the pre-upgrade-check action in the charm.
---
 charms/worker/charmcraft.yaml                 |    8 +
 charms/worker/k8s/charmcraft.yaml             |    5 +
 .../charms/data_platform_libs/v0/upgrade.py   | 1102 +++++++++++++++++
 charms/worker/k8s/requirements.txt            |    2 +
 charms/worker/k8s/src/charm.py                |   14 +-
 charms/worker/k8s/src/inspector.py            |   94 ++
 charms/worker/k8s/src/literals.py             |   21 +
 charms/worker/k8s/src/token_distributor.py    |   17 +-
 charms/worker/k8s/src/upgrade.py              |   99 ++
 .../worker/k8s/tests/unit/test_inspector.py   |  101 ++
 charms/worker/k8s/tests/unit/test_upgrade.py  |  119 ++
 11 files changed, 1578 insertions(+), 4 deletions(-)
 create mode 100644 charms/worker/k8s/lib/charms/data_platform_libs/v0/upgrade.py
 create mode 100644 charms/worker/k8s/src/inspector.py
 create mode 100644 charms/worker/k8s/src/literals.py
 create mode 100644 charms/worker/k8s/src/upgrade.py
 create mode 100644 charms/worker/k8s/tests/unit/test_inspector.py
 create mode 100644 charms/worker/k8s/tests/unit/test_upgrade.py

diff --git a/charms/worker/charmcraft.yaml b/charms/worker/charmcraft.yaml
index b567c149..26d308d8 100644
--- a/charms/worker/charmcraft.yaml
+++ b/charms/worker/charmcraft.yaml
@@ -85,6 +85,14 @@ parts:
       rm -rf $CRAFT_PRIME/lib $CRAFT_PRIME/templates
       mv $CRAFT_PRIME/k8s/lib $CRAFT_PRIME/lib
       mv $CRAFT_PRIME/k8s/templates $CRAFT_PRIME/templates
+actions:
+  pre-upgrade-check:
+    description: Run necessary pre-upgrade checks before executing a charm upgrade.
+
+peers:
+  upgrade:
+    interface: upgrade
+
 
 provides:
   cos-agent:
diff --git a/charms/worker/k8s/charmcraft.yaml b/charms/worker/k8s/charmcraft.yaml
index b68cb8a3..70ce0156 100644
--- a/charms/worker/k8s/charmcraft.yaml
+++ b/charms/worker/k8s/charmcraft.yaml
@@ -169,6 +169,9 @@ actions:
       server:
         description: Override the server endpoint with this field
         type: string
+  pre-upgrade-check:
+    description: Run necessary pre-upgrade checks before executing a charm upgrade.
+
 
 parts:
   charm:
@@ -180,6 +183,8 @@ peers:
     interface: k8s-cluster
   cos-tokens:
     interface: cos-k8s-tokens
+  upgrade:
+    interface: upgrade
 
 provides:
   cos-agent:
diff --git a/charms/worker/k8s/lib/charms/data_platform_libs/v0/upgrade.py b/charms/worker/k8s/lib/charms/data_platform_libs/v0/upgrade.py
new file mode 100644
index 00000000..4d909d64
--- /dev/null
+++ b/charms/worker/k8s/lib/charms/data_platform_libs/v0/upgrade.py
@@ -0,0 +1,1102 @@
+# Copyright 2023 Canonical Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Library to manage in-place upgrades for charms running on VMs and K8s.
+
+This library contains handlers for `upgrade` relation events used to coordinate
+between units in an application during a `juju refresh`, as well as `Pydantic` models
+for instantiating, validating and comparing dependencies.
+
+An upgrade on VMs is initiated with the command `juju refresh`. Once executed, the following
+events are emitted to each unit at random:
+    - `upgrade-charm`
+    - `config-changed`
+    - `leader-settings-changed` - Non-leader only
+
+Charm authors can implement the classes defined in this library to streamline the process of
+coordinating which unit updates when, achieved through updating of unit-data `state` throughout.
+
+At a high-level, the upgrade steps are as follows:
+    - Run pre-checks on the cluster to confirm it is safe to upgrade
+    - Create stack of unit.ids, to serve as the upgrade order (generally workload leader is last)
+    - Start the upgrade by issuing a Juju CLI command
+    - The unit at the top of the stack gets permission to upgrade
+    - The unit handles the upgrade and restarts their service
+    - Repeat, until all units have restarted
+
+### Usage by charm authors
+
+#### `upgrade` relation
+
+Charm authors must implement an additional peer-relation.
+
+As this library uses relation data exchanged between units to coordinate, charm authors
+need to add a new relation interface. The relation name does not matter.
+
+`metadata.yaml`
+```yaml
+peers:
+  upgrade:
+    interface: upgrade
+```
+
+#### Dependencies JSON/Dict
+
+Charm authors must implement a dict object tracking current charm versions, requirements + upgradability.
+
+Many workload versions may be incompatible with older/newer versions. This same idea also can apply to
+charm or snap versions. Workloads with required related applications (e.g Kafka + ZooKeeper) also need to
+ensure their versions are compatible during an upgrade, to avoid cluster failure.
+
+As such, it is necessasry to freeze any dependencies within each published charm. An example of this could
+be creating a `DEPENDENCIES` dict within the charm code, with the following structure:
+
+`src/literals.py`
+```python
+DEPENDENCIES = {
+    "kafka_charm": {
+        "dependencies": {"zookeeper": ">50"},
+        "name": "kafka",
+        "upgrade_supported": ">90",
+        "version": "100",
+    },
+    "kafka_service": {
+        "dependencies": {"zookeeper": "^3"},
+        "name": "kafka",
+        "upgrade_supported": ">=0.8",
+        "version": "3.3.2",
+    },
+}
+```
+
+The first-level key names are arbitrary labels for tracking what those versions+dependencies are for.
+The `dependencies` second-level values are a key-value map of any required external applications,
+    and the versions this packaged charm can support.
+The `upgrade_suppported` second-level values are requirements from which an in-place upgrade can be
+    supported by the charm.
+The `version` second-level values correspond to the current version of this packaged charm.
+
+Any requirements comply with [`poetry`'s dependency specifications](https://python-poetry.org/docs/dependency-specification/#caret-requirements).
+
+### Dependency Model
+
+Charm authors must implement their own class inheriting from `DependencyModel`.
+
+Using a `Pydantic` model to instantiate the aforementioned `DEPENDENCIES` dict gives stronger type safety and additional
+layers of validation.
+
+Implementation just needs to ensure that the top-level key names from `DEPENDENCIES` are defined as attributed in the model.
+
+`src/upgrade.py`
+```python
+from pydantic import BaseModel
+
+class KafkaDependenciesModel(BaseModel):
+    kafka_charm: DependencyModel
+    kafka_service: DependencyModel
+```
+
+### Overrides for `DataUpgrade`
+
+Charm authors must define their own class, inheriting from `DataUpgrade`, overriding all required `abstractmethod`s.
+
+```python
+class ZooKeeperUpgrade(DataUpgrade):
+    def __init__(self, charm: "ZooKeeperUpgrade", **kwargs):
+        super().__init__(charm, **kwargs)
+        self.charm = charm
+```
+
+#### Implementation of `pre_upgrade_check()`
+
+Before upgrading a cluster, it's a good idea to check that it is stable and healthy before permitting it.
+Here, charm authors can validate upgrade safety through API calls, relation-data checks, etc.
+If any of these checks fail, raise `ClusterNotReadyError`.
+
+```python
+    @override
+    def pre_upgrade_check(self) -> None:
+        default_message = "Pre-upgrade check failed and cannot safely upgrade"
+        try:
+            if not self.client.members_broadcasting or not len(self.client.server_members) == len(
+                self.charm.cluster.peer_units
+            ):
+                raise ClusterNotReadyError(
+                    message=default_message,
+                    cause="Not all application units are connected and broadcasting in the quorum",
+                )
+
+            if self.client.members_syncing:
+                raise ClusterNotReadyError(
+                    message=default_message, cause="Some quorum members are syncing data"
+                )
+
+            if not self.charm.cluster.stable:
+                raise ClusterNotReadyError(
+                    message=default_message, cause="Charm has not finished initialising"
+                )
+
+        except QuorumLeaderNotFoundError:
+            raise ClusterNotReadyError(message=default_message, cause="Quorum leader not found")
+        except ConnectionClosedError:
+            raise ClusterNotReadyError(
+                message=default_message, cause="Unable to connect to the cluster"
+            )
+```
+
+#### Implementation of `build_upgrade_stack()` - VM ONLY
+
+Oftentimes, it is necessary to ensure that the workload leader is the last unit to upgrade,
+to ensure high-availability during the upgrade process.
+Here, charm authors can create a LIFO stack of unit.ids, represented as a list of unit.id strings,
+with the leader unit being at i[0].
+
+```python
+@override
+def build_upgrade_stack(self) -> list[int]:
+    upgrade_stack = []
+    for unit in self.charm.cluster.peer_units:
+        config = self.charm.cluster.unit_config(unit=unit)
+
+        # upgrade quorum leader last
+        if config["host"] == self.client.leader:
+            upgrade_stack.insert(0, int(config["unit_id"]))
+        else:
+            upgrade_stack.append(int(config["unit_id"]))
+
+    return upgrade_stack
+```
+
+#### Implementation of `_on_upgrade_granted()`
+
+On relation-changed events, each unit will check the current upgrade-stack persisted to relation data.
+If that unit is at the top of the stack, it will emit an `upgrade-granted` event, which must be handled.
+Here, workloads can be re-installed with new versions, checks can be made, data synced etc.
+If the new unit successfully rejoined the cluster, call `set_unit_completed()`.
+If the new unit failed to rejoin the cluster, call `set_unit_failed()`.
+
+NOTE - It is essential here to manually call `on_upgrade_changed` if the unit is the current leader.
+This ensures that the leader gets it's own relation-changed event, and updates the upgrade-stack for
+other units to follow suit.
+
+```python
+@override
+def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
+    self.charm.snap.stop_snap_service()
+
+    if not self.charm.snap.install():
+        logger.error("Unable to install ZooKeeper Snap")
+        self.set_unit_failed()
+        return None
+
+    logger.info(f"{self.charm.unit.name} upgrading service...")
+    self.charm.snap.restart_snap_service()
+
+    try:
+        logger.debug("Running post-upgrade check...")
+        self.pre_upgrade_check()
+
+        logger.debug("Marking unit completed...")
+        self.set_unit_completed()
+
+        # ensures leader gets it's own relation-changed when it upgrades
+        if self.charm.unit.is_leader():
+            logger.debug("Re-emitting upgrade-changed on leader...")
+            self.on_upgrade_changed(event)
+
+    except ClusterNotReadyError as e:
+        logger.error(e.cause)
+        self.set_unit_failed()
+```
+
+#### Implementation of `log_rollback_instructions()`
+
+If the upgrade fails, manual intervention may be required for cluster recovery.
+Here, charm authors can log out any necessary steps to take to recover from a failed upgrade.
+When a unit fails, this library will automatically log out this message.
+
+```python
+@override
+def log_rollback_instructions(self) -> None:
+    logger.error("Upgrade failed. Please run `juju refresh` to previous version.")
+```
+
+### Instantiating in the charm and deferring events
+
+Charm authors must add a class attribute for the child class of `DataUpgrade` in the main charm.
+They must also ensure that any non-upgrade related events that may be unsafe to handle during
+an upgrade, are deferred if the unit is not in the `idle` state - i.e not currently upgrading.
+
+```python
+class ZooKeeperCharm(CharmBase):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.upgrade = ZooKeeperUpgrade(
+            self,
+            relation_name = "upgrade",
+            substrate = "vm",
+            dependency_model=ZooKeeperDependencyModel(
+                **DEPENDENCIES
+            ),
+        )
+
+    def restart(self, event) -> None:
+        if not self.upgrade.state == "idle":
+            event.defer()
+            return None
+
+        self.restart_snap_service()
+```
+"""
+
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, List, Literal, Optional, Set, Tuple
+
+import poetry.core.constraints.version as poetry_version
+from ops.charm import (
+    ActionEvent,
+    CharmBase,
+    CharmEvents,
+    RelationCreatedEvent,
+    UpgradeCharmEvent,
+)
+from ops.framework import EventBase, EventSource, Object
+from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, Relation, Unit, WaitingStatus
+from pydantic import BaseModel, root_validator, validator
+
+# The unique Charmhub library identifier, never change it
+LIBID = "156258aefb79435a93d933409a8c8684"
+
+# Increment this major API version when introducing breaking changes
+LIBAPI = 0
+
+# Increment this PATCH version before using `charmcraft publish-lib` or reset
+# to 0 if you are raising the major API version
+LIBPATCH = 18
+
+PYDEPS = ["pydantic>=1.10,<2", "poetry-core"]
+
+logger = logging.getLogger(__name__)
+
+# --- DEPENDENCY RESOLUTION FUNCTIONS ---
+
+
+def verify_requirements(version: str, requirement: str) -> bool:
+    """Verifies a specified version against defined constraint.
+
+    Supports Poetry version constraints
+    https://python-poetry.org/docs/dependency-specification/#version-constraints
+
+    Args:
+        version: the version currently in use
+        requirement: Poetry version constraint
+
+    Returns:
+        True if `version` meets defined `requirement`. Otherwise False
+    """
+    return poetry_version.parse_constraint(requirement).allows(
+        poetry_version.Version.parse(version)
+    )
+
+
+# --- DEPENDENCY MODEL TYPES ---
+
+
+class DependencyModel(BaseModel):
+    """Manager for a single dependency.
+
+    To be used as part of another model representing a collection of arbitrary dependencies.
+
+    Example::
+
+        class KafkaDependenciesModel(BaseModel):
+            kafka_charm: DependencyModel
+            kafka_service: DependencyModel
+
+        deps = {
+            "kafka_charm": {
+                "dependencies": {"zookeeper": ">5"},
+                "name": "kafka",
+                "upgrade_supported": ">5",
+                "version": "10",
+            },
+            "kafka_service": {
+                "dependencies": {"zookeeper": "^3.6"},
+                "name": "kafka",
+                "upgrade_supported": "~3.3",
+                "version": "3.3.2",
+            },
+        }
+
+        model = KafkaDependenciesModel(**deps)  # loading dict in to model
+
+        print(model.dict())  # exporting back validated deps
+    """
+
+    dependencies: Dict[str, str]
+    name: str
+    upgrade_supported: str
+    version: str
+
+    @validator("dependencies", "upgrade_supported", each_item=True)
+    @classmethod
+    def dependencies_validator(cls, value):
+        """Validates version constraint."""
+        if isinstance(value, dict):
+            deps = value.values()
+        else:
+            deps = [value]
+
+        for dep in deps:
+            poetry_version.parse_constraint(dep)
+
+        return value
+
+    @root_validator(skip_on_failure=True)
+    @classmethod
+    def version_upgrade_supported_validator(cls, values):
+        """Validates specified `version` meets `upgrade_supported` requirement."""
+        if not verify_requirements(
+            version=values.get("version"), requirement=values.get("upgrade_supported")
+        ):
+            raise ValueError(
+                f"upgrade_supported value {values.get('upgrade_supported')} greater than version value {values.get('version')} for {values.get('name')}."
+            )
+
+        return values
+
+    def can_upgrade(self, dependency: "DependencyModel") -> bool:
+        """Compares two instances of :class:`DependencyModel` for upgradability.
+
+        Args:
+            dependency: a dependency model to compare this model against
+
+        Returns:
+            True if current model can upgrade from dependent model. Otherwise False
+        """
+        return verify_requirements(version=self.version, requirement=dependency.upgrade_supported)
+
+
+# --- CUSTOM EXCEPTIONS ---
+
+
+class UpgradeError(Exception):
+    """Base class for upgrade related exceptions in the module."""
+
+    def __init__(self, message: str, cause: Optional[str], resolution: Optional[str]):
+        super().__init__(message)
+        self.message = message
+        self.cause = cause or ""
+        self.resolution = resolution or ""
+
+    def __repr__(self):
+        """Representation of the UpgradeError class."""
+        return f"{type(self).__module__}.{type(self).__name__} - {str(vars(self))}"
+
+    def __str__(self):
+        """String representation of the UpgradeError class."""
+        return repr(self)
+
+
+class ClusterNotReadyError(UpgradeError):
+    """Exception flagging that the cluster is not ready to start upgrading.
+
+    For example, if the cluster fails :class:`DataUpgrade._on_pre_upgrade_check_action`
+
+    Args:
+        message: string message to be logged out
+        cause: short human-readable description of the cause of the error
+        resolution: short human-readable instructions for manual error resolution (optional)
+    """
+
+    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
+        super().__init__(message, cause=cause, resolution=resolution)
+
+
+class KubernetesClientError(UpgradeError):
+    """Exception flagging that a call to Kubernetes API failed.
+
+    For example, if the cluster fails :class:`DataUpgrade._set_rolling_update_partition`
+
+    Args:
+        message: string message to be logged out
+        cause: short human-readable description of the cause of the error
+        resolution: short human-readable instructions for manual error resolution (optional)
+    """
+
+    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
+        super().__init__(message, cause=cause, resolution=resolution)
+
+
+class VersionError(UpgradeError):
+    """Exception flagging that the old `version` fails to meet the new `upgrade_supported`s.
+
+    For example, upgrades from version `2.x` --> `4.x`,
+        but `4.x` only supports upgrading from `3.x` onwards
+
+    Args:
+        message: string message to be logged out
+        cause: short human-readable description of the cause of the error
+        resolution: short human-readable instructions for manual solutions to the error (optional)
+    """
+
+    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
+        super().__init__(message, cause=cause, resolution=resolution)
+
+
+class DependencyError(UpgradeError):
+    """Exception flagging that some new `dependency` is not being met.
+
+    For example, new version requires related App version `2.x`, but currently is `1.x`
+
+    Args:
+        message: string message to be logged out
+        cause: short human-readable description of the cause of the error
+        resolution: short human-readable instructions for manual solutions to the error (optional)
+    """
+
+    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
+        super().__init__(message, cause=cause, resolution=resolution)
+
+
+# --- CUSTOM EVENTS ---
+
+
+class UpgradeGrantedEvent(EventBase):
+    """Used to tell units that they can process an upgrade."""
+
+
+class UpgradeFinishedEvent(EventBase):
+    """Used to tell units that they finished the upgrade."""
+
+
+class UpgradeEvents(CharmEvents):
+    """Upgrade events.
+
+    This class defines the events that the lib can emit.
+    """
+
+    upgrade_granted = EventSource(UpgradeGrantedEvent)
+    upgrade_finished = EventSource(UpgradeFinishedEvent)
+
+
+# --- EVENT HANDLER ---
+
+
+class DataUpgrade(Object, ABC):
+    """Manages `upgrade` relation operations for in-place upgrades."""
+
+    STATES = ["recovery", "failed", "idle", "ready", "upgrading", "completed"]
+
+    on = UpgradeEvents()  # pyright: ignore [reportAssignmentType]
+
+    def __init__(
+        self,
+        charm: CharmBase,
+        dependency_model: BaseModel,
+        relation_name: str = "upgrade",
+        substrate: Literal["vm", "k8s"] = "vm",
+    ):
+        super().__init__(charm, relation_name)
+        self.charm = charm
+        self.dependency_model = dependency_model
+        self.relation_name = relation_name
+        self.substrate = substrate
+        self._upgrade_stack = None
+
+        # events
+        self.framework.observe(
+            self.charm.on[relation_name].relation_created, self._on_upgrade_created
+        )
+        self.framework.observe(
+            self.charm.on[relation_name].relation_changed, self.on_upgrade_changed
+        )
+        self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm)
+        self.framework.observe(getattr(self.on, "upgrade_granted"), self._on_upgrade_granted)
+        self.framework.observe(getattr(self.on, "upgrade_finished"), self._on_upgrade_finished)
+
+        # actions
+        self.framework.observe(
+            getattr(self.charm.on, "pre_upgrade_check_action"), self._on_pre_upgrade_check_action
+        )
+        if self.substrate == "k8s":
+            self.framework.observe(
+                getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action
+            )
+
+    @property
+    def peer_relation(self) -> Optional[Relation]:
+        """The upgrade peer relation."""
+        return self.charm.model.get_relation(self.relation_name)
+
+    @property
+    def app_units(self) -> Set[Unit]:
+        """The peer-related units in the application."""
+        if not self.peer_relation:
+            return set()
+
+        return set([self.charm.unit] + list(self.peer_relation.units))
+
+    @property
+    def state(self) -> Optional[str]:
+        """The unit state from the upgrade peer relation."""
+        if not self.peer_relation:
+            return None
+
+        return self.peer_relation.data[self.charm.unit].get("state", None)
+
+    @property
+    def stored_dependencies(self) -> Optional[BaseModel]:
+        """The application dependencies from the upgrade peer relation."""
+        if not self.peer_relation:
+            return None
+
+        if not (deps := self.peer_relation.data[self.charm.app].get("dependencies", "")):
+            return None
+
+        return type(self.dependency_model)(**json.loads(deps))
+
+    @property
+    def upgrade_stack(self) -> Optional[List[int]]:
+        """Gets the upgrade stack from the upgrade peer relation.
+
+        Unit.ids are ordered Last-In-First-Out (LIFO).
+            i.e unit.id at index `-1` is the first unit to upgrade.
+            unit.id at index `0` is the last unit to upgrade.
+
+        Returns:
+            List of integer unit.ids, ordered in upgrade order in a stack
+        """
+        if not self.peer_relation:
+            return None
+
+        # lazy-load
+        if self._upgrade_stack is None:
+            self._upgrade_stack = (
+                json.loads(self.peer_relation.data[self.charm.app].get("upgrade-stack", "[]"))
+                or None
+            )
+
+        return self._upgrade_stack
+
+    @upgrade_stack.setter
+    def upgrade_stack(self, stack: List[int]) -> None:
+        """Sets the upgrade stack to the upgrade peer relation.
+
+        Unit.ids are ordered Last-In-First-Out (LIFO).
+            i.e unit.id at index `-1` is the first unit to upgrade.
+            unit.id at index `0` is the last unit to upgrade.
+        """
+        if not self.peer_relation:
+            return
+
+        self.peer_relation.data[self.charm.app].update({"upgrade-stack": json.dumps(stack)})
+        self._upgrade_stack = stack
+
+    @property
+    def other_unit_states(self) -> list:
+        """Current upgrade state for other units.
+
+        Returns:
+            Unsorted list of upgrade states for other units.
+        """
+        if not self.peer_relation:
+            return []
+
+        return [
+            self.peer_relation.data[unit].get("state", "")
+            for unit in list(self.peer_relation.units)
+        ]
+
+    @property
+    def unit_states(self) -> list:
+        """Current upgrade state for all units.
+
+        Returns:
+            Unsorted list of upgrade states for all units.
+        """
+        if not self.peer_relation:
+            return []
+
+        return [self.peer_relation.data[unit].get("state", "") for unit in self.app_units]
+
+    @property
+    def cluster_state(self) -> Optional[str]:
+        """Current upgrade state for cluster units.
+
+        Determined from :class:`DataUpgrade.STATE`, taking the lowest ordinal unit state.
+
+        For example, if units in have states: `["ready", "upgrading", "completed"]`,
+            the overall state for the cluster is `ready`.
+
+        Returns:
+            String of upgrade state from the furthest behind unit.
+        """
+        if not self.unit_states:
+            return None
+
+        try:
+            return sorted(self.unit_states, key=self.STATES.index)[0]
+        except (ValueError, KeyError):
+            return None
+
+    @property
+    def idle(self) -> Optional[bool]:
+        """Flag for whether the cluster is in an idle upgrade state.
+
+        Returns:
+            True if all application units in idle state. Otherwise False
+        """
+        return set(self.unit_states) == {"idle"}
+
+    @abstractmethod
+    def pre_upgrade_check(self) -> None:
+        """Runs necessary checks validating the cluster is in a healthy state to upgrade.
+
+        Called by all units during :meth:`_on_pre_upgrade_check_action`.
+
+        Raises:
+            :class:`ClusterNotReadyError`: if cluster is not ready to upgrade
+        """
+        pass
+
+    def build_upgrade_stack(self) -> List[int]:
+        """Builds ordered iterable of all application unit.ids to upgrade in.
+
+        Called by leader unit during :meth:`_on_pre_upgrade_check_action`.
+
+        Returns:
+            Iterable of integer unit.ids, LIFO ordered in upgrade order
+                i.e `[5, 2, 4, 1, 3]`, unit `3` upgrades first, `5` upgrades last
+        """
+        # don't raise if k8s substrate, uses default statefulset order
+        if self.substrate == "k8s":
+            return []
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def log_rollback_instructions(self) -> None:
+        """Sets charm state and logs out rollback instructions.
+
+        Called by all units when `state=failed` found during :meth:`_on_upgrade_changed`.
+        """
+        pass
+
+    def _repair_upgrade_stack(self) -> None:
+        """Ensures completed units are re-added to the upgrade-stack after failure."""
+        # need to update the stack as it was not refreshed by rollback run of pre-upgrade-check
+        # avoids difficult health check implementation by charm-authors needing to exclude dead units
+
+        # if the first unit in the stack fails, the stack will be the same length as units
+        # i.e this block not ran
+        if (
+            self.cluster_state in ["failed", "recovery"]
+            and self.upgrade_stack
+            and len(self.upgrade_stack) != len(self.app_units)
+            and self.charm.unit.is_leader()
+        ):
+            new_stack = self.upgrade_stack
+            for unit in self.app_units:
+                unit_id = int(unit.name.split("/")[1])
+
+                # if a unit fails, it rolls back first
+                if unit_id not in new_stack:
+                    new_stack.insert(-1, unit_id)
+                    logger.debug(f"Inserted {unit_id} in to upgrade-stack - {new_stack}")
+
+            self.upgrade_stack = new_stack
+
+    def set_unit_failed(self, cause: Optional[str] = None) -> None:
+        """Sets unit `state=failed` to the upgrade peer data.
+
+        Args:
+            cause: short description of cause of failure
+        """
+        if not self.peer_relation:
+            return None
+
+        # needed to refresh the stack
+        # now leader pulls a fresh stack from newly updated relation data
+        if self.charm.unit.is_leader():
+            self._upgrade_stack = None
+
+        self.charm.unit.status = BlockedStatus(cause if cause else "")
+        self.peer_relation.data[self.charm.unit].update({"state": "failed"})
+        self.log_rollback_instructions()
+
+    def set_unit_completed(self) -> None:
+        """Sets unit `state=completed` to the upgrade peer data."""
+        if not self.peer_relation:
+            return None
+
+        # needed to refresh the stack
+        # now leader pulls a fresh stack from newly updated relation data
+        if self.charm.unit.is_leader():
+            self._upgrade_stack = None
+
+        self.charm.unit.status = MaintenanceStatus("upgrade completed")
+        self.peer_relation.data[self.charm.unit].update({"state": "completed"})
+
+        # Emit upgrade_finished event to run unit's post upgrade operations.
+        if self.substrate == "k8s":
+            logger.debug(
+                f"{self.charm.unit.name} has completed the upgrade, emitting `upgrade_finished` event..."
+            )
+            getattr(self.on, "upgrade_finished").emit()
+
+    def _on_upgrade_created(self, event: RelationCreatedEvent) -> None:
+        """Handler for `upgrade-relation-created` events."""
+        if not self.peer_relation:
+            event.defer()
+            return
+
+        # setting initial idle state needed to avoid execution on upgrade-changed events
+        self.peer_relation.data[self.charm.unit].update({"state": "idle"})
+
+        if self.charm.unit.is_leader():
+            logger.debug("Persisting dependencies to upgrade relation data...")
+            self.peer_relation.data[self.charm.app].update(
+                {"dependencies": json.dumps(self.dependency_model.dict())}
+            )
+
+    def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None:
+        """Handler for `pre-upgrade-check-action` events."""
+        if not self.peer_relation:
+            event.fail(message="Could not find upgrade relation.")
+            return
+
+        if not self.charm.unit.is_leader():
+            event.fail(message="Action must be ran on the Juju leader.")
+            return
+
+        if self.cluster_state == "failed":
+            logger.info("Entering recovery state for rolling-back to previous version...")
+            self._repair_upgrade_stack()
+            self.charm.unit.status = BlockedStatus("ready to rollback application")
+            self.peer_relation.data[self.charm.unit].update({"state": "recovery"})
+            return
+
+        # checking if upgrade in progress
+        if self.cluster_state != "idle":
+            event.fail("Cannot run pre-upgrade checks, cluster already upgrading.")
+            return
+
+        try:
+            logger.info("Running pre-upgrade-check...")
+            self.pre_upgrade_check()
+
+            if self.substrate == "k8s":
+                logger.info("Building upgrade-stack for K8s...")
+                built_upgrade_stack = sorted(
+                    [int(unit.name.split("/")[1]) for unit in self.app_units]
+                )
+            else:
+                logger.info("Building upgrade-stack for VMs...")
+                built_upgrade_stack = self.build_upgrade_stack()
+
+            logger.debug(f"Built upgrade stack of {built_upgrade_stack}")
+
+        except ClusterNotReadyError as e:
+            logger.error(e)
+            event.fail(message=e.message)
+            return
+        except Exception as e:
+            logger.error(e)
+            event.fail(message="Unknown error found.")
+            return
+
+        logger.info("Setting upgrade-stack to relation data...")
+        self.upgrade_stack = built_upgrade_stack
+
+    def _on_resume_upgrade_action(self, event: ActionEvent) -> None:
+        """Handle resume upgrade action.
+
+        Continue the upgrade by setting the partition to the next unit.
+        """
+        if not self.peer_relation:
+            event.fail(message="Could not find upgrade relation.")
+            return
+
+        if not self.charm.unit.is_leader():
+            event.fail(message="Action must be ran on the Juju leader.")
+            return
+
+        if not self.upgrade_stack:
+            event.fail(message="Nothing to resume, upgrade stack unset.")
+            return
+
+        # Check whether this is being run after juju refresh was called
+        # (the size of the upgrade stack should match the number of total
+        # unit minus one).
+        if len(self.upgrade_stack) != len(self.peer_relation.units):
+            event.fail(message="Upgrade can be resumed only once after juju refresh is called.")
+            return
+
+        try:
+            next_partition = self.upgrade_stack[-1]
+            self._set_rolling_update_partition(partition=next_partition)
+            event.set_results({"message": f"Upgrade will resume on unit {next_partition}"})
+        except KubernetesClientError:
+            event.fail(message="Cannot set rolling update partition.")
+
+    def _upgrade_supported_check(self) -> None:
+        """Checks if previous versions can be upgraded to new versions.
+
+        Raises:
+            :class:`VersionError` if upgrading to existing `version` is not supported
+        """
+        keys = self.dependency_model.__fields__.keys()
+
+        compatible = True
+        incompatibilities: List[Tuple[str, str, str, str]] = []
+        for key in keys:
+            old_dep: DependencyModel = getattr(self.stored_dependencies, key)
+            new_dep: DependencyModel = getattr(self.dependency_model, key)
+
+            if not old_dep.can_upgrade(dependency=new_dep):
+                compatible = False
+                incompatibilities.append(
+                    (key, old_dep.version, new_dep.version, new_dep.upgrade_supported)
+                )
+
+        base_message = "Versions incompatible"
+        base_cause = "Upgrades only supported for specific versions"
+        if not compatible:
+            for incompat in incompatibilities:
+                base_message += (
+                    f", {incompat[0]} {incompat[1]} can not be upgraded to {incompat[2]}"
+                )
+                base_cause += f", {incompat[0]} versions satisfying requirement {incompat[3]}"
+
+            raise VersionError(
+                message=base_message,
+                cause=base_cause,
+            )
+
+    def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
+        """Handler for `upgrade-charm` events."""
+        # defer if not all units have pre-upgraded
+        if not self.peer_relation:
+            event.defer()
+            return
+
+        if not self.upgrade_stack:
+            logger.error("Cluster upgrade failed, ensure pre-upgrade checks are ran first.")
+            return
+
+        if self.substrate == "vm":
+            # for VM run version checks on leader only
+            if self.charm.unit.is_leader():
+                try:
+                    self._upgrade_supported_check()
+                except VersionError as e:  # not ready if not passed check
+                    logger.error(e)
+                    self.set_unit_failed()
+                    return
+                top_unit_id = self.upgrade_stack[-1]
+                top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}")
+                if (
+                    top_unit == self.charm.unit
+                    and self.peer_relation.data[self.charm.unit].get("state") == "recovery"
+                ):
+                    # While in a rollback and the Juju leader unit is the top unit in the upgrade stack, emit the event
+                    # for this unit to start the rollback.
+                    self.peer_relation.data[self.charm.unit].update({"state": "ready"})
+                    self.on_upgrade_changed(event)
+                    return
+            self.charm.unit.status = WaitingStatus("other units upgrading first...")
+            self.peer_relation.data[self.charm.unit].update({"state": "ready"})
+
+            if len(self.app_units) == 1:
+                # single unit upgrade, emit upgrade_granted event right away
+                getattr(self.on, "upgrade_granted").emit()
+
+        else:
+            # for k8s run version checks only on highest ordinal unit
+            if (
+                self.charm.unit.name
+                == f"{self.charm.app.name}/{self.charm.app.planned_units() -1}"
+            ):
+                try:
+                    self._upgrade_supported_check()
+                except VersionError as e:  # not ready if not passed check
+                    logger.error(e)
+                    self.set_unit_failed()
+                    return
+            # On K8s an unit that receives the upgrade-charm event is upgrading
+            self.charm.unit.status = MaintenanceStatus("upgrading unit")
+            self.peer_relation.data[self.charm.unit].update({"state": "upgrading"})
+
+    def on_upgrade_changed(self, event: EventBase) -> None:
+        """Handler for `upgrade-relation-changed` events."""
+        if not self.peer_relation:
+            return
+
+        # if any other unit failed, don't continue with upgrade
+        if self.cluster_state == "failed":
+            logger.debug("Cluster failed to upgrade, exiting...")
+            return
+
+        if self.substrate == "vm" and self.cluster_state == "recovery":
+            # skip run while in recovery. The event will be retrigged when the cluster is ready
+            logger.debug("Cluster in recovery, skip...")
+            return
+
+        # if all units completed, mark as complete
+        if not self.upgrade_stack:
+            if self.state == "completed" and self.cluster_state in ["idle", "completed"]:
+                logger.info("All units completed upgrade, setting idle upgrade state...")
+                self.charm.unit.status = ActiveStatus()
+                self.peer_relation.data[self.charm.unit].update({"state": "idle"})
+
+                if self.charm.unit.is_leader():
+                    logger.debug("Persisting new dependencies to upgrade relation data...")
+                    self.peer_relation.data[self.charm.app].update(
+                        {"dependencies": json.dumps(self.dependency_model.dict())}
+                    )
+                return
+
+            if self.cluster_state == "idle":
+                logger.debug("upgrade-changed event handled before pre-checks, exiting...")
+                return
+
+            logger.debug("Did not find upgrade-stack or completed cluster state, skipping...")
+            return
+
+        # upgrade ongoing, set status for waiting units
+        if "upgrading" in self.unit_states and self.state in ["idle", "ready"]:
+            self.charm.unit.status = WaitingStatus("other units upgrading first...")
+
+        # pop mutates the `upgrade_stack` attr
+        top_unit_id = self.upgrade_stack.pop()
+        top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}")
+        top_state = self.peer_relation.data[top_unit].get("state")
+
+        # if top of stack is completed, leader pops it
+        if self.charm.unit.is_leader() and top_state == "completed":
+            logger.debug(f"{top_unit} has finished upgrading, updating stack...")
+
+            # writes the mutated attr back to rel data
+            self.peer_relation.data[self.charm.app].update(
+                {"upgrade-stack": json.dumps(self.upgrade_stack)}
+            )
+
+            # recurse on leader to ensure relation changed event not lost
+            # in case leader is next or the last unit to complete
+            self.on_upgrade_changed(event)
+
+        # if unit top of stack and all units ready (i.e stack), emit granted event
+        if (
+            self.charm.unit == top_unit
+            and top_state in ["ready", "upgrading"]
+            and self.cluster_state == "ready"
+            and "upgrading" not in self.other_unit_states
+        ):
+            logger.debug(
+                f"{top_unit.name} is next to upgrade, emitting `upgrade_granted` event and upgrading..."
+            )
+            self.charm.unit.status = MaintenanceStatus("upgrading...")
+            self.peer_relation.data[self.charm.unit].update({"state": "upgrading"})
+
+            try:
+                getattr(self.on, "upgrade_granted").emit()
+            except DependencyError as e:
+                logger.error(e)
+                self.set_unit_failed()
+                return
+
+    def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
+        """Handler for `upgrade-granted` events.
+
+        Handlers of this event must meet the following:
+            - SHOULD check for related application deps from :class:`DataUpgrade.dependencies`
+                - MAY raise :class:`DependencyError` if dependency not met
+            - MUST update unit `state` after validating the success of the upgrade, calling one of:
+                - :class:`DataUpgrade.set_unit_failed` if the unit upgrade fails
+                - :class:`DataUpgrade.set_unit_completed` if the unit upgrade succeeds
+            - MUST call :class:`DataUpgarde.on_upgrade_changed` on exit so event not lost on leader
+        """
+        # don't raise if k8s substrate, only return
+        if self.substrate == "k8s":
+            return
+
+        raise NotImplementedError
+
+    def _on_upgrade_finished(self, _) -> None:
+        """Handler for `upgrade-finished` events."""
+        if self.substrate == "vm" or not self.peer_relation:
+            return
+
+        # Emit the upgrade relation changed event in the leader to update the upgrade_stack.
+        if self.charm.unit.is_leader():
+            self.charm.on[self.relation_name].relation_changed.emit(
+                self.model.get_relation(self.relation_name)
+            )
+
+        # This hook shouldn't run for the last unit (the first that is upgraded). For that unit it
+        # should be done through an action after the upgrade success on that unit is double-checked.
+        unit_number = int(self.charm.unit.name.split("/")[1])
+        if unit_number == len(self.peer_relation.units):
+            logger.info(
+                f"{self.charm.unit.name} unit upgraded. Evaluate and run `resume-upgrade` action to continue upgrade"
+            )
+            return
+
+        # Also, the hook shouldn't run for the first unit (the last that is upgraded).
+        if unit_number == 0:
+            logger.info(f"{self.charm.unit.name} unit upgraded. Upgrade is complete")
+            return
+
+        try:
+            # Use the unit number instead of the upgrade stack to avoid race conditions
+            # (i.e. the leader updates the upgrade stack after this hook runs).
+            next_partition = unit_number - 1
+            logger.debug(f"Set rolling update partition to unit {next_partition}")
+            self._set_rolling_update_partition(partition=next_partition)
+        except KubernetesClientError:
+            logger.exception("Cannot set rolling update partition")
+            self.set_unit_failed()
+            self.log_rollback_instructions()
+
+    def _set_rolling_update_partition(self, partition: int) -> None:
+        """Patch the StatefulSet's `spec.updateStrategy.rollingUpdate.partition`.
+
+        Args:
+            partition: partition to set.
+
+        K8s only. It should decrement the rolling update strategy partition by using a code
+        like the following:
+
+            from lightkube.core.client import Client
+            from lightkube.core.exceptions import ApiError
+            from lightkube.resources.apps_v1 import StatefulSet
+
+            try:
+                patch = {"spec": {"updateStrategy": {"rollingUpdate": {"partition": partition}}}}
+                Client().patch(StatefulSet, name=self.charm.model.app.name, namespace=self.charm.model.name, obj=patch)
+                logger.debug(f"Kubernetes StatefulSet partition set to {partition}")
+            except ApiError as e:
+                if e.status.code == 403:
+                    cause = "`juju trust` needed"
+                else:
+                    cause = str(e)
+                raise KubernetesClientError("Kubernetes StatefulSet patch failed", cause)
+        """
+        if self.substrate == "vm":
+            return
+
+        raise NotImplementedError
diff --git a/charms/worker/k8s/requirements.txt b/charms/worker/k8s/requirements.txt
index 8f2e0805..01a13b16 100644
--- a/charms/worker/k8s/requirements.txt
+++ b/charms/worker/k8s/requirements.txt
@@ -10,3 +10,5 @@ tomli ==2.1.0
 tomli-w == 1.0.0
 typing_extensions==4.12.2
 websocket-client==1.8.0
+poetry-core==1.9.1
+lightkube==0.15.5
diff --git a/charms/worker/k8s/src/charm.py b/charms/worker/k8s/src/charm.py
index 8d28884a..d3775be0 100755
--- a/charms/worker/k8s/src/charm.py
+++ b/charms/worker/k8s/src/charm.py
@@ -56,10 +56,13 @@
 from charms.node_base import LabelMaker
 from charms.reconciler import Reconciler
 from cos_integration import COSIntegration
+from inspector import ClusterInspector
+from literals import DEPENDENCIES
 from snap import management as snap_management
 from snap import version as snap_version
 from token_distributor import ClusterTokenType, TokenCollector, TokenDistributor, TokenStrategy
 from typing_extensions import Literal
+from upgrade import K8sDependenciesModel, K8sUpgrade
 
 # Log messages can be retrieved using juju debug-log
 log = logging.getLogger(__name__)
@@ -126,6 +129,14 @@ def __init__(self, *args):
         self.api_manager = K8sdAPIManager(factory)
         xcp_relation = "external-cloud-provider" if self.is_control_plane else ""
         self.xcp = ExternalCloudProvider(self, xcp_relation)
+        self.cluster_inspector = ClusterInspector(kubeconfig_path=KUBECONFIG)
+        self.upgrade = K8sUpgrade(
+            self,
+            node_manager=self.cluster_inspector,
+            relation_name="upgrade",
+            substrate="vm",
+            dependency_model=K8sDependenciesModel(**DEPENDENCIES),
+        )
         self.cos = COSIntegration(self)
         self.reconciler = Reconciler(self, self._reconcile)
         self.distributor = TokenDistributor(self, self.get_node_name(), self.api_manager)
@@ -293,7 +304,7 @@ def _bootstrap_k8s_snap(self):
             log.info("K8s cluster already bootstrapped")
             return
 
-        bootstrap_config = BootstrapConfig()
+        bootstrap_config = BootstrapConfig.construct()
         self._configure_datastore(bootstrap_config)
         self._configure_cloud_provider(bootstrap_config)
         self._configure_annotations(bootstrap_config)
@@ -610,6 +621,7 @@ def _update_kubernetes_version(self):
         """
         relation = self.model.get_relation("cluster")
         if not relation:
+            status.add(ops.BlockedStatus("Missing cluster integration"))
             raise ReconcilerError("Missing cluster integration")
         if version := snap_version("k8s"):
             relation.data[self.unit]["version"] = version
diff --git a/charms/worker/k8s/src/inspector.py b/charms/worker/k8s/src/inspector.py
new file mode 100644
index 00000000..1b3f1a18
--- /dev/null
+++ b/charms/worker/k8s/src/inspector.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""A module for inspecting a Kubernetes cluster."""
+
+import logging
+from pathlib import Path
+from typing import List, Optional
+
+from lightkube import ApiError, Client, KubeConfig
+from lightkube.core.client import LabelSelector
+from lightkube.resources.core_v1 import Node, Pod
+
+log = logging.getLogger(__name__)
+
+
+class ClusterInspector:
+    """A helper class for inspecting a Kubernetes cluster."""
+
+    class ClusterInspectorError(Exception):
+        """Base exception for ClusterInspector errors."""
+
+    def __init__(
+        self,
+        kubeconfig_path: Path,
+    ):
+        """Initialize the ClusterInspector.
+
+        Args:
+            kubeconfig_path: The path to the kubeconfig file.
+        """
+        self.kubeconfig_path = kubeconfig_path
+        # NOTE (mateoflorido): The client is set to None to avoid
+        # initializing it when the object is created (e.g. during
+        # the charm install as we don't have the kubeconfig yet).
+        # The client will be initialized when it's needed using the
+        # _get_client method.
+        self.client: Optional[Client] = None
+
+    def _get_client(self) -> Client:
+        """Return the client instance."""
+        if self.client is None:
+            config = KubeConfig.from_file(str(self.kubeconfig_path))
+            self.client = Client(config=config.get())
+        return self.client
+
+    def get_nodes(self, labels: LabelSelector) -> Optional[List[Node]]:
+        """Get nodes from the cluster.
+
+        Args:
+            labels: A dictionary of labels to filter nodes.
+
+        Returns:
+            A list of the failed nodes that match the label selector.
+
+        Raises:
+            ClusterInspectorError: If the nodes cannot be retrieved.
+        """
+        client = self._get_client()
+        unready_nodes = []
+        try:
+            for node in client.list(Node, labels=labels):
+                if node.status != "Ready":
+                    unready_nodes.append(node)
+        except ApiError as e:
+            raise ClusterInspector.ClusterInspectorError(f"Failed to get nodes: {e}") from e
+        return unready_nodes or None
+
+    def verify_pods_running(self, namespaces: List[str]) -> Optional[str]:
+        """Verify that all pods in the specified namespaces are running.
+
+        Args:
+            namespaces: A list of namespaces to check.
+
+        Returns:
+            None if all pods are running, otherwise returns a string
+            containing the namespaces that have pods not running.
+
+        Raises:
+            ClusterInspectorError: If the pods cannot be retrieved.
+        """
+        client = self._get_client()
+
+        failing_pods = []
+        try:
+            for namespace in namespaces:
+                for pod in client.list(Pod, namespace=namespace):
+                    if pod.status.phase != "Running":  # type: ignore
+                        failing_pods.append(f"{namespace}/{pod.metadata.name}")  # type: ignore
+            if failing_pods:
+                return ", ".join(failing_pods)
+        except ApiError as e:
+            raise ClusterInspector.ClusterInspectorError(f"Failed to get pods: {e}") from e
+        return None
diff --git a/charms/worker/k8s/src/literals.py b/charms/worker/k8s/src/literals.py
new file mode 100644
index 00000000..656f95af
--- /dev/null
+++ b/charms/worker/k8s/src/literals.py
@@ -0,0 +1,21 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Literals for the charm."""
+
+DEPENDENCIES = {
+    # NOTE: Update the dependencies for the k8s-charm before releasing.
+    "k8s_charm": {
+        "dependencies": {"k8s-worker": ">2"},
+        "name": "k8s",
+        "upgrade_supported": ">=1",
+        "version": "2",
+    },
+    # NOTE: Update the dependencies for the k8s-service before releasing.
+    "k8s_service": {
+        "dependencies": {"k8s-worker": "^1.31.0"},
+        "name": "k8s",
+        "upgrade_supported": "^1.30.0",
+        "version": "1.31.2",
+    },
+}
diff --git a/charms/worker/k8s/src/token_distributor.py b/charms/worker/k8s/src/token_distributor.py
index d507d606..257e1de8 100644
--- a/charms/worker/k8s/src/token_distributor.py
+++ b/charms/worker/k8s/src/token_distributor.py
@@ -35,9 +35,20 @@ class K8sCharm(Protocol):
         unit (ops.Unit): The unit object.
     """
 
-    app: ops.Application
-    model: ops.Model
-    unit: ops.Unit
+    @property
+    def app(self) -> ops.Application:
+        """The application object."""
+        ...  # pylint: disable=unnecessary-ellipsis
+
+    @property
+    def model(self) -> ops.Model:
+        """The model object."""
+        ...  # pylint: disable=unnecessary-ellipsis
+
+    @property
+    def unit(self) -> ops.Unit:
+        """The unit object."""
+        ...  # pylint: disable=unnecessary-ellipsis
 
     def get_cluster_name(self) -> str:
         """Get the cluster name."""
diff --git a/charms/worker/k8s/src/upgrade.py b/charms/worker/k8s/src/upgrade.py
new file mode 100644
index 00000000..fc2548f6
--- /dev/null
+++ b/charms/worker/k8s/src/upgrade.py
@@ -0,0 +1,99 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""A module for upgrading the k8s and k8s-worker charms."""
+
+import logging
+from typing import List
+
+from charms.data_platform_libs.v0.upgrade import ClusterNotReadyError, DataUpgrade, DependencyModel
+from inspector import ClusterInspector
+from pydantic import BaseModel
+
+log = logging.getLogger(__name__)
+
+
+class K8sDependenciesModel(BaseModel):
+    """A model for the k8s and k8s-worker charm dependencies.
+
+    Attributes:
+        k8s_charm: The k8s charm dependency model.
+        k8s_service: The k8s-service charm dependency model.
+    """
+
+    k8s_charm: DependencyModel
+    k8s_service: DependencyModel
+
+
+class K8sUpgrade(DataUpgrade):
+    """A helper class for upgrading the k8s and k8s-worker charms."""
+
+    def __init__(self, charm, node_manager: ClusterInspector, **kwargs):
+        """Initialize the K8sUpgrade.
+
+        Args:
+            charm: The charm instance.
+            node_manager: The ClusterInspector instance.
+            kwargs: Additional keyword arguments.
+        """
+        super().__init__(charm, **kwargs)
+        self.charm = charm
+        self.node_manager = node_manager
+
+    def pre_upgrade_check(self) -> None:
+        """Check if the cluster is ready for an upgrade.
+
+        It verifies that the cluster nodes are ready before proceeding and
+        if the pods in the specified namespace are ready.
+
+        Raises:
+            ClusterNotReadyError: If the cluster is not ready for an upgrade.
+        """
+        try:
+            nodes = self.node_manager.get_nodes(
+                labels={"juju-charm": "k8s-worker" if self.charm.is_worker else "k8s"}
+            )
+        except ClusterInspector.ClusterInspectorError as e:
+            raise ClusterNotReadyError(
+                message="Cluster is not ready for an upgrade",
+                cause=str(e),
+                resolution="""API server may not be reachable.
+                Please check that the API server is up and running.""",
+            ) from e
+
+        unready_nodes = nodes or []
+
+        if unready_nodes:
+            raise ClusterNotReadyError(
+                message="Cluster is not ready for an upgrade",
+                cause=f"Nodes not ready: {', '.join(unready_nodes)}",
+                resolution="""Node(s) may be in a bad state.
+                    Please check the node(s) for more information.""",
+            )
+
+        if failing_pods := self.node_manager.verify_pods_running(["kube-system"]):
+            raise ClusterNotReadyError(
+                message="Cluster is not ready",
+                cause=f"Pods not running in namespace(s): {failing_pods}",
+                resolution="Check the logs for the failing pods.",
+            )
+
+    def build_upgrade_stack(self) -> List[int]:
+        """Return a list of unit numbers to upgrade in order.
+
+        Returns:
+            A list of unit numbers to upgrade in order.
+        """
+        relation = self.charm.model.get_relation("cluster")
+        if not relation:
+            return [int(self.charm.unit.name.split("/")[-1])]
+
+        return [
+            int(unit.name.split("/")[-1]) for unit in ({self.charm.unit} | set(relation.units))
+        ]
+
+    def log_rollback_instructions(self) -> None:
+        """Log instructions for rolling back the upgrade."""
+        log.critical(
+            "To rollback the upgrade, run: `juju refresh` to the previously deployed revision."
+        )
diff --git a/charms/worker/k8s/tests/unit/test_inspector.py b/charms/worker/k8s/tests/unit/test_inspector.py
new file mode 100644
index 00000000..2e532095
--- /dev/null
+++ b/charms/worker/k8s/tests/unit/test_inspector.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Tests for the inspector module."""
+
+import unittest
+from pathlib import Path
+from typing import List
+from unittest.mock import MagicMock
+
+from inspector import ClusterInspector
+from lightkube.core.exceptions import ApiError
+from lightkube.resources.core_v1 import Node, Pod
+
+
+class TestClusterInspector(unittest.TestCase):
+    """Tests for the ClusterInspector class."""
+
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.inspector = ClusterInspector(Path("/path/to/kubeconfig"))
+        self.mock_client = MagicMock()
+        self.inspector.client = self.mock_client
+
+    def test_get_nodes_returns_unready(self):
+        """Test that get_nodes returns unready nodes."""
+        mock_node1 = MagicMock(spec=Node)
+        mock_node1.status = "Ready"
+        mock_node1.metadata.name = "node1"
+
+        mock_node2 = MagicMock(spec=Node)
+        mock_node2.status = "NotReady"
+        mock_node2.metadata.name = "node2"
+
+        self.mock_client.list.return_value = [mock_node1, mock_node2]
+
+        nodes: List[Node] = self.inspector.get_nodes({"role": "control-plane"})
+
+        self.mock_client.list.assert_called_once_with(Node, labels={"role": "control-plane"})
+        self.assertEqual(len(nodes), 1)
+        # pylint: disable=unsubscriptable-object
+        self.assertEqual(nodes[0].metadata.name, "node2")  # type: ignore
+
+    def test_get_nodes_api_error(self):
+        """Test get_nodes handles API errors."""
+        self.mock_client.list.side_effect = ApiError(response=MagicMock())
+        with self.assertRaises(ClusterInspector.ClusterInspectorError):
+            self.inspector.get_nodes({"role": "control-plane"})
+
+    def test_verify_pods_running_failed_pods(self):
+        """Test verify_pods_running when some pods are not running."""
+        mock_pod = MagicMock(spec=Pod)
+        mock_pod.status.phase = "Running"
+        mock_pod.metadata.name = "pod1"
+
+        mock_pod2 = MagicMock(spec=Pod)
+        mock_pod2.status.phase = "Failed"
+        mock_pod2.metadata.name = "pod2"
+
+        self.mock_client.list.return_value = [mock_pod, mock_pod2]
+
+        result = self.inspector.verify_pods_running(["kube-system"])
+
+        self.assertEqual(result, "kube-system/pod2")
+        self.mock_client.list.assert_called_once_with(Pod, namespace="kube-system")
+
+    def test_verify_pods_running_multiple_namespaces(self):
+        """Test verify_pods_running with multiple namespaces."""
+
+        def mock_list_pods(_, namespace):
+            """Mock the list method to return pods in different states.
+
+            Args:
+                namespace: The namespace to list pods from.
+
+            Returns:
+                A list of pods in different states.
+            """
+            if namespace == "ns1":
+                mock_pod = MagicMock(spec=Pod)
+                mock_pod.status.phase = "Running"
+                mock_pod.metadata.name = "pod1"
+                return [mock_pod]
+            mock_pod = MagicMock(spec=Pod)
+            mock_pod.status.phase = "Failed"
+            mock_pod.metadata.name = "pod2"
+            return [mock_pod]
+
+        self.mock_client.list.side_effect = mock_list_pods
+
+        result = self.inspector.verify_pods_running(["ns1", "ns2"])
+
+        self.assertEqual(result, "ns2/pod2")
+        self.assertEqual(self.mock_client.list.call_count, 2)
+
+    def test_verify_pods_running_api_error(self):
+        """Test verify_pods_running handles API errors."""
+        self.mock_client.list.side_effect = ApiError(response=MagicMock())
+
+        with self.assertRaises(ClusterInspector.ClusterInspectorError):
+            self.inspector.verify_pods_running(["default"])
diff --git a/charms/worker/k8s/tests/unit/test_upgrade.py b/charms/worker/k8s/tests/unit/test_upgrade.py
new file mode 100644
index 00000000..edeb5003
--- /dev/null
+++ b/charms/worker/k8s/tests/unit/test_upgrade.py
@@ -0,0 +1,119 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Tests for the upgrade module."""
+
+import unittest
+from unittest.mock import MagicMock
+
+from charms.data_platform_libs.v0.upgrade import ClusterNotReadyError
+from inspector import ClusterInspector
+from upgrade import K8sDependenciesModel, K8sUpgrade
+
+
+class TestK8sUpgrade(unittest.TestCase):
+    """Tests for the K8sUpgrade class."""
+
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.charm = MagicMock()
+        self.node_manager = MagicMock(spec=ClusterInspector)
+        self.upgrade = K8sUpgrade(
+            self.charm,
+            node_manager=self.node_manager,
+            relation_name="upgrade",
+            substrate="vm",
+            dependency_model=K8sDependenciesModel(
+                **{
+                    "k8s_charm": {
+                        "dependencies": {"k8s-worker": ">50"},
+                        "name": "k8s",
+                        "upgrade_supported": ">90",
+                        "version": "100",
+                    },
+                    "k8s_service": {
+                        "dependencies": {"k8s-worker": "^3"},
+                        "name": "k8s",
+                        "upgrade_supported": ">=0.8",
+                        "version": "1.31.1",
+                    },
+                }
+            ),
+        )
+
+    def test_pre_upgrade_check_worker_success(self):
+        """Test pre_upgrade_check succeeds for worker nodes."""
+        self.charm.is_worker = True
+        self.node_manager.get_nodes.return_value = []
+        self.node_manager.verify_pods_running.return_value = None
+
+        self.upgrade.pre_upgrade_check()
+
+        self.node_manager.get_nodes.assert_called_once_with(labels={"juju-charm": "k8s-worker"})
+        self.node_manager.verify_pods_running.assert_called_once_with(["kube-system"])
+
+    def test_pre_upgrade_check_control_plane_success(self):
+        """Test pre_upgrade_check succeeds for control plane nodes."""
+        self.charm.is_worker = False
+        self.node_manager.get_nodes.return_value = []
+        self.node_manager.verify_pods_running.return_value = None
+
+        self.upgrade.pre_upgrade_check()
+
+        self.node_manager.get_nodes.assert_called_once_with(labels={"juju-charm": "k8s"})
+
+    def test_pre_upgrade_check_unready_nodes(self):
+        """Test pre_upgrade_check fails when nodes are not ready."""
+        self.charm.is_worker = True
+        self.node_manager.get_nodes.return_value = [
+            "worker-1",
+            "worker-2",
+            "worker-3",
+        ]
+
+        with self.assertRaises(ClusterNotReadyError):
+            self.upgrade.pre_upgrade_check()
+
+    def test_pre_upgrade_check_cluster_inspector_error(self):
+        """Test pre_upgrade_check handles ClusterInspectorError."""
+        self.node_manager.get_nodes.side_effect = ClusterInspector.ClusterInspectorError(
+            "test error"
+        )
+
+        with self.assertRaises(ClusterNotReadyError):
+            self.upgrade.pre_upgrade_check()
+
+    def test_pre_upgrade_check_pods_not_ready(self):
+        """Test pre_upgrade_check fails when pods are not ready."""
+        self.charm.is_worker = True
+        self.node_manager.get_nodes.return_value = None
+        self.node_manager.verify_pods_running.return_value = "kube-system/pod-1"
+
+        with self.assertRaises(ClusterNotReadyError):
+            self.upgrade.pre_upgrade_check()
+
+    def test_build_upgrade_stack_no_relation(self):
+        """Test build_upgrade_stack when no cluster relation exists."""
+        self.charm.unit.name = "k8s/0"
+        self.charm.model.get_relation.return_value = None
+
+        result = self.upgrade.build_upgrade_stack()
+
+        self.assertEqual(result, [0])
+        self.charm.model.get_relation.assert_called_once_with("cluster")
+
+    def test_build_upgrade_stack_with_relation(self):
+        """Test build_upgrade_stack with cluster relation."""
+        self.charm.unit.name = "k8s/0"
+        relation = MagicMock()
+        unit_1 = MagicMock()
+        unit_1.name = "k8s/1"
+        unit_2 = MagicMock()
+        unit_2.name = "k8s/2"
+        relation.units = {unit_1, unit_2}
+        self.charm.model.get_relation.return_value = relation
+
+        result = self.upgrade.build_upgrade_stack()
+
+        self.assertEqual(sorted(result), [0, 1, 2])
+        self.charm.model.get_relation.assert_called_once_with("cluster")