From b0bf05615fa3c2d5c0aa0065a1903fdc41ebfb72 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Sat, 23 Nov 2024 22:18:45 +0000 Subject: [PATCH 1/2] chore(slurmctld, slurm): clean up relation data These changes remove the unused "cluster_name" from the relation data sent to slurmd on the slurmd relation. Additionally, make the "cluster_name" property private. --- charms/slurmctld/charmcraft.yaml | 2 +- charms/slurmctld/src/charm.py | 4 ++-- charms/slurmctld/src/interface_slurmd.py | 1 - charms/slurmctld/tests/unit/test_charm.py | 6 +++--- charms/slurmd/src/interface_slurmctld.py | 4 ---- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/charms/slurmctld/charmcraft.yaml b/charms/slurmctld/charmcraft.yaml index c5470e7..3f58b65 100644 --- a/charms/slurmctld/charmcraft.yaml +++ b/charms/slurmctld/charmcraft.yaml @@ -67,7 +67,7 @@ config: options: cluster-name: type: string - default: osd-cluster + default: "osd-cluster" description: | Name to be recorded in database for jobs from this cluster. diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index 99ec654..80ffea3 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -302,7 +302,7 @@ def _assemble_slurmctld_parameters() -> dict[str, Any]: } slurm_conf = SlurmConfig( - ClusterName=self.cluster_name, + ClusterName=self._cluster_name, SlurmctldAddr=self._slurmd_ingress_address, SlurmctldHost=[self._slurmctld.hostname], SlurmctldParameters=_assemble_slurmctld_parameters(), @@ -377,7 +377,7 @@ def _resume_nodes(self, nodelist: List[str]) -> None: self._slurmctld.scontrol(update_cmd) @property - def cluster_name(self) -> str: + def _cluster_name(self) -> str: """Return the cluster name.""" cluster_name = "charmedhpc" if cluster_name_from_config := self.config.get("cluster-name"): diff --git a/charms/slurmctld/src/interface_slurmd.py b/charms/slurmctld/src/interface_slurmd.py index dba0c13..84dcaae 100644 --- a/charms/slurmctld/src/interface_slurmd.py +++ b/charms/slurmctld/src/interface_slurmd.py @@ -89,7 +89,6 @@ def _on_relation_created(self, event: RelationCreatedEvent) -> None: { "munge_key": self._charm.get_munge_key(), "slurmctld_host": self._charm.hostname, - "cluster_name": self._charm.cluster_name, "nhc_params": health_check_params, } ) diff --git a/charms/slurmctld/tests/unit/test_charm.py b/charms/slurmctld/tests/unit/test_charm.py index 353b3ff..3f886ef 100644 --- a/charms/slurmctld/tests/unit/test_charm.py +++ b/charms/slurmctld/tests/unit/test_charm.py @@ -34,12 +34,12 @@ def setUp(self): self.harness.begin() def test_cluster_name(self) -> None: - """Test that the cluster_name property works.""" - self.assertEqual(self.harness.charm.cluster_name, "osd-cluster") + """Test that the _cluster_name property works.""" + self.assertEqual(self.harness.charm._cluster_name, "osd-cluster") def test_cluster_info(self) -> None: """Test the cluster_info property works.""" - self.assertEqual(type(self.harness.charm.cluster_name), str) + self.assertEqual(type(self.harness.charm._cluster_name), str) def test_is_slurm_installed(self) -> None: """Test that the is_slurm_installed method works.""" diff --git a/charms/slurmd/src/interface_slurmctld.py b/charms/slurmd/src/interface_slurmctld.py index 50b0e9e..1aac04f 100644 --- a/charms/slurmd/src/interface_slurmctld.py +++ b/charms/slurmd/src/interface_slurmctld.py @@ -24,14 +24,12 @@ class SlurmctldAvailableEvent(EventBase): def __init__( self, handle, - cluster_name, munge_key, nhc_params, slurmctld_host, ): super().__init__(handle) - self.cluster_name = cluster_name self.munge_key = munge_key self.nhc_params = nhc_params self.slurmctld_host = slurmctld_host @@ -39,7 +37,6 @@ def __init__( def snapshot(self): """Snapshot the event data.""" return { - "cluster_name": self.cluster_name, "munge_key": self.munge_key, "nhc_params": self.nhc_params, "slurmctld_host": self.slurmctld_host, @@ -47,7 +44,6 @@ def snapshot(self): def restore(self, snapshot): """Restore the snapshot of the event data.""" - self.cluster_name = snapshot.get("cluster_name") self.munge_key = snapshot.get("munge_key") self.nhc_params = snapshot.get("nhc_params") self.slurmctld_host = snapshot.get("slurmctld_host") From 2ffe50510eab46d11fef04319803c2af98a80e1b Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Sun, 24 Nov 2024 17:20:31 +0000 Subject: [PATCH 2/2] chore(slurmctld): use peer relation for ingress ip These changes add a peer relation for the slurmctld charm and replace using the slurmd interface to obtain the ingress_address with the new slurmctld-peer relation. The reason for this change is that we do not want to depend on the existence of the slurmd relation in order to know our ip. Using a peer relation we will always have resolvability so long as juju knows the ip address of the unit. --- charms/slurmctld/charmcraft.yaml | 4 ++++ charms/slurmctld/src/charm.py | 18 ++++++++++-------- charms/slurmctld/src/constants.py | 2 ++ charms/slurmctld/src/exceptions.py | 13 +++++++++++++ charms/slurmctld/tests/unit/test_charm.py | 1 + 5 files changed, 30 insertions(+), 8 deletions(-) create mode 100644 charms/slurmctld/src/exceptions.py diff --git a/charms/slurmctld/charmcraft.yaml b/charms/slurmctld/charmcraft.yaml index 3f58b65..ecfaaf5 100644 --- a/charms/slurmctld/charmcraft.yaml +++ b/charms/slurmctld/charmcraft.yaml @@ -36,6 +36,10 @@ provides: interface: cos_agent limit: 1 +peers: + slurmctld-peer: + interface: slurmctld-peer + assumes: - juju diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index 80ffea3..59b51cb 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -9,7 +9,8 @@ import subprocess from typing import Any, Dict, List, Optional, Union -from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS +from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, PEER_RELATION +from exceptions import IngressAddressUnavailableError from interface_slurmd import ( PartitionAvailableEvent, PartitionUnavailableEvent, @@ -303,7 +304,7 @@ def _assemble_slurmctld_parameters() -> dict[str, Any]: slurm_conf = SlurmConfig( ClusterName=self._cluster_name, - SlurmctldAddr=self._slurmd_ingress_address, + SlurmctldAddr=self._ingress_address, SlurmctldHost=[self._slurmctld.hostname], SlurmctldParameters=_assemble_slurmctld_parameters(), ProctrackType="proctrack/linuxproc" if is_container() else "proctrack/cgroup", @@ -403,12 +404,13 @@ def hostname(self) -> str: return self._slurmctld.hostname @property - def _slurmd_ingress_address(self) -> str: - """Return the ingress_address from the slurmd relation if it exists.""" - ingress_address = "" - if binding := self.model.get_binding("slurmd"): - ingress_address = f"{binding.network.ingress_address}" - return ingress_address + def _ingress_address(self) -> str: + """Return the ingress_address from the peer relation if it exists.""" + if (peer_binding := self.model.get_binding(PEER_RELATION)) is not None: + ingress_address = f"{peer_binding.network.ingress_address}" + logger.debug(f"Slurmctld ingress_address: {ingress_address}") + return ingress_address + raise IngressAddressUnavailableError("Ingress address unavailable") @property def slurm_installed(self) -> bool: diff --git a/charms/slurmctld/src/constants.py b/charms/slurmctld/src/constants.py index b9334b5..91acce1 100644 --- a/charms/slurmctld/src/constants.py +++ b/charms/slurmctld/src/constants.py @@ -3,6 +3,8 @@ """This module provides constants for the slurmctld-operator charm.""" +PEER_RELATION = "slurmctld-peer" + CHARM_MAINTAINED_SLURM_CONF_PARAMETERS = { "AuthAltParameters": {"jwt_key": "/var/lib/slurm/checkpoint/jwt_hs256.key"}, "AuthAltTypes": ["auth/jwt"], diff --git a/charms/slurmctld/src/exceptions.py b/charms/slurmctld/src/exceptions.py new file mode 100644 index 0000000..472c28f --- /dev/null +++ b/charms/slurmctld/src/exceptions.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Omnivector Corp +# See LICENSE file for licensing details. + +"""Custom exceptions for the slurmctld operator.""" + + +class IngressAddressUnavailableError(Exception): + """Exception raised when a slurm operation failed.""" + + @property + def message(self) -> str: + """Return message passed as argument to exception.""" + return self.args[0] diff --git a/charms/slurmctld/tests/unit/test_charm.py b/charms/slurmctld/tests/unit/test_charm.py index 3f886ef..29fc2bc 100644 --- a/charms/slurmctld/tests/unit/test_charm.py +++ b/charms/slurmctld/tests/unit/test_charm.py @@ -160,6 +160,7 @@ def test_on_slurmdbd_unavailable(self) -> None: def test_get_user_supplied_parameters(self, *_) -> None: """Test that user supplied parameters are parsed correctly.""" self.harness.add_relation("slurmd", "slurmd") + self.harness.add_relation("slurmctld-peer", self.harness.charm.app.name) self.harness.update_config( {"slurm-conf-parameters": "JobAcctGatherFrequency=task=30,network=40"} )