Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(all): implement slurm_ops charm library #35

Merged
merged 30 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
85623fc
feat(slurmd): add util modules for nhc, machine info, and service man…
NucciTheBoss Nov 8, 2024
752edb1
feat(slurmd): use `slurm_ops` to provide `SlurmdManager`
NucciTheBoss Nov 8, 2024
4c22d5b
tests(slurmd): update unit tests for slurmd to patch `slurm_ops`
NucciTheBoss Nov 8, 2024
9ccf425
chore(slurmctld): bump version of `slurmutils` to 0.8.0
NucciTheBoss Nov 8, 2024
68355b6
feat(slurmctld): use `slurm_ops` to provide `SlurmctldManager`
NucciTheBoss Nov 8, 2024
12a935a
test(slurmd): update unit tests for slurmctld to patch `slurm_ops`
NucciTheBoss Nov 8, 2024
0f8ab63
chore(slurmdbd): add `slurmutils` as dep
NucciTheBoss Nov 8, 2024
580166c
feat(slurmdbd): use `slurm_ops` to provide `SlurmdbdManager`
NucciTheBoss Nov 8, 2024
a975ccb
tests(slurmdbd): update unit tests for slurmdbd to patch `slurm_ops`
NucciTheBoss Nov 8, 2024
056d349
chore(slurmrestd): add `slurmutils` as dep
NucciTheBoss Nov 8, 2024
5e45874
feat(slurmrestd): use `slurm_ops` to provide `SlurmrestdManager`
NucciTheBoss Nov 8, 2024
074651d
tests(slurmrestd): update unit tests for slurmrestd to patch `slurm_ops`
NucciTheBoss Nov 8, 2024
5153464
chore(tests): update test dependencies
NucciTheBoss Nov 8, 2024
ae6e78f
fix: apply ruff fixes when running `tox run -e fmt` action
NucciTheBoss Nov 8, 2024
d3a92f2
feat: add `slurm_ops` and `is_container` charm libs to project
NucciTheBoss Nov 8, 2024
4b5bc51
fix(slurmctld,slurmd): replace word sanity in TODO comments
NucciTheBoss Nov 8, 2024
07ec196
tests(slurmdbd): remove patch decorators for `slurmdbd_ops`
NucciTheBoss Nov 8, 2024
648e032
fix(slurmctld,slurmd): correct typing for static checker
NucciTheBoss Nov 8, 2024
81a19ff
fix(all): add parts definition to install `rustc` compiler
NucciTheBoss Nov 8, 2024
7964c44
Merge branch 'charmed-hpc:main' into feat-impl-slurm-ops
NucciTheBoss Nov 12, 2024
607c484
fix: remove used `constants` module from slurmd charm
NucciTheBoss Nov 12, 2024
7b71612
fix(slurmd): strip first level of nhc tarball
NucciTheBoss Nov 14, 2024
43d3a08
fix(slurmctld): adapt config values to slurm.conf editor
NucciTheBoss Nov 14, 2024
b0e11a6
fix(slurmd): increase max cyclomatic complexity to 15
NucciTheBoss Nov 14, 2024
26ca3a3
fix(slurmdbd): adapt config values to slurmdbd.conf editor
NucciTheBoss Nov 14, 2024
9e3bbb1
chore(all): bump version of external charm lib dependencies
NucciTheBoss Nov 15, 2024
14b32f0
fix(slurmctld,slurmdbd): correct type hints for pyright
NucciTheBoss Nov 15, 2024
c751b0a
fix(slurmd): correct order of debug messages
NucciTheBoss Nov 18, 2024
f0cadeb
refactor(slurmd): raise `FileNotFoundError` if `/etc/nhc/nhc.conf` do…
NucciTheBoss Nov 18, 2024
2ba972f
fix(slurmctld,slurmd): clean up TODO comments
NucciTheBoss Nov 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions charms/slurmctld/build.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
external-libraries:
- charms.hpc_libs.v0.is_container
- charms.hpc_libs.v0.slurm_ops
- charms.operator_libs_linux.v0.apt
- charms.operator_libs_linux.v1.systemd
14 changes: 14 additions & 0 deletions charms/slurmctld/charmcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright 2020 Omnivector Solutions, LLC
# See LICENSE file for licensing details.

name: slurmctld
summary: |
Slurmctld, the central management daemon of Slurm.
Expand Down Expand Up @@ -43,6 +44,19 @@ bases:
channel: "22.04"
architectures: [amd64]

parts:
charm:
charm-requirements: ["requirements.txt"]
override-build: |
cp /usr/bin/rustc-1.80 /usr/bin/rustc
craftctl default
build-packages:
- libffi-dev
- libssl-dev
- rustc-1.80
- cargo
- pkg-config
- git

config:
options:
Expand Down
2 changes: 1 addition & 1 deletion charms/slurmctld/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ops==2.15.0
distro==1.9.0
pycryptodome==3.20.0
slurmutils~=0.8.0
149 changes: 70 additions & 79 deletions charms/slurmctld/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,16 @@
import subprocess
from typing import Any, Dict, List, Optional, Union

from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, SLURM_CONF_PATH
from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS
from interface_slurmd import (
PartitionAvailableEvent,
PartitionUnavailableEvent,
Slurmd,
SlurmdAvailableEvent,
SlurmdDepartedEvent,
)
from interface_slurmdbd import (
Slurmdbd,
SlurmdbdAvailableEvent,
SlurmdbdUnavailableEvent,
)
from interface_slurmrestd import (
Slurmrestd,
SlurmrestdAvailableEvent,
)
from interface_slurmdbd import Slurmdbd, SlurmdbdAvailableEvent, SlurmdbdUnavailableEvent
from interface_slurmrestd import Slurmrestd, SlurmrestdAvailableEvent
from ops import (
ActionEvent,
ActiveStatus,
Expand All @@ -38,8 +31,10 @@
WaitingStatus,
main,
)
from slurm_conf_editor import slurm_conf_as_string
from slurmctld_ops import SlurmctldManager, is_container
from slurmutils.models import CgroupConfig, SlurmConfig

from charms.hpc_libs.v0.is_container import is_container
from charms.hpc_libs.v0.slurm_ops import SlurmctldManager, SlurmOpsError

logger = logging.getLogger()

Expand All @@ -64,8 +59,7 @@ def __init__(self, *args):
user_supplied_slurm_conf_params=str(),
)

self._slurmctld_manager = SlurmctldManager()

self._slurmctld = SlurmctldManager(snap=False)
self._slurmd = Slurmd(self, "slurmd")
self._slurmdbd = Slurmdbd(self, "slurmdbd")
self._slurmrestd = Slurmrestd(self, "slurmrestd")
Expand All @@ -90,36 +84,30 @@ def __init__(self, *args):

def _on_install(self, event: InstallEvent) -> None:
"""Perform installation operations for slurmctld."""
self.unit.status = WaitingStatus("Installing slurmctld")

if self._slurmctld_manager.install():

# Store the munge_key and jwt_rsa key in the stored state.
# NOTE: Use secrets instead of stored state when secrets are supported the framework.
if self.model.unit.is_leader():
jwt_rsa = self._slurmctld_manager.generate_jwt_rsa()
self._stored.jwt_rsa = jwt_rsa

munge_key = self._slurmctld_manager.generate_munge_key()
self._stored.munge_key = munge_key

self._slurmctld_manager.stop_munged()
self._slurmctld_manager.write_munge_key(munge_key)
self._slurmctld_manager.start_munged()

self._slurmctld_manager.stop_slurmctld()
self._slurmctld_manager.write_jwt_rsa(jwt_rsa)
self._slurmctld_manager.start_slurmctld()
self.unit.status = WaitingStatus("installing slurmctld")
try:
if self.unit.is_leader():
self._slurmctld.install()

# TODO: https://github.com/charmed-hpc/slurm-charms/issues/38 -
# Use Juju Secrets instead of StoredState for exchanging keys between units.
self._slurmctld.jwt.generate()
self._stored.jwt_rsa = self._slurmctld.jwt.get()
self._slurmctld.munge.key.generate()
self._stored.munge_key = self._slurmctld.munge.key.get()
self._slurmctld.munge.service.restart()
self._slurmctld.service.restart()
self.unit.set_workload_version(self._slurmctld.version())

self.unit.set_workload_version(self._slurmctld_manager.version())
self.slurm_installed = True
else:
self.unit.status = BlockedStatus("Only singleton slurmctld is supported.")
logger.debug("Secondary slurmctld not supported.")
self.unit.status = BlockedStatus("slurmctld high-availability not supported")
logger.warning(
"slurmctld high-availability is not supported yet. please scale down application."
)
event.defer()
else:
self.unit.status = BlockedStatus("Error installing slurmctld")
logger.error("Cannot install slurmctld, please debug.")
except SlurmOpsError as e:
logger.error(e.message)
event.defer()

self._on_write_slurm_conf(event)
Expand Down Expand Up @@ -154,21 +142,21 @@ def _on_config_changed(self, event: ConfigChangedEvent) -> None:
logger.debug("## Emitting write-slurm-config event.")
self._on_write_slurm_conf(event)

def _on_update_status(self, event: UpdateStatusEvent) -> None:
def _on_update_status(self, _: UpdateStatusEvent) -> None:
"""Handle update status."""
self._check_status()

def _on_show_current_config_action(self, event: ActionEvent) -> None:
"""Show current slurm.conf."""
slurm_conf = SLURM_CONF_PATH.read_text()
event.set_results({"slurm.conf": slurm_conf})
event.set_results({"slurm.conf": str(self._slurmctld.config.load())})

def _on_slurmrestd_available(self, event: SlurmrestdAvailableEvent) -> None:
"""Check that we have slurm_config when slurmrestd available otherwise defer the event."""
if self.model.unit.is_leader():
if self._check_status():
slurm_conf = slurm_conf_as_string(self._assemble_slurm_conf())
self._slurmrestd.set_slurm_config_on_app_relation_data(slurm_conf)
self._slurmrestd.set_slurm_config_on_app_relation_data(
str(self._slurmctld.config.load())
)
return
logger.debug("Cluster not ready yet, deferring event.")
event.defer()
Expand All @@ -177,7 +165,7 @@ def _on_slurmdbd_available(self, event: SlurmdbdAvailableEvent) -> None:
self._stored.slurmdbd_host = event.slurmdbd_host
self._on_write_slurm_conf(event)

def _on_slurmdbd_unavailable(self, event: SlurmdbdUnavailableEvent) -> None:
def _on_slurmdbd_unavailable(self, _: SlurmdbdUnavailableEvent) -> None:
self._stored.slurmdbd_host = ""
self._check_status()

Expand Down Expand Up @@ -234,16 +222,17 @@ def _on_write_slurm_conf(
return

if slurm_config := self._assemble_slurm_conf():
self._slurmctld_manager.stop_slurmctld()
self._slurmctld_manager.write_slurm_conf(slurm_config)
self._slurmctld.service.disable()
self._slurmctld.config.dump(slurm_config)

# Write out any user_supplied_cgroup_parameters to /etc/slurm/cgroup.conf.
if user_supplied_cgroup_parameters := self.config.get("cgroup-parameters", ""):
self._slurmctld_manager.write_cgroup_conf(str(user_supplied_cgroup_parameters))

self._slurmctld_manager.start_slurmctld()
self._slurmctld.cgroup.dump(
CgroupConfig.from_str(str(user_supplied_cgroup_parameters))
)
NucciTheBoss marked this conversation as resolved.
Show resolved Hide resolved

self._slurmctld_manager.slurm_cmd("scontrol", "reconfigure")
self._slurmctld.service.enable()
self._slurmctld.scontrol("reconfigure")

# Transitioning Nodes
#
Expand All @@ -267,34 +256,32 @@ def _on_write_slurm_conf(

# slurmrestd needs the slurm.conf file, so send it every time it changes.
if self._slurmrestd.is_joined is not False:
slurm_conf = slurm_conf_as_string(slurm_config)
self._slurmrestd.set_slurm_config_on_app_relation_data(slurm_conf)
self._slurmrestd.set_slurm_config_on_app_relation_data(str(slurm_config))
else:
logger.debug("## Should write slurm.conf, but we don't have it. " "Deferring.")
event.defer()

def _assemble_slurm_conf(self) -> Dict[str, Any]:
def _assemble_slurm_conf(self) -> SlurmConfig:
"""Return the slurm.conf parameters."""
user_supplied_parameters = self._get_user_supplied_parameters()

slurmd_parameters = self._slurmd.get_new_nodes_and_nodes_and_partitions()

def _assemble_slurmctld_parameters() -> str:
def _assemble_slurmctld_parameters() -> dict[str, Any]:
# Preprocess merging slurmctld_parameters if they exist in the context
slurmctld_param_config = CHARM_MAINTAINED_SLURM_CONF_PARAMETERS[
"SlurmctldParameters"
].split(",")
user_config = []
slurmctld_parameters = {"enable_configless": True}

if (
user_supplied_slurmctld_parameters := user_supplied_parameters.get(
"SlurmctldParameters", ""
)
!= ""
):
user_config.extend(user_supplied_slurmctld_parameters.split(","))
for opt in user_supplied_slurmctld_parameters.split(","):
k, v = opt.split("=", maxsplit=1)
slurmctld_parameters.update({k: v})

return ",".join(slurmctld_param_config + user_config)
return slurmctld_parameters

accounting_params = {}
if (slurmdbd_host := self._stored.slurmdbd_host) != "":
Expand All @@ -305,20 +292,20 @@ def _assemble_slurmctld_parameters() -> str:
"AccountingStoragePort": "6819",
}

slurm_conf = {
"ClusterName": self.cluster_name,
"SlurmctldAddr": self._slurmd_ingress_address,
"SlurmctldHost": self.hostname,
"SlurmctldParameters": _assemble_slurmctld_parameters(),
"ProctrackType": "proctrack/linuxproc" if is_container() else "proctrack/cgroup",
"TaskPlugin": "task/affinity" if is_container() else "task/cgroup,task/affinity",
slurm_conf = SlurmConfig(
ClusterName=self.cluster_name,
SlurmctldAddr=self._slurmd_ingress_address,
SlurmctldHost=[self._slurmctld.hostname],
SlurmctldParameters=_assemble_slurmctld_parameters(),
ProctrackType="proctrack/linuxproc" if is_container() else "proctrack/cgroup",
TaskPlugin=["task/affinity"] if is_container() else ["task/cgroup", "task/affinity"],
**accounting_params,
**CHARM_MAINTAINED_SLURM_CONF_PARAMETERS,
**slurmd_parameters,
**user_supplied_parameters,
}
)

logger.debug(f"slurm.conf: {slurm_conf}")
logger.debug(f"slurm.conf: {slurm_conf.dict()}")
return slurm_conf

def _get_user_supplied_parameters(self) -> Dict[Any, Any]:
Expand All @@ -333,11 +320,11 @@ def _get_user_supplied_parameters(self) -> Dict[Any, Any]:
return user_supplied_parameters

def _get_new_node_names_from_slurm_config(
self, slurm_config: Dict[str, Any]
self, slurm_config: SlurmConfig
) -> List[Optional[str]]:
"""Given the slurm_config, return the nodes that are DownNodes with reason 'New node.'."""
new_node_names = []
if down_nodes_from_slurm_config := slurm_config.get("down_nodes"):
if down_nodes_from_slurm_config := slurm_config.down_nodes:
for down_nodes_entry in down_nodes_from_slurm_config:
for down_node_name in down_nodes_entry["DownNodes"]:
if down_nodes_entry["Reason"] == "New node.":
Expand All @@ -352,12 +339,16 @@ def _check_status(self) -> bool: # noqa C901
- Munge running
"""
if self.slurm_installed is not True:
self.unit.status = BlockedStatus("Error installing slurmctld")
self.unit.status = BlockedStatus(
"failed to install slurmctld. see logs for further details"
)
return False

if not self._slurmctld_manager.check_munged():
self.unit.status = BlockedStatus("Error configuring munge key")
return False
# TODO: https://github.com/charmed-hpc/hpc-libs/issues/18 -
# Re-enable munge key validation check when supported by `slurm_ops` charm library.
# if not self._slurmctld.check_munged():
# self.unit.status = BlockedStatus("Error configuring munge key")
# return False

self.unit.status = ActiveStatus("")
return True
Expand All @@ -374,7 +365,7 @@ def _resume_nodes(self, nodelist: List[str]) -> None:
"""Run scontrol to resume the specified node list."""
nodes = ",".join(nodelist)
update_cmd = f"update nodename={nodes} state=resume"
self._slurmctld_manager.slurm_cmd("scontrol", update_cmd)
self._slurmctld.scontrol(update_cmd)

@property
def cluster_name(self) -> str:
Expand All @@ -400,7 +391,7 @@ def new_nodes(self, new_nodes: List[Any]) -> None:
@property
def hostname(self) -> str:
"""Return the hostname."""
return self._slurmctld_manager.hostname
return self._slurmctld.hostname

@property
def _slurmd_ingress_address(self) -> str:
Expand Down
Loading