Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
feat: use new manager for controlling slurmctld
Browse files Browse the repository at this point in the history
Still using some methods provided by the legacy manager
that we don't have time to refactor this pulse

Signed-off-by: Jason C. Nucciarone <[email protected]>
  • Loading branch information
NucciTheBoss committed Jul 15, 2024
1 parent 85fc688 commit d9849ce
Showing 1 changed file with 39 additions and 42 deletions.
81 changes: 39 additions & 42 deletions src/charm.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#!/usr/bin/env python3
# Copyright 2020 Omnivector Solutions, LLC
# Copyright 2020-2024 Omnivector, LLC.
# See LICENSE file for licensing details.

"""SlurmctldCharm."""
"""Slurmctld charm operator."""

import logging
import shlex
import subprocess
from typing import Any, Dict, List, Optional, Union

import charms.hpc_libs.v0.slurm_ops as slurm
from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, SLURM_CONF_PATH
from interface_slurmd import (
PartitionAvailableEvent,
Expand Down Expand Up @@ -39,7 +40,8 @@
main,
)
from slurm_conf_editor import slurm_conf_as_string
from slurmctld_ops import SlurmctldManager, is_container
from slurmctld_ops import LegacySlurmctldManager, is_container
from utils import SlurmctldManager, scontrol

logger = logging.getLogger()

Expand All @@ -64,7 +66,9 @@ def __init__(self, *args):
user_supplied_slurm_conf_params=str(),
)

self._slurmctld_manager = SlurmctldManager()
self._slurmctld = SlurmctldManager()
# Legacy manager provides operations we're still fleshing out.
self._legacy_manager = LegacySlurmctldManager()

self._slurmd = Slurmd(self, "slurmd")
self._slurmdbd = Slurmdbd(self, "slurmdbd")
Expand All @@ -90,37 +94,30 @@ def __init__(self, *args):

def _on_install(self, event: InstallEvent) -> None:
"""Perform installation operations for slurmctld."""
self.unit.status = WaitingStatus("Installing slurmctld")

if self._slurmctld_manager.install():

# Store the munge_key and jwt_rsa key in the stored state.
# NOTE: Use secrets instead of stored state when secrets are supported the framework.
if self.model.unit.is_leader():
jwt_rsa = self._slurmctld_manager.generate_jwt_rsa()
self._stored.jwt_rsa = jwt_rsa
if not self.unit.is_leader():
self.unit.status = BlockedStatus("multiple slurmctld controllers not supported")
logger.error("multiple controllers not support. we're working on enabling support :)")
event.defer()
return

munge_key = self._slurmctld_manager.generate_munge_key()
self._stored.munge_key = munge_key
self.unit.status = WaitingStatus("installing slurmctld")
try:
slurm.install()
self.unit.set_workload_version(slurm.version())

self._slurmctld_manager.stop_munged()
self._slurmctld_manager.write_munge_key(munge_key)
self._slurmctld_manager.start_munged()
self._stored.munge_key = self._slurmctld.munge.get_key()

self._slurmctld_manager.stop_slurmctld()
self._slurmctld_manager.write_jwt_rsa(jwt_rsa)
self._slurmctld_manager.start_slurmctld()
jwt_rsa = self._legacy_manager.generate_jwt_rsa()
self._stored.jwt_rsa = jwt_rsa
self._legacy_manager.write_jwt_rsa(jwt_rsa)

self.unit.set_workload_version(self._slurmctld_manager.version())
self.slurm_installed = True
else:
self.unit.status = BlockedStatus("Only singleton slurmctld is supported.")
logger.debug("Secondary slurmctld not supported.")
event.defer()
else:
self.unit.status = BlockedStatus("Error installing slurmctld")
logger.error("Cannot install slurmctld, please debug.")
self._slurmctld.enable()
self.slurm_installed = True
except slurm.SlurmOpsError as e:
self.unit.status = BlockedStatus("error installing slurmctld. check log for more info")
logger.error(e.message)
event.defer()
return

self._on_write_slurm_conf(event)

Expand Down Expand Up @@ -234,16 +231,15 @@ def _on_write_slurm_conf(
return

if slurm_config := self._assemble_slurm_conf():
self._slurmctld_manager.stop_slurmctld()
self._slurmctld_manager.write_slurm_conf(slurm_config)
self._slurmctld.disable()
self._legacy_manager.write_slurm_conf(slurm_config)

# Write out any user_supplied_cgroup_parameters to /etc/slurm/cgroup.conf.
if user_supplied_cgroup_parameters := self.config.get("cgroup-parameters", ""):
self._slurmctld_manager.write_cgroup_conf(str(user_supplied_cgroup_parameters))
self._legacy_manager.write_cgroup_conf(str(user_supplied_cgroup_parameters))

self._slurmctld_manager.start_slurmctld()

self._slurmctld_manager.slurm_cmd("scontrol", "reconfigure")
self._slurmctld.enable()
scontrol("reconfigure")

# Transitioning Nodes
#
Expand Down Expand Up @@ -354,9 +350,11 @@ def _check_status(self) -> bool: # noqa C901
self.unit.status = BlockedStatus("Error installing slurmctld")
return False

if not self._slurmctld_manager.check_munged():
self.unit.status = BlockedStatus("Error configuring munge key")
return False
# FIXME: Returns false because systemd is looking for the
# `munge.service` file and not the snap one.
# if not self._legacy_manager.check_munged():
# self.unit.status = BlockedStatus("Error configuring munge key")
# return False

self.unit.status = ActiveStatus("")
return True
Expand All @@ -372,8 +370,7 @@ def get_jwt_rsa(self) -> Optional[str]:
def _resume_nodes(self, nodelist: List[str]) -> None:
"""Run scontrol to resume the specified node list."""
nodes = ",".join(nodelist)
update_cmd = f"update nodename={nodes} state=resume"
self._slurmctld_manager.slurm_cmd("scontrol", update_cmd)
scontrol("update", f"nodename={nodes}", "state=resume")

@property
def cluster_name(self) -> str:
Expand All @@ -399,7 +396,7 @@ def new_nodes(self, new_nodes: List[Any]) -> None:
@property
def hostname(self) -> str:
"""Return the hostname."""
return self._slurmctld_manager.hostname
return self._legacy_manager.hostname

@property
def _slurmd_ingress_address(self) -> str:
Expand Down

0 comments on commit d9849ce

Please sign in to comment.