Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
feat: replace ppa with the Slurm snap
Browse files Browse the repository at this point in the history
  • Loading branch information
jedel1043 committed Jul 11, 2024
1 parent 70a61a9 commit c98d186
Show file tree
Hide file tree
Showing 6 changed files with 317 additions and 158 deletions.
280 changes: 280 additions & 0 deletions lib/charms/hpc_libs/v0/slurm_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# Copyright 2024 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Abstractions for managing Slurm operations via snap.
This library contains the `SlurmManagerBase` and `ServiceType` class
which provide high-level interfaces for managing Slurm within charmed operators.
### Example Usage
#### Managing a Slurm service
The `SlurmManagerBase` constructor receives a `ServiceType` enum. The enum instructs
the inheriting Slurm service manager how to manage its corresponding Slurm service on the host.
```python3
import charms.hpc_libs.v0.slurm_ops as slurm
from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType
class SlurmctldManager(SlurmManagerBase):
# Manage `slurmctld` service on host.
def __init__(self) -> None:
super().__init__(ServiceType.SLURMCTLD)
class ApplicationCharm(CharmBase):
# Application charm that needs to use the Slurm snap.
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._slurm_manager = SlurmctldManager()
self.framework.observe(
self.on.install,
self._on_install,
)
def _on_install(self, _) -> None:
slurm.install()
self.unit.set_workload_version(slurm.version())
self._slurm_manager.config.set({"cluster-name": "cluster"})
```
"""

__all__ = [
"format_key",
"install",
"version",
"ConfigurationManager",
"ServiceType",
"SlurmManagerBase",
]

import json
import logging
import re
import subprocess
from collections.abc import Mapping
from enum import Enum
from typing import Any, Optional

import yaml

# The unique Charmhub library identifier, never change it
LIBID = "541fd767f90b40539cf7cd6e7db8fabf"

# Increment this major API version when introducing breaking changes
LIBAPI = 0

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 3

# Charm library dependencies to fetch during `charmcraft pack`.
PYDEPS = ["pyyaml>=6.0.1"]

_logger = logging.getLogger(__name__)
_acronym = re.compile(r"(?<=[A-Z])(?=[A-Z][a-z])")
_kebabize = re.compile(r"(?<=[a-z0-9])(?=[A-Z])")


class SlurmOpsError(Exception):
"""Exception raised when a slurm operation failed."""

@property
def message(self) -> str:
"""Return message passed as argument to exception."""
return self.args[0]


def format_key(key: str) -> str:
"""Format Slurm configuration keys from SlurmCASe into kebab case.
Args:
key: Slurm configuration key to convert to kebab case.
Notes:
Slurm configuration syntax does not follow proper PascalCasing
format, so we cannot put keys directly through a kebab case converter
to get the desired format. Some additional processing is needed for
certain keys before the key can properly kebabized.
For example, without additional preprocessing, the key `CPUs` will
become `cp-us` if put through a kebabizer with being preformatted to `Cpus`.
"""
if "CPUs" in key:
key = key.replace("CPUs", "Cpus")
key = _acronym.sub(r"-", key)
return _kebabize.sub(r"-", key).lower()


def install() -> None:
"""Install Slurm."""
# FIXME: Pin slurm to the stable channel
_snap("install", "slurm", "--channel", "latest/candidate", "--classic")


def version() -> str:
"""Get the current version of Slurm installed on the system."""
info = yaml.safe_load(_snap("info", "slurm"))
ver: str = info["installed"]
return ver.split(maxsplit=1)[0]


def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str:
"""Call a command with logging.
Raises:
SlurmOpsError: Raised if the command fails.
"""
cmd = [cmd, *args]
_logger.debug(f"Executing command {cmd}")
try:
return subprocess.check_output(cmd, input=stdin, stderr=subprocess.PIPE, text=True).strip()
except subprocess.CalledProcessError as e:
_logger.error(f"`{' '.join(cmd)}` failed")
_logger.error(f"stderr: {e.stderr.decode()}")
raise SlurmOpsError(f"command {cmd[0]} failed. Reason:\n{e.stderr.decode()}")


def _snap(*args) -> str:
"""Control snap by via executed `snap ...` commands.
Raises:
subprocess.CalledProcessError: Raised if snap command fails.
"""
return _call("snap", *args)


def _mungectl(*args: str, stdin: Optional[str] = None) -> str:
"""Control munge via `slurm.mungectl ...`.
Args:
*args: Arguments to pass to `mungectl`.
stdin: Input to pass to `mungectl` via stdin.
Raises:
subprocess.CalledProcessError: Raised if `mungectl` command fails.
"""
return _call("slurm.mungectl", *args, stdin=stdin)


class ServiceType(Enum):
"""Type of Slurm service to manage."""

MUNGED = "munged"
SLURMD = "slurmd"
SLURMCTLD = "slurmctld"
SLURMDBD = "slurmdbd"
SLURMRESTD = "slurmrestd"

@property
def config_name(self) -> str:
"""Configuration name on the slurm snap for this service type."""
if self is ServiceType.SLURMCTLD:
return "slurm"
if self is ServiceType.MUNGED:
return "munge"

return self.value


class ServiceManager:
"""Control a Slurm service."""

def enable(self) -> None:
"""Enable service."""
_snap("start", "--enable", f"slurm.{self._service.value}")

def disable(self) -> None:
"""Disable service."""
_snap("stop", "--disable", f"slurm.{self._service.value}")

def restart(self) -> None:
"""Restart service."""
_snap("restart", f"slurm.{self._service.value}")


class ConfigurationManager:
"""Control configuration of a Slurm component."""

def __init__(self, name: str) -> None:
self._name = name

def get_options(self, *keys: str) -> Mapping[str, Any]:
"""Get given configurations values for Slurm component."""
configs = {}
for key in keys:
config = self.get(key)
target = key.rsplit(".", maxsplit=1)[-1]
configs[target] = config

return configs

def get(self, key: Optional[str] = None) -> Any:
"""Get specific configuration value for Slurm component."""
key = f"{self._name}.{key}" if key else self._name
config = json.loads(_snap("get", "-d", "slurm", key))
return config[key]

def set(self, config: Mapping[str, Any]) -> None:
"""Set configuration for Slurm component."""
args = [f"{self._name}.{k}={json.dumps(v)}" for k, v in config.items()]
_snap("set", "slurm", *args)

def unset(self, *keys: str) -> None:
"""Unset configuration for Slurm component."""
args = [f"{self._name}.{k}" for k in keys] if len(keys) > 0 else [self._name]
_snap("unset", "slurm", *args)


class MungeManager(ServiceManager):
"""Manage `munged` service operations."""

def __init__(self) -> None:
service = ServiceType.MUNGED
self._service = service
self.config = ConfigurationManager(service.config_name)

def get_key(self) -> str:
"""Get the current munge key.
Returns:
The current munge key as a base64-encoded string.
"""
return _mungectl("key", "get")

def set_key(self, key: str) -> None:
"""Set a new munge key.
Args:
key: A new, base64-encoded munge key.
"""
_mungectl("key", "set", stdin=key)

def generate_key(self) -> None:
"""Generate a new, cryptographically secure munge key."""
_mungectl("key", "generate")


class SlurmManagerBase(ServiceManager):
"""Base manager for Slurm services."""

def __init__(self, service: ServiceType) -> None:
self._service = service
self.config = ConfigurationManager(service.config_name)
self.munge = MungeManager()
11 changes: 6 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def _on_install(self, event: InstallEvent) -> None:

if self._slurmd_manager.install():
self.unit.set_workload_version(self._slurmd_manager.version())
slurmd.override_service()
self._systemd_notices.subscribe()

self._stored.slurm_installed = True
Expand Down Expand Up @@ -142,7 +141,7 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None:

if (slurmctld_host := event.slurmctld_host) != self._stored.slurmctld_host:
if slurmctld_host is not None:
slurmd.override_default(slurmctld_host)
self._slurmd_manager.set_conf_server(slurmctld_host)
self._stored.slurmctld_host = slurmctld_host
logger.debug(f"slurmctld_host={slurmctld_host}")
else:
Expand Down Expand Up @@ -178,7 +177,8 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None:
else:
logger.error("## Unable to restart munge")

slurmd.restart()
self._slurmd_manager._manager.enable()
self._slurmd_manager._manager.restart()
self._check_status()

def _on_slurmctld_unavailable(self, event) -> None:
Expand All @@ -188,7 +188,7 @@ def _on_slurmctld_unavailable(self, event) -> None:
self._stored.nhc_params = ""
self._stored.munge_key = ""
self._stored.slurmctld_host = ""
slurmd.stop()
self._slurmd_manager._manager.disable()
self._check_status()

def _on_slurmd_started(self, _: ServiceStartedEvent) -> None:
Expand All @@ -204,7 +204,8 @@ def _on_node_configured_action(self, _: ActionEvent) -> None:
# Trigger reconfiguration of slurmd node.
self._new_node = False
self._slurmctld.set_node()
slurmd.restart()
self._slurmd_manager._manager.enable()
self._slurmd_manager._manager.restart()
logger.debug("### This node is not new anymore")

def _on_show_nhc_config(self, event: ActionEvent) -> None:
Expand Down
35 changes: 1 addition & 34 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,4 @@
SLURM_USER = "root"
SLURM_GROUP = "root"

MUNGE_KEY_PATH = Path("/etc/munge/munge.key")

UBUNTU_HPC_PPA_KEY = """
-----BEGIN PGP PUBLIC KEY BLOCK-----
Comment: Hostname:
Version: Hockeypuck 2.1.1-10-gec3b0e7
xsFNBGTuZb8BEACtJ1CnZe6/hv84DceHv+a54y3Pqq0gqED0xhTKnbj/E2ByJpmT
NlDNkpeITwPAAN1e3824Me76Qn31RkogTMoPJ2o2XfG253RXd67MPxYhfKTJcnM3
CEkmeI4u2Lynh3O6RQ08nAFS2AGTeFVFH2GPNWrfOsGZW03Jas85TZ0k7LXVHiBs
W6qonbsFJhshvwC3SryG4XYT+z/+35x5fus4rPtMrrEOD65hij7EtQNaE8owuAju
Kcd0m2b+crMXNcllWFWmYMV0VjksQvYD7jwGrWeKs+EeHgU8ZuqaIP4pYHvoQjag
umqnH9Qsaq5NAXiuAIAGDIIV4RdAfQIR4opGaVgIFJdvoSwYe3oh2JlrLPBlyxyY
dayDifd3X8jxq6/oAuyH1h5K/QLs46jLSR8fUbG98SCHlRmvozTuWGk+e07ALtGe
sGv78ToHKwoM2buXaTTHMwYwu7Rx8LZ4bZPHdersN1VW/m9yn1n5hMzwbFKy2s6/
D4Q2ZBsqlN+5aW2q0IUmO+m0GhcdaDv8U7RVto1cWWPr50HhiCi7Yvei1qZiD9jq
57oYZVqTUNCTPxi6NeTOdEc+YqNynWNArx4PHh38LT0bqKtlZCGHNfoAJLPVYhbB
b2AHj9edYtHU9AAFSIy+HstET6P0UDxy02IeyE2yxoUBqdlXyv6FL44E+wARAQAB
zRxMYXVuY2hwYWQgUFBBIGZvciBVYnVudHUgSFBDwsGOBBMBCgA4FiEErocSHcPk
oLD4H/Aj9tDF1ca+s3sFAmTuZb8CGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AA
CgkQ9tDF1ca+s3sz3w//RNawsgydrutcbKf0yphDhzWS53wgfrs2KF1KgB0u/H+u
6Kn2C6jrVM0vuY4NKpbEPCduOj21pTCepL6PoCLv++tICOLVok5wY7Zn3WQFq0js
Iy1wO5t3kA1cTD/05v/qQVBGZ2j4DsJo33iMcQS5AjHvSr0nu7XSvDDEE3cQE55D
87vL7lgGjuTOikPh5FpCoS1gpemBfwm2Lbm4P8vGOA4/witRjGgfC1fv1idUnZLM
TbGrDlhVie8pX2kgB6yTYbJ3P3kpC1ZPpXSRWO/cQ8xoYpLBTXOOtqwZZUnxyzHh
gM+hv42vPTOnCo+apD97/VArsp59pDqEVoAtMTk72fdBqR+BB77g2hBkKESgQIEq
EiE1/TOISioMkE0AuUdaJ2ebyQXugSHHuBaqbEC47v8t5DVN5Qr9OriuzCuSDNFn
6SBHpahN9ZNi9w0A/Yh1+lFfpkVw2t04Q2LNuupqOpW+h3/62AeUqjUIAIrmfeML
IDRE2VdquYdIXKuhNvfpJYGdyvx/wAbiAeBWg0uPSepwTfTG59VPQmj0FtalkMnN
ya2212K5q68O5eXOfCnGeMvqIXxqzpdukxSZnLkgk40uFJnJVESd/CxHquqHPUDE
fy6i2AnB3kUI27D4HY2YSlXLSRbjiSxTfVwNCzDsIh7Czefsm6ITK2+cVWs0hNQ=
=cs1s
-----END PGP PUBLIC KEY BLOCK-----
"""
SLURM_SNAP = Path("/snap/slurm/current")
Loading

0 comments on commit c98d186

Please sign in to comment.