From c98d18664b8f9992c5a32d8d76958b60cd590a3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Thu, 11 Jul 2024 13:00:46 -0600 Subject: [PATCH] feat: replace ppa with the Slurm snap --- lib/charms/hpc_libs/v0/slurm_ops.py | 280 ++++++++++++++++++++++++++++ src/charm.py | 11 +- src/constants.py | 35 +--- src/slurmd_ops.py | 131 +++---------- src/templates/override.conf | 3 - src/utils/slurmd.py | 15 -- 6 files changed, 317 insertions(+), 158 deletions(-) create mode 100644 lib/charms/hpc_libs/v0/slurm_ops.py delete mode 100644 src/templates/override.conf diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py new file mode 100644 index 0000000..d49a38a --- /dev/null +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -0,0 +1,280 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Abstractions for managing Slurm operations via snap. + +This library contains the `SlurmManagerBase` and `ServiceType` class +which provide high-level interfaces for managing Slurm within charmed operators. + +### Example Usage + +#### Managing a Slurm service + +The `SlurmManagerBase` constructor receives a `ServiceType` enum. The enum instructs +the inheriting Slurm service manager how to manage its corresponding Slurm service on the host. + +```python3 +import charms.hpc_libs.v0.slurm_ops as slurm +from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType + +class SlurmctldManager(SlurmManagerBase): + # Manage `slurmctld` service on host. + + def __init__(self) -> None: + super().__init__(ServiceType.SLURMCTLD) + + +class ApplicationCharm(CharmBase): + # Application charm that needs to use the Slurm snap. + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + self._slurm_manager = SlurmctldManager() + self.framework.observe( + self.on.install, + self._on_install, + ) + + def _on_install(self, _) -> None: + slurm.install() + self.unit.set_workload_version(slurm.version()) + self._slurm_manager.config.set({"cluster-name": "cluster"}) +``` +""" + +__all__ = [ + "format_key", + "install", + "version", + "ConfigurationManager", + "ServiceType", + "SlurmManagerBase", +] + +import json +import logging +import re +import subprocess +from collections.abc import Mapping +from enum import Enum +from typing import Any, Optional + +import yaml + +# The unique Charmhub library identifier, never change it +LIBID = "541fd767f90b40539cf7cd6e7db8fabf" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 3 + +# Charm library dependencies to fetch during `charmcraft pack`. +PYDEPS = ["pyyaml>=6.0.1"] + +_logger = logging.getLogger(__name__) +_acronym = re.compile(r"(?<=[A-Z])(?=[A-Z][a-z])") +_kebabize = re.compile(r"(?<=[a-z0-9])(?=[A-Z])") + + +class SlurmOpsError(Exception): + """Exception raised when a slurm operation failed.""" + + @property + def message(self) -> str: + """Return message passed as argument to exception.""" + return self.args[0] + + +def format_key(key: str) -> str: + """Format Slurm configuration keys from SlurmCASe into kebab case. + + Args: + key: Slurm configuration key to convert to kebab case. + + Notes: + Slurm configuration syntax does not follow proper PascalCasing + format, so we cannot put keys directly through a kebab case converter + to get the desired format. Some additional processing is needed for + certain keys before the key can properly kebabized. + + For example, without additional preprocessing, the key `CPUs` will + become `cp-us` if put through a kebabizer with being preformatted to `Cpus`. + """ + if "CPUs" in key: + key = key.replace("CPUs", "Cpus") + key = _acronym.sub(r"-", key) + return _kebabize.sub(r"-", key).lower() + + +def install() -> None: + """Install Slurm.""" + # FIXME: Pin slurm to the stable channel + _snap("install", "slurm", "--channel", "latest/candidate", "--classic") + + +def version() -> str: + """Get the current version of Slurm installed on the system.""" + info = yaml.safe_load(_snap("info", "slurm")) + ver: str = info["installed"] + return ver.split(maxsplit=1)[0] + + +def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str: + """Call a command with logging. + + Raises: + SlurmOpsError: Raised if the command fails. + """ + cmd = [cmd, *args] + _logger.debug(f"Executing command {cmd}") + try: + return subprocess.check_output(cmd, input=stdin, stderr=subprocess.PIPE, text=True).strip() + except subprocess.CalledProcessError as e: + _logger.error(f"`{' '.join(cmd)}` failed") + _logger.error(f"stderr: {e.stderr.decode()}") + raise SlurmOpsError(f"command {cmd[0]} failed. Reason:\n{e.stderr.decode()}") + + +def _snap(*args) -> str: + """Control snap by via executed `snap ...` commands. + + Raises: + subprocess.CalledProcessError: Raised if snap command fails. + """ + return _call("snap", *args) + + +def _mungectl(*args: str, stdin: Optional[str] = None) -> str: + """Control munge via `slurm.mungectl ...`. + + Args: + *args: Arguments to pass to `mungectl`. + stdin: Input to pass to `mungectl` via stdin. + + Raises: + subprocess.CalledProcessError: Raised if `mungectl` command fails. + """ + return _call("slurm.mungectl", *args, stdin=stdin) + + +class ServiceType(Enum): + """Type of Slurm service to manage.""" + + MUNGED = "munged" + SLURMD = "slurmd" + SLURMCTLD = "slurmctld" + SLURMDBD = "slurmdbd" + SLURMRESTD = "slurmrestd" + + @property + def config_name(self) -> str: + """Configuration name on the slurm snap for this service type.""" + if self is ServiceType.SLURMCTLD: + return "slurm" + if self is ServiceType.MUNGED: + return "munge" + + return self.value + + +class ServiceManager: + """Control a Slurm service.""" + + def enable(self) -> None: + """Enable service.""" + _snap("start", "--enable", f"slurm.{self._service.value}") + + def disable(self) -> None: + """Disable service.""" + _snap("stop", "--disable", f"slurm.{self._service.value}") + + def restart(self) -> None: + """Restart service.""" + _snap("restart", f"slurm.{self._service.value}") + + +class ConfigurationManager: + """Control configuration of a Slurm component.""" + + def __init__(self, name: str) -> None: + self._name = name + + def get_options(self, *keys: str) -> Mapping[str, Any]: + """Get given configurations values for Slurm component.""" + configs = {} + for key in keys: + config = self.get(key) + target = key.rsplit(".", maxsplit=1)[-1] + configs[target] = config + + return configs + + def get(self, key: Optional[str] = None) -> Any: + """Get specific configuration value for Slurm component.""" + key = f"{self._name}.{key}" if key else self._name + config = json.loads(_snap("get", "-d", "slurm", key)) + return config[key] + + def set(self, config: Mapping[str, Any]) -> None: + """Set configuration for Slurm component.""" + args = [f"{self._name}.{k}={json.dumps(v)}" for k, v in config.items()] + _snap("set", "slurm", *args) + + def unset(self, *keys: str) -> None: + """Unset configuration for Slurm component.""" + args = [f"{self._name}.{k}" for k in keys] if len(keys) > 0 else [self._name] + _snap("unset", "slurm", *args) + + +class MungeManager(ServiceManager): + """Manage `munged` service operations.""" + + def __init__(self) -> None: + service = ServiceType.MUNGED + self._service = service + self.config = ConfigurationManager(service.config_name) + + def get_key(self) -> str: + """Get the current munge key. + + Returns: + The current munge key as a base64-encoded string. + """ + return _mungectl("key", "get") + + def set_key(self, key: str) -> None: + """Set a new munge key. + + Args: + key: A new, base64-encoded munge key. + """ + _mungectl("key", "set", stdin=key) + + def generate_key(self) -> None: + """Generate a new, cryptographically secure munge key.""" + _mungectl("key", "generate") + + +class SlurmManagerBase(ServiceManager): + """Base manager for Slurm services.""" + + def __init__(self, service: ServiceType) -> None: + self._service = service + self.config = ConfigurationManager(service.config_name) + self.munge = MungeManager() diff --git a/src/charm.py b/src/charm.py index c004fcf..db9ef43 100755 --- a/src/charm.py +++ b/src/charm.py @@ -82,7 +82,6 @@ def _on_install(self, event: InstallEvent) -> None: if self._slurmd_manager.install(): self.unit.set_workload_version(self._slurmd_manager.version()) - slurmd.override_service() self._systemd_notices.subscribe() self._stored.slurm_installed = True @@ -142,7 +141,7 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: if (slurmctld_host := event.slurmctld_host) != self._stored.slurmctld_host: if slurmctld_host is not None: - slurmd.override_default(slurmctld_host) + self._slurmd_manager.set_conf_server(slurmctld_host) self._stored.slurmctld_host = slurmctld_host logger.debug(f"slurmctld_host={slurmctld_host}") else: @@ -178,7 +177,8 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: else: logger.error("## Unable to restart munge") - slurmd.restart() + self._slurmd_manager._manager.enable() + self._slurmd_manager._manager.restart() self._check_status() def _on_slurmctld_unavailable(self, event) -> None: @@ -188,7 +188,7 @@ def _on_slurmctld_unavailable(self, event) -> None: self._stored.nhc_params = "" self._stored.munge_key = "" self._stored.slurmctld_host = "" - slurmd.stop() + self._slurmd_manager._manager.disable() self._check_status() def _on_slurmd_started(self, _: ServiceStartedEvent) -> None: @@ -204,7 +204,8 @@ def _on_node_configured_action(self, _: ActionEvent) -> None: # Trigger reconfiguration of slurmd node. self._new_node = False self._slurmctld.set_node() - slurmd.restart() + self._slurmd_manager._manager.enable() + self._slurmd_manager._manager.restart() logger.debug("### This node is not new anymore") def _on_show_nhc_config(self, event: ActionEvent) -> None: diff --git a/src/constants.py b/src/constants.py index 21b428f..6eec976 100644 --- a/src/constants.py +++ b/src/constants.py @@ -6,37 +6,4 @@ SLURM_USER = "root" SLURM_GROUP = "root" -MUNGE_KEY_PATH = Path("/etc/munge/munge.key") - -UBUNTU_HPC_PPA_KEY = """ ------BEGIN PGP PUBLIC KEY BLOCK----- -Comment: Hostname: -Version: Hockeypuck 2.1.1-10-gec3b0e7 - -xsFNBGTuZb8BEACtJ1CnZe6/hv84DceHv+a54y3Pqq0gqED0xhTKnbj/E2ByJpmT -NlDNkpeITwPAAN1e3824Me76Qn31RkogTMoPJ2o2XfG253RXd67MPxYhfKTJcnM3 -CEkmeI4u2Lynh3O6RQ08nAFS2AGTeFVFH2GPNWrfOsGZW03Jas85TZ0k7LXVHiBs -W6qonbsFJhshvwC3SryG4XYT+z/+35x5fus4rPtMrrEOD65hij7EtQNaE8owuAju -Kcd0m2b+crMXNcllWFWmYMV0VjksQvYD7jwGrWeKs+EeHgU8ZuqaIP4pYHvoQjag -umqnH9Qsaq5NAXiuAIAGDIIV4RdAfQIR4opGaVgIFJdvoSwYe3oh2JlrLPBlyxyY -dayDifd3X8jxq6/oAuyH1h5K/QLs46jLSR8fUbG98SCHlRmvozTuWGk+e07ALtGe -sGv78ToHKwoM2buXaTTHMwYwu7Rx8LZ4bZPHdersN1VW/m9yn1n5hMzwbFKy2s6/ -D4Q2ZBsqlN+5aW2q0IUmO+m0GhcdaDv8U7RVto1cWWPr50HhiCi7Yvei1qZiD9jq -57oYZVqTUNCTPxi6NeTOdEc+YqNynWNArx4PHh38LT0bqKtlZCGHNfoAJLPVYhbB -b2AHj9edYtHU9AAFSIy+HstET6P0UDxy02IeyE2yxoUBqdlXyv6FL44E+wARAQAB -zRxMYXVuY2hwYWQgUFBBIGZvciBVYnVudHUgSFBDwsGOBBMBCgA4FiEErocSHcPk -oLD4H/Aj9tDF1ca+s3sFAmTuZb8CGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AA -CgkQ9tDF1ca+s3sz3w//RNawsgydrutcbKf0yphDhzWS53wgfrs2KF1KgB0u/H+u -6Kn2C6jrVM0vuY4NKpbEPCduOj21pTCepL6PoCLv++tICOLVok5wY7Zn3WQFq0js -Iy1wO5t3kA1cTD/05v/qQVBGZ2j4DsJo33iMcQS5AjHvSr0nu7XSvDDEE3cQE55D -87vL7lgGjuTOikPh5FpCoS1gpemBfwm2Lbm4P8vGOA4/witRjGgfC1fv1idUnZLM -TbGrDlhVie8pX2kgB6yTYbJ3P3kpC1ZPpXSRWO/cQ8xoYpLBTXOOtqwZZUnxyzHh -gM+hv42vPTOnCo+apD97/VArsp59pDqEVoAtMTk72fdBqR+BB77g2hBkKESgQIEq -EiE1/TOISioMkE0AuUdaJ2ebyQXugSHHuBaqbEC47v8t5DVN5Qr9OriuzCuSDNFn -6SBHpahN9ZNi9w0A/Yh1+lFfpkVw2t04Q2LNuupqOpW+h3/62AeUqjUIAIrmfeML -IDRE2VdquYdIXKuhNvfpJYGdyvx/wAbiAeBWg0uPSepwTfTG59VPQmj0FtalkMnN -ya2212K5q68O5eXOfCnGeMvqIXxqzpdukxSZnLkgk40uFJnJVESd/CxHquqHPUDE -fy6i2AnB3kUI27D4HY2YSlXLSRbjiSxTfVwNCzDsIh7Czefsm6ITK2+cVWs0hNQ= -=cs1s ------END PGP PUBLIC KEY BLOCK----- -""" +SLURM_SNAP = Path("/snap/slurm/current") diff --git a/src/slurmd_ops.py b/src/slurmd_ops.py index dcab81b..b471141 100644 --- a/src/slurmd_ops.py +++ b/src/slurmd_ops.py @@ -7,17 +7,15 @@ import shlex import subprocess import textwrap -from base64 import b64decode from grp import getgrnam from pathlib import Path from pwd import getpwnam from shutil import rmtree from typing import Any, Dict -import charms.operator_libs_linux.v0.apt as apt # type: ignore [import-untyped] +import charms.hpc_libs.v0.slurm_ops as slurm import charms.operator_libs_linux.v1.systemd as systemd # type: ignore [import-untyped] -import distro -from constants import MUNGE_KEY_PATH, SLURM_GROUP, SLURM_USER, UBUNTU_HPC_PPA_KEY +from constants import SLURM_GROUP, SLURM_SNAP, SLURM_USER logger = logging.getLogger() @@ -39,101 +37,22 @@ def __init__(self, msg): pass -class CharmedHPCPackageLifecycleManager: - """Facilitate ubuntu-hpc slurm component package lifecycles.""" - - def __init__(self, package_name: str): - self._package_name = package_name - self._keyring_path = Path(f"/usr/share/keyrings/ubuntu-hpc-{self._package_name}.asc") - - def _repo(self) -> apt.DebianRepository: - """Return the ubuntu-hpc repo.""" - ppa_url = "https://ppa.launchpadcontent.net/ubuntu-hpc/slurm-wlm-23.02/ubuntu" - sources_list = f"deb [signed-by={self._keyring_path}] {ppa_url} {distro.codename()} main" - return apt.DebianRepository.from_repo_line(sources_list) - - def install(self) -> bool: - """Install package using lib apt.""" - package_installed = False - - if self._keyring_path.exists(): - self._keyring_path.unlink() - self._keyring_path.write_text(UBUNTU_HPC_PPA_KEY) - - repositories = apt.RepositoryMapping() - repositories.add(self._repo()) - - try: - apt.update() - apt.add_package([self._package_name]) - package_installed = True - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found in package cache or on system.") - except apt.PackageError as e: - logger.error(f"Could not install '{self._package_name}'. Reason: {e.message}") - - return package_installed - - def uninstall(self) -> None: - """Uninstall the package using libapt.""" - if apt.remove_package(self._package_name): - logger.info(f"'{self._package_name}' removed from system.") - else: - logger.error(f"'{self._package_name}' not found on system.") - - repositories = apt.RepositoryMapping() - repositories.disable(self._repo()) - - if self._keyring_path.exists(): - self._keyring_path.unlink() - - def upgrade_to_latest(self) -> None: - """Upgrade package to latest.""" - try: - slurm_package = apt.DebianPackage.from_system(self._package_name) - slurm_package.ensure(apt.PackageState.Latest) - logger.info(f"Updated '{self._package_name}' to: {slurm_package.version.number}.") - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found in package cache or on system.") - except apt.PackageError as e: - logger.error(f"Could not install '{self._package_name}'. Reason: {e.message}") - - def version(self) -> str: - """Return the package version.""" - slurm_package_vers = "" - try: - slurm_package_vers = apt.DebianPackage.from_installed_package( - self._package_name - ).version.number - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found on system.") - return slurm_package_vers - - class SlurmdManager: """SlurmdManager.""" def __init__(self): - self._munge_package = CharmedHPCPackageLifecycleManager("munge") - self._slurmd_package = CharmedHPCPackageLifecycleManager("slurmd") - self._slurm_client_package = CharmedHPCPackageLifecycleManager("slurm-client") + self._manager = slurm.SlurmManagerBase(slurm.ServiceType.SLURMD) def install(self) -> bool: """Install slurmd, slurm-client and munge packages to the system.""" - if self._slurmd_package.install() is not True: - logger.debug("Cannot install 'slurmd' package.") - return False + slurm.install() - systemd.service_stop("slurmd") + self._manager.disable() + self._manager.munge.disable() - if self._munge_package.install() is not True: - logger.debug("Cannot install 'munge' package.") - return False - - systemd.service_stop("munge") + os.symlink("/etc/systemd/system/snap.slurm.slurmd.service", "/etc/systemd/system/slurm.service") - if self._slurm_client_package.install() is not True: - logger.debug("Cannot install 'slurm-client' package.") + if not systemd.daemon_reload(): return False if not self._install_nhc_from_tarball(): @@ -145,19 +64,15 @@ def install(self) -> bool: spool_dir = Path("/var/spool/slurmd") spool_dir.mkdir() - slurm_user_uid, slurm_group_gid = _get_slurm_user_uid_and_slurm_group_gid() - os.chown(f"{spool_dir}", slurm_user_uid, slurm_group_gid) - return True def version(self) -> str: """Return slurm version.""" - return self._slurmd_package.version() + return slurm.version() def write_munge_key(self, munge_key: str) -> None: """Base64 decode and write the munge key.""" - key = b64decode(munge_key.encode()) - MUNGE_KEY_PATH.write_bytes(key) + self._manager.munge.set_key(munge_key) def _install_nhc_from_tarball(self) -> bool: """Install NHC from tarball that is packaged with the charm. @@ -271,26 +186,30 @@ def restart_munged(self) -> bool: """ try: logger.debug("## Restarting munge") - systemd.service_restart("munge") - except SlurmdException("Cannot start munge.") as e: # type: ignore [misc] + self._manager.munge.enable() + self._manager.munge.restart() + except slurm.SlurmOpsError as e: # type: ignore [misc] logger.error(e) return False return self.check_munged() def check_munged(self) -> bool: """Check if munge is working correctly.""" - if not systemd.service_running("munge"): + if not systemd.service_running("snap.slurm.munged"): return False # check if munge is working, i.e., can use the credentials correctly try: logger.debug("## Testing if munge is working correctly") - cmd = "munge -n" + cmd = "slurm.munge -n" munge = subprocess.Popen( shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE ) unmunge = subprocess.Popen( - ["unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["slurm.unmunge"], + stdin=munge.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) if munge is not None: munge.stdout.close() # type: ignore [union-attr] @@ -310,7 +229,9 @@ def get_node_config(self) -> Dict[Any, Any]: """Return the node configuration options as reported by slurmd -C.""" slurmd_config_options = "" try: - slurmd_config_options = subprocess.check_output(["slurmd", "-C"], text=True).strip() + slurmd_config_options = subprocess.check_output( + [SLURM_SNAP / "sbin" / "slurmd", "-C"], text=True + ).strip() except subprocess.CalledProcessError as e: logger.error(e) raise e @@ -327,3 +248,11 @@ def get_node_config(self) -> Dict[Any, Any]: raise e return slurmd_config_options_parsed + + def set_conf_server(self, server: str) -> None: + """Set the config server that provides the config file. + + Args: + server: Server hostname of the slurmctld service. + """ + self._manager.config.set({"config-server": server}) diff --git a/src/templates/override.conf b/src/templates/override.conf deleted file mode 100644 index d880806..0000000 --- a/src/templates/override.conf +++ /dev/null @@ -1,3 +0,0 @@ -[Service] -LimitMEMLOCK=infinity -LimitNOFILE=1048576 diff --git a/src/utils/slurmd.py b/src/utils/slurmd.py index ab5ed2b..9a0c5e7 100644 --- a/src/utils/slurmd.py +++ b/src/utils/slurmd.py @@ -32,21 +32,6 @@ _logger = logging.getLogger(__name__) -def start() -> None: - """Start slurmd service.""" - systemd.service_start("slurmd") - - -def stop() -> None: - """Stop slurmd service.""" - systemd.service_stop("slurmd") - - -def restart() -> None: - """Restart slurmd service.""" - systemd.service_restart("slurmd") - - def override_default(host: str) -> None: """Override the /etc/default/slurmd file.