Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
pr feedback integration
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbeedy committed Jun 25, 2024
1 parent 6005ab0 commit ccefec7
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 470 deletions.
91 changes: 49 additions & 42 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import subprocess
from typing import Any, Dict, List, Optional, Union

from charms.fluentbit.v0.fluentbit import FluentbitClient # type: ignore
from charms.fluentbit.v0.fluentbit import FluentbitClient
from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, FLUENTBIT_CONFIG, SLURM_CONF_PATH
from interface_slurmd import (
PartitionAvailableEvent,
PartitionUnavailableEvent,
Expand Down Expand Up @@ -40,7 +41,7 @@
main,
)
from slurm_conf_editor import slurm_conf_as_string
from slurmctld_ops import SlurmctldManager
from slurmctld_ops import SlurmctldManager, is_container

logger = logging.getLogger()

Expand All @@ -65,13 +66,9 @@ def __init__(self, *args):
user_supplied_slurm_conf_params=str(),
)

# Fluentbit relation
self._fluentbit = FluentbitClient(self, "fluentbit")

# SlurmctldManager
self._slurmctld_manager = SlurmctldManager(self, "slurmctld")
self._slurmctld_manager = SlurmctldManager()

# Slurm components
self._fluentbit = FluentbitClient(self, "fluentbit")
self._slurmd = Slurmd(self, "slurmd")
self._slurmdbd = Slurmdbd(self, "slurmdbd")
self._slurmrestd = Slurmrestd(self, "slurmrestd")
Expand All @@ -91,7 +88,6 @@ def __init__(self, *args):
self._slurmd.on.slurmd_departed: self._on_write_slurm_conf,
# slurmrestd available
self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available,
# NOTE: a second slurmctld should get the jwt/munge keys and configure them
# fluentbit
self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created,
# actions
Expand All @@ -111,29 +107,43 @@ def _on_install(self, event: InstallEvent) -> None:
# Store the munge_key and jwt_rsa key in the stored state.
# NOTE: Use secrets instead of stored state when secrets are supported the framework.
if self.model.unit.is_leader():
self._stored.jwt_rsa = self._slurmctld_manager.generate_jwt_rsa()
self._stored.munge_key = self._slurmctld_manager.get_munge_key()
self._slurmctld_manager.write_jwt_rsa(self.get_jwt_rsa())
self._slurmctld_manager.restart_munged()
jwt_rsa = self._slurmctld_manager.generate_jwt_rsa()
self._stored.jwt_rsa = jwt_rsa

munge_key = self._slurmctld_manager.generate_munge_key()
self._stored.munge_key = munge_key

self._slurmctld_manager.stop_munged()
self._slurmctld_manager.write_munge_key(munge_key)
self._slurmctld_manager.start_munged()

self._slurmctld_manager.stop_slurmctld()
self._slurmctld_manager.write_jwt_rsa(jwt_rsa)
self._slurmctld_manager.start_slurmctld()

self.unit.set_workload_version(self._slurmctld_manager.version())
self.slurm_installed = True
else:
self.unit.status = BlockedStatus("Only singleton slurmctld is supported.")
logger.debug("Secondary slurmctld not supported.")
event.defer()
else:
self.unit.status = BlockedStatus("Error installing slurmctld")
logger.error("Cannot install slurmctld, please debug.")
event.defer()

self._on_write_slurm_conf(event)

def _on_config_changed(self, event: ConfigChangedEvent) -> None:
"""Perform config-changed operations."""
charm_config_nhc_params = self.config.get("health-check-params")
if (user_supplied_nhc_params := charm_config_nhc_params) != self._stored.nhc_params:
charm_config_nhc_params = str(self.config.get("health-check-params", ""))
if (charm_config_nhc_params != self._stored.nhc_params) and (
charm_config_nhc_params != ""
):
logger.debug("## NHC user supplied params changed, sending to slurmd.")
self._stored.nhc_params = user_supplied_nhc_params
self._stored.nhc_params = charm_config_nhc_params
# Send the custom NHC parameters to all slurmd.
self._slurmd.set_nhc_params(user_supplied_nhc_params)
self._slurmd.set_nhc_params(charm_config_nhc_params)

write_slurm_conf = False
if charm_config_default_partition := self.config.get("default-partition"):
Expand All @@ -160,15 +170,13 @@ def _on_update_status(self, event: UpdateStatusEvent) -> None:

def _on_show_current_config_action(self, event: ActionEvent) -> None:
"""Show current slurm.conf."""
slurm_conf = self._slurmctld_manager.slurm_conf_path.read_text()
slurm_conf = SLURM_CONF_PATH.read_text()
event.set_results({"slurm.conf": slurm_conf})

def _on_fluentbit_relation_created(self, event: RelationCreatedEvent) -> None:
"""Set up Fluentbit log forwarding."""
logger.debug("## Configuring fluentbit")
cfg = []
cfg.extend(self._slurmctld_manager.fluentbit_config_slurm)
self._fluentbit.configure(cfg)
self._fluentbit.configure(FLUENTBIT_CONFIG)

def _on_slurmrestd_available(self, event: SlurmrestdAvailableEvent) -> None:
"""Check that we have slurm_config when slurmrestd available otherwise defer the event."""
Expand Down Expand Up @@ -241,16 +249,15 @@ def _on_write_slurm_conf(
return

if slurm_config := self._assemble_slurm_conf():
self._slurmctld_manager.stop_slurmctld()
self._slurmctld_manager.write_slurm_conf(slurm_config)

# Write out any user_supplied_cgroup_parameters to /etc/slurm/cgroup.conf.
if user_supplied_cgroup_parameters := self.config.get("cgroup-parameters"):
self._slurmctld_manager.write_cgroup_conf(user_supplied_cgroup_parameters)
if user_supplied_cgroup_parameters := self.config.get("cgroup-parameters", ""):
self._slurmctld_manager.write_cgroup_conf(str(user_supplied_cgroup_parameters))

self._slurmctld_manager.start_slurmctld()

# Restart is needed if nodes are added/removed from the cluster, but since we don't
# currently have a method of identifying if nodes are being added or removed, simply
# restart every time after writing slurm.conf.
self._slurmctld_manager.restart_slurmctld()
self._slurmctld_manager.slurm_cmd("scontrol", "reconfigure")

# Transitioning Nodes
Expand Down Expand Up @@ -281,29 +288,28 @@ def _on_write_slurm_conf(
logger.debug("## Should write slurm.conf, but we don't have it. " "Deferring.")
event.defer()

def _assemble_slurm_conf(self) -> Dict[Any, Any]:
def _assemble_slurm_conf(self) -> Dict[str, Any]:
"""Return the slurm.conf parameters."""
slurmctld_manager = self._slurmctld_manager

charm_maintained_parameters = slurmctld_manager.charm_maintained_slurm_conf_parameters()
user_supplied_parameters = self._get_user_supplied_parameters()

slurmd_parameters = self._slurmd.get_new_nodes_and_nodes_and_partitions()

def _assemble_slurmctld_parameters() -> str:
# Preprocess merging slurmctld_parameters if they exist in the context
slurmctld_param_config = charm_maintained_parameters["SlurmctldParameters"].split(",")
if user_supplied_slurmctld_parameters := user_supplied_parameters.get(
slurmctld_param_config = CHARM_MAINTAINED_SLURM_CONF_PARAMETERS[
"SlurmctldParameters"
):
slurmctld_param_config = list(
set(
slurmctld_param_config.extend(
user_supplied_slurmctld_parameters.split(",")
)
)
].split(",")
user_config = []

if (
user_supplied_slurmctld_parameters := user_supplied_parameters.get(
"SlurmctldParameters", ""
)
return ",".join(slurmctld_param_config)
!= ""
):
user_config.extend(user_supplied_slurmctld_parameters.split(","))

return ",".join(slurmctld_param_config + user_config)

accounting_params = {}
if (slurmdbd_host := self._stored.slurmdbd_host) != "":
Expand All @@ -319,8 +325,9 @@ def _assemble_slurmctld_parameters() -> str:
"SlurmctldAddr": self._slurmd_ingress_address,
"SlurmctldHost": self.hostname,
"SlurmctldParameters": _assemble_slurmctld_parameters(),
"ProctrackType": "proctrack/linuxproc" if is_container() else "proctrack/cgroup",
**accounting_params,
**charm_maintained_parameters,
**CHARM_MAINTAINED_SLURM_CONF_PARAMETERS,
**slurmd_parameters,
**user_supplied_parameters,
}
Expand Down
99 changes: 99 additions & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2024 Omnivector, LLC.
# See LICENSE file for licensing details.
"""This module provides constants for the slurmctld-operator charm."""
from pathlib import Path

SLURM_CONF_PATH = Path("/etc/slurm/slurm.conf")
SLURM_USER = "slurm"
SLURM_GROUP = "slurm"

CHARM_MAINTAINED_SLURM_CONF_PARAMETERS = {
"AuthAltParameters": "jwt_key=/var/spool/slurmctldjwt_hs256.key",
"AuthAltTypes": "auth/jwt",
"AuthInfo": "/var/run/munge/munge.socket.2",
"AuthType": "auth/munge",
"GresTypes": "gpu",
"HealthCheckInterval": "600",
"HealthCheckNodeState": "ANY,CYCLE",
"HealthCheckProgram": "/usr/sbin/omni-nhc-wrapper",
"MailProg": "/usr/bin/mail.mailutils",
"PluginDir": "/usr/lib/x86_64-linux-gnu/slurm-wlm",
"PlugStackConfig": "/etc/slurm/plugstack.conf.d/plugstack.conf",
"SelectType": "select/cons_tres",
"SlurmctldPort": "6817",
"SlurmdPort": "6818",
"StateSaveLocation": "/var/spool/slurmctld",
"SlurmdSpoolDir": "/var/spool/slurmd",
"SlurmctldParameters": "enable_configless",
"SlurmctldLogFile": "/var/log/slurm/slurmctld.log",
"SlurmdLogFile": "/var/log/slurm/slurmctld.log",
"SlurmdPidFile": "/var/run/slurmd.pid",
"SlurmctldPidFile": "/var/run/slurmctld.pid",
"SlurmUser": SLURM_USER,
"SlurmdUser": "root",
"RebootProgram": '"/usr/sbin/reboot --reboot"',
}


FLUENTBIT_CONFIG = [
{
"input": [
("name", "tail"),
("path", "/var/log/slurm/slurmctld.log"),
("path_key", "filename"),
("tag", "slurmctld"),
("parser", "slurm"),
]
},
{
"parser": [
("name", "slurm"),
("format", "regex"),
("regex", r"^\[(?<time>[^\]]*)\] (?<log>.*)$"),
("time_key", "time"),
("time_format", "%Y-%m-%dT%H:%M:%S.%L"),
]
},
{
"filter": [
("name", "record_modifier"),
("match", "slurmctld"),
("record", "hostname ${HOSTNAME}"),
("record", "service slurmctld"),
]
},
]


UBUNTU_HPC_PPA_KEY = """
-----BEGIN PGP PUBLIC KEY BLOCK-----
Comment: Hostname:
Version: Hockeypuck 2.1.1-10-gec3b0e7
xsFNBGTuZb8BEACtJ1CnZe6/hv84DceHv+a54y3Pqq0gqED0xhTKnbj/E2ByJpmT
NlDNkpeITwPAAN1e3824Me76Qn31RkogTMoPJ2o2XfG253RXd67MPxYhfKTJcnM3
CEkmeI4u2Lynh3O6RQ08nAFS2AGTeFVFH2GPNWrfOsGZW03Jas85TZ0k7LXVHiBs
W6qonbsFJhshvwC3SryG4XYT+z/+35x5fus4rPtMrrEOD65hij7EtQNaE8owuAju
Kcd0m2b+crMXNcllWFWmYMV0VjksQvYD7jwGrWeKs+EeHgU8ZuqaIP4pYHvoQjag
umqnH9Qsaq5NAXiuAIAGDIIV4RdAfQIR4opGaVgIFJdvoSwYe3oh2JlrLPBlyxyY
dayDifd3X8jxq6/oAuyH1h5K/QLs46jLSR8fUbG98SCHlRmvozTuWGk+e07ALtGe
sGv78ToHKwoM2buXaTTHMwYwu7Rx8LZ4bZPHdersN1VW/m9yn1n5hMzwbFKy2s6/
D4Q2ZBsqlN+5aW2q0IUmO+m0GhcdaDv8U7RVto1cWWPr50HhiCi7Yvei1qZiD9jq
57oYZVqTUNCTPxi6NeTOdEc+YqNynWNArx4PHh38LT0bqKtlZCGHNfoAJLPVYhbB
b2AHj9edYtHU9AAFSIy+HstET6P0UDxy02IeyE2yxoUBqdlXyv6FL44E+wARAQAB
zRxMYXVuY2hwYWQgUFBBIGZvciBVYnVudHUgSFBDwsGOBBMBCgA4FiEErocSHcPk
oLD4H/Aj9tDF1ca+s3sFAmTuZb8CGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AA
CgkQ9tDF1ca+s3sz3w//RNawsgydrutcbKf0yphDhzWS53wgfrs2KF1KgB0u/H+u
6Kn2C6jrVM0vuY4NKpbEPCduOj21pTCepL6PoCLv++tICOLVok5wY7Zn3WQFq0js
Iy1wO5t3kA1cTD/05v/qQVBGZ2j4DsJo33iMcQS5AjHvSr0nu7XSvDDEE3cQE55D
87vL7lgGjuTOikPh5FpCoS1gpemBfwm2Lbm4P8vGOA4/witRjGgfC1fv1idUnZLM
TbGrDlhVie8pX2kgB6yTYbJ3P3kpC1ZPpXSRWO/cQ8xoYpLBTXOOtqwZZUnxyzHh
gM+hv42vPTOnCo+apD97/VArsp59pDqEVoAtMTk72fdBqR+BB77g2hBkKESgQIEq
EiE1/TOISioMkE0AuUdaJ2ebyQXugSHHuBaqbEC47v8t5DVN5Qr9OriuzCuSDNFn
6SBHpahN9ZNi9w0A/Yh1+lFfpkVw2t04Q2LNuupqOpW+h3/62AeUqjUIAIrmfeML
IDRE2VdquYdIXKuhNvfpJYGdyvx/wAbiAeBWg0uPSepwTfTG59VPQmj0FtalkMnN
ya2212K5q68O5eXOfCnGeMvqIXxqzpdukxSZnLkgk40uFJnJVESd/CxHquqHPUDE
fy6i2AnB3kUI27D4HY2YSlXLSRbjiSxTfVwNCzDsIh7Czefsm6ITK2+cVWs0hNQ=
=cs1s
-----END PGP PUBLIC KEY BLOCK-----
"""
18 changes: 9 additions & 9 deletions src/interface_slurmd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
"""Interface slurmd."""
"""Slurmctld interface to slurmd."""

import json
import logging
from typing import Any, Dict
Expand Down Expand Up @@ -36,8 +36,8 @@ class SlurmdDepartedEvent(EventBase):
"""Emitted when one slurmd departs."""


class PartitionInventoryEvents(ObjectEvents):
"""SlurmClusterProviderRelationEvents."""
class Events(ObjectEvents):
"""Slurmd interface events."""

partition_available = EventSource(PartitionAvailableEvent)
partition_unavailable = EventSource(PartitionUnavailableEvent)
Expand All @@ -48,7 +48,7 @@ class PartitionInventoryEvents(ObjectEvents):
class Slurmd(Object):
"""Slurmd inventory interface."""

on = PartitionInventoryEvents()
on = Events() # pyright: ignore [reportIncompatibleMethodOverride, reportAssignmentType]

def __init__(self, charm, relation_name):
"""Set self._relation_name and self.charm."""
Expand Down Expand Up @@ -122,7 +122,7 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
raise (e)

if node.get("new_node"):
if node_config := node.get("node_config"):
if node_config := node.get("node_parameters"):
if node_name := node_config.get("NodeName"):
self._charm.new_nodes = list(set(self._charm.new_nodes + [node_name]))
self.on.slurmd_available.emit()
Expand All @@ -143,7 +143,7 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
event.relation.data[self.model.app]["cluster_info"] = ""
self.on.partition_unavailable.emit()

def set_nhc_params(self, params: str = "#") -> None:
def set_nhc_params(self, params: str) -> None:
"""Send NHC parameters to all slurmd."""
# juju does not allow setting empty data/strings on the relation data,
# so we set it to something that behaves like empty
Expand Down Expand Up @@ -214,7 +214,7 @@ def get_new_nodes_and_nodes_and_partitions(self) -> Dict[str, Any]:
if node := self._get_node_from_relation(relation, unit):

# Check that the data we expect to exist, exists.
if node_config := node.get("node_config"):
if node_config := node.get("node_parameters"):

# Get the NodeName and append to the partition nodes
node_name = node_config["NodeName"]
Expand All @@ -236,7 +236,7 @@ def get_new_nodes_and_nodes_and_partitions(self) -> Dict[str, Any]:
if self._charm.model.config.get("default-partition") == partition_name:
partition_parameters["Default"] = "YES"

partitions[partition_name] = partition_parameters
partitions[partition_name] = partition_parameters

# If we have down nodes because they are new nodes, then set them here.
new_node_down_nodes = (
Expand Down
10 changes: 5 additions & 5 deletions src/interface_slurmdbd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
"""Slurmdbd."""
"""Slurmctld interface to slurmdbd."""

import json
import logging

Expand Down Expand Up @@ -37,8 +37,8 @@ class SlurmdbdUnavailableEvent(EventBase):
"""Emits slurmdbd_unavailable."""


class SlurmdbdAvailableEvents(ObjectEvents):
"""SlurmdbdAvailableEvents."""
class Events(ObjectEvents):
"""Slurmdbd interface events."""

slurmdbd_available = EventSource(SlurmdbdAvailableEvent)
slurmdbd_unavailable = EventSource(SlurmdbdUnavailableEvent)
Expand All @@ -47,7 +47,7 @@ class SlurmdbdAvailableEvents(ObjectEvents):
class Slurmdbd(Object):
"""Facilitate slurmdbd lifecycle events."""

on = SlurmdbdAvailableEvents()
on = Events() # pyright: ignore [reportIncompatibleMethodOverride, reportAssignmentType]

def __init__(self, charm, relation_name):
"""Set the initial attribute values for this interface."""
Expand Down
Loading

0 comments on commit ccefec7

Please sign in to comment.