From 0ba7588a498d22c601ac7208a806600a11c0b05b Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Tue, 26 Nov 2024 11:48:13 +0000 Subject: [PATCH] feat(sackd): add sackd operator Adds a new Slurm charm for deploying sackd: the login node daemon which gets configuration files from the controller in 'configless' setups and provides support for the 'auth/slurm' authentication method. --- charms/sackd/LICENSE | 202 ++++++++++++++++++++++ charms/sackd/README.md | 44 +++++ charms/sackd/build.yaml | 4 + charms/sackd/charmcraft.yaml | 50 ++++++ charms/sackd/requirements.txt | 2 + charms/sackd/src/charm.py | 163 +++++++++++++++++ charms/sackd/src/interface_slurmctld.py | 124 +++++++++++++ charms/sackd/terraform/main.tf | 27 +++ charms/sackd/terraform/outputs.tf | 23 +++ charms/sackd/terraform/variables.tf | 47 +++++ charms/sackd/terraform/versions.tf | 22 +++ charms/sackd/tests/unit/test_charm.py | 109 ++++++++++++ charms/slurmctld/charmcraft.yaml | 2 + charms/slurmctld/src/charm.py | 2 + charms/slurmctld/src/interface_sackd.py | 47 +++++ charms/slurmctld/tests/unit/test_charm.py | 26 ++- test-requirements.txt | 1 + tests/integration/conftest.py | 20 ++- tests/integration/test_charm.py | 18 +- 19 files changed, 927 insertions(+), 6 deletions(-) create mode 100644 charms/sackd/LICENSE create mode 100644 charms/sackd/README.md create mode 100644 charms/sackd/build.yaml create mode 100644 charms/sackd/charmcraft.yaml create mode 100644 charms/sackd/requirements.txt create mode 100755 charms/sackd/src/charm.py create mode 100644 charms/sackd/src/interface_slurmctld.py create mode 100644 charms/sackd/terraform/main.tf create mode 100644 charms/sackd/terraform/outputs.tf create mode 100644 charms/sackd/terraform/variables.tf create mode 100644 charms/sackd/terraform/versions.tf create mode 100644 charms/sackd/tests/unit/test_charm.py create mode 100644 charms/slurmctld/src/interface_sackd.py diff --git a/charms/sackd/LICENSE b/charms/sackd/LICENSE new file mode 100644 index 0000000..a05b724 --- /dev/null +++ b/charms/sackd/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023 Omnivector, LLC + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/charms/sackd/README.md b/charms/sackd/README.md new file mode 100644 index 0000000..3bdf95a --- /dev/null +++ b/charms/sackd/README.md @@ -0,0 +1,44 @@ +
+ +# sackd operator + +A [Juju](https://juju.is) operator for sackd - the login node daemon of [Slurm](https://slurm.schedmd.com/overview.html). + +[![Charmhub Badge](https://charmhub.io/sackd/badge.svg)](https://charmhub.io/sackd) +[![Matrix](https://img.shields.io/matrix/ubuntu-hpc%3Amatrix.org?logo=matrix&label=ubuntu-hpc)](https://matrix.to/#/#ubuntu-hpc:matrix.org) + +
+ +## Features + +The sackd operator provides and manages the sackd daemon. This daemon provides the login node service for machines enlisted as login nodes in Charmed Slurm clusters. + +## Usage + +This operator should be used with Juju 3.x or greater. + +#### Deploy a minimal Charmed Slurm cluster with a login node + +```shell +$ juju deploy slurmctld --channel edge +$ juju deploy slurmd --channel edge +$ juju deploy sackd --channel edge +$ juju integrate slurmctld:slurmd slurmd:slurmctld +$ juju integrate slurmctld:login-node sackd:slurmctld +``` + +## Project & Community + +The sackd operator is a project of the [Ubuntu HPC](https://discourse.ubuntu.com/t/high-performance-computing-team/35988) +community. It is an open source project that is welcome to community involvement, contributions, suggestions, fixes, and +constructive feedback. Interested in being involved with development? Check out these links below: + +* [Join our online chat](https://matrix.to/#/#ubuntu-hpc:matrix.org) +* [Contributing guidelines](./CONTRIBUTING.md) +* [Code of conduct](https://ubuntu.com/community/ethos/code-of-conduct) +* [File a bug report](https://github.com/charmed-hpc/slurm-charms/issues) +* [Juju SDK docs](https://juju.is/docs/sdk) + +## License + +The sackd operator is free software, distributed under the Apache Software License, version 2.0. See the [LICENSE](./LICENSE) file for more information. diff --git a/charms/sackd/build.yaml b/charms/sackd/build.yaml new file mode 100644 index 0000000..acff2da --- /dev/null +++ b/charms/sackd/build.yaml @@ -0,0 +1,4 @@ +external-libraries: + - charms.hpc_libs.v0.slurm_ops + - charms.operator_libs_linux.v0.apt + - charms.operator_libs_linux.v0.juju_systemd_notices diff --git a/charms/sackd/charmcraft.yaml b/charms/sackd/charmcraft.yaml new file mode 100644 index 0000000..5a481cf --- /dev/null +++ b/charms/sackd/charmcraft.yaml @@ -0,0 +1,50 @@ +name: sackd +type: charm + +summary: | + Sackd, the login node daemon of Slurm. + +description: | + This charm provides sackd to facilitate deployment of a login node for a + Slurm cluster. + + sackd is the login node daemon of Slurm. It enables authentication to the + cluster and retrieval of configuration files when running in configless + mode. + +links: + contact: https://matrix.to/#/#hpc:ubuntu.com + + issues: + - https://github.com/charmed-hpc/slurm-charms/issues + + source: + - https://github.com/charmed-hpc/slurm-charms + +assumes: + - juju + +base: ubuntu@24.04 +platforms: + amd64: + build-on: amd64 + build-for: amd64 + +parts: + charm: + charm-requirements: [ "requirements.txt" ] + override-build: | + cp /usr/bin/rustc-1.80 /usr/bin/rustc + craftctl default + build-packages: + - libffi-dev + - libssl-dev + - rustc-1.80 + - cargo + - pkg-config + - git + +provides: + slurmctld: + interface: sackd + limit: 1 diff --git a/charms/sackd/requirements.txt b/charms/sackd/requirements.txt new file mode 100644 index 0000000..5faa048 --- /dev/null +++ b/charms/sackd/requirements.txt @@ -0,0 +1,2 @@ +ops==2.15.0 +distro==1.9.0 diff --git a/charms/sackd/src/charm.py b/charms/sackd/src/charm.py new file mode 100755 index 0000000..7f6839c --- /dev/null +++ b/charms/sackd/src/charm.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# Copyright 2020-2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Sackd Operator Charm.""" + +import logging + +from interface_slurmctld import Slurmctld, SlurmctldAvailableEvent +from ops import ( + ActiveStatus, + BlockedStatus, + CharmBase, + InstallEvent, + StoredState, + UpdateStatusEvent, + WaitingStatus, + main, +) + +from charms.hpc_libs.v0.slurm_ops import SackdManager, SlurmOpsError +from charms.operator_libs_linux.v0.juju_systemd_notices import ( # type: ignore[import-untyped] + ServiceStartedEvent, + ServiceStoppedEvent, + SystemdNotices, +) + +logger = logging.getLogger(__name__) + + +class SackdCharm(CharmBase): + """Sackd lifecycle events.""" + + _stored = StoredState() + + def __init__(self, *args, **kwargs): + """Init _stored attributes and interfaces, observe events.""" + super().__init__(*args, **kwargs) + + self._stored.set_default( + auth_key=str(), + sackd_installed=False, + slurmctld_available=False, + slurmctld_host=str(), + ) + + self._sackd = SackdManager(snap=False) + self._slurmctld = Slurmctld(self, "slurmctld") + self._systemd_notices = SystemdNotices(self, ["sackd"]) + + event_handler_bindings = { + self.on.install: self._on_install, + self.on.update_status: self._on_update_status, + self._slurmctld.on.slurmctld_available: self._on_slurmctld_available, + self._slurmctld.on.slurmctld_unavailable: self._on_slurmctld_unavailable, + self.on.service_sackd_started: self._on_sackd_started, + self.on.service_sackd_stopped: self._on_sackd_stopped, + } + for event, handler in event_handler_bindings.items(): + self.framework.observe(event, handler) + + def _on_install(self, event: InstallEvent) -> None: + """Perform installation operations for sackd.""" + self.unit.status = WaitingStatus("installing sackd") + + try: + self._sackd.install() + # Ensure sackd does not start before relation established + self._sackd.service.disable() + self.unit.set_workload_version(self._sackd.version()) + self._systemd_notices.subscribe() + self._stored.sackd_installed = True + except SlurmOpsError as e: + logger.error(e.message) + event.defer() + + self._check_status() + + def _on_update_status(self, _: UpdateStatusEvent) -> None: + """Handle update status.""" + self._check_status() + + def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: + """Retrieve the slurmctld_available event data and store in charm state.""" + if self._stored.sackd_installed is not True: + event.defer() + return + + if (slurmctld_host := event.slurmctld_host) != self._stored.slurmctld_host: + if slurmctld_host is not None: + self._sackd.config_server = f"{slurmctld_host}:6817" + self._stored.slurmctld_host = slurmctld_host + logger.debug(f"slurmctld_host={slurmctld_host}") + else: + logger.debug("'slurmctld_host' not in event data.") + return + + if (auth_key := event.auth_key) != self._stored.auth_key: + if auth_key is not None: + self._stored.auth_key = auth_key + self._sackd.munge.key.set(auth_key) # TODO change this once auth/slurm in place + else: + logger.debug("'auth_key' not in event data.") + return + + logger.debug("#### Storing slurmctld_available event relation data in charm StoredState.") + self._stored.slurmctld_available = True + + # Restart sackd after we write event data to respective locations. + self._sackd.munge.service.restart() # TODO change this once auth/slurm in place + self._sackd.service.enable() + self._check_status() + + def _on_slurmctld_unavailable(self, _) -> None: + """Stop sackd and set slurmctld_available = False when we lose slurmctld.""" + logger.debug("## Slurmctld unavailable") + self._stored.slurmctld_available = False + self._stored.auth_key = "" + self._stored.slurmctld_host = "" + self._sackd.service.disable() + self._check_status() + + def _on_sackd_started(self, _: ServiceStartedEvent) -> None: + """Handle event emitted by systemd after sackd daemon successfully starts.""" + self.unit.status = ActiveStatus() + + def _on_sackd_stopped(self, _: ServiceStoppedEvent) -> None: + """Handle event emitted by systemd after sackd daemon is stopped.""" + self.unit.status = BlockedStatus("sackd not running") + + def _check_status(self) -> bool: + """Check if we have all needed components. + + - sackd installed + - slurmctld available and working + - auth key configured and working + """ + if self._stored.sackd_installed is not True: + self.unit.status = BlockedStatus( + "failed to install sackd. see logs for further details" + ) + return False + + if self._slurmctld.is_joined is not True: + self.unit.status = BlockedStatus("Need relations: slurmctld") + return False + + if self._stored.slurmctld_available is not True: + self.unit.status = WaitingStatus("Waiting on: slurmctld") + return False + + # TODO: https://github.com/charmed-hpc/hpc-libs/issues/18 - + # Re-enable auth key validation check check when supported by `slurm_ops` charm library. + # No longer using munge - this is slurm.key now. + # if not self._sackd.check_munged(): + # self.unit.status = BlockedStatus("Error configuring auth key") + # return False + + return True + + +if __name__ == "__main__": # pragma: nocover + main.main(SackdCharm) diff --git a/charms/sackd/src/interface_slurmctld.py b/charms/sackd/src/interface_slurmctld.py new file mode 100644 index 0000000..7e12d11 --- /dev/null +++ b/charms/sackd/src/interface_slurmctld.py @@ -0,0 +1,124 @@ +"""Slurmctld interface for sackd.""" + +import json +import logging +from typing import Union + +from ops import ( + EventBase, + EventSource, + Object, + ObjectEvents, + Relation, + RelationBrokenEvent, + RelationChangedEvent, +) + +logger = logging.getLogger(__name__) + + +class SlurmctldAvailableEvent(EventBase): + """Emitted when slurmctld is available.""" + + def __init__( + self, + handle, + auth_key, + slurmctld_host, + ): + super().__init__(handle) + + self.auth_key = auth_key + self.slurmctld_host = slurmctld_host + + def snapshot(self): + """Snapshot the event data.""" + return { + "auth_key": self.auth_key, + "slurmctld_host": self.slurmctld_host, + } + + def restore(self, snapshot): + """Restore the snapshot of the event data.""" + self.auth_key = snapshot.get("auth_key") + self.slurmctld_host = snapshot.get("slurmctld_host") + + +class SlurmctldUnavailableEvent(EventBase): + """Emit when the relation to slurmctld is broken.""" + + +class Events(ObjectEvents): + """Sackd emitted events.""" + + slurmctld_available = EventSource(SlurmctldAvailableEvent) + slurmctld_unavailable = EventSource(SlurmctldUnavailableEvent) + + +class Slurmctld(Object): + """Slurmctld integration.""" + + on = Events() # pyright: ignore [reportIncompatibleMethodOverride, reportAssignmentType] + + def __init__(self, charm, relation_name): + """Set initial data and observe interface events.""" + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_relation_changed, + ) + + self.framework.observe( + self._charm.on[self._relation_name].relation_broken, + self._on_relation_broken, + ) + + def _on_relation_changed(self, event: RelationChangedEvent) -> None: + """Handle the relation-changed event. + + Get the cluster_info from slurmctld and emit the slurmctld_available event. + + Ensure all cases are accounted for: + - no application in event + - no application data in relation + - no cluster_info in application relation data + - application exists in event, and application data exists on relation, cluster_info + exists in application relation data + """ + if app := event.app: + if app_data := event.relation.data.get(app): + if cluster_info_json := app_data.get("cluster_info"): + try: + cluster_info = json.loads(cluster_info_json) + except json.JSONDecodeError as e: + logger.error(e) + raise (e) + + logger.debug(f"cluster_info: {cluster_info}") + self.on.slurmctld_available.emit(**cluster_info) + else: + logger.debug( + f"No cluster_info in application data, deferring {self._relation_name}" + ) + event.defer() + else: + logger.debug("No application data on relation.") + else: + logger.debug("No application on the event.") + + def _on_relation_broken(self, event: RelationBrokenEvent) -> None: + """Emit slurmctld_unavailable when the relation-broken event occurs.""" + self.on.slurmctld_unavailable.emit() + + @property + def _relation(self) -> Union[Relation, None]: + """Return the relation.""" + return self.model.get_relation(self._relation_name) + + @property + def is_joined(self) -> bool: + """Return True if relation is joined.""" + return True if self.model.relations.get(self._relation_name) else False diff --git a/charms/sackd/terraform/main.tf b/charms/sackd/terraform/main.tf new file mode 100644 index 0000000..588e440 --- /dev/null +++ b/charms/sackd/terraform/main.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "juju_application" "sackd" { + name = var.app_name + model = var.model_name + + charm { + name = "sackd" + channel = var.channel + revision = var.revision + } + + config = var.config + units = var.units +} diff --git a/charms/sackd/terraform/outputs.tf b/charms/sackd/terraform/outputs.tf new file mode 100644 index 0000000..c2cf45d --- /dev/null +++ b/charms/sackd/terraform/outputs.tf @@ -0,0 +1,23 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "app_name" { + value = juju_application.sackd.name +} + +output "provides" { + value = { + slurmctld = "slurmctld" + } +} diff --git a/charms/sackd/terraform/variables.tf b/charms/sackd/terraform/variables.tf new file mode 100644 index 0000000..46ef2cc --- /dev/null +++ b/charms/sackd/terraform/variables.tf @@ -0,0 +1,47 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "app_name" { + description = "Name of the sackd application within the Juju model." + type = string +} + +variable "channel" { + description = "Channel to deploy the sackd charm from." + type = string + default = "latest/stable" +} + +variable "config" { + description = "Initial configuration for deployed sackd charm." + type = map(string) + default = {} +} + +variable "model_name" { + description = "Name of model to deploy sackd charm to." + type = string +} + +variable "revision" { + description = "Revision of the sackd charm to deploy." + type = number + default = null +} + +variable "units" { + description = "Number of sackd units to deploy." + type = number + default = 1 +} diff --git a/charms/sackd/terraform/versions.tf b/charms/sackd/terraform/versions.tf new file mode 100644 index 0000000..cc63922 --- /dev/null +++ b/charms/sackd/terraform/versions.tf @@ -0,0 +1,22 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + juju = { + source = "juju/juju" + version = ">= 0.13.0" + } + } +} diff --git a/charms/sackd/tests/unit/test_charm.py b/charms/sackd/tests/unit/test_charm.py new file mode 100644 index 0000000..415751d --- /dev/null +++ b/charms/sackd/tests/unit/test_charm.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# Copyright 2023-2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the sackd operator.""" + +from unittest import TestCase +from unittest.mock import Mock, PropertyMock, patch + +from charm import SackdCharm +from ops.model import ActiveStatus, BlockedStatus +from scenario import Context, State + +from charms.hpc_libs.v0.slurm_ops import SlurmOpsError + + +class TestCharm(TestCase): + """Unit test sackd charm.""" + + def setUp(self) -> None: + """Set up unit test.""" + self.ctx = Context(SackdCharm) + + @patch("charms.operator_libs_linux.v0.juju_systemd_notices.SystemdNotices.subscribe") + @patch("ops.framework.EventBase.defer") + def test_install_success(self, defer, *_) -> None: + """Test install success behavior.""" + with self.ctx(self.ctx.on.install(), State()) as manager: + manager.charm._sackd.install = Mock() + manager.charm._sackd.service.disable = Mock() + manager.charm._sackd.version = Mock(return_value="24.05.2-1") + manager.run() + self.assertTrue(manager.charm._stored.sackd_installed) + + defer.assert_not_called() + + @patch("ops.framework.EventBase.defer") + def test_install_fail(self, defer) -> None: + """Test install failure behavior.""" + with self.ctx(self.ctx.on.install(), State()) as manager: + manager.charm._sackd.install = Mock( + side_effect=SlurmOpsError("failed to install sackd") + ) + manager.run() + + self.assertEqual( + manager.charm.unit.status, + BlockedStatus("failed to install sackd. see logs for further details"), + ) + self.assertFalse(manager.charm._stored.sackd_installed) + + defer.assert_called() + + def test_service_sackd_start(self) -> None: + """Test service_sackd_started event handler.""" + with self.ctx(self.ctx.on.start(), State()) as manager: + # Run method directly rather than emit a ServiceStartedEvent. + # TODO: Refactor once Scenario has restored support for running custom events. See: + # https://github.com/canonical/operator/issues/1421 + manager.charm._on_sackd_started(None) + self.assertEqual(manager.charm.unit.status, ActiveStatus()) + + def test_service_sackd_stopped(self) -> None: + """Test service_sackd_stopped event handler.""" + with self.ctx(self.ctx.on.stop(), State()) as manager: + # Run method directly rather than emit a ServiceStoppedEvent. + # TODO: Refactor once Scenario has restored support for running custom events. See: + # https://github.com/canonical/operator/issues/1421 + manager.charm._on_sackd_stopped(None) + self.assertEqual(manager.charm.unit.status, BlockedStatus("sackd not running")) + + @patch("interface_slurmctld.Slurmctld.is_joined", new_callable=PropertyMock(return_value=True)) + def test_update_status_success(self, *_) -> None: + """Test `UpdateStateEvent` hook success.""" + with self.ctx(self.ctx.on.update_status(), State()) as manager: + manager.charm._stored.sackd_installed = True + manager.charm._stored.slurmctld_available = True + manager.charm.unit.status = ActiveStatus() + manager.run() + # ActiveStatus is the expected value when _check_status does not + # modify the current state of the unit and should return True. + self.assertTrue(manager.charm._check_status()) + self.assertEqual(manager.charm.unit.status, ActiveStatus()) + + def test_update_status_install_fail(self) -> None: + """Test `UpdateStateEvent` hook failure.""" + with self.ctx(self.ctx.on.update_status(), State()) as manager: + manager.run() + self.assertEqual( + manager.charm.unit.status, + BlockedStatus("failed to install sackd. see logs for further details"), + ) + + +if __name__ == "__main__": + import unittest + + unittest.main() diff --git a/charms/slurmctld/charmcraft.yaml b/charms/slurmctld/charmcraft.yaml index 19f6e26..0a0fc37 100644 --- a/charms/slurmctld/charmcraft.yaml +++ b/charms/slurmctld/charmcraft.yaml @@ -30,6 +30,8 @@ requires: interface: slurmdbd slurmrestd: interface: slurmrestd + login-node: + interface: sackd provides: cos-agent: diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index 187154d..8e0790d 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -15,6 +15,7 @@ PEER_RELATION, ) from exceptions import IngressAddressUnavailableError +from interface_sackd import Sackd from interface_slurmd import ( PartitionAvailableEvent, PartitionUnavailableEvent, @@ -66,6 +67,7 @@ def __init__(self, *args): ) self._slurmctld = SlurmctldManager(snap=False) + self._sackd = Sackd(self, "login-node") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") diff --git a/charms/slurmctld/src/interface_sackd.py b/charms/slurmctld/src/interface_sackd.py new file mode 100644 index 0000000..078e531 --- /dev/null +++ b/charms/slurmctld/src/interface_sackd.py @@ -0,0 +1,47 @@ +"""Slurmctld interface to sackd.""" + +import json +import logging + +from ops import Object, RelationBrokenEvent, RelationCreatedEvent + +logger = logging.getLogger() + + +class Sackd(Object): + """Sackd inventory interface.""" + + def __init__(self, charm, relation_name): + """Set self._relation_name and self.charm.""" + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + + self.framework.observe( + self._charm.on[self._relation_name].relation_created, + self._on_relation_created, + ) + + self.framework.observe( + self._charm.on[self._relation_name].relation_broken, + self._on_relation_broken, + ) + + def _on_relation_created(self, event: RelationCreatedEvent) -> None: + """Set our data on the relation.""" + # Need to wait until the charm has installed slurm before we can proceed. + if not self._charm.slurm_installed: + event.defer() + return + + event.relation.data[self.model.app]["cluster_info"] = json.dumps( + { + "auth_key": self._charm.get_munge_key(), # TODO: change this once munge is auth/slurm + "slurmctld_host": self._charm.hostname, + } + ) + + def _on_relation_broken(self, event: RelationBrokenEvent) -> None: + """Clear the cluster info if the relation is broken.""" + if self.framework.model.unit.is_leader(): + event.relation.data[self.model.app]["cluster_info"] = "" diff --git a/charms/slurmctld/tests/unit/test_charm.py b/charms/slurmctld/tests/unit/test_charm.py index 29fc2bc..99cdca4 100644 --- a/charms/slurmctld/tests/unit/test_charm.py +++ b/charms/slurmctld/tests/unit/test_charm.py @@ -15,7 +15,7 @@ """Test default charm events such as install, etc.""" -from unittest.mock import Mock, patch +from unittest.mock import Mock, PropertyMock, patch from charm import SlurmctldCharm from ops.model import BlockedStatus @@ -156,6 +156,30 @@ def test_on_slurmdbd_unavailable(self) -> None: self.harness.charm._slurmdbd.on.slurmdbd_unavailable.emit() self.assertEqual(self.harness.charm._stored.slurmdbd_host, "") + @patch( + "charms.hpc_libs.v0.slurm_ops.SlurmctldManager.hostname", + new_callable=PropertyMock(return_value="test_hostname"), + ) + def test_sackd_on_relation_created(self, *_) -> None: + """Test that sackd relation is created successfully.""" + self.harness.set_leader(True) + # Patch StoredState + setattr(self.harness.charm._stored, "slurm_installed", True) + setattr(self.harness.charm._stored, "munge_key", "=ABC=") + + relation_id = self.harness.add_relation("login-node", "sackd") + self.assertEqual( + self.harness.get_relation_data(relation_id, "slurmctld")["cluster_info"], + '{"auth_key": "=ABC=", "slurmctld_host": "test_hostname"}', + ) + + @patch("ops.framework.EventBase.defer") + def test_sackd_fail_on_relation_created(self, defer) -> None: + """Test sackd relation when slurm is not installed.""" + setattr(self.harness.charm._stored, "slurm_installed", False) # Patch StoredState + self.harness.add_relation("login-node", "sackd") + defer.asset_called() + @patch("charm.is_container", return_value=True) def test_get_user_supplied_parameters(self, *_) -> None: """Test that user supplied parameters are parsed correctly.""" diff --git a/test-requirements.txt b/test-requirements.txt index 2729e30..9c791e3 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,4 +1,5 @@ ops==2.15.0 +ops-scenario==7.0.5 cryptography~=43.0.1 distro==1.9.0 python-dotenv~=1.0.1 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d113c3d..3fb91b8 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -28,13 +28,14 @@ SLURMD_DIR = Path(slurmd) if (slurmd := os.getenv("SLURMD_DIR")) else None SLURMDBD_DIR = Path(slurmdbd) if (slurmdbd := os.getenv("SLURMDBD_DIR")) else None SLURMRESTD_DIR = Path(slurmrestd) if (slurmrestd := os.getenv("SLURMRESTD_DIR")) else None +SACKD_DIR = Path(sackd) if (sackd := os.getenv("SACKD_DIR")) else None def pytest_addoption(parser) -> None: parser.addoption( "--charm-base", action="store", - default="ubuntu@22.04", + default="ubuntu@24.04", help="Charm base version to use for integration tests", ) @@ -111,3 +112,20 @@ async def slurmrestd_charm(request, ops_test: OpsTest) -> Union[str, Path]: return "slurmrestd" return await ops_test.build_charm(SLURMRESTD_DIR, verbosity="verbose") + + +@pytest.fixture(scope="module") +async def sackd_charm(request, ops_test: OpsTest) -> Union[str, Path]: + """Pack sackd_charm charm to use for integration tests. + + If the `SACKD_DIR` environment variable is not set, this will pull the charm from + Charmhub instead. + + Returns: + `Path` if "sackd" is built locally. `str` otherwise.. + """ + if not SACKD_DIR: + logger.info("Pulling sackd from Charmhub") + return "sackd" + + return await ops_test.build_charm(SACKD_DIR, verbosity="verbose") diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index 4d6467e..59754ef 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -28,9 +28,10 @@ SLURMD = "slurmd" SLURMDBD = "slurmdbd" SLURMRESTD = "slurmrestd" +SACKD = "sackd" DATABASE = "mysql" ROUTER = "mysql-router" -SLURM_APPS = [SLURMCTLD, SLURMD, SLURMDBD, SLURMRESTD] +SLURM_APPS = [SLURMCTLD, SLURMD, SLURMDBD, SLURMRESTD, SACKD] @pytest.mark.abort_on_fail @@ -43,12 +44,13 @@ async def test_build_and_deploy_against_edge( slurmd_charm, slurmdbd_charm, slurmrestd_charm, + sackd_charm, ) -> None: - """Test that the slurmctld charm can stabilize against slurmd, slurmdbd, slurmrestd, and MySQL.""" + """Test that the slurmctld charm can stabilize against slurmd, slurmdbd, slurmrestd, sackd, and MySQL.""" logger.info(f"Deploying {', '.join(SLURM_APPS)}, and {DATABASE}") # Pack charms and download NHC resource for the slurmd operator. - slurmctld, slurmd, slurmdbd, slurmrestd = await asyncio.gather( - slurmctld_charm, slurmd_charm, slurmdbd_charm, slurmrestd_charm + slurmctld, slurmd, slurmdbd, slurmrestd, sackd = await asyncio.gather( + slurmctld_charm, slurmd_charm, slurmdbd_charm, slurmrestd_charm, sackd_charm ) # Deploy the test Charmed SLURM cloud. await asyncio.gather( @@ -79,6 +81,13 @@ async def test_build_and_deploy_against_edge( num_units=1, base=charm_base, ), + ops_test.model.deploy( + str(sackd), + application_name=SACKD, + channel="edge" if isinstance(sackd, str) else None, + num_units=1, + base=charm_base, + ), # TODO: # Re-enable `mysql-router` in the integration tests once `dpe/edge` # channel supports the `ubuntu@24.04` base. @@ -101,6 +110,7 @@ async def test_build_and_deploy_against_edge( await ops_test.model.integrate(f"{SLURMCTLD}:{SLURMD}", f"{SLURMD}:{SLURMCTLD}") await ops_test.model.integrate(f"{SLURMCTLD}:{SLURMDBD}", f"{SLURMDBD}:{SLURMCTLD}") await ops_test.model.integrate(f"{SLURMCTLD}:{SLURMRESTD}", f"{SLURMRESTD}:{SLURMCTLD}") + await ops_test.model.integrate(f"{SLURMCTLD}:login-node", f"{SACKD}:{SLURMCTLD}") # await ops_test.model.integrate(f"{SLURMDBD}-{ROUTER}:backend-database", f"{DATABASE}:database") await ops_test.model.integrate(f"{SLURMDBD}:database", f"{DATABASE}:database") # Reduce the update status frequency to accelerate the triggering of deferred events.