diff --git a/actions.yaml b/actions.yaml deleted file mode 100644 index ce7ef05..0000000 --- a/actions.yaml +++ /dev/null @@ -1,47 +0,0 @@ -show-current-config: - description: > - Display the currently used `slurm.conf`. - - Note: This file only exists in `slurmctld` charm and is automatically - distributed to all compute nodes by Slurm. - - Example usage: - $ juju run-action slurmctld/leader --format=json --wait | jq .[].results.slurm.conf | xargs -I % -0 python3 -c 'print(%)' -drain: - description: > - Drain specified nodes. - - Example usage: - $ juju run-action slurmctld/leader drain nodename=node-[1,2] reason="Updating kernel" - params: - nodename: - type: string - description: The nodes to drain, using the Slurm format, e.g. `node-[1,2]`. - reason: - type: string - description: Reason to drain the nodes. - required: - - nodename - - reason -resume: - description: > - Resume specified nodes. - - Note: Newly added nodes will remain in the `down` state until configured, - with the `node-configured` action. - - Example usage: $ juju run-action slurmctld/leader resume nodename=node-[1,2] - params: - nodename: - type: string - description: > - The nodes to resume, using the Slurm format, e.g. `node-[1,2]`. - required: - - nodename - -influxdb-info: - description: > - Get InfluxDB info. - - This action returns the host, port, username, password, database, and - retention policy regarding to InfluxDB. diff --git a/charmcraft.yaml b/charmcraft.yaml index 125880e..9b57f0f 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -1,5 +1,52 @@ # Copyright 2020 Omnivector Solutions, LLC # See LICENSE file for licensing details. +name: slurmctld +summary: | + Slurmctld, the central management daemon of Slurm. +description: | + This charm provides slurmctld, munged, and the bindings to other utilities + that make lifecycle operations a breeze. + + slurmctld is the central management daemon of SLURM. It monitors all other + SLURM daemons and resources, accepts work (jobs), and allocates resources + to those jobs. Given the critical functionality of slurmctld, there may be + a backup server to assume these functions in the event that the primary + server fails. + +links: + contact: https://matrix.to/#/#hpc:ubuntu.com + + issues: + - https://github.com/charmed-hpc/slurmctld-operator/issues + + source: + - https://github.com/charmed-hpc/slurmctld-operator + +peers: + slurmctld-peer: + interface: slurmctld-peer +requires: + slurmd: + interface: slurmd + slurmdbd: + interface: slurmdbd + slurmrestd: + interface: slurmrestd + influxdb-api: + interface: influxdb-api + elasticsearch: + interface: elasticsearch + fluentbit: + interface: fluentbit +provides: + prolog-epilog: + interface: prolog-epilog + grafana-source: + interface: grafana-source + scope: global + +assumes: + - juju type: charm bases: @@ -29,3 +76,145 @@ parts: echo $VERSION > $CRAFT_PART_INSTALL/version stage: - version + +config: + options: + custom-slurm-repo: + type: string + default: "" + description: > + Use a custom repository for Slurm installation. + + This can be set to the Organization's local mirror/cache of packages and + supersedes the Omnivector repositories. Alternatively, it can be used to + track a `testing` Slurm version, e.g. by setting to + `ppa:omnivector/osd-testing`. + + Note: The configuration `custom-slurm-repo` must be set *before* + deploying the units. Changing this value after deploying the units will + not reinstall Slurm. + cluster-name: + type: string + default: osd-cluster + description: > + Name to be recorded in database for jobs from this cluster. + + This is important if a single database is used to record information from + multiple Slurm-managed clusters. + default-partition: + type: string + default: "" + description: > + Default Slurm partition. This is only used if defined, and must match an + existing partition. + custom-config: + type: string + default: "" + description: > + User supplied Slurm configuration. + + This value supplements the charm supplied `slurm.conf` that is used for + Slurm Controller and Compute nodes. + + Example usage: + $ juju config slurmcltd custom-config="FirstJobId=1234" + proctrack-type: + type: string + default: proctrack/cgroup + description: > + Identifies the plugin to be used for process tracking on a job step + basis. + cgroup-config: + type: string + default: | + CgroupAutomount=yes + ConstrainCores=yes + description: > + Configuration content for `cgroup.conf`. + + health-check-params: + default: "" + type: string + description: > + Extra parameters for NHC command. + + This option can be used to customize how NHC is called, e.g. to send an + e-mail to an admin when NHC detects an error set this value to + `-M admin@domain.com`. + health-check-interval: + default: 600 + type: int + description: Interval in seconds between executions of the Health Check. + health-check-state: + default: "ANY,CYCLE" + type: string + description: Only run the Health Check on nodes in this state. + + acct-gather-frequency: + type: string + default: "task=30" + description: > + Accounting and profiling sampling intervals for the acct_gather plugins. + + Note: A value of `0` disables the periodic sampling. In this case, the + accounting information is collected when the job terminates. + + Example usage: + $ juju config slurmcltd acct-gather-frequency="task=30,network=30" + acct-gather-custom: + type: string + default: "" + description: > + User supplied `acct_gather.conf` configuration. + + This value supplements the charm supplied `acct_gather.conf` file that is + used for configuring the acct_gather plugins. + +actions: + show-current-config: + description: > + Display the currently used `slurm.conf`. + + Note: This file only exists in `slurmctld` charm and is automatically + distributed to all compute nodes by Slurm. + + Example usage: + $ juju run-action slurmctld/leader --format=json --wait | jq .[].results.slurm.conf | xargs -I % -0 python3 -c 'print(%)' + drain: + description: > + Drain specified nodes. + + Example usage: + $ juju run-action slurmctld/leader drain nodename=node-[1,2] reason="Updating kernel" + params: + nodename: + type: string + description: The nodes to drain, using the Slurm format, e.g. `node-[1,2]`. + reason: + type: string + description: Reason to drain the nodes. + required: + - nodename + - reason + resume: + description: > + Resume specified nodes. + + Note: Newly added nodes will remain in the `down` state until configured, + with the `node-configured` action. + + Example usage: $ juju run-action slurmctld/leader resume nodename=node-[1,2] + params: + nodename: + type: string + description: > + The nodes to resume, using the Slurm format, e.g. `node-[1,2]`. + required: + - nodename + + influxdb-info: + description: > + Get InfluxDB info. + + This action returns the host, port, username, password, database, and + retention policy regarding to InfluxDB. diff --git a/config.yaml b/config.yaml deleted file mode 100644 index 2d4798e..0000000 --- a/config.yaml +++ /dev/null @@ -1,91 +0,0 @@ -options: - custom-slurm-repo: - type: string - default: "" - description: > - Use a custom repository for Slurm installation. - - This can be set to the Organization's local mirror/cache of packages and - supersedes the Omnivector repositories. Alternatively, it can be used to - track a `testing` Slurm version, e.g. by setting to - `ppa:omnivector/osd-testing`. - - Note: The configuration `custom-slurm-repo` must be set *before* - deploying the units. Changing this value after deploying the units will - not reinstall Slurm. - cluster-name: - type: string - default: osd-cluster - description: > - Name to be recorded in database for jobs from this cluster. - - This is important if a single database is used to record information from - multiple Slurm-managed clusters. - default-partition: - type: string - default: "" - description: > - Default Slurm partition. This is only used if defined, and must match an - existing partition. - custom-config: - type: string - default: "" - description: > - User supplied Slurm configuration. - - This value supplements the charm supplied `slurm.conf` that is used for - Slurm Controller and Compute nodes. - - Example usage: - $ juju config slurmcltd custom-config="FirstJobId=1234" - proctrack-type: - type: string - default: proctrack/cgroup - description: > - Identifies the plugin to be used for process tracking on a job step - basis. - cgroup-config: - type: string - default: | - CgroupAutomount=yes - ConstrainCores=yes - description: > - Configuration content for `cgroup.conf`. - - health-check-params: - default: "" - type: string - description: > - Extra parameters for NHC command. - - This option can be used to customize how NHC is called, e.g. to send an - e-mail to an admin when NHC detects an error set this value to - `-M admin@domain.com`. - health-check-interval: - default: 600 - type: int - description: Interval in seconds between executions of the Health Check. - health-check-state: - default: "ANY,CYCLE" - type: string - description: Only run the Health Check on nodes in this state. - - acct-gather-frequency: - type: string - default: "task=30" - description: > - Accounting and profiling sampling intervals for the acct_gather plugins. - - Note: A value of `0` disables the periodic sampling. In this case, the - accounting information is collected when the job terminates. - - Example usage: - $ juju config slurmcltd acct-gather-frequency="task=30,network=30" - acct-gather-custom: - type: string - default: "" - description: > - User supplied `acct_gather.conf` configuration. - - This value supplements the charm supplied `acct_gather.conf` file that is - used for configuring the acct_gather plugins. diff --git a/metadata.yaml b/metadata.yaml deleted file mode 100644 index 713b150..0000000 --- a/metadata.yaml +++ /dev/null @@ -1,44 +0,0 @@ -name: slurmctld -summary: | - Slurmctld, the central management daemon of Slurm. -description: | - This charm provides slurmctld, munged, and the bindings to other utilities - that make lifecycle operations a breeze. - - slurmctld is the central management daemon of SLURM. It monitors all other - SLURM daemons and resources, accepts work (jobs), and allocates resources - to those jobs. Given the critical functionality of slurmctld, there may be - a backup server to assume these functions in the event that the primary - server fails. -source: https://github.com/omnivector-solutions/slurmctld-operator -issues: https://github.com/omnivector-solutions/slurmctld-operator/issues -maintainers: - - OmniVector Solutions - - Jason C. Nucciarone - - David Gomez - -peers: - slurmctld-peer: - interface: slurmctld-peer -requires: - slurmd: - interface: slurmd - slurmdbd: - interface: slurmdbd - slurmrestd: - interface: slurmrestd - influxdb-api: - interface: influxdb-api - elasticsearch: - interface: elasticsearch - fluentbit: - interface: fluentbit -provides: - prolog-epilog: - interface: prolog-epilog - grafana-source: - interface: grafana-source - scope: global - -assumes: - - juju