Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add submit options to the cluster configuration #35

Merged
merged 3 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/src/clusters/cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ be one of:
* `"slurm"`
* `"bash"`

## submit_options

`cluster.submit_options`: **array** of **strings** - Scheduler submission options that
are passed to every job on this cluster.

## partition

`cluster.partition`: **array** of **tables** - Define the scheduler partitions that
Expand Down
3 changes: 3 additions & 0 deletions doc/src/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* Edit links to documentation pages.
* New arguments to `show status` display actions that are in the requested states:
`--completed`, `--eligible`, `--submitted`, and `--waiting`.
* `cluster.submit_options` configuration option in `clusters.toml`.

*Changed:*

Expand All @@ -15,6 +16,8 @@
* `show status` hides actions with 0 directories by default. Pass `--all` to show all
actions.
* `clean` now cleans all caches by default.
* Submit jobs with `--constraint="scratch"` by default on Delta.
* Submit jobs with `--constraint="nvme"` by default on Frontier.

*Fixed:*

Expand Down
6 changes: 6 additions & 0 deletions src/builtin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ fn andes() -> Cluster {
name: "andes".into(),
identify: IdentificationMethod::ByEnvironment("LMOD_SYSTEM_NAME".into(), "andes".into()),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![
// Auto-detected partitions: batch
Partition {
Expand All @@ -92,6 +93,7 @@ fn anvil() -> Cluster {
name: "anvil".into(),
identify: IdentificationMethod::ByEnvironment("RCAC_CLUSTER".into(), "anvil".into()),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![
// Auto-detected partitions: shared | wholenode | gpu
Partition {
Expand Down Expand Up @@ -149,6 +151,7 @@ fn delta() -> Cluster {
name: "delta".into(),
identify: IdentificationMethod::ByEnvironment("LMOD_SYSTEM_NAME".into(), "Delta".into()),
scheduler: SchedulerType::Slurm,
submit_options: vec!["--constraint=\"scratch\"".to_string()],
partition: vec![
// Auto-detected partitions: cpu | gpuA100x4
Partition {
Expand Down Expand Up @@ -206,6 +209,7 @@ fn frontier() -> Cluster {
name: "frontier".into(),
identify: IdentificationMethod::ByEnvironment("LMOD_SYSTEM_NAME".into(), "frontier".into()),
scheduler: SchedulerType::Slurm,
submit_options: vec!["--constraint=\"nvme\"".to_string()],
partition: vec![
// Auto-detected partitions: batch
Partition {
Expand All @@ -225,6 +229,7 @@ fn greatlakes() -> Cluster {
name: "greatlakes".into(),
identify: IdentificationMethod::ByEnvironment("CLUSTER_NAME".into(), "greatlakes".into()),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![
// Auto-detected partitions: standard | gpu_mig40,gpu | gpu.
Partition {
Expand Down Expand Up @@ -295,6 +300,7 @@ fn none() -> Cluster {
name: "none".into(),
identify: IdentificationMethod::Always(true),
scheduler: SchedulerType::Bash,
submit_options: Vec::new(),
partition: vec![Partition {
name: "none".into(),
..Partition::default()
Expand Down
13 changes: 13 additions & 0 deletions src/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ pub struct Cluster {

/// The partitions in the cluster's queue.
pub partition: Vec<Partition>,

/// Submit options to include in every job submitted to this cluster.
#[serde(default)]
pub submit_options: Vec<String>,
}

/// Methods to identify clusters.
Expand Down Expand Up @@ -400,30 +404,35 @@ mod tests {
identify: IdentificationMethod::Always(false),
scheduler: SchedulerType::Bash,
partition: Vec::new(),
submit_options: Vec::new(),
},
Cluster {
name: "cluster1".into(),
identify: IdentificationMethod::ByEnvironment("_row_select".into(), "a".into()),
scheduler: SchedulerType::Bash,
partition: Vec::new(),
submit_options: Vec::new(),
},
Cluster {
name: "cluster2".into(),
identify: IdentificationMethod::ByEnvironment("_row_select".into(), "b".into()),
scheduler: SchedulerType::Bash,
partition: Vec::new(),
submit_options: Vec::new(),
},
Cluster {
name: "cluster3".into(),
identify: IdentificationMethod::Always(true),
scheduler: SchedulerType::Bash,
partition: Vec::new(),
submit_options: Vec::new(),
},
Cluster {
name: "cluster4".into(),
identify: IdentificationMethod::ByEnvironment("_row_Select".into(), "b".into()),
scheduler: SchedulerType::Bash,
partition: Vec::new(),
submit_options: Vec::new(),
},
];
let cluster_configuration = Configuration { cluster: clusters };
Expand Down Expand Up @@ -591,6 +600,7 @@ mod tests {
identify: IdentificationMethod::Always(true),
scheduler: SchedulerType::Bash,
partition: partitions,
submit_options: Vec::new(),
};

let cpu_resources = Resources {
Expand Down Expand Up @@ -728,6 +738,7 @@ name = "b"
assert_eq!(cluster.name, "a");
assert_eq!(cluster.identify, IdentificationMethod::Always(true));
assert_eq!(cluster.scheduler, SchedulerType::Bash);
assert!(cluster.submit_options.is_empty());
assert_eq!(
cluster.partition,
vec![Partition {
Expand All @@ -748,6 +759,7 @@ name = "b"
name = "a"
identify.by_environment = ["b", "c"]
scheduler = "slurm"
submit_options = ["option1", "option2"]

[[cluster.partition]]
name = "d"
Expand Down Expand Up @@ -777,6 +789,7 @@ account_suffix = "-gpu"
IdentificationMethod::ByEnvironment("b".into(), "c".into())
);
assert_eq!(cluster.scheduler, SchedulerType::Slurm);
assert_eq!(cluster.submit_options, vec!["option1", "option2"]);
assert_eq!(
cluster.partition,
vec![Partition {
Expand Down
1 change: 1 addition & 0 deletions src/scheduler/bash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,7 @@ mod tests {
scheduler: SchedulerType::Bash,
identify: IdentificationMethod::Always(false),
partition: Vec::new(),
submit_options: Vec::new(),
};
let script = Bash::new(cluster, launchers)
.make_script(&action, &directories)
Expand Down
31 changes: 31 additions & 0 deletions src/scheduler/slurm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ impl Scheduler for Slurm {
let minutes = (total + 59) / 60;
let _ = writeln!(preamble, "#SBATCH --time={minutes}");

// Add global cluster submit options first so that users can override them.
for option in &self.cluster.submit_options {
let _ = writeln!(preamble, "#SBATCH {option}");
}

// Use provided submission options
if let Some(submit_options) = action.submit_options.get(&self.cluster.name) {
if let Some(ref account) = submit_options.account {
Expand Down Expand Up @@ -299,6 +304,7 @@ mod tests {
identify: IdentificationMethod::Always(false),
scheduler: SchedulerType::Slurm,
partition: vec![Partition::default()],
submit_options: Vec::new(),
};

let slurm = Slurm::new(cluster, launchers.by_cluster("cluster"));
Expand All @@ -323,6 +329,27 @@ mod tests {
assert!(script.contains("#SBATCH --time=180"));
}

#[test]
#[parallel]
fn cluster_submit_options() {
let (action, directories, mut slurm) = setup();
slurm.cluster.submit_options = vec!["--option=value".to_string()];

let script = slurm
.make_script(&action, &directories)
.expect("valid script");
println!("{script}");

assert!(script.contains("#SBATCH --job-name=action"));
assert!(script.contains("#SBATCH --ntasks=1"));
assert!(!script.contains("#SBATCH --account"));
assert!(script.contains("#SBATCH --partition=partition"));
assert!(!script.contains("#SBATCH --cpus-per-task"));
assert!(!script.contains("#SBATCH --gpus-per-task"));
assert!(script.contains("#SBATCH --time=180"));
assert!(script.contains("#SBATCH --option=value"));
}

#[test]
#[parallel]
fn ntasks() {
Expand Down Expand Up @@ -421,6 +448,7 @@ mod tests {
name: "cluster".into(),
identify: IdentificationMethod::Always(false),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![Partition {
memory_per_cpu: Some("a".into()),
..Partition::default()
Expand All @@ -447,6 +475,7 @@ mod tests {
name: "cluster".into(),
identify: IdentificationMethod::Always(false),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![Partition {
memory_per_gpu: Some("b".into()),
..Partition::default()
Expand Down Expand Up @@ -475,6 +504,7 @@ mod tests {
name: "cluster".into(),
identify: IdentificationMethod::Always(false),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![Partition {
cpus_per_node: Some(10),
..Partition::default()
Expand Down Expand Up @@ -503,6 +533,7 @@ mod tests {
name: "cluster".into(),
identify: IdentificationMethod::Always(false),
scheduler: SchedulerType::Slurm,
submit_options: Vec::new(),
partition: vec![Partition {
gpus_per_node: Some(5),
..Partition::default()
Expand Down