-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathnotify_nodes_drained
executable file
·54 lines (45 loc) · 1.59 KB
/
notify_nodes_drained
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env bash
# Slurm trigger script for nodes in failing states
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
# MUST be executed by the slurm user, consistent with:
# scontrol show config | grep SlurmUser
# Configure these variables for your system:
slurm_user=slurm
slurm_notify=root
my_mail=/usr/bin/mailx
# Select the node state as one of: down,drained,fail,up (see "man strigger")
my_state=drained
# sacct output fields
export SACCT_FORMAT="JobID,JobName,Partition,User,Account,NodeList,State,ExitCode"
# This will make the trigger permanent:
# my_flags="--flags=PERM"
# Check that we run as the slurm user
if test "`id -nu`" != "$slurm_user"
then
echo ERROR: The strigger command must be executed by the $slurm_user user
exit -1
fi
# Commands to be run for notification
function my_tasks() {
# Print node info
sinfo -lRN
# Select Nodelist with Reason="Kill task failed" and reboot those nodes
# (the paste command is from the coreutils package)
nodelist=`sinfo -h -t $my_state -o "%N %E" | grep -i "Kill task failed" | awk '{print $1}' | paste -d ,`
if [[ -n "$nodelist" ]]
then
echo
echo "Recent jobs running on nodes $nodelist"
sacct --nodelist=$nodelist -S now-600
echo
echo "Now reboot and resume nodes $nodelist"
scontrol reboot nextstate=resume $nodelist
sinfo -N -n $nodelist
fi
# Submit trigger for next event ($0 = this script)
echo
echo Setting new trigger --node --$my_state --program=$0
strigger --set --node --$my_state --program=$0 $my_flags
}
# Notify Slurm administrator of node state
my_tasks 2>&1 | $my_mail -s "Nodes $my_state: $@" $slurm_notify