-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathpsnode
executable file
·105 lines (95 loc) · 2.94 KB
/
psnode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env bash
# Do a "ps" process status on a node-list, but exclude system processes
# Usage: psnode [-c columns] node-list
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
# Default value:
export columns=100
# Parse command options
while getopts "c:" options; do
case $options in
c ) export columns=$OPTARG
if ! [[ $columns =~ ^[0-9]+$ ]]
then
echo "Number of columns $columns must be a positive number"
exit 1
fi
if [[ $columns -lt 5 ]] || [[ $columns -gt 1024 ]]
then
echo "Unreasonable number of columns $columns"
exit 1
fi
;;
h | * ) echo "Usage: psnode [-c columns | -h] node-list"
exit 1;;
esac
done
shift $((OPTIND-1))
if test $# -ne 1
then
echo "ERROR:"
echo "Usage: psnode [-c columns | -h] node-list"
exit -1
fi
# The command argument is the list of nodes
nodelist=$*
# Enable these variables if the gpustat command has been installed on nodes with GPUs
enable_gpustat=1
GPUNAME_WIDTH="--gpuname-width 24"
# color="--color"
color=""
PS="/bin/ps"
PSFLAGS="-o pid,nlwp,state,user,start,cputime,%cpu,rssize,command --columns $columns"
SSH="ssh -n -x"
PING="/bin/ping -c 1 -w 3"
unset SQUEUE_FORMAT2
# System users EXCLUDED from the process list
USERLIST="root rpc rpcuser daemon ntp smmsp sshd hpsmh named dbus 68 chrony polkitd munge"
# Make a deselect-list consisting only of existent users
deselect_list=""
sep=""
for u in $USERLIST
do
if test -n "`getent passwd $u`"
then
# This user exists in the passwd database
deselect_list="${deselect_list}${sep}${u}"
sep=" "
fi
done
sep=""
# The "scontrol show hostnames" command is used to expand NodeList expressions
for node in `scontrol show hostnames $nodelist`
do
if test -n "$sep"
then
echo "$sep"
fi
jobids="`squeue -h -O JobID -w $node | xargs --no-run-if-empty`"
if [[ -z "$jobids" ]]
then
echo "Node $node has no running jobs"
continue
fi
# Print node status information:
echo "Node ${node} information:"
sinfo -N -O "NodeList:10 ,Partition:16 ,CPUs:4 ,CPUsLoad:9 ,SocketCoreThread:8 ,Memory:7 ,StateLong:11 ,Reason:20 ,Gres" -n $node
echo "Jobid and username list:"
squeue -h -O JobID,Username -w $node
if $PING $node >/dev/null 2>&1
then
echo "User processes:"
# Count also the number of processes (numprocs) and threads (numthreads) when field $1 is a number
$SSH $node $PS $PSFLAGS --deselect -u \""${deselect_list}"\" | awk '{print $0; if($1~/^[0-9]+$/) {numprocs++; numthreads+=$2}} END {printf("Total: %d processes and %d threads\n", numprocs, numthreads)}'
# Print GPU tasks with gpustat (if applicable)
if [[ $enable_gpustat > 0 && "`sinfo -h -N -n $node -O Gres | grep -i GPU`" =~ "gpu:" ]]
then
echo
echo "Running GPU tasks on node $node:"
echo "ID GPU-type | Temp GPU% | Mem / Tot | user:process/PID(Mem)"
$SSH $node gpustat -upc $GPUNAME_WIDTH $color --no-header
fi
else
echo "'*** WARNING ***' Cannot ping host ${node} !"
fi
sep="====================================="
done