This repository has been archived by the owner on Aug 8, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
/
multinode_runfabtests.sh
executable file
·193 lines (169 loc) · 6.54 KB
/
multinode_runfabtests.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
. ~/.bash_profile
check_kernel_has_fork_support()
{
gcc -I/usr/include -o ~/fork_checker ~/fork_checker.c -lefa -libverbs
~/fork_checker
if [ 0 -eq $? ]; then
return 1
fi
return 0
}
run_test_with_expected_ret()
{
SERVER_IP=$1
CLIENT_IP=$2
SERVER_CMD=$3
CLIENT_CMD=$4
EXPECT_RESULT=$5
ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no ${SERVER_IP} ${SERVER_CMD} >& server.out &
server_pid=$!
sleep 1
ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no ${CLIENT_IP} ${CLIENT_CMD} ${SERVER_IP} >& client.out &
client_pid=$!
wait $client_pid
client_ret=$?
if [ $client_ret -ne 0 ]; then
kill -9 $server_pid
fi
wait $server_pid
server_ret=$?
ret=0
if [ ${EXPECT_RESULT} = "FAIL" ]; then
if [ $server_ret -ne 0 ] || [ $client_ret -ne 0 ]; then
echo "Test ${PROGRAM_TO_RUN} Passed!"
else
echo "Test ${PROGRAM_TO_RUN} Failed!"
ret=1
fi
else
if [ $server_ret -eq 0 ] && [ $client_ret -eq 0 ]; then
echo "Test ${PROGRAM_TO_RUN} Passed!"
else
echo "Test ${PROGRAM_TO_RUN} Failed!"
ret=1
fi
fi
echo "server output:"
cat server.out
echo "client output:"
cat client.out
return $ret
}
set -xe
PROVIDER=$1
SERVER_IP=$2
CLIENT_IP=$3
BUILD_GDR=$5
# Runs all the tests in the fabtests suite while only expanding failed cases
EXCLUDE=${HOME}/libfabric/fabtests/install/share/fabtests/test_configs/${PROVIDER}/${PROVIDER}.exclude
if [ -f ${EXCLUDE} ]; then
EXCLUDE="-R -f ${EXCLUDE}"
else
EXCLUDE=""
fi
# Each individual test has a "-b" option and "-E" option. Both will
# use out-of-band address exchange.
# The difference is "-b" will use out-of-band synchronization, -E
# does not.
#
# runfabtests.sh's "-b" option actually uses the -E option of each indivdual
# test (for historical reasons).
#
runfabtests_script="${HOME}/libfabric/fabtests/install/bin/runfabtests.sh"
b_option_available="$($runfabtests_script -h 2>&1 | grep '\-b' || true)"
# Check if '-P' option (Run provider specific fabtests) is available
P_option_available="$($runfabtests_script -h 2>&1 | grep '\-P' || true)"
FABTESTS_OPTS="-E LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\" -vvv ${EXCLUDE}"
FABTESTS_OPTS+=" -p ${HOME}/libfabric/fabtests/install/bin/"
if [ ${PROVIDER} == "efa" ]; then
if [ -n "$P_option_available" ]; then
FABTESTS_OPTS+=" -P"
fi
if [ -n "$b_option_available" ]; then
FABTESTS_OPTS+=" -b -t all"
else
gid_c=$4
gid_s=$(ibv_devinfo -v | grep GID | awk '{print $3}')
FABTESTS_OPTS+=" -C \"-P 0\" -s $gid_s -c $gid_c -t all"
fi
fi
bash -c "$runfabtests_script ${FABTESTS_OPTS} ${PROVIDER} ${SERVER_IP} ${CLIENT_IP}"
if [ ${PROVIDER} == "efa" ]; then
# dgram_pingpong test has been excluded during installation
# (in install-fabtests.sh), because it does not work with "-E" option.
# So here we run it separately using "-b" option
bash_option=$-
restore_e=0
if [[ $bash_option =~ e ]]; then
restore_e=1
set +e
fi
exit_code=0
ami_arch=$(uname -m)
# Run fi_dgram_pingpong on x86 only as it currently does not work on c6gn instances.
# This change will be reverted once the issue is fixed.
if [[ "$ami_arch" == "x86_64" ]]; then
echo "Run fi_dgram_pingpong with out-of-band synchronization"
SERVER_CMD="${HOME}/libfabric/fabtests/install/bin/fi_dgram_pingpong -k -p efa -b"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
fi
# Run fi_rdm_tagged_bw with fork when different environment variables are set.
fork_option_available=$(${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -h 2>&1 | grep '\-K' || true)
if [ -n "$fork_option_available" ]; then
echo "Run fi_rdm_tagged_bw with fork"
SERVER_CMD="${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -p efa -K -E"
CLIENT_CMD="${SERVER_CMD}"
# If an application used fork, it needs to enable rdma-core's user space fork support to avoid kernel's
# copy-on-write mechanism being is applied to pinned memory, otherwise there will be data corrutpion.
# To make sure fork support is enabled properly, libfabric registered a fork handler, which will abort
# the application if fork support is not enabled.
#
# Kernel 5.13 and newer will not apply CoW on pinned memory, hence the user space kernel support
# is unneeded. Libfabric will detect that support via rdma-core's ibv_is_fork_initialized() API, and
# will not register that fork handler on kernel 5.13.
#
# In all, the "fi_rdm_tagged_bw with fork" test is expected to pass on 5.13 and newer, but fail on
# older kernels.
check_kernel_has_fork_support ${SERVER_IP}
if [ $? -eq 1 ] ; then
expected_result="PASS"
else
expected_result="FAIL"
fi
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "$expected_result"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
echo "Run fi_rdm_tagged_bw with fork and RDMAV_FORK_SAFE set"
SERVER_CMD="RDMAV_FORK_SAFE=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -v -p efa -K -E"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
echo "Run fi_rdm_tagged_bw with fork and FI_EFA_FORK_SAFE set"
SERVER_CMD="FI_EFA_FORK_SAFE=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -v -p efa -K -E"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
fi
if [[ ${BUILD_GDR} -eq 1 ]]; then
echo "Run fi_rdm_tagged_bw with server using device (GPU) memory and client using host memory"
CLIENT_CMD="FI_EFA_USE_DEVICE_RDMA=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -p efa -E"
SERVER_CMD="${CLIENT_CMD} -D cuda"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
fi
if [ $restore_e -eq 1 ]; then
set -e
fi
exit $exit_code
fi