Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add retries to SSH iperf on server #342

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 126 additions & 43 deletions pkg/hhfab/hhnet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,137 @@
# Copyright 2024 Hedgehog
# SPDX-License-Identifier: Apache-2.0


set -e

NETWORKD_PATH="/etc/systemd/network"

function cleanup() {
for i in {0..3}; do
sudo ip l d "bond$i" 2> /dev/null || true
sudo networkctl down "bond$i" 2> /dev/null || true
sudo networkctl delete "bond$i" 2> /dev/null || true
done

for i in {1..9}; do
sudo ip l s "enp2s$i" down 2> /dev/null || true
sudo networkctl down "enp2s$i" 2> /dev/null || true
for j in {1000..1020}; do
sudo ip l d "enp2s$i.$j" 2> /dev/null || true
sudo networkctl down "enp2s$i.$j" 2> /dev/null || true
sudo networkctl delete "enp2s$i.$j" 2> /dev/null || true
done
done

sleep 1
sudo rm -f "$NETWORKD_PATH"/10-bond*.network "$NETWORKD_PATH"/10-bond*.netdev
sudo rm -f "$NETWORKD_PATH"/20-slave*.network
sudo rm -f "$NETWORKD_PATH"/30-vlan*.network "$NETWORKD_PATH"/30-vlan*.netdev

sudo systemctl restart systemd-networkd 2> /dev/null || true
sleep 2
}

function setup_bond() {
local bond_name=$1

sudo ip l a "$bond_name" type bond miimon 100 mode 802.3ad

for iface in "${@:2}"; do
# cannot enslave interface if it is up
sudo ip l s "$iface" down 2> /dev/null || true
sudo ip l s "$iface" master "$bond_name"
shift
local interfaces=("$@")
for iface in "${interfaces[@]}"; do
if ! sudo networkctl status "$iface" &> /dev/null; then
echo "Interface $iface not found"
return 1
fi
done
cat << EOF | sudo tee "$NETWORKD_PATH/10-$bond_name.netdev" > /dev/null
[NetDev]
Name=$bond_name
Kind=bond
[Bond]
Mode=802.3ad
LACPTransmitRate=fast
MIIMonitorSec=1s
EOF
for iface in "${interfaces[@]}"; do
cat << EOF | sudo tee "$NETWORKD_PATH/20-slave-$iface.network" > /dev/null
[Match]
Name=$iface
[Network]
Bond=$bond_name
[Link]
MTUBytes=9036
EOF
done

sudo ip l s "$bond_name" up
sudo systemctl restart systemd-networkd 2> /dev/null || true
sleep 5
}

function setup_vlan() {
local iface_name=$1
local parent_iface=$1
local vlan_id=$2

sudo ip l s "$iface_name" up
sudo ip l a link "$iface_name" name "$iface_name.$vlan_id" type vlan id "$vlan_id"
sudo ip l s "$iface_name.$vlan_id" up
if ! sudo networkctl status "$parent_iface" &> /dev/null; then
echo "Parent interface $parent_iface not found"
return 1
fi

sudo networkctl up "$parent_iface" 2> /dev/null || true
sleep 2

cat << EOF | sudo tee "$NETWORKD_PATH/30-vlan-$parent_iface-$vlan_id.netdev" > /dev/null
[NetDev]
Name=$parent_iface.$vlan_id
Kind=vlan

[VLAN]
Id=$vlan_id
EOF

cat << EOF | sudo tee "$NETWORKD_PATH/30-vlan-$parent_iface-$vlan_id.network" > /dev/null
[Match]
Name=$parent_iface.$vlan_id

[Network]
DHCP=yes
EOF

cat << EOF | sudo tee "$NETWORKD_PATH/30-$parent_iface.network" > /dev/null
[Match]
Name=$parent_iface

[Network]
VLAN=$parent_iface.$vlan_id

[Link]
MTUBytes=9036
EOF

sudo systemctl restart systemd-networkd 2> /dev/null || true
sleep 5
}

function wait_for_interface() {
local iface_name=$1
local max_attempts=60
local attempt=0
local retry_delay=1

while true; do
attempt=$((attempt + 1))

local status
status=$(sudo networkctl status "$iface_name" 2> /dev/null)

if echo "$status" | grep -q "State: routable" 2> /dev/null; then
return 0
elif echo "$status" | grep -q "State: carrier" 2> /dev/null; then
return 0
elif echo "$status" | grep -q "State: degraded" 2> /dev/null; then
return 0
fi

if [ "$attempt" -ge "$max_attempts" ]; then
echo "Interface $iface_name failed to become ready after $max_attempts attempts"
return 1
fi

sleep "$retry_delay"
done
}

function get_ip() {
Expand All @@ -62,53 +154,44 @@ function get_ip() {
echo "$ip"
}

# Usage:
# hhnet cleanup
# hhnet bond 1000 enp2s1 enp2s2 enp2s3 enp2s4
# hhnet vlan 1000 enp2s1

function usage() {
echo "Usage: $0 <cleanup|bond|vlan> [<args> ...]" >&2
echo " Cleanup all interfaces (enp2s1-9, bond0-3, vlans 1000-1020): " >&2
echo " hhnet cleanup" >&2
echo " Setup bond from provided interfaces (at least one) and vlan on top of it" >&2
echo " hhnet bond 1000 enp2s1 enp2s2 enp2s3 enp2s4" >&2
echo " Setup vlan on top of provided interface (exactly one)" >&2
echo " hhnet vlan 1000 enp2s1" >&2
echo "Usage: $0 [command] [...]" >&2
echo " Cleanup all interfaces (enp2s1-9, bond0-3, vlans 1000-1020): " >&2
echo " hhnet cleanup" >&2
echo " Setup bond from provided interfaces (at least one) and vlan on top of it" >&2
echo " hhnet bond 1000 enp2s1 enp2s2 enp2s3 enp2s4" >&2
echo " Setup vlan on top of provided interface (exactly one)" >&2
echo " hhnet vlan 1000 enp2s1" >&2
}

if [ "$#" -lt 1 ]; then
usage

exit 1
elif [ "$1" == "cleanup" ]; then
cleanup

exit 0
elif [ "$1" == "bond" ]; then
if [ "$#" -lt 3 ]; then
echo "Usage: $0 bond <vlan_id> <iface1> [<iface2> ...]" >&2
echo "Usage: $0 bond <vlan_id> <interface> [...]" >&2
exit 1
fi

setup_bond bond0 "${@:3}"
setup_bond bond0 "${@:3}" || exit 1
sleep 1
setup_vlan bond0 "$2"
get_ip bond0."$2"

wait_for_interface bond0 || exit 1
setup_vlan bond0 "$2" || exit 1
wait_for_interface "bond0.$2" || exit 1
get_ip "bond0.$2" || exit 1
exit 0
elif [ "$1" == "vlan" ]; then
if [ "$#" -ne 3 ]; then
echo "Usage: $0 vlan <vlan_id> <iface1>" >&2
echo "Usage: $0 vlan <vlan_id> <interface>" >&2
exit 1
fi

setup_vlan "$3" "$2"
get_ip "$3"."$2"

setup_vlan "$3" "$2" || exit 1
wait_for_interface "$3.$2" || exit 1
get_ip "$3.$2" || exit 1
exit 0
else
usage

exit 1
fi
3 changes: 3 additions & 0 deletions pkg/hhfab/show-tech/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,7 @@ journalctl -k >> "$OUTPUT_FILE" 2>/dev/null
echo -e "\n=== Kernel Network Logs ===" >> "$OUTPUT_FILE"
dmesg | grep -i "network\|bond\|vlan" >> "$OUTPUT_FILE"

echo -e "\n=== SSH Logs ===" >> "$OUTPUT_FILE"
journalctl -u sshd >> "$OUTPUT_FILE" 2>/dev/null

echo "Diagnostics collected to $OUTPUT_FILE"
110 changes: 64 additions & 46 deletions pkg/hhfab/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -1261,66 +1261,84 @@ func checkIPerf(ctx context.Context, opts TestConnectivityOpts, iperfs *semaphor
if opts.IPerfsSeconds <= 0 || !expected {
return nil
}
maxRetries := 3
var lastErr error
for attempt := 1; attempt <= maxRetries; attempt++ {
if err := iperfs.Acquire(ctx, 1); err != nil {
return fmt.Errorf("acquiring iperf semaphore: %w", err)
}

if err := iperfs.Acquire(ctx, 1); err != nil {
return fmt.Errorf("acquiring iperf semaphore: %w", err)
}
defer iperfs.Release(1)
err := func() error {
defer iperfs.Release(1)
attemptCtx, cancel := context.WithTimeout(ctx, time.Duration(opts.IPerfsSeconds+30)*time.Second)
defer cancel()

ctx, cancel := context.WithTimeout(ctx, time.Duration(opts.IPerfsSeconds+30)*time.Second)
defer cancel()
slog.Debug("Running iperf", "from", from, "to", to, "attempt", attempt)
g, attemptCtx := errgroup.WithContext(attemptCtx)

slog.Debug("Running iperf", "from", from, "to", to)
g.Go(func() error {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is something I don't understand... If I read the docs on golang's errgroup correctly, each of these functions will be started immediately in a separate goroutine (as we are not setting any limit on the errgroup), but the operations should actually be sequential, or at least partly so - i.e. starting the server and checking for it in the client could be done in parallel, although I'm not quite sure what the benefits are, but you definitely do not want to start the actual client until the server is ready, especially since that operation does not have a retry. Did I misinterpret something here?

out, err := toSSH.RunContext(attemptCtx, fmt.Sprintf("toolbox -q timeout -v %d iperf3 -s -1", opts.IPerfsSeconds+25))
if err != nil {
return fmt.Errorf("running iperf server: %w: %s", err, string(out))
}
return nil
})

g, ctx := errgroup.WithContext(ctx)
g.Go(func() error {
for retries := 0; retries < 10; retries++ {
cmd := fmt.Sprintf("nc -zv %s 5201", toIP.String())
out, err := fromSSH.RunContext(attemptCtx, cmd)
if err == nil {
slog.Debug("iperf3 server is ready", "server", to)
break
}
slog.Debug("iperf3 server not ready, retrying", "server", to, "retry", retries+1, "error", err, "output", string(out))
time.Sleep(1 * time.Second)
}
return nil
})

g.Go(func() error {
out, err := toSSH.RunContext(ctx, fmt.Sprintf("toolbox -q timeout -v %d iperf3 -s -1", opts.IPerfsSeconds+25))
if err != nil {
return fmt.Errorf("running iperf server: %w: %s", err, string(out))
}
g.Go(func() error {
cmd := fmt.Sprintf("toolbox -q timeout -v %d iperf3 -P 4 -J -c %s -t %d", opts.IPerfsSeconds+25, toIP.String(), opts.IPerfsSeconds)
slog.Debug("Running iperf3 client", "from", from, "to", to, "cmd", cmd)
out, err := fromSSH.RunContext(attemptCtx, cmd)
if err != nil {
return fmt.Errorf("running iperf client: %w: %s", err, string(out))
}
report, err := parseIPerf3Report(out)
if err != nil {
return fmt.Errorf("parsing iperf report: %w", err)
}

return nil
})
slog.Debug("IPerf3 result", "from", from, "to", to,
"sendSpeed", asMbps(report.End.SumSent.BitsPerSecond),
"receiveSpeed", asMbps(report.End.SumReceived.BitsPerSecond),
"sent", asMB(float64(report.End.SumSent.Bytes)),
"received", asMB(float64(report.End.SumReceived.Bytes)))

g.Go(func() error {
time.Sleep(1 * time.Second) // TODO think about more reliable way to wait for server to start
if opts.IPerfsMinSpeed > 0 {
if report.End.SumSent.BitsPerSecond < opts.IPerfsMinSpeed*1_000_000 {
return fmt.Errorf("iperf send speed too low: %s < %s", asMbps(report.End.SumSent.BitsPerSecond), asMbps(opts.IPerfsMinSpeed*1_000_000))
}
if report.End.SumReceived.BitsPerSecond < opts.IPerfsMinSpeed*1_000_000 {
return fmt.Errorf("iperf receive speed too low: %s < %s", asMbps(report.End.SumReceived.BitsPerSecond), asMbps(opts.IPerfsMinSpeed*1_000_000))
}
}
return nil
})

cmd := fmt.Sprintf("toolbox -q timeout -v %d iperf3 -P 4 -J -c %s -t %d", opts.IPerfsSeconds+25, toIP.String(), opts.IPerfsSeconds)
out, err := fromSSH.RunContext(ctx, cmd)
if err != nil {
return fmt.Errorf("running iperf client: %w: %s", err, string(out))
}
return g.Wait()
}()

report, err := parseIPerf3Report(out)
if err != nil {
return fmt.Errorf("parsing iperf report: %w", err)
}

slog.Debug("IPerf3 result", "from", from, "to", to,
"sendSpeed", asMbps(report.End.SumSent.BitsPerSecond),
"receiveSpeed", asMbps(report.End.SumReceived.BitsPerSecond),
"sent", asMB(float64(report.End.SumSent.Bytes)),
"received", asMB(float64(report.End.SumReceived.Bytes)),
)

if opts.IPerfsMinSpeed > 0 {
if report.End.SumSent.BitsPerSecond < opts.IPerfsMinSpeed*1_000_000 {
return fmt.Errorf("iperf send speed too low: %s < %s", asMbps(report.End.SumSent.BitsPerSecond), asMbps(opts.IPerfsMinSpeed*1_000_000))
}
if report.End.SumReceived.BitsPerSecond < opts.IPerfsMinSpeed*1_000_000 {
return fmt.Errorf("iperf receive speed too low: %s < %s", asMbps(report.End.SumReceived.BitsPerSecond), asMbps(opts.IPerfsMinSpeed*1_000_000))
}
slog.Warn("iperf attempt failed", "from", from, "to", to, "attempt", attempt, "error", err)
lastErr = err
continue
}

return nil
})

if err := g.Wait(); err != nil {
return fmt.Errorf("running iperf: %w", err)
}

return nil
return fmt.Errorf("iperf test failed after %d attempts: %w", maxRetries, lastErr)
}

func checkCurl(ctx context.Context, opts TestConnectivityOpts, curls *semaphore.Weighted, from string, fromSSH *goph.Client, toIP string, expected bool) error {
Expand Down
Loading