Skip to content

test(gossipsub): make CI test to be more strict 0-tolerance for missed published messages #2679

test(gossipsub): make CI test to be more strict 0-tolerance for missed published messages

test(gossipsub): make CI test to be more strict 0-tolerance for missed published messages #2679

Workflow file for this run

name: Memory Check
on:
# tests must run for a PR to be valid and pass merge queue muster
# on main, we want to know that all commits are passing at a glance, any deviation should help bisecting errors
# the merge run checks should show on master and enable this clear test/passing history
merge_group:
branches: [main]
pull_request:
branches: ["*"]
env:
CLIENT_DATA_PATH: /home/runner/.local/share/safe/client
NODE_DATA_PATH: /home/runner/.local/share/safe/node
BOOTSTRAP_NODE_DATA_PATH: /home/runner/.local/share/safe/bootstrap_node
jobs:
memory-check:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- uses: Swatinem/rust-cache@v2
continue-on-error: true
- name: install ripgrep
shell: bash
run: sudo apt-get install -y ripgrep
- name: Build sn bins
run: cargo build --release --bins
timeout-minutes: 30
- name: Build churn tests
run: cargo test --release -p sn_node --test data_with_churn --no-run
timeout-minutes: 30
- name: Start a node instance that does not undergo churn
run: |
mkdir -p $BOOTSTRAP_NODE_DATA_PATH
./target/release/safenode \
--root-dir $BOOTSTRAP_NODE_DATA_PATH --log-output-dest $BOOTSTRAP_NODE_DATA_PATH --local &
sleep 10
env:
SN_LOG: "all"
- name: Set SAFE_PEERS
run: |
safe_peers=$(rg "listening on \".+\"" $BOOTSTRAP_NODE_DATA_PATH -u | \
rg '/ip4.*$' -m1 -o | rg '"' -r '')
echo "SAFE_PEERS=$safe_peers" >> $GITHUB_ENV
- name: Check SAFE_PEERS was set
shell: bash
run: echo "The SAFE_PEERS variable has been set to $SAFE_PEERS"
- name: Start a local network
env:
SN_LOG: all
uses: maidsafe/sn-local-testnet-action@main
with:
action: start
interval: 2000
node-path: target/release/safenode
faucet-path: target/release/faucet
platform: ubuntu-latest
set-safe-peers: false
# In this case we did *not* want SAFE_PEERS to be set to another value by starting the testnet
- name: Check SAFE_PEERS was not changed
shell: bash
run: echo "The SAFE_PEERS variable has been set to ${SAFE_PEERS}"
- name: Create and fund a wallet to pay for files storage
run: |
cargo run --bin faucet --release -- --log-output-dest=data-dir send 5000000 $(cargo run --bin safe --release -- --log-output-dest=data-dir wallet address | tail -n 1) > initial_balance_from_faucet.txt
cat initial_balance_from_faucet.txt
cat initial_balance_from_faucet.txt | tail -n 1 > transfer_hex
cat transfer_hex
cargo run --bin safe --release -- --log-output-dest=data-dir wallet receive --file transfer_hex
env:
SN_LOG: "all"
timeout-minutes: 15
# The resources file we upload may change, and with it mem consumption.
# Be aware!
- name: Start a client to upload files
run: |
ls -l ./target/release
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/faucet"
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/safe"
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/safenode"
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/testnet"
env:
SN_LOG: "all"
timeout-minutes: 25
- name: Chunks data integrity during nodes churn
run: cargo test --release -p sn_node --test data_with_churn -- --nocapture
env:
TEST_DURATION_MINS: 15
TEST_TOTAL_CHURN_CYCLES: 15
SN_LOG: "all"
timeout-minutes: 30
- name: Verify restart of nodes using rg
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of restarts
# TODO: make this use an env var, or relate to testnet size
run : |
restart_count=$(rg "Node is restarting in" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Restart $restart_count nodes"
peer_removed=$(rg "PeerRemovedFromRoutingTable" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "PeerRemovedFromRoutingTable $peer_removed times"
if [ $peer_removed -lt $restart_count ]; then
echo "PeerRemovedFromRoutingTable times of: $peer_removed is less than the restart count of: $restart_count"
exit 1
fi
node_count=$(ls $NODE_DATA_PATH | wc -l)
echo "Node dir count is $node_count"
# TODO: reenable this once the testnet dir creation is tidied up to avoid a large count here
# if [ $restart_count -lt $node_count ]; then
# echo "Restart count of: $restart_count is less than the node count of: $node_count"
# exit 1
# fi
- name: Verify data replication using rg
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of replication
# TODO: make this use an env var, or relate to testnet size
# As the bootstrap_node using separate folder for logging,
# hence the folder input to rg needs to cover that as well.
run : |
sending_list_count=$(rg "Sending a replication list" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Sent $sending_list_count replication lists"
received_list_count=$(rg "Received replication list from" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Received $received_list_count replication lists"
fetching_attempt_count=$(rg "FetchingKeysForReplication" $NODE_DATA_PATH -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Carried out $fetching_attempt_count fetching attempts"
if: always()
- name: Start a client to download files
run: |
cargo run --bin safe --release -- --log-output-dest=data-dir files download
ls -l $CLIENT_DATA_PATH/downloaded_files
downloaded_files=$(ls $CLIENT_DATA_PATH/downloaded_files | wc -l)
if [ $downloaded_files -lt 4 ]; then
echo "Only downloaded $downloaded_files files, less than the 4 files uploaded"
exit 1
fi
env:
SN_LOG: "all"
timeout-minutes: 10
- name: Check nodes running
shell: bash
timeout-minutes: 1
continue-on-error: true
run: pgrep safenode | wc -l
if: always()
- name: Stop the local network and upload logs
if: always()
uses: maidsafe/sn-local-testnet-action@main
with:
action: stop
log_file_prefix: safe_test_logs_memcheck
platform: ubuntu-latest
- name: Check node memory usage
shell: bash
# The resources file and churning chunk_size we upload may change, and with it mem consumption.
# This is set to a value high enough to allow for some variation depending on
# resources and node location in the network, but hopefully low enough to catch
# any wild memory issues
# Any changes to this value should be carefully considered and tested!
# As we have a bootstrap node acting as an access point for churning nodes and client,
# The memory usage here will be significantly higher here than in the benchmark test,
# where we don't have a bootstrap node.
run: |
node_peak_mem_limit_mb="300" # mb
peak_mem_usage=$(
rg '"memory_used_mb":[^,]*' $NODE_DATA_PATH/*/logs/* -o --no-line-number --no-filename |
awk -F':' '/"memory_used_mb":/{print $2}' |
sort -n |
tail -n 1
)
echo "Node memory usage: $peak_mem_usage MB"
if (( $(echo "$peak_mem_usage > $node_peak_mem_limit_mb" | bc -l) )); then
echo "Node memory usage exceeded threshold: $peak_mem_usage MB"
exit 1
fi
if: always()
- name: Check client memory usage
shell: bash
# limits here are lower that benchmark tests as there is less going on.
run: |
client_peak_mem_limit_mb="300" # mb
client_avg_mem_limit_mb="120" # mb
peak_mem_usage=$(
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob safe.* -o --no-line-number --no-filename |
awk -F':' '/"memory_used_mb":/{print $2}' |
sort -n |
tail -n 1
)
echo "Peak memory usage: $peak_mem_usage MB"
if (( $(echo "$peak_mem_usage > $client_peak_mem_limit_mb" | bc -l) )); then
echo "Client peak memory usage exceeded threshold: $client_peak_mem_limit_mb MB"
exit 1
fi
total_mem=$(
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob safe.* -o --no-line-number --no-filename |
awk -F':' '/"memory_used_mb":/ {sum += $2} END {printf "%.0f\n", sum}'
)
num_of_times=$(
rg "\"memory_used_mb\"" $CLIENT_DATA_PATH/logs --glob safe.* -c --stats |
rg "(\d+) matches" |
rg "\d+" -o
)
echo "num_of_times: $num_of_times"
echo "Total memory is: $total_mem"
average_mem=$(($total_mem/$(($num_of_times))))
echo "Average memory is: $average_mem"
if (( $(echo "$average_mem > $client_avg_mem_limit_mb" | bc -l) )); then
echo "Client average memory usage exceeded threshold: $client_avg_mem_limit_mb MB"
exit 1
fi
- name: Upload payment wallet initialization log
uses: actions/upload-artifact@main
with:
name: payment_wallet_initialization_log
path: initial_balance_from_faucet.txt
continue-on-error: true
if: always()
- name: Upload Faucet folder
uses: actions/upload-artifact@main
with:
name: faucet_folder
path: /home/runner/.local/share/safe/test_faucet
continue-on-error: true
if: always()