test(gossipsub): make CI test to be more strict 0-tolerance for missed published messages #2680
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Memory Check | |
on: | |
# tests must run for a PR to be valid and pass merge queue muster | |
# on main, we want to know that all commits are passing at a glance, any deviation should help bisecting errors | |
# the merge run checks should show on master and enable this clear test/passing history | |
merge_group: | |
branches: [main] | |
pull_request: | |
branches: ["*"] | |
env: | |
CLIENT_DATA_PATH: /home/runner/.local/share/safe/client | |
NODE_DATA_PATH: /home/runner/.local/share/safe/node | |
BOOTSTRAP_NODE_DATA_PATH: /home/runner/.local/share/safe/bootstrap_node | |
jobs: | |
memory-check: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
- name: Install Rust | |
uses: actions-rs/toolchain@v1 | |
with: | |
profile: minimal | |
toolchain: stable | |
override: true | |
- uses: Swatinem/rust-cache@v2 | |
continue-on-error: true | |
- name: install ripgrep | |
shell: bash | |
run: sudo apt-get install -y ripgrep | |
- name: Build sn bins | |
run: cargo build --release --bins | |
timeout-minutes: 30 | |
- name: Build churn tests | |
run: cargo test --release -p sn_node --test data_with_churn --no-run | |
timeout-minutes: 30 | |
- name: Start a node instance that does not undergo churn | |
run: | | |
mkdir -p $BOOTSTRAP_NODE_DATA_PATH | |
./target/release/safenode \ | |
--root-dir $BOOTSTRAP_NODE_DATA_PATH --log-output-dest $BOOTSTRAP_NODE_DATA_PATH --local & | |
sleep 10 | |
env: | |
SN_LOG: "all" | |
- name: Set SAFE_PEERS | |
run: | | |
safe_peers=$(rg "listening on \".+\"" $BOOTSTRAP_NODE_DATA_PATH -u | \ | |
rg '/ip4.*$' -m1 -o | rg '"' -r '') | |
echo "SAFE_PEERS=$safe_peers" >> $GITHUB_ENV | |
- name: Check SAFE_PEERS was set | |
shell: bash | |
run: echo "The SAFE_PEERS variable has been set to $SAFE_PEERS" | |
- name: Start a local network | |
env: | |
SN_LOG: all | |
uses: maidsafe/sn-local-testnet-action@main | |
with: | |
action: start | |
interval: 2000 | |
node-path: target/release/safenode | |
faucet-path: target/release/faucet | |
platform: ubuntu-latest | |
set-safe-peers: false | |
# In this case we did *not* want SAFE_PEERS to be set to another value by starting the testnet | |
- name: Check SAFE_PEERS was not changed | |
shell: bash | |
run: echo "The SAFE_PEERS variable has been set to ${SAFE_PEERS}" | |
- name: Create and fund a wallet to pay for files storage | |
run: | | |
cargo run --bin faucet --release -- --log-output-dest=data-dir send 5000000 $(cargo run --bin safe --release -- --log-output-dest=data-dir wallet address | tail -n 1) > initial_balance_from_faucet.txt | |
cat initial_balance_from_faucet.txt | |
cat initial_balance_from_faucet.txt | tail -n 1 > transfer_hex | |
cat transfer_hex | |
cargo run --bin safe --release -- --log-output-dest=data-dir wallet receive --file transfer_hex | |
env: | |
SN_LOG: "all" | |
timeout-minutes: 15 | |
# The resources file we upload may change, and with it mem consumption. | |
# Be aware! | |
- name: Start a client to upload files | |
run: | | |
ls -l ./target/release | |
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/faucet" | |
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/safe" | |
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/safenode" | |
cargo run --bin safe --release -- --log-output-dest=data-dir files upload -- "./target/release/testnet" | |
env: | |
SN_LOG: "all" | |
timeout-minutes: 25 | |
- name: Chunks data integrity during nodes churn | |
run: cargo test --release -p sn_node --test data_with_churn -- --nocapture | |
env: | |
TEST_DURATION_MINS: 15 | |
TEST_TOTAL_CHURN_CYCLES: 15 | |
SN_LOG: "all" | |
timeout-minutes: 30 | |
- name: Verify restart of nodes using rg | |
shell: bash | |
timeout-minutes: 1 | |
# get the counts, then the specific line, and then the digit count only | |
# then check we have an expected level of restarts | |
# TODO: make this use an env var, or relate to testnet size | |
run : | | |
restart_count=$(rg "Node is restarting in" $NODE_DATA_PATH -c --stats | \ | |
rg "(\d+) matches" | rg "\d+" -o) | |
echo "Restart $restart_count nodes" | |
peer_removed=$(rg "PeerRemovedFromRoutingTable" $NODE_DATA_PATH -c --stats | \ | |
rg "(\d+) matches" | rg "\d+" -o) | |
echo "PeerRemovedFromRoutingTable $peer_removed times" | |
if [ $peer_removed -lt $restart_count ]; then | |
echo "PeerRemovedFromRoutingTable times of: $peer_removed is less than the restart count of: $restart_count" | |
exit 1 | |
fi | |
node_count=$(ls $NODE_DATA_PATH | wc -l) | |
echo "Node dir count is $node_count" | |
# TODO: reenable this once the testnet dir creation is tidied up to avoid a large count here | |
# if [ $restart_count -lt $node_count ]; then | |
# echo "Restart count of: $restart_count is less than the node count of: $node_count" | |
# exit 1 | |
# fi | |
- name: Verify data replication using rg | |
shell: bash | |
timeout-minutes: 1 | |
# get the counts, then the specific line, and then the digit count only | |
# then check we have an expected level of replication | |
# TODO: make this use an env var, or relate to testnet size | |
# As the bootstrap_node using separate folder for logging, | |
# hence the folder input to rg needs to cover that as well. | |
run : | | |
sending_list_count=$(rg "Sending a replication list" $NODE_DATA_PATH -c --stats | \ | |
rg "(\d+) matches" | rg "\d+" -o) | |
echo "Sent $sending_list_count replication lists" | |
received_list_count=$(rg "Received replication list from" $NODE_DATA_PATH -c --stats | \ | |
rg "(\d+) matches" | rg "\d+" -o) | |
echo "Received $received_list_count replication lists" | |
fetching_attempt_count=$(rg "FetchingKeysForReplication" $NODE_DATA_PATH -c --stats | \ | |
rg "(\d+) matches" | rg "\d+" -o) | |
echo "Carried out $fetching_attempt_count fetching attempts" | |
if: always() | |
- name: Start a client to download files | |
run: | | |
cargo run --bin safe --release -- --log-output-dest=data-dir files download | |
ls -l $CLIENT_DATA_PATH/downloaded_files | |
downloaded_files=$(ls $CLIENT_DATA_PATH/downloaded_files | wc -l) | |
if [ $downloaded_files -lt 4 ]; then | |
echo "Only downloaded $downloaded_files files, less than the 4 files uploaded" | |
exit 1 | |
fi | |
env: | |
SN_LOG: "all" | |
timeout-minutes: 10 | |
- name: Check nodes running | |
shell: bash | |
timeout-minutes: 1 | |
continue-on-error: true | |
run: pgrep safenode | wc -l | |
if: always() | |
- name: Stop the local network and upload logs | |
if: always() | |
uses: maidsafe/sn-local-testnet-action@main | |
with: | |
action: stop | |
log_file_prefix: safe_test_logs_memcheck | |
platform: ubuntu-latest | |
- name: Check node memory usage | |
shell: bash | |
# The resources file and churning chunk_size we upload may change, and with it mem consumption. | |
# This is set to a value high enough to allow for some variation depending on | |
# resources and node location in the network, but hopefully low enough to catch | |
# any wild memory issues | |
# Any changes to this value should be carefully considered and tested! | |
# As we have a bootstrap node acting as an access point for churning nodes and client, | |
# The memory usage here will be significantly higher here than in the benchmark test, | |
# where we don't have a bootstrap node. | |
run: | | |
node_peak_mem_limit_mb="300" # mb | |
peak_mem_usage=$( | |
rg '"memory_used_mb":[^,]*' $NODE_DATA_PATH/*/logs/* -o --no-line-number --no-filename | | |
awk -F':' '/"memory_used_mb":/{print $2}' | | |
sort -n | | |
tail -n 1 | |
) | |
echo "Node memory usage: $peak_mem_usage MB" | |
if (( $(echo "$peak_mem_usage > $node_peak_mem_limit_mb" | bc -l) )); then | |
echo "Node memory usage exceeded threshold: $peak_mem_usage MB" | |
exit 1 | |
fi | |
if: always() | |
- name: Check client memory usage | |
shell: bash | |
# limits here are lower that benchmark tests as there is less going on. | |
run: | | |
client_peak_mem_limit_mb="300" # mb | |
client_avg_mem_limit_mb="120" # mb | |
peak_mem_usage=$( | |
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob safe.* -o --no-line-number --no-filename | | |
awk -F':' '/"memory_used_mb":/{print $2}' | | |
sort -n | | |
tail -n 1 | |
) | |
echo "Peak memory usage: $peak_mem_usage MB" | |
if (( $(echo "$peak_mem_usage > $client_peak_mem_limit_mb" | bc -l) )); then | |
echo "Client peak memory usage exceeded threshold: $client_peak_mem_limit_mb MB" | |
exit 1 | |
fi | |
total_mem=$( | |
rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob safe.* -o --no-line-number --no-filename | | |
awk -F':' '/"memory_used_mb":/ {sum += $2} END {printf "%.0f\n", sum}' | |
) | |
num_of_times=$( | |
rg "\"memory_used_mb\"" $CLIENT_DATA_PATH/logs --glob safe.* -c --stats | | |
rg "(\d+) matches" | | |
rg "\d+" -o | |
) | |
echo "num_of_times: $num_of_times" | |
echo "Total memory is: $total_mem" | |
average_mem=$(($total_mem/$(($num_of_times)))) | |
echo "Average memory is: $average_mem" | |
if (( $(echo "$average_mem > $client_avg_mem_limit_mb" | bc -l) )); then | |
echo "Client average memory usage exceeded threshold: $client_avg_mem_limit_mb MB" | |
exit 1 | |
fi | |
- name: Upload payment wallet initialization log | |
uses: actions/upload-artifact@main | |
with: | |
name: payment_wallet_initialization_log | |
path: initial_balance_from_faucet.txt | |
continue-on-error: true | |
if: always() | |
- name: Upload Faucet folder | |
uses: actions/upload-artifact@main | |
with: | |
name: faucet_folder | |
path: /home/runner/.local/share/safe/test_faucet | |
continue-on-error: true | |
if: always() |