diff --git a/.github/workflows/merge.yml b/.github/workflows/merge.yml index 1d59de2431..aebccee63f 100644 --- a/.github/workflows/merge.yml +++ b/.github/workflows/merge.yml @@ -408,7 +408,7 @@ jobs: if: matrix.os != 'windows-latest' run: | set -e - for i in {1..100}; do + for i in {1..50}; do dd if=/dev/urandom of=random_file_$i.bin bs=1M count=1 status=none ./target/release/ant --log-output-dest data-dir file upload random_file_$i.bin --public ./target/release/ant --log-output-dest data-dir file upload random_file_$i.bin @@ -423,7 +423,7 @@ jobs: shell: pwsh run: | $ErrorActionPreference = "Stop" - for ($i = 1; $i -le 100; $i++) { + for ($i = 1; $i -le 50; $i++) { $fileName = "random_file_$i.bin" $byteArray = [byte[]]@(0xFF) * (1MB) # Create a 1 MB array filled with 0xFF [System.IO.File]::WriteAllBytes($fileName, $byteArray) diff --git a/ant-cli/Cargo.toml b/ant-cli/Cargo.toml index 0239975d03..7834564d07 100644 --- a/ant-cli/Cargo.toml +++ b/ant-cli/Cargo.toml @@ -15,7 +15,7 @@ path = "src/main.rs" [features] default = ["metrics"] -local = ["ant-bootstrap/local", "autonomi/local"] +local = ["ant-bootstrap/local", "autonomi/local", "ant-logging/process-metrics"] metrics = ["ant-logging/process-metrics"] nightly = [] diff --git a/ant-cli/src/main.rs b/ant-cli/src/main.rs index 279a354e5d..971c38fd6a 100644 --- a/ant-cli/src/main.rs +++ b/ant-cli/src/main.rs @@ -24,7 +24,7 @@ pub use access::user_data; use clap::Parser; use color_eyre::Result; -#[cfg(feature = "metrics")] +#[cfg(feature = "local")] use ant_logging::metrics::init_metrics; use ant_logging::{LogBuilder, LogFormat, ReloadHandle, WorkerGuard}; use ant_protocol::version; @@ -73,7 +73,7 @@ async fn main() -> Result<()> { } let _log_guards = init_logging_and_metrics(&opt)?; - #[cfg(feature = "metrics")] + #[cfg(feature = "local")] tokio::spawn(init_metrics(std::process::id())); info!("\"{}\"", std::env::args().collect::>().join(" ")); diff --git a/ant-networking/src/driver.rs b/ant-networking/src/driver.rs index 4534b49110..bb1637a099 100644 --- a/ant-networking/src/driver.rs +++ b/ant-networking/src/driver.rs @@ -135,7 +135,7 @@ const PERIODIC_KAD_BOOTSTRAP_INTERVAL_MAX_S: u64 = 21600; // Init during compilation, instead of runtime error that should never happen // Option::expect will be stabilised as const in the future (https://github.com/rust-lang/rust/issues/67441) -const REPLICATION_FACTOR: NonZeroUsize = match NonZeroUsize::new(CLOSE_GROUP_SIZE) { +const REPLICATION_FACTOR: NonZeroUsize = match NonZeroUsize::new(CLOSE_GROUP_SIZE + 2) { Some(v) => v, None => panic!("CLOSE_GROUP_SIZE should not be zero"), }; diff --git a/ant-networking/src/lib.rs b/ant-networking/src/lib.rs index 434aa192ad..fca47f18d0 100644 --- a/ant-networking/src/lib.rs +++ b/ant-networking/src/lib.rs @@ -387,6 +387,10 @@ impl Network { .await?; // Filter out results from the ignored peers. close_nodes.retain(|peer_id| !ignore_peers.contains(peer_id)); + info!( + "For record {record_address:?} quoting {} nodes. ignore_peers is {ignore_peers:?}", + close_nodes.len() + ); if close_nodes.is_empty() { error!("Can't get store_cost of {record_address:?}, as all close_nodes are ignored"); diff --git a/ant-node/Cargo.toml b/ant-node/Cargo.toml index 053390041e..a7b9b817b7 100644 --- a/ant-node/Cargo.toml +++ b/ant-node/Cargo.toml @@ -17,9 +17,9 @@ path = "src/bin/antnode/main.rs" default = ["metrics", "upnp", "open-metrics", "encrypt-records"] encrypt-records = ["ant-networking/encrypt-records"] extension-module = ["pyo3/extension-module"] -local = ["ant-networking/local", "ant-evm/local", "ant-bootstrap/local"] +local = ["ant-networking/local", "ant-evm/local", "ant-bootstrap/local", "ant-logging/process-metrics"] loud = ["ant-networking/loud"] # loud mode: print important messages to console -metrics = ["ant-logging/process-metrics"] +metrics = [] nightly = [] open-metrics = ["ant-networking/open-metrics", "prometheus-client"] otlp = ["ant-logging/otlp"] diff --git a/ant-node/src/bin/antnode/main.rs b/ant-node/src/bin/antnode/main.rs index db40d00101..3397d81461 100644 --- a/ant-node/src/bin/antnode/main.rs +++ b/ant-node/src/bin/antnode/main.rs @@ -15,7 +15,7 @@ mod subcommands; use crate::subcommands::EvmNetworkCommand; use ant_bootstrap::{BootstrapCacheConfig, BootstrapCacheStore, PeersArgs}; use ant_evm::{get_evm_network_from_env, EvmNetwork, RewardsAddress}; -#[cfg(feature = "metrics")] +#[cfg(feature = "local")] use ant_logging::metrics::init_metrics; use ant_logging::{Level, LogFormat, LogOutputDest, ReloadHandle}; use ant_node::{Marker, NodeBuilder, NodeEvent, NodeEventsReceiver}; @@ -306,7 +306,7 @@ fn main() -> Result<()> { // Create a tokio runtime per `run_node` attempt, this ensures // any spawned tasks are closed before we would attempt to run // another process with these args. - #[cfg(feature = "metrics")] + #[cfg(feature = "local")] rt.spawn(init_metrics(std::process::id())); let initial_peres = rt.block_on(opt.peers.get_addrs(None, Some(100)))?; debug!("Node's owner set to: {:?}", opt.owner); diff --git a/ant-node/src/node.rs b/ant-node/src/node.rs index 4908c0bc23..2515af6344 100644 --- a/ant-node/src/node.rs +++ b/ant-node/src/node.rs @@ -16,7 +16,9 @@ use ant_bootstrap::BootstrapCacheStore; use ant_evm::RewardsAddress; #[cfg(feature = "open-metrics")] use ant_networking::MetricsRegistries; -use ant_networking::{Instant, Network, NetworkBuilder, NetworkEvent, NodeIssue, SwarmDriver}; +use ant_networking::{ + target_arch::sleep, Instant, Network, NetworkBuilder, NetworkEvent, NodeIssue, SwarmDriver, +}; use ant_protocol::{ convert_distance_to_u256, error::Error as ProtocolError, @@ -969,7 +971,7 @@ impl Node { } } // Sleep a short while to avoid causing a spike on resource usage. - std::thread::sleep(std::time::Duration::from_secs(10)); + sleep(std::time::Duration::from_secs(10)).await; } } } diff --git a/autonomi/src/client/mod.rs b/autonomi/src/client/mod.rs index f245833b91..d118a5f065 100644 --- a/autonomi/src/client/mod.rs +++ b/autonomi/src/client/mod.rs @@ -251,6 +251,12 @@ impl Client { receiver.await.expect("sender should not close")?; debug!("Client is connected to the network"); + // With the switch to the new bootstrap cache scheme, + // Seems the too many `initial dial`s could result in failure, + // when startup quoting/upload tasks got started up immediatly. + // Hence, put in a forced wait to allow `initial network discovery` to be completed. + ant_networking::target_arch::sleep(Duration::from_secs(5)).await; + Ok(Self { network, client_event_sender: Arc::new(None), diff --git a/autonomi/src/client/quote.rs b/autonomi/src/client/quote.rs index 9794f165d7..38dfd7f6fd 100644 --- a/autonomi/src/client/quote.rs +++ b/autonomi/src/client/quote.rs @@ -11,7 +11,7 @@ use crate::client::rate_limiter::RateLimiter; use ant_evm::payment_vault::get_market_price; use ant_evm::{Amount, EvmNetwork, PaymentQuote, QuotePayment, QuotingMetrics}; use ant_networking::{Network, NetworkError}; -use ant_protocol::{storage::ChunkAddress, NetworkAddress}; +use ant_protocol::{storage::ChunkAddress, NetworkAddress, CLOSE_GROUP_SIZE}; use libp2p::PeerId; use std::collections::HashMap; use xor_name::XorName; @@ -159,6 +159,14 @@ async fn fetch_store_quote_with_retries( loop { match fetch_store_quote(network, content_addr).await { Ok(quote) => { + if quote.len() < CLOSE_GROUP_SIZE { + retries += 1; + error!("Error while fetching store quote: not enough quotes ({}/{CLOSE_GROUP_SIZE}), retry #{retries}, quotes {quote:?}", + quote.len()); + if retries > 2 { + break Err(CostError::CouldNotGetStoreQuote(content_addr)); + } + } break Ok((content_addr, quote)); } Err(err) if retries < 2 => { @@ -172,6 +180,9 @@ async fn fetch_store_quote_with_retries( break Err(CostError::CouldNotGetStoreQuote(content_addr)); } } + // Shall have a sleep between retries to avoid choking the network. + // This shall be rare to happen though. + std::thread::sleep(std::time::Duration::from_secs(5)); } }