Skip to content

Commit

Permalink
Executor benchmark revamps (#15127)
Browse files Browse the repository at this point in the history
## Description

* Separately reporting signature_verification and ledger_update stages.
* changing "block execution time" from being VM_EXECUTE_BLOCK counter to BLOCK_EXECUTOR_EXECUTE_BLOCK - as it is counting BlockSTM + VM, instead of just VM. adding BLOCK_EXECUTOR_INNER_EXECUTE_BLOCK when needed better granularity.
* Changed so that AptosVM is decoupled from BlockSTM. I.e. AptosVM doesn't implement TransactionBlockExecutor any more, but there is new AptosVMBlockExecutor. That allows for creating NativeVMBlockExecutor in a following PR. Allowing TransactionBlockExecutor to have state if needed, with having new() and &self argument.
* fixed split_stages to split all pipeline stages, and for initial delay to only create transacitons, but not start the pipline (i.e. verification) beforehand. 

Followup PR will introduce different native executors.


## How Has This Been Tested?
performance benchmark
  • Loading branch information
igor-aptos authored Nov 7, 2024
1 parent 08cd86d commit 0d53727
Show file tree
Hide file tree
Showing 26 changed files with 953 additions and 409 deletions.
7 changes: 5 additions & 2 deletions api/test-context/src/test_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ use aptos_config::{
};
use aptos_crypto::{ed25519::Ed25519PrivateKey, hash::HashValue, SigningKey};
use aptos_db::AptosDB;
use aptos_executor::{block_executor::BlockExecutor, db_bootstrapper};
use aptos_executor::{
block_executor::{AptosVMBlockExecutor, BlockExecutor},
db_bootstrapper,
};
use aptos_executor_types::BlockExecutorTrait;
use aptos_framework::BuiltPackage;
use aptos_indexer_grpc_table_info::internal_indexer_db_service::MockInternalIndexerDBService;
Expand Down Expand Up @@ -204,7 +207,7 @@ pub fn new_test_context_inner(
rng,
root_key,
validator_owner,
Box::new(BlockExecutor::<AptosVM>::new(db_rw)),
Box::new(BlockExecutor::<AptosVMBlockExecutor>::new(db_rw)),
mempool,
db,
test_name,
Expand Down
12 changes: 12 additions & 0 deletions aptos-move/block-executor/src/counters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ fn output_buckets() -> std::vec::Vec<f64> {
.unwrap()
}

pub static BLOCK_EXECUTOR_INNER_EXECUTE_BLOCK: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
// metric name
"aptos_executor_block_executor_inner_execute_block_seconds",
// metric description
"The time spent in the most-inner part of executing a block of transactions, \
i.e. for BlockSTM that is how long parallel or sequential execution took.",
exponential_buckets(/*start=*/ 1e-3, /*factor=*/ 2.0, /*count=*/ 20).unwrap(),
)
.unwrap()
});

/// Count of times the module publishing fallback was triggered in parallel execution.
pub static MODULE_PUBLISHING_FALLBACK_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
Expand Down
8 changes: 5 additions & 3 deletions aptos-move/block-executor/src/executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

use crate::{
code_cache_global::ImmutableModuleCache,
counters,
counters::{
PARALLEL_EXECUTION_SECONDS, RAYON_EXECUTION_SECONDS, TASK_EXECUTE_SECONDS,
TASK_VALIDATE_SECONDS, VM_INIT_SECONDS, WORK_WITH_TASK_SECONDS,
self, BLOCK_EXECUTOR_INNER_EXECUTE_BLOCK, PARALLEL_EXECUTION_SECONDS,
RAYON_EXECUTION_SECONDS, TASK_EXECUTE_SECONDS, TASK_VALIDATE_SECONDS, VM_INIT_SECONDS,
WORK_WITH_TASK_SECONDS,
},
errors::*,
executor_utilities::*,
Expand Down Expand Up @@ -1692,6 +1692,8 @@ where
signature_verified_block: &[T],
base_view: &S,
) -> BlockExecutionResult<BlockOutput<E::Output>, E::Error> {
let _timer = BLOCK_EXECUTOR_INNER_EXECUTE_BLOCK.start_timer();

if self.config.local.concurrency_level > 1 {
let parallel_result =
self.execute_transactions_parallel(&env, signature_verified_block, base_view);
Expand Down
128 changes: 88 additions & 40 deletions aptos-move/e2e-benchmark/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use aptos_transaction_generator_lib::{
use aptos_types::{account_address::AccountAddress, transaction::TransactionPayload};
use rand::{rngs::StdRng, SeedableRng};
use serde_json::json;
use std::process::exit;
use std::{collections::HashMap, process::exit};

pub fn execute_txn(
executor: &mut FakeExecutor,
Expand Down Expand Up @@ -80,70 +80,113 @@ const ALLOWED_REGRESSION: f32 = 0.15;
const ALLOWED_IMPROVEMENT: f32 = 0.15;
const ABSOLUTE_BUFFER_US: f32 = 2.0;

const CALIBRATION_VALUES: &str = "
Loop { loop_count: Some(100000), loop_type: NoOp } 6 0.988 1.039 41212.4
Loop { loop_count: Some(10000), loop_type: Arithmetic } 6 0.977 1.038 25868.8
CreateObjects { num_objects: 10, object_payload_size: 0 } 6 0.940 1.026 152.1
CreateObjects { num_objects: 10, object_payload_size: 10240 } 6 0.934 1.051 9731.3
CreateObjects { num_objects: 100, object_payload_size: 0 } 6 0.966 1.051 1458.3
CreateObjects { num_objects: 100, object_payload_size: 10240 } 6 0.969 1.077 11196.4
InitializeVectorPicture { length: 40 } 6 0.973 1.066 75.0
VectorPicture { length: 40 } 6 0.955 1.092 22.0
VectorPictureRead { length: 40 } 6 0.952 1.047 21.0
InitializeVectorPicture { length: 30720 } 6 0.969 1.071 27295.8
VectorPicture { length: 30720 } 6 0.957 1.066 6560.2
VectorPictureRead { length: 30720 } 6 0.948 1.053 6642.8
SmartTablePicture { length: 30720, num_points_per_txn: 200 } 6 0.972 1.024 42660.4
SmartTablePicture { length: 1048576, num_points_per_txn: 300 } 6 0.961 1.020 73725.5
ResourceGroupsSenderWriteTag { string_length: 1024 } 6 0.867 1.001 15.0
ResourceGroupsSenderMultiChange { string_length: 1024 } 6 0.966 1.069 29.0
TokenV1MintAndTransferFT 6 0.972 1.045 356.8
TokenV1MintAndTransferNFTSequential 6 0.991 1.067 543.7
TokenV2AmbassadorMint { numbered: true } 6 0.987 1.052 474.4
LiquidityPoolSwap { is_stable: true } 6 0.970 1.042 555.4
LiquidityPoolSwap { is_stable: false } 6 0.925 1.001 535.3
";

struct CalibrationInfo {
// count: usize,
expected_time: f32,
}

fn get_parsed_calibration_values() -> HashMap<String, CalibrationInfo> {
CALIBRATION_VALUES
.trim()
.split('\n')
.map(|line| {
let parts = line.split('\t').collect::<Vec<_>>();
(parts[0].to_string(), CalibrationInfo {
// count: parts[1].parse().unwrap(),
expected_time: parts[parts.len() - 1].parse().unwrap(),
})
})
.collect()
}

fn main() {
let executor = FakeExecutor::from_head_genesis();
let mut executor = executor.set_not_parallel();

let calibration_values = get_parsed_calibration_values();

let entry_points = vec![
// too fast for the timer
// (, EntryPoints::Nop),
// (, EntryPoints::BytesMakeOrChange {
// data_length: Some(32),
// }),
// (, EntryPoints::IncGlobal),
(34651, EntryPoints::Loop {
EntryPoints::Loop {
loop_count: Some(100000),
loop_type: LoopType::NoOp,
}),
(21145, EntryPoints::Loop {
},
EntryPoints::Loop {
loop_count: Some(10000),
loop_type: LoopType::Arithmetic,
}),
},
// This is a cheap bcs (serializing vec<u8>), so not representative of what BCS native call should cost.
// (, EntryPoints::Loop { loop_count: Some(1000), loop_type: LoopType::BCS { len: 1024 }}),
(124, EntryPoints::CreateObjects {
EntryPoints::CreateObjects {
num_objects: 10,
object_payload_size: 0,
}),
(8090, EntryPoints::CreateObjects {
},
EntryPoints::CreateObjects {
num_objects: 10,
object_payload_size: 10 * 1024,
}),
(1246, EntryPoints::CreateObjects {
},
EntryPoints::CreateObjects {
num_objects: 100,
object_payload_size: 0,
}),
(9556, EntryPoints::CreateObjects {
},
EntryPoints::CreateObjects {
num_objects: 100,
object_payload_size: 10 * 1024,
}),
(61, EntryPoints::InitializeVectorPicture { length: 40 }),
(16, EntryPoints::VectorPicture { length: 40 }),
(16, EntryPoints::VectorPictureRead { length: 40 }),
(23256, EntryPoints::InitializeVectorPicture {
length: 30 * 1024,
}),
(5860, EntryPoints::VectorPicture { length: 30 * 1024 }),
(5849, EntryPoints::VectorPictureRead { length: 30 * 1024 }),
(35440, EntryPoints::SmartTablePicture {
},
EntryPoints::InitializeVectorPicture { length: 40 },
EntryPoints::VectorPicture { length: 40 },
EntryPoints::VectorPictureRead { length: 40 },
EntryPoints::InitializeVectorPicture { length: 30 * 1024 },
EntryPoints::VectorPicture { length: 30 * 1024 },
EntryPoints::VectorPictureRead { length: 30 * 1024 },
EntryPoints::SmartTablePicture {
length: 30 * 1024,
num_points_per_txn: 200,
}),
(60464, EntryPoints::SmartTablePicture {
},
EntryPoints::SmartTablePicture {
length: 1024 * 1024,
num_points_per_txn: 300,
}),
(13, EntryPoints::ResourceGroupsSenderWriteTag {
},
EntryPoints::ResourceGroupsSenderWriteTag {
string_length: 1024,
}),
(27, EntryPoints::ResourceGroupsSenderMultiChange {
},
EntryPoints::ResourceGroupsSenderMultiChange {
string_length: 1024,
}),
(291, EntryPoints::TokenV1MintAndTransferFT),
(468, EntryPoints::TokenV1MintAndTransferNFTSequential),
(386, EntryPoints::TokenV2AmbassadorMint { numbered: true }),
(467, EntryPoints::LiquidityPoolSwap { is_stable: true }),
(429, EntryPoints::LiquidityPoolSwap { is_stable: false }),
},
EntryPoints::TokenV1MintAndTransferFT,
EntryPoints::TokenV1MintAndTransferNFTSequential,
EntryPoints::TokenV2AmbassadorMint { numbered: true },
EntryPoints::LiquidityPoolSwap { is_stable: true },
EntryPoints::LiquidityPoolSwap { is_stable: false },
];

let mut failures = Vec::new();
Expand All @@ -154,7 +197,12 @@ fn main() {
"wall time (us)", "expected (us)", "diff(- is impr)"
);

for (index, (expected_time, entry_point)) in entry_points.into_iter().enumerate() {
for (index, entry_point) in entry_points.into_iter().enumerate() {
let entry_point_name = format!("{:?}", entry_point);
let expected_time = calibration_values
.get(&entry_point_name)
.unwrap()
.expected_time;
let publisher = executor.new_account_at(AccountAddress::random());

let mut package_handler = PackageHandler::new(entry_point.package_name());
Expand Down Expand Up @@ -189,23 +237,23 @@ fn main() {
&package,
publisher.address(),
&mut executor,
if expected_time > 10000 {
if expected_time > 10000.0 {
6
} else if expected_time > 1000 {
} else if expected_time > 1000.0 {
10
} else {
100
},
);
let diff = (elapsed_micros as f32 - expected_time as f32) / (expected_time as f32) * 100.0;
let diff = (elapsed_micros as f32 - expected_time) / expected_time * 100.0;
println!(
"{:15} {:15} {:14.1}% {:?}",
"{:15} {:15.1} {:14.1}% {:?}",
elapsed_micros, expected_time, diff, entry_point
);

json_lines.push(json!({
"grep": "grep_json_aptos_move_vm_perf",
"transaction_type": format!("{:?}", entry_point),
"transaction_type": entry_point_name,
"wall_time_us": elapsed_micros,
"expected_wall_time_us": expected_time,
"test_index": index,
Expand Down
1 change: 0 additions & 1 deletion consensus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ aptos-temppath = { workspace = true }
aptos-time-service = { workspace = true }
aptos-types = { workspace = true }
aptos-validator-transaction-pool = { workspace = true }
aptos-vm = { workspace = true }
async-trait = { workspace = true }
bcs = { workspace = true }
byteorder = { workspace = true }
Expand Down
7 changes: 3 additions & 4 deletions consensus/src/consensus_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,13 @@ use aptos_channels::aptos_channel::Receiver;
use aptos_config::config::NodeConfig;
use aptos_consensus_notifications::ConsensusNotificationSender;
use aptos_event_notifications::{DbBackedOnChainConfig, ReconfigNotificationListener};
use aptos_executor::block_executor::BlockExecutor;
use aptos_executor::block_executor::{AptosVMBlockExecutor, BlockExecutor};
use aptos_logger::prelude::*;
use aptos_mempool::QuorumStoreRequest;
use aptos_network::application::interface::{NetworkClient, NetworkServiceEvents};
use aptos_storage_interface::DbReaderWriter;
use aptos_time_service::TimeService;
use aptos_validator_transaction_pool::VTxnPoolState;
use aptos_vm::AptosVM;
use futures::channel::mpsc;
use move_core_types::account_address::AccountAddress;
use std::{collections::HashMap, sync::Arc};
Expand Down Expand Up @@ -65,7 +64,7 @@ pub fn start_consensus(
));

let execution_proxy = ExecutionProxy::new(
Arc::new(BlockExecutor::<AptosVM>::new(aptos_db)),
Arc::new(BlockExecutor::<AptosVMBlockExecutor>::new(aptos_db)),
txn_notifier,
state_sync_notifier,
runtime.handle(),
Expand Down Expand Up @@ -158,7 +157,7 @@ pub fn start_consensus_observer(
node_config.consensus.mempool_executed_txn_timeout_ms,
));
let execution_proxy = ExecutionProxy::new(
Arc::new(BlockExecutor::<AptosVM>::new(aptos_db.clone())),
Arc::new(BlockExecutor::<AptosVMBlockExecutor>::new(aptos_db.clone())),
txn_notifier,
state_sync_notifier,
consensus_observer_runtime.handle(),
Expand Down
6 changes: 5 additions & 1 deletion consensus/src/execution_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ use std::{
};
use tokio::sync::{mpsc, oneshot};

/// Smallest number of transactions Rayon should put into a single worker task.
/// Same as in execution/executor-benchmark/src/block_preparation.rs
pub const SIG_VERIFY_RAYON_MIN_THRESHOLD: usize = 32;

pub type PreCommitHook =
Box<dyn 'static + FnOnce(&StateComputeResult) -> BoxFuture<'static, ()> + Send>;

Expand Down Expand Up @@ -156,7 +160,7 @@ impl ExecutionPipeline {
let num_txns = txns_to_execute.len();
txns_to_execute
.into_par_iter()
.with_min_len(optimal_min_len(num_txns, 32))
.with_min_len(optimal_min_len(num_txns, SIG_VERIFY_RAYON_MIN_THRESHOLD))
.map(|t| t.into())
.collect::<Vec<_>>()
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ mod tests {
// Check that the transaction is valid JSON
let transaction = serde_json::from_slice::<Transaction>(json_bytes).unwrap();

assert_eq!(transaction.version, 61);
assert_eq!(transaction.version, 53);
}
}
Loading

0 comments on commit 0d53727

Please sign in to comment.