Skip to content

Commit

Permalink
approval-voting: Add no shows debug information (paritytech#4726)
Browse files Browse the repository at this point in the history
Add some debug logs to be able to identify the validators and parachains
that have most no-shows, this metric is valuable because it will help us
identify validators and parachains that regularly have this problem.

From the validator_index we can then query the on-chain information and
identify the exact validator that is causing the no-shows.

---------

Signed-off-by: Alexandru Gheorghe <[email protected]>
  • Loading branch information
alexggh authored Jun 10, 2024
1 parent 497d64e commit 2869fd6
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 18 deletions.
41 changes: 34 additions & 7 deletions polkadot/node/core/approval-voting/src/approval_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use polkadot_primitives::ValidatorIndex;
use crate::{
persisted_entries::{ApprovalEntry, CandidateEntry, TrancheEntry},
time::Tick,
MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE,
};

/// Result of counting the necessary tranches needed for approving a block.
Expand All @@ -32,6 +33,7 @@ pub struct TranchesToApproveResult {
pub required_tranches: RequiredTranches,
/// The total number of no_shows at the moment we are doing the counting.
pub total_observed_no_shows: usize,
pub no_show_validators: Vec<ValidatorIndex>,
}

/// The required tranches of assignments needed to determine whether a candidate is approved.
Expand Down Expand Up @@ -188,6 +190,8 @@ struct State {
/// The last tick at which a considered assignment was received.
last_assignment_tick: Option<Tick>,
total_observed_no_shows: usize,
// The validator's index that are no-shows.
no_show_validators: Vec<ValidatorIndex>,
}

impl State {
Expand All @@ -203,6 +207,7 @@ impl State {
return TranchesToApproveResult {
required_tranches: RequiredTranches::All,
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
}

Expand All @@ -217,6 +222,7 @@ impl State {
last_assignment_tick: self.last_assignment_tick,
},
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
}

Expand All @@ -234,6 +240,7 @@ impl State {
clock_drift,
},
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
} else {
TranchesToApproveResult {
Expand All @@ -244,6 +251,7 @@ impl State {
clock_drift,
},
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
}
}
Expand All @@ -253,11 +261,12 @@ impl State {
}

fn advance(
&self,
mut self,
new_assignments: usize,
new_no_shows: usize,
next_no_show: Option<Tick>,
last_assignment_tick: Option<Tick>,
no_show_validators: Vec<ValidatorIndex>,
) -> State {
let new_covered = if self.depth == 0 {
new_assignments
Expand Down Expand Up @@ -290,6 +299,17 @@ impl State {
(self.depth, covering, uncovered)
};

// Make sure we don't store too many no-show validators, since this metric
// is valuable if there are just a few of them to identify the problematic
// validators.
// If there are a lot then we've got bigger problems and no need to make this
// array unnecessarily large.
if self.no_show_validators.len() + no_show_validators.len() <
MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE
{
self.no_show_validators.extend(no_show_validators);
}

State {
assignments,
depth,
Expand All @@ -299,6 +319,7 @@ impl State {
next_no_show,
last_assignment_tick,
total_observed_no_shows: self.total_observed_no_shows + new_no_shows,
no_show_validators: self.no_show_validators,
}
}
}
Expand Down Expand Up @@ -354,8 +375,9 @@ fn count_no_shows(
block_tick: Tick,
no_show_duration: Tick,
drifted_tick_now: Tick,
) -> (usize, Option<u64>) {
) -> (usize, Option<u64>, Vec<ValidatorIndex>) {
let mut next_no_show = None;
let mut no_show_validators = Vec::new();
let no_shows = assignments
.iter()
.map(|(v_index, tick)| {
Expand All @@ -379,12 +401,14 @@ fn count_no_shows(
// the clock drift will be removed again to do the comparison above.
next_no_show = super::min_prefer_some(next_no_show, Some(no_show_at + clock_drift));
}

if is_no_show {
no_show_validators.push(*v_index);
}
is_no_show
})
.count();

(no_shows, next_no_show)
(no_shows, next_no_show, no_show_validators)
}

/// Determine the amount of tranches of assignments needed to determine approval of a candidate.
Expand All @@ -408,6 +432,7 @@ pub fn tranches_to_approve(
next_no_show: None,
last_assignment_tick: None,
total_observed_no_shows: 0,
no_show_validators: Vec::new(),
};

// The `ApprovalEntry` doesn't have any data for empty tranches. We still want to iterate over
Expand Down Expand Up @@ -446,7 +471,7 @@ pub fn tranches_to_approve(
//
// While we count the no-shows, we also determine the next possible no-show we might
// see within this tranche.
let (no_shows, next_no_show) = count_no_shows(
let (no_shows, next_no_show, no_show_validators) = count_no_shows(
assignments,
approvals,
clock_drift,
Expand All @@ -455,7 +480,7 @@ pub fn tranches_to_approve(
drifted_tick_now,
);

let s = s.advance(n_assignments, no_shows, next_no_show, last_assignment_tick);
let s = s.advance(n_assignments, no_shows, next_no_show, last_assignment_tick, no_show_validators);
let output = s.output(tranche, needed_approvals, n_validators, no_show_duration);

*state = match output.required_tranches {
Expand Down Expand Up @@ -1186,7 +1211,7 @@ mod tests {
approvals.set(v_index, true);
}

let (no_shows, next_no_show) = count_no_shows(
let (no_shows, next_no_show, _) = count_no_shows(
&test.assignments,
&approvals,
test.clock_drift,
Expand Down Expand Up @@ -1390,6 +1415,7 @@ mod tests {
next_no_show: None,
last_assignment_tick: None,
total_observed_no_shows: 0,
no_show_validators: Default::default(),
};

assert_eq!(
Expand All @@ -1414,6 +1440,7 @@ mod tests {
next_no_show: None,
last_assignment_tick: None,
total_observed_no_shows: 0,
no_show_validators: Default::default(),
};

assert_eq!(
Expand Down
1 change: 1 addition & 0 deletions polkadot/node/core/approval-voting/src/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,7 @@ pub(crate) mod tests {
per_block_assignments_gathering_times: LruMap::new(ByLength::new(
MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
)),
no_show_stats: Default::default(),
}
}

Expand Down
98 changes: 87 additions & 11 deletions polkadot/node/core/approval-voting/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ struct ApprovalStatus {
tranche_now: DelayTranche,
block_tick: Tick,
last_no_shows: usize,
no_show_validators: Vec<ValidatorIndex>,
}

#[derive(Copy, Clone)]
Expand Down Expand Up @@ -832,6 +833,51 @@ struct State {
// tranches we trigger and why.
per_block_assignments_gathering_times:
LruMap<BlockNumber, HashMap<(Hash, CandidateHash), AssignmentGatheringRecord>>,
no_show_stats: NoShowStats,
}

// Regularly dump the no-show stats at this block number frequency.
const NO_SHOW_DUMP_FREQUENCY: BlockNumber = 50;
// The maximum number of validators we record no-shows for, per candidate.
pub(crate) const MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE: usize = 20;

// No show stats per validator and per parachain.
// This is valuable information when we have to debug live network issue, because
// it gives information if things are going wrong only for some validators or just
// for some parachains.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
struct NoShowStats {
per_validator_no_show: HashMap<SessionIndex, HashMap<ValidatorIndex, usize>>,
per_parachain_no_show: HashMap<u32, usize>,
last_dumped_block_number: BlockNumber,
}

impl NoShowStats {
// Print the no-show stats if NO_SHOW_DUMP_FREQUENCY blocks have passed since the last
// print.
fn maybe_print(&mut self, current_block_number: BlockNumber) {
if self.last_dumped_block_number > current_block_number ||
current_block_number - self.last_dumped_block_number < NO_SHOW_DUMP_FREQUENCY
{
return
}
if self.per_parachain_no_show.is_empty() && self.per_validator_no_show.is_empty() {
return
}

gum::debug!(
target: LOG_TARGET,
"Validators with no_show {:?} and parachains with no_shows {:?} since {:}",
self.per_validator_no_show,
self.per_parachain_no_show,
self.last_dumped_block_number
);

self.last_dumped_block_number = current_block_number;

self.per_validator_no_show.clear();
self.per_parachain_no_show.clear();
}
}

#[derive(Debug, Clone, PartialEq, Eq)]
Expand Down Expand Up @@ -887,21 +933,25 @@ impl State {
);

if let Some(approval_entry) = candidate_entry.approval_entry(&block_hash) {
let TranchesToApproveResult { required_tranches, total_observed_no_shows } =
approval_checking::tranches_to_approve(
approval_entry,
candidate_entry.approvals(),
tranche_now,
block_tick,
no_show_duration,
session_info.needed_approvals as _,
);
let TranchesToApproveResult {
required_tranches,
total_observed_no_shows,
no_show_validators,
} = approval_checking::tranches_to_approve(
approval_entry,
candidate_entry.approvals(),
tranche_now,
block_tick,
no_show_duration,
session_info.needed_approvals as _,
);

let status = ApprovalStatus {
required_tranches,
block_tick,
tranche_now,
last_no_shows: total_observed_no_shows,
no_show_validators,
};

Some((approval_entry, status))
Expand Down Expand Up @@ -1044,6 +1094,26 @@ impl State {
},
}
}

fn record_no_shows(
&mut self,
session_index: SessionIndex,
para_id: u32,
no_show_validators: &Vec<ValidatorIndex>,
) {
if !no_show_validators.is_empty() {
*self.no_show_stats.per_parachain_no_show.entry(para_id.into()).or_default() += 1;
}
for validator_index in no_show_validators {
*self
.no_show_stats
.per_validator_no_show
.entry(session_index)
.or_default()
.entry(*validator_index)
.or_default() += 1;
}
}
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -1096,6 +1166,7 @@ where
per_block_assignments_gathering_times: LruMap::new(ByLength::new(
MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
)),
no_show_stats: NoShowStats::default(),
};

// `None` on start-up. Gets initialized/updated on leaf update
Expand Down Expand Up @@ -1740,6 +1811,8 @@ async fn handle_from_overseer<Context>(
"Imported new block.",
);

state.no_show_stats.maybe_print(block_batch.block_number);

for (c_hash, c_entry) in block_batch.imported_candidates {
metrics.on_candidate_imported();

Expand Down Expand Up @@ -2904,7 +2977,8 @@ where
let mut actions = Vec::new();
let block_hash = block_entry.block_hash();
let block_number = block_entry.block_number();

let session_index = block_entry.session();
let para_id = candidate_entry.candidate_receipt().descriptor().para_id;
let tick_now = state.clock.tick_now();

let (is_approved, status) = if let Some((approval_entry, status)) = state
Expand Down Expand Up @@ -3000,7 +3074,9 @@ where
if is_approved {
approval_entry.mark_approved();
}

if newly_approved {
state.record_no_shows(session_index, para_id.into(), &status.no_show_validators);
}
actions.extend(schedule_wakeup_action(
&approval_entry,
block_hash,
Expand Down
2 changes: 2 additions & 0 deletions polkadot/node/core/approval-voting/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5068,6 +5068,7 @@ fn test_gathering_assignments_statements() {
per_block_assignments_gathering_times: LruMap::new(ByLength::new(
MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
)),
no_show_stats: NoShowStats::default(),
};

for i in 0..200i32 {
Expand Down Expand Up @@ -5162,6 +5163,7 @@ fn test_observe_assignment_gathering_status() {
per_block_assignments_gathering_times: LruMap::new(ByLength::new(
MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
)),
no_show_stats: NoShowStats::default(),
};

let metrics_inner = MetricsInner {
Expand Down

0 comments on commit 2869fd6

Please sign in to comment.