From 34a814db7899a0c4a66f08dbacb2a17beeb476a1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 3 Jul 2024 13:53:26 -0700 Subject: [PATCH] do not let failure list grow without bound --- nexus/db-queries/src/db/saga_recovery.rs | 2 +- nexus/src/app/background/tasks/saga_recovery.rs | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/saga_recovery.rs b/nexus/db-queries/src/db/saga_recovery.rs index 34a99b6de2..036169a348 100644 --- a/nexus/db-queries/src/db/saga_recovery.rs +++ b/nexus/db-queries/src/db/saga_recovery.rs @@ -25,7 +25,7 @@ use std::collections::BTreeSet; use std::sync::Arc; use steno::SagaId; -/// Describes the result [`recover`] +/// Describes the result of [`recover()`] pub struct SagasRecovered { recovered: BTreeMap>>, skipped: BTreeSet, diff --git a/nexus/src/app/background/tasks/saga_recovery.rs b/nexus/src/app/background/tasks/saga_recovery.rs index cfabaab332..580e286e60 100644 --- a/nexus/src/app/background/tasks/saga_recovery.rs +++ b/nexus/src/app/background/tasks/saga_recovery.rs @@ -16,10 +16,14 @@ use nexus_db_queries::db::DataStore; use serde::Serialize; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; +use std::collections::VecDeque; use std::sync::Arc; use steno::SagaId; use uuid::Uuid; +/// Maximum number of recent failures to keep track of for debugging +const N_FAILED_SAGA_HISTORY: usize = 16; + /// Background task that recovers sagas assigned to this Nexus /// /// Normally, this task only does anything of note once, when Nexus starts up. @@ -39,7 +43,7 @@ pub struct SagaRecovery { registry: Arc, sagas_recovered: BTreeMap>, - recent_failures: Vec, + recent_failures: VecDeque, last_pass: LastPass, } @@ -48,7 +52,7 @@ pub struct SagaRecovery { #[derive(Clone, Serialize)] pub struct SagaRecoveryTaskStatus { all_recovered: BTreeMap>, - recent_failures: Vec, + recent_failures: VecDeque, last_pass: LastPass, } @@ -85,7 +89,7 @@ impl SagaRecovery { sec, registry, sagas_recovered: BTreeMap::new(), - recent_failures: Vec::new(), + recent_failures: VecDeque::with_capacity(N_FAILED_SAGA_HISTORY), last_pass: LastPass::NeverStarted, } } @@ -149,7 +153,10 @@ impl BackgroundTask for SagaRecovery { } for (saga_id, error) in ok.iter_failed() { - self.recent_failures.push(RecoveryFailure { + if self.recent_failures.len() == N_FAILED_SAGA_HISTORY { + let _ = self.recent_failures.pop_front(); + } + self.recent_failures.push_back(RecoveryFailure { time: now, saga_id, message: InlineErrorChain::new(error).to_string(),