Skip to content

Commit

Permalink
refactor: extract common parsing of SpUpdateStatus into MgsClients
Browse files Browse the repository at this point in the history
  • Loading branch information
jgallagher committed Nov 15, 2023
1 parent 46b6d27 commit 2a6d27c
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 271 deletions.
152 changes: 152 additions & 0 deletions nexus/src/app/update/mgs_clients.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,53 @@
//! Module providing support for handling failover between multiple MGS clients
use futures::Future;
use gateway_client::types::SpType;
use gateway_client::types::SpUpdateStatus;
use gateway_client::Client;
use slog::Logger;
use std::collections::VecDeque;
use std::sync::Arc;
use uuid::Uuid;

pub(super) type GatewayClientError =
gateway_client::Error<gateway_client::types::Error>;

pub(super) enum PollUpdateStatus {
Preparing { progress: Option<f64> },
InProgress { progress: Option<f64> },
Complete,
}

#[derive(Debug, thiserror::Error)]
pub enum UpdateStatusError {
#[error("different update is now preparing ({0})")]
DifferentUpdatePreparing(Uuid),
#[error("different update is now in progress ({0})")]
DifferentUpdateInProgress(Uuid),
#[error("different update is now complete ({0})")]
DifferentUpdateComplete(Uuid),
#[error("different update is now aborted ({0})")]
DifferentUpdateAborted(Uuid),
#[error("different update failed ({0})")]
DifferentUpdateFailed(Uuid),
#[error("update status lost (did the SP reset?)")]
UpdateStatusLost,
#[error("update was aborted")]
UpdateAborted,
#[error("update failed (error code {0})")]
UpdateFailedWithCode(u32),
#[error("update failed (error message {0})")]
UpdateFailedWithMessage(String),
}

#[derive(Debug, thiserror::Error)]
pub(super) enum PollUpdateStatusError {
#[error(transparent)]
StatusError(#[from] UpdateStatusError),
#[error(transparent)]
ClientError(#[from] GatewayClientError),
}

#[derive(Debug, Clone)]
pub struct MgsClients {
clients: VecDeque<Arc<Client>>,
Expand Down Expand Up @@ -89,4 +128,117 @@ impl MgsClients {
// errors. Return the error from the last MGS we tried.
Err(GatewayClientError::CommunicationError(last_err.unwrap()))
}

/// Poll for the status of an expected-to-be-in-progress update.
pub(super) async fn poll_update_status(
&mut self,
sp_type: SpType,
sp_slot: u32,
component: &'static str,
update_id: Uuid,
log: &Logger,
) -> Result<PollUpdateStatus, PollUpdateStatusError> {
let update_status = self
.try_all(log, |client| async move {
let update_status = client
.sp_component_update_status(sp_type, sp_slot, component)
.await?;

debug!(
log, "got update status";
"mgs_addr" => client.baseurl(),
"status" => ?update_status,
);

Ok(update_status)
})
.await?
.into_inner();

match update_status {
// For `Preparing` and `InProgress`, we could check the progress
// information returned by these steps and try to check that
// we're still _making_ progress, but every Nexus instance needs
// to do that anyway in case we (or the MGS instance delivering
// the update) crash, so we'll omit that check here. Instead, we
// just sleep and we'll poll again shortly.
SpUpdateStatus::Preparing { id, progress } => {
if id == update_id {
let progress = progress.and_then(|progress| {
if progress.current > progress.total {
warn!(
log, "nonsense preparing progress";
"current" => progress.current,
"total" => progress.total,
);
None
} else if progress.total == 0 {
None
} else {
Some(
f64::from(progress.current)
/ f64::from(progress.total),
)
}
});
Ok(PollUpdateStatus::Preparing { progress })
} else {
Err(UpdateStatusError::DifferentUpdatePreparing(id).into())
}
}
SpUpdateStatus::InProgress { id, bytes_received, total_bytes } => {
if id == update_id {
let progress = if bytes_received > total_bytes {
warn!(
log, "nonsense update progress";
"bytes_received" => bytes_received,
"total_bytes" => total_bytes,
);
None
} else if total_bytes == 0 {
None
} else {
Some(f64::from(bytes_received) / f64::from(total_bytes))
};
Ok(PollUpdateStatus::InProgress { progress })
} else {
Err(UpdateStatusError::DifferentUpdateInProgress(id).into())
}
}
SpUpdateStatus::Complete { id } => {
if id == update_id {
Ok(PollUpdateStatus::Complete)
} else {
Err(UpdateStatusError::DifferentUpdateComplete(id).into())
}
}
SpUpdateStatus::None => {
Err(UpdateStatusError::UpdateStatusLost.into())
}
SpUpdateStatus::Aborted { id } => {
if id == update_id {
Err(UpdateStatusError::UpdateAborted.into())
} else {
Err(UpdateStatusError::DifferentUpdateAborted(id).into())
}
}
SpUpdateStatus::Failed { code, id } => {
if id == update_id {
Err(UpdateStatusError::UpdateFailedWithCode(code).into())
} else {
Err(UpdateStatusError::DifferentUpdateFailed(id).into())
}
}
SpUpdateStatus::RotError { id, message } => {
if id == update_id {
Err(UpdateStatusError::UpdateFailedWithMessage(format!(
"rot error: {message}"
))
.into())
} else {
Err(UpdateStatusError::DifferentUpdateFailed(id).into())
}
}
}
}
}
2 changes: 1 addition & 1 deletion nexus/src/app/update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ mod mgs_clients;
mod rot_updater;
mod sp_updater;

pub use mgs_clients::MgsClients;
pub use mgs_clients::{MgsClients, UpdateStatusError};
pub use rot_updater::{RotUpdateError, RotUpdater};
pub use sp_updater::{SpUpdateError, SpUpdater};

Expand Down
179 changes: 46 additions & 133 deletions nexus/src/app/update/rot_updater.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@

//! Module containing types for updating RoTs via MGS.
use std::time::Duration;

use super::mgs_clients::PollUpdateStatusError;
use super::MgsClients;
use super::UpdateProgress;
use super::UpdateStatusError;
use crate::app::update::mgs_clients::PollUpdateStatus;
use gateway_client::types::RotSlot;
use gateway_client::types::SpComponentFirmwareSlot;
use gateway_client::types::SpType;
use gateway_client::types::SpUpdateStatus;
use gateway_client::SpComponent;
use slog::Logger;
use std::time::Duration;
use tokio::sync::watch;
use uuid::Uuid;

Expand All @@ -24,10 +25,17 @@ pub enum RotUpdateError {
#[error("error communicating with MGS")]
MgsCommunication(#[from] GatewayClientError),

// Error returned when we successfully start an update but it fails to
// complete successfully.
#[error("RoT update failed to complete: {0}")]
FailedToComplete(String),
#[error("failed checking update status: {0}")]
PollUpdateStatus(#[from] UpdateStatusError),
}

impl From<PollUpdateStatusError> for RotUpdateError {
fn from(err: PollUpdateStatusError) -> Self {
match err {
PollUpdateStatusError::StatusError(err) => err.into(),
PollUpdateStatusError::ClientError(err) => err.into(),
}
}
}

pub struct RotUpdater {
Expand Down Expand Up @@ -112,6 +120,11 @@ impl RotUpdater {
})
.await?;

// wait for any progress watchers to be dropped before we return;
// otherwise, they'll get `RecvError`s when trying to check the current
// status
self.progress.closed().await;

Ok(())
}

Expand Down Expand Up @@ -151,136 +164,36 @@ impl RotUpdater {
const STATUS_POLL_INTERVAL: Duration = Duration::from_secs(3);

loop {
let update_status = mgs_clients
.try_all(&self.log, |client| async move {
let update_status = client
.sp_component_update_status(
self.sp_type,
self.sp_slot,
SpComponent::ROT.const_as_str(),
)
.await?;

info!(
self.log, "got SP update status";
"mgs_addr" => client.baseurl(),
"status" => ?update_status,
);

Ok(update_status)
})
.await?
.into_inner();

// The majority of possible update statuses indicate failure; we'll
// handle the small number of non-failure cases by either
// `continue`ing or `return`ing; all other branches will give us an
// error string that we can report.
let error_message = match update_status {
// For `Preparing` and `InProgress`, we could check the progress
// information returned by these steps and try to check that
// we're still _making_ progress, but every Nexus instance needs
// to do that anyway in case we (or the MGS instance delivering
// the update) crash, so we'll omit that check here. Instead, we
// just sleep and we'll poll again shortly.
SpUpdateStatus::Preparing { id, progress } => {
if id == self.update_id {
let progress = progress.and_then(|progress| {
if progress.current > progress.total {
warn!(
self.log, "nonsense preparing progress";
"current" => progress.current,
"total" => progress.total,
);
None
} else if progress.total == 0 {
None
} else {
Some(
f64::from(progress.current)
/ f64::from(progress.total),
)
}
});
self.progress.send_replace(Some(
UpdateProgress::Preparing { progress },
));
tokio::time::sleep(STATUS_POLL_INTERVAL).await;
continue;
} else {
format!("different update is now preparing ({id})")
}
}
SpUpdateStatus::InProgress {
id,
bytes_received,
total_bytes,
} => {
if id == self.update_id {
let progress = if bytes_received > total_bytes {
warn!(
self.log, "nonsense progress";
"bytes_received" => bytes_received,
"total_bytes" => total_bytes,
);
None
} else if total_bytes == 0 {
None
} else {
Some(
f64::from(bytes_received)
/ f64::from(total_bytes),
)
};
self.progress.send_replace(Some(
UpdateProgress::InProgress { progress },
));
tokio::time::sleep(STATUS_POLL_INTERVAL).await;
continue;
} else {
format!("different update is now in progress ({id})")
}
}
SpUpdateStatus::Complete { id } => {
if id == self.update_id {
self.progress.send_replace(Some(
UpdateProgress::InProgress { progress: Some(1.0) },
));
return Ok(());
} else {
format!("different update is now in complete ({id})")
}
}
SpUpdateStatus::None => {
"update status lost (did the SP reset?)".to_string()
}
SpUpdateStatus::Aborted { id } => {
if id == self.update_id {
"update was aborted".to_string()
} else {
format!("different update is now in complete ({id})")
}
let status = mgs_clients
.poll_update_status(
self.sp_type,
self.sp_slot,
SpComponent::ROT.const_as_str(),
self.update_id,
&self.log,
)
.await?;

match status {
PollUpdateStatus::Preparing { progress } => {
self.progress.send_replace(Some(
UpdateProgress::Preparing { progress },
));
}
SpUpdateStatus::Failed { code, id } => {
if id == self.update_id {
format!("update failed (error code {code})")
} else {
format!("different update failed ({id})")
}
PollUpdateStatus::InProgress { progress } => {
self.progress.send_replace(Some(
UpdateProgress::InProgress { progress },
));
}
SpUpdateStatus::RotError { id, message } => {
if id == self.update_id {
format!("update failed (rot error: {message})")
} else {
format!("different update failed with rot error ({id})")
}
PollUpdateStatus::Complete => {
self.progress.send_replace(Some(
UpdateProgress::InProgress { progress: Some(1.0) },
));
return Ok(());
}
};
}

self.progress.send_replace(Some(UpdateProgress::Failed(
error_message.clone(),
)));
return Err(RotUpdateError::FailedToComplete(error_message));
tokio::time::sleep(STATUS_POLL_INTERVAL).await;
}
}

Expand Down
Loading

0 comments on commit 2a6d27c

Please sign in to comment.