Skip to content

Commit

Permalink
Accept notifications from Crucible (#5135)
Browse files Browse the repository at this point in the history
Allow any Upstairs to notify Nexus about the start or completion (plus
status) of live repairs. The motivation for this was to be used in the
final stage of region replacement to notify Nexus that the replacement
has finished, but more generally this can be used to keep track of how
many times repair occurs for each region.

Also accept notifications for:

- when a downstairs client is requested to stop
- when a downstairs client stops

These will be used as breadcrumbs to determine when downstairs were
having problems, why repairs started in the first place, and more.

Fixes #5120
  • Loading branch information
jmpesp authored Mar 14, 2024
1 parent e6029bb commit 2406d9d
Show file tree
Hide file tree
Showing 35 changed files with 2,447 additions and 91 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions clients/nexus-client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ serde_json.workspace = true
slog.workspace = true
uuid.workspace = true
omicron-workspace-hack.workspace = true
omicron-uuid-kinds.workspace = true
4 changes: 4 additions & 0 deletions clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ progenitor::generate_api!(
NewPasswordHash = omicron_passwords::NewPasswordHash,
NetworkInterface = omicron_common::api::internal::shared::NetworkInterface,
NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind,
TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::DownstairsKind>,
TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,
},
patch = {
SledAgentInfo = { derives = [PartialEq, Eq] },
Expand Down
1 change: 1 addition & 0 deletions common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ tokio = { workspace = true, features = ["full"] }
uuid.workspace = true
parse-display.workspace = true
progenitor.workspace = true
progenitor-client.workspace = true
omicron-workspace-hack.workspace = true
once_cell.workspace = true
regress.workspace = true
Expand Down
83 changes: 83 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ use crate::api::external::{
InstanceState, IpNet, SemverVersion, Vni,
};
use chrono::{DateTime, Utc};
use omicron_uuid_kinds::DownstairsRegionKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsRepairKind;
use omicron_uuid_kinds::UpstairsSessionKind;
use parse_display::{Display, FromStr};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -251,3 +255,82 @@ pub enum HostIdentifier {
Ip(IpNet),
Vpc(Vni),
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum UpstairsRepairType {
Live,
Reconciliation,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsUnderRepair {
pub region_uuid: TypedUuid<DownstairsRegionKind>,
pub target_addr: std::net::SocketAddrV6,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairStartInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairFinishInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
pub aborted: bool,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairProgress {
pub time: DateTime<Utc>,
pub current_item: i64,
pub total_items: i64,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DownstairsClientStopRequestReason {
Replacing,
Disabled,
FailedReconcile,
IOError,
BadNegotiationOrder,
Incompatible,
FailedLiveRepair,
TooManyOutstandingJobs,
Deactivated,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsClientStopRequest {
pub time: DateTime<Utc>,
pub reason: DownstairsClientStopRequestReason,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DownstairsClientStoppedReason {
ConnectionTimeout,
ConnectionFailed,
Timeout,
WriteFailed,
ReadFailed,
RequestedStop,
Finished,
QueueClosed,
ReceiveTaskCancelled,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsClientStopped {
pub time: DateTime<Utc>,
pub reason: DownstairsClientStoppedReason,
}
81 changes: 81 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,84 @@ impl slog::KV for FileKv {
}

pub const OMICRON_DPD_TAG: &str = "omicron";

use futures::Future;
use slog::warn;

/// Retry a progenitor client operation until a known result is returned.
///
/// Saga execution relies on the outcome of an external call being known: since
/// they are idempotent, reissue the external call until a known result comes
/// back. Retry if a communication error is seen, or if another retryable error
/// is seen.
///
/// Note that retrying is only valid if the call itself is idempotent.
pub async fn retry_until_known_result<F, T, E, Fut>(
log: &slog::Logger,
mut f: F,
) -> Result<T, progenitor_client::Error<E>>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
E: std::fmt::Debug,
{
backoff::retry_notify(
backoff::retry_policy_internal_service(),
move || {
let fut = f();
async move {
match fut.await {
Err(progenitor_client::Error::CommunicationError(e)) => {
warn!(
log,
"saw transient communication error {}, retrying...",
e,
);

Err(backoff::BackoffError::transient(
progenitor_client::Error::CommunicationError(e),
))
}

Err(progenitor_client::Error::ErrorResponse(
response_value,
)) => {
match response_value.status() {
// Retry on 503 or 429
http::StatusCode::SERVICE_UNAVAILABLE
| http::StatusCode::TOO_MANY_REQUESTS => {
Err(backoff::BackoffError::transient(
progenitor_client::Error::ErrorResponse(
response_value,
),
))
}

// Anything else is a permanent error
_ => Err(backoff::BackoffError::Permanent(
progenitor_client::Error::ErrorResponse(
response_value,
),
)),
}
}

Err(e) => {
warn!(log, "saw permanent error {}, aborting", e,);

Err(backoff::BackoffError::Permanent(e))
}

Ok(v) => Ok(v),
}
}
},
|error: progenitor_client::Error<_>, delay| {
warn!(
log,
"failed external call ({:?}), will retry in {:?}", error, delay,
);
},
)
.await
}
1 change: 1 addition & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ rustls = { workspace = true }
rustls-pemfile = { workspace = true }
update-common.workspace = true
omicron-workspace-hack.workspace = true
omicron-uuid-kinds.workspace = true

[dev-dependencies]
async-bb8-diesel.workspace = true
Expand Down
133 changes: 133 additions & 0 deletions nexus/db-model/src/downstairs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use crate::schema::downstairs_client_stop_request_notification;
use crate::schema::downstairs_client_stopped_notification;
use crate::typed_uuid::DbTypedUuid;
use chrono::{DateTime, Utc};
use omicron_common::api::internal;
use omicron_uuid_kinds::DownstairsKind;
use omicron_uuid_kinds::UpstairsKind;
use serde::{Deserialize, Serialize};

// Types for stop request notification

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "downstairs_client_stop_request_reason_type", schema = "public"))]
pub struct DownstairsClientStopRequestReasonEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
#[diesel(sql_type = DownstairsClientStopRequestReasonEnum)]
pub enum DownstairsClientStopRequestReason;

// Reason types
Replacing => b"replacing"
Disabled => b"disabled"
FailedReconcile => b"failed_reconcile"
IOError => b"io_error"
BadNegotiationOrder => b"bad_negotiation_order"
Incompatible => b"incompatible"
FailedLiveRepair => b"failed_live_repair"
TooManyOutstandingJobs => b"too_many_outstanding_jobs"
Deactivated => b"deactivated"
);

impl From<internal::nexus::DownstairsClientStopRequestReason>
for DownstairsClientStopRequestReason
{
fn from(
v: internal::nexus::DownstairsClientStopRequestReason,
) -> DownstairsClientStopRequestReason {
match v {
internal::nexus::DownstairsClientStopRequestReason::Replacing => DownstairsClientStopRequestReason::Replacing,
internal::nexus::DownstairsClientStopRequestReason::Disabled => DownstairsClientStopRequestReason::Disabled,
internal::nexus::DownstairsClientStopRequestReason::FailedReconcile => DownstairsClientStopRequestReason::FailedReconcile,
internal::nexus::DownstairsClientStopRequestReason::IOError => DownstairsClientStopRequestReason::IOError,
internal::nexus::DownstairsClientStopRequestReason::BadNegotiationOrder => DownstairsClientStopRequestReason::BadNegotiationOrder,
internal::nexus::DownstairsClientStopRequestReason::Incompatible => DownstairsClientStopRequestReason::Incompatible,
internal::nexus::DownstairsClientStopRequestReason::FailedLiveRepair => DownstairsClientStopRequestReason::FailedLiveRepair,
internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs => DownstairsClientStopRequestReason::TooManyOutstandingJobs,
internal::nexus::DownstairsClientStopRequestReason::Deactivated => DownstairsClientStopRequestReason::Deactivated,
}
}
}

/// A Record of when an Upstairs requested a Downstairs client task stop
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = downstairs_client_stop_request_notification)]
pub struct DownstairsClientStopRequestNotification {
// Importantly, this is client time, not Nexus' time that it received the
// notification.
pub time: DateTime<Utc>,

// Which Upstairs sent this notification?
pub upstairs_id: DbTypedUuid<UpstairsKind>,

// Which Downstairs client was requested to stop?
pub downstairs_id: DbTypedUuid<DownstairsKind>,

pub reason: DownstairsClientStopRequestReason,
}

// Types for stopped notification

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "downstairs_client_stopped_reason_type", schema = "public"))]
pub struct DownstairsClientStoppedReasonEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
#[diesel(sql_type = DownstairsClientStoppedReasonEnum)]
pub enum DownstairsClientStoppedReason;

// Reason types
ConnectionTimeout => b"connection_timeout"
ConnectionFailed => b"connection_failed"
Timeout => b"timeout"
WriteFailed => b"write_failed"
ReadFailed => b"read_failed"
RequestedStop => b"requested_stop"
Finished => b"finished"
QueueClosed => b"queue_closed"
ReceiveTaskCancelled => b"receive_task_cancelled"
);

impl From<internal::nexus::DownstairsClientStoppedReason>
for DownstairsClientStoppedReason
{
fn from(
v: internal::nexus::DownstairsClientStoppedReason,
) -> DownstairsClientStoppedReason {
match v {
internal::nexus::DownstairsClientStoppedReason::ConnectionTimeout => DownstairsClientStoppedReason::ConnectionTimeout,
internal::nexus::DownstairsClientStoppedReason::ConnectionFailed => DownstairsClientStoppedReason::ConnectionFailed,
internal::nexus::DownstairsClientStoppedReason::Timeout => DownstairsClientStoppedReason::Timeout,
internal::nexus::DownstairsClientStoppedReason::WriteFailed => DownstairsClientStoppedReason::WriteFailed,
internal::nexus::DownstairsClientStoppedReason::ReadFailed => DownstairsClientStoppedReason::ReadFailed,
internal::nexus::DownstairsClientStoppedReason::RequestedStop => DownstairsClientStoppedReason::RequestedStop,
internal::nexus::DownstairsClientStoppedReason::Finished => DownstairsClientStoppedReason::Finished,
internal::nexus::DownstairsClientStoppedReason::QueueClosed => DownstairsClientStoppedReason::QueueClosed,
internal::nexus::DownstairsClientStoppedReason::ReceiveTaskCancelled => DownstairsClientStoppedReason::ReceiveTaskCancelled,
}
}
}

/// A Record of when a Downstairs client task stopped
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = downstairs_client_stopped_notification)]
pub struct DownstairsClientStoppedNotification {
// Importantly, this is client time, not Nexus' time that it received the
// notification.
pub time: DateTime<Utc>,

// Which Upstairs sent this notification?
pub upstairs_id: DbTypedUuid<UpstairsKind>,

// Which Downstairs client was stopped?
pub downstairs_id: DbTypedUuid<DownstairsKind>,

pub reason: DownstairsClientStoppedReason,
}
Loading

0 comments on commit 2406d9d

Please sign in to comment.