Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept notifications from Crucible #5135

Merged
merged 29 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
128a998
Accept live repair status reports from Crucible
jmpesp Feb 23, 2024
169b478
use replace directive for TypedUuidFor*
jmpesp Feb 23, 2024
7e7934e
no more underscores
jmpesp Feb 26, 2024
ff08b00
support status for live repair and reconciliation
jmpesp Feb 27, 2024
b1940f0
schema 37 -> 38
jmpesp Feb 27, 2024
1b03223
more schema update
jmpesp Feb 27, 2024
0cd8601
accept upstairs repair progress
jmpesp Feb 27, 2024
d2bf5f5
tests pass
jmpesp Feb 27, 2024
d6f41f3
Merge branch 'main' into crucible_repair_status_reports
jmpesp Feb 27, 2024
832e649
simple mismatched record type check
jmpesp Feb 28, 2024
6c44342
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 1, 2024
53947b3
bad merge
jmpesp Mar 1, 2024
1c81d75
move retry_until_known_result into common
jmpesp Mar 1, 2024
c39094e
prepend /crucible/0/
jmpesp Mar 8, 2024
63d05ed
add downstairs client task stopped notification
jmpesp Mar 11, 2024
567ce0c
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 11, 2024
a5e0c9f
schema 38 -> 41
jmpesp Mar 11, 2024
2b6fa26
downstairs_client_stopped_notification sql
jmpesp Mar 11, 2024
83106c3
fmt
jmpesp Mar 11, 2024
41e7aa7
snake case please
jmpesp Mar 11, 2024
86847b4
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 12, 2024
8bc626c
update URLs with prefix
jmpesp Mar 12, 2024
0b37b63
use new Error::non_resourcetype_not_found
jmpesp Mar 12, 2024
3dfabf0
use a variable, they are the same requests
jmpesp Mar 12, 2024
6e8b63c
test_upstairs_notify_downstairs_client_stop
jmpesp Mar 12, 2024
a5410aa
fmt
jmpesp Mar 12, 2024
df79ff2
separate endpoints for stop request and stopped
jmpesp Mar 13, 2024
737d18f
missing omicron.public. prefix
jmpesp Mar 14, 2024
215b014
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions clients/nexus-client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ serde_json.workspace = true
slog.workspace = true
uuid.workspace = true
omicron-workspace-hack.workspace = true
omicron-uuid-kinds.workspace = true
4 changes: 4 additions & 0 deletions clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ progenitor::generate_api!(
NewPasswordHash = omicron_passwords::NewPasswordHash,
NetworkInterface = omicron_common::api::internal::shared::NetworkInterface,
NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind,
TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::DownstairsKind>,
TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,
},
patch = {
SledAgentInfo = { derives = [PartialEq, Eq] },
Expand Down
1 change: 1 addition & 0 deletions common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ tokio = { workspace = true, features = ["full"] }
uuid.workspace = true
parse-display.workspace = true
progenitor.workspace = true
progenitor-client.workspace = true
omicron-workspace-hack.workspace = true
once_cell.workspace = true
regress.workspace = true
Expand Down
83 changes: 83 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ use crate::api::external::{
InstanceState, IpNet, SemverVersion, Vni,
};
use chrono::{DateTime, Utc};
use omicron_uuid_kinds::DownstairsRegionKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsRepairKind;
use omicron_uuid_kinds::UpstairsSessionKind;
use parse_display::{Display, FromStr};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -251,3 +255,82 @@ pub enum HostIdentifier {
Ip(IpNet),
Vpc(Vni),
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum UpstairsRepairType {
Live,
Reconciliation,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsUnderRepair {
pub region_uuid: TypedUuid<DownstairsRegionKind>,
pub target_addr: std::net::SocketAddrV6,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairStartInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairFinishInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
pub aborted: bool,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairProgress {
pub time: DateTime<Utc>,
pub current_item: i64,
pub total_items: i64,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DownstairsClientStopRequestReason {
Replacing,
Disabled,
FailedReconcile,
IOError,
BadNegotiationOrder,
Incompatible,
FailedLiveRepair,
TooManyOutstandingJobs,
Deactivated,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsClientStopRequest {
pub time: DateTime<Utc>,
pub reason: DownstairsClientStopRequestReason,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DownstairsClientStoppedReason {
ConnectionTimeout,
ConnectionFailed,
Timeout,
WriteFailed,
ReadFailed,
RequestedStop,
Finished,
QueueClosed,
ReceiveTaskCancelled,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsClientStopped {
pub time: DateTime<Utc>,
pub reason: DownstairsClientStoppedReason,
}
81 changes: 81 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,84 @@ impl slog::KV for FileKv {
}

pub const OMICRON_DPD_TAG: &str = "omicron";

use futures::Future;
use slog::warn;

/// Retry a progenitor client operation until a known result is returned.
///
/// Saga execution relies on the outcome of an external call being known: since
/// they are idempotent, reissue the external call until a known result comes
/// back. Retry if a communication error is seen, or if another retryable error
/// is seen.
///
/// Note that retrying is only valid if the call itself is idempotent.
pub async fn retry_until_known_result<F, T, E, Fut>(
log: &slog::Logger,
mut f: F,
) -> Result<T, progenitor_client::Error<E>>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
E: std::fmt::Debug,
{
backoff::retry_notify(
backoff::retry_policy_internal_service(),
move || {
let fut = f();
async move {
match fut.await {
Err(progenitor_client::Error::CommunicationError(e)) => {
warn!(
log,
"saw transient communication error {}, retrying...",
e,
);

Err(backoff::BackoffError::transient(
progenitor_client::Error::CommunicationError(e),
))
}

Err(progenitor_client::Error::ErrorResponse(
response_value,
)) => {
match response_value.status() {
// Retry on 503 or 429
http::StatusCode::SERVICE_UNAVAILABLE
| http::StatusCode::TOO_MANY_REQUESTS => {
Err(backoff::BackoffError::transient(
progenitor_client::Error::ErrorResponse(
response_value,
),
))
}

// Anything else is a permanent error
_ => Err(backoff::BackoffError::Permanent(
progenitor_client::Error::ErrorResponse(
response_value,
),
)),
}
}

Err(e) => {
warn!(log, "saw permanent error {}, aborting", e,);

Err(backoff::BackoffError::Permanent(e))
}

Ok(v) => Ok(v),
}
}
},
|error: progenitor_client::Error<_>, delay| {
warn!(
log,
"failed external call ({:?}), will retry in {:?}", error, delay,
);
},
)
.await
}
1 change: 1 addition & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ rustls = { workspace = true }
rustls-pemfile = { workspace = true }
update-common.workspace = true
omicron-workspace-hack.workspace = true
omicron-uuid-kinds.workspace = true

[dev-dependencies]
async-bb8-diesel.workspace = true
Expand Down
133 changes: 133 additions & 0 deletions nexus/db-model/src/downstairs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use crate::schema::downstairs_client_stop_request_notification;
use crate::schema::downstairs_client_stopped_notification;
use crate::typed_uuid::DbTypedUuid;
use chrono::{DateTime, Utc};
use omicron_common::api::internal;
use omicron_uuid_kinds::DownstairsKind;
use omicron_uuid_kinds::UpstairsKind;
use serde::{Deserialize, Serialize};

// Types for stop request notification

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "downstairs_client_stop_request_reason_type", schema = "public"))]
pub struct DownstairsClientStopRequestReasonEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
#[diesel(sql_type = DownstairsClientStopRequestReasonEnum)]
pub enum DownstairsClientStopRequestReason;

// Reason types
Replacing => b"replacing"
Disabled => b"disabled"
FailedReconcile => b"failed_reconcile"
IOError => b"io_error"
BadNegotiationOrder => b"bad_negotiation_order"
Incompatible => b"incompatible"
FailedLiveRepair => b"failed_live_repair"
TooManyOutstandingJobs => b"too_many_outstanding_jobs"
Deactivated => b"deactivated"
);

impl From<internal::nexus::DownstairsClientStopRequestReason>
for DownstairsClientStopRequestReason
{
fn from(
v: internal::nexus::DownstairsClientStopRequestReason,
) -> DownstairsClientStopRequestReason {
match v {
internal::nexus::DownstairsClientStopRequestReason::Replacing => DownstairsClientStopRequestReason::Replacing,
internal::nexus::DownstairsClientStopRequestReason::Disabled => DownstairsClientStopRequestReason::Disabled,
internal::nexus::DownstairsClientStopRequestReason::FailedReconcile => DownstairsClientStopRequestReason::FailedReconcile,
internal::nexus::DownstairsClientStopRequestReason::IOError => DownstairsClientStopRequestReason::IOError,
internal::nexus::DownstairsClientStopRequestReason::BadNegotiationOrder => DownstairsClientStopRequestReason::BadNegotiationOrder,
internal::nexus::DownstairsClientStopRequestReason::Incompatible => DownstairsClientStopRequestReason::Incompatible,
internal::nexus::DownstairsClientStopRequestReason::FailedLiveRepair => DownstairsClientStopRequestReason::FailedLiveRepair,
internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs => DownstairsClientStopRequestReason::TooManyOutstandingJobs,
internal::nexus::DownstairsClientStopRequestReason::Deactivated => DownstairsClientStopRequestReason::Deactivated,
}
}
}

/// A Record of when an Upstairs requested a Downstairs client task stop
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = downstairs_client_stop_request_notification)]
pub struct DownstairsClientStopRequestNotification {
// Importantly, this is client time, not Nexus' time that it received the
// notification.
pub time: DateTime<Utc>,

// Which Upstairs sent this notification?
pub upstairs_id: DbTypedUuid<UpstairsKind>,

// Which Downstairs client was requested to stop?
pub downstairs_id: DbTypedUuid<DownstairsKind>,

pub reason: DownstairsClientStopRequestReason,
}

// Types for stopped notification

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "downstairs_client_stopped_reason_type", schema = "public"))]
pub struct DownstairsClientStoppedReasonEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
#[diesel(sql_type = DownstairsClientStoppedReasonEnum)]
pub enum DownstairsClientStoppedReason;

// Reason types
ConnectionTimeout => b"connection_timeout"
ConnectionFailed => b"connection_failed"
Timeout => b"timeout"
WriteFailed => b"write_failed"
ReadFailed => b"read_failed"
RequestedStop => b"requested_stop"
Finished => b"finished"
QueueClosed => b"queue_closed"
ReceiveTaskCancelled => b"receive_task_cancelled"
);

impl From<internal::nexus::DownstairsClientStoppedReason>
for DownstairsClientStoppedReason
{
fn from(
v: internal::nexus::DownstairsClientStoppedReason,
) -> DownstairsClientStoppedReason {
match v {
internal::nexus::DownstairsClientStoppedReason::ConnectionTimeout => DownstairsClientStoppedReason::ConnectionTimeout,
internal::nexus::DownstairsClientStoppedReason::ConnectionFailed => DownstairsClientStoppedReason::ConnectionFailed,
internal::nexus::DownstairsClientStoppedReason::Timeout => DownstairsClientStoppedReason::Timeout,
internal::nexus::DownstairsClientStoppedReason::WriteFailed => DownstairsClientStoppedReason::WriteFailed,
internal::nexus::DownstairsClientStoppedReason::ReadFailed => DownstairsClientStoppedReason::ReadFailed,
internal::nexus::DownstairsClientStoppedReason::RequestedStop => DownstairsClientStoppedReason::RequestedStop,
internal::nexus::DownstairsClientStoppedReason::Finished => DownstairsClientStoppedReason::Finished,
internal::nexus::DownstairsClientStoppedReason::QueueClosed => DownstairsClientStoppedReason::QueueClosed,
internal::nexus::DownstairsClientStoppedReason::ReceiveTaskCancelled => DownstairsClientStoppedReason::ReceiveTaskCancelled,
}
}
}

/// A Record of when a Downstairs client task stopped
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = downstairs_client_stopped_notification)]
pub struct DownstairsClientStoppedNotification {
// Importantly, this is client time, not Nexus' time that it received the
// notification.
pub time: DateTime<Utc>,

// Which Upstairs sent this notification?
pub upstairs_id: DbTypedUuid<UpstairsKind>,

// Which Downstairs client was stopped?
pub downstairs_id: DbTypedUuid<DownstairsKind>,

pub reason: DownstairsClientStoppedReason,
}
Loading
Loading