Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept notifications from Crucible #5135

Merged
merged 29 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
128a998
Accept live repair status reports from Crucible
jmpesp Feb 23, 2024
169b478
use replace directive for TypedUuidFor*
jmpesp Feb 23, 2024
7e7934e
no more underscores
jmpesp Feb 26, 2024
ff08b00
support status for live repair and reconciliation
jmpesp Feb 27, 2024
b1940f0
schema 37 -> 38
jmpesp Feb 27, 2024
1b03223
more schema update
jmpesp Feb 27, 2024
0cd8601
accept upstairs repair progress
jmpesp Feb 27, 2024
d2bf5f5
tests pass
jmpesp Feb 27, 2024
d6f41f3
Merge branch 'main' into crucible_repair_status_reports
jmpesp Feb 27, 2024
832e649
simple mismatched record type check
jmpesp Feb 28, 2024
6c44342
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 1, 2024
53947b3
bad merge
jmpesp Mar 1, 2024
1c81d75
move retry_until_known_result into common
jmpesp Mar 1, 2024
c39094e
prepend /crucible/0/
jmpesp Mar 8, 2024
63d05ed
add downstairs client task stopped notification
jmpesp Mar 11, 2024
567ce0c
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 11, 2024
a5e0c9f
schema 38 -> 41
jmpesp Mar 11, 2024
2b6fa26
downstairs_client_stopped_notification sql
jmpesp Mar 11, 2024
83106c3
fmt
jmpesp Mar 11, 2024
41e7aa7
snake case please
jmpesp Mar 11, 2024
86847b4
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 12, 2024
8bc626c
update URLs with prefix
jmpesp Mar 12, 2024
0b37b63
use new Error::non_resourcetype_not_found
jmpesp Mar 12, 2024
3dfabf0
use a variable, they are the same requests
jmpesp Mar 12, 2024
6e8b63c
test_upstairs_notify_downstairs_client_stop
jmpesp Mar 12, 2024
a5410aa
fmt
jmpesp Mar 12, 2024
df79ff2
separate endpoints for stop request and stopped
jmpesp Mar 13, 2024
737d18f
missing omicron.public. prefix
jmpesp Mar 14, 2024
215b014
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ progenitor::generate_api!(
NewPasswordHash = omicron_passwords::NewPasswordHash,

TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,
TypedUuidForLiveRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::LiveRepairKind>,
}
);

Expand Down
1 change: 1 addition & 0 deletions common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ toml.workspace = true
uuid.workspace = true
parse-display.workspace = true
progenitor.workspace = true
progenitor-client.workspace = true
omicron-workspace-hack.workspace = true
once_cell.workspace = true
regress.workspace = true
Expand Down
24 changes: 21 additions & 3 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ use crate::api::external::{
};
use chrono::{DateTime, Utc};
use omicron_uuid_kinds::DownstairsRegionKind;
use omicron_uuid_kinds::LiveRepairKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsRepairKind;
use omicron_uuid_kinds::UpstairsSessionKind;
use parse_display::{Display, FromStr};
use schemars::JsonSchema;
Expand Down Expand Up @@ -256,6 +256,13 @@ pub enum HostIdentifier {
Vpc(Vni),
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum UpstairsRepairType {
Live,
Reconciliation,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsUnderRepair {
pub region_uuid: TypedUuid<DownstairsRegionKind>,
Expand All @@ -264,15 +271,26 @@ pub struct DownstairsUnderRepair {

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairStartInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<LiveRepairKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairFinishInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<LiveRepairKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
pub aborted: bool,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairProgress {
pub time: DateTime<Utc>,
pub current_item: i64,
pub total_items: i64,
}
81 changes: 81 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,84 @@ impl slog::KV for FileKv {
}

pub const OMICRON_DPD_TAG: &str = "omicron";

use futures::Future;
use slog::warn;

/// Retry a progenitor client operation until a known result is returned.
///
/// Saga execution relies on the outcome of an external call being known: since
/// they are idempotent, reissue the external call until a known result comes
/// back. Retry if a communication error is seen, or if another retryable error
/// is seen.
///
/// Note that retrying is only valid if the call itself is idempotent.
pub async fn retry_until_known_result<F, T, E, Fut>(
log: &slog::Logger,
mut f: F,
) -> Result<T, progenitor_client::Error<E>>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
E: std::fmt::Debug,
{
backoff::retry_notify(
backoff::retry_policy_internal_service(),
move || {
let fut = f();
async move {
match fut.await {
Err(progenitor_client::Error::CommunicationError(e)) => {
warn!(
log,
"saw transient communication error {}, retrying...",
e,
);

Err(backoff::BackoffError::transient(
progenitor_client::Error::CommunicationError(e),
))
}

Err(progenitor_client::Error::ErrorResponse(
response_value,
)) => {
match response_value.status() {
// Retry on 503 or 429
http::StatusCode::SERVICE_UNAVAILABLE
| http::StatusCode::TOO_MANY_REQUESTS => {
Err(backoff::BackoffError::transient(
progenitor_client::Error::ErrorResponse(
response_value,
),
))
}

// Anything else is a permanent error
_ => Err(backoff::BackoffError::Permanent(
progenitor_client::Error::ErrorResponse(
response_value,
),
)),
}
}

Err(e) => {
warn!(log, "saw permanent error {}, aborting", e,);

Err(backoff::BackoffError::Permanent(e))
}

Ok(v) => Ok(v),
}
}
},
|error: progenitor_client::Error<_>, delay| {
warn!(
log,
"failed external call ({:?}), will retry in {:?}", error, delay,
);
},
)
.await
}
2 changes: 1 addition & 1 deletion nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
tough.workspace = true
trust-dns-resolver.workspace = true
uuid.workspace = true
nexus-blueprint-execution.workspace = true

nexus-defaults.workspace = true
nexus-db-model.workspace = true
nexus-db-queries.workspace = true
Expand Down
4 changes: 2 additions & 2 deletions nexus/db-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ mod ipv4net;
pub mod ipv6;
mod ipv6net;
mod l4_port_range;
mod live_repair;
mod macaddr;
mod name;
mod network_interface;
Expand Down Expand Up @@ -85,6 +84,7 @@ mod switch;
mod tuf_repo;
mod typed_uuid;
mod unsigned;
mod upstairs_repair;
mod user_builtin;
mod utilization;
mod virtual_provisioning_collection;
Expand Down Expand Up @@ -141,7 +141,6 @@ pub use ipv4net::*;
pub use ipv6::*;
pub use ipv6net::*;
pub use l4_port_range::*;
pub use live_repair::*;
pub use name::*;
pub use network_interface::*;
pub use oximeter_info::*;
Expand Down Expand Up @@ -176,6 +175,7 @@ pub use switch_interface::*;
pub use switch_port::*;
pub use tuf_repo::*;
pub use typed_uuid::to_db_typed_uuid;
pub use upstairs_repair::*;
pub use user_builtin::*;
pub use utilization::*;
pub use virtual_provisioning_collection::*;
Expand Down
95 changes: 0 additions & 95 deletions nexus/db-model/src/live_repair.rs

This file was deleted.

16 changes: 13 additions & 3 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion;
///
/// This should be updated whenever the schema is changed. For more details,
/// refer to: schema/crdb/README.adoc
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(37, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(38, 0, 0);
leftwo marked this conversation as resolved.
Show resolved Hide resolved

table! {
disk (id) {
Expand Down Expand Up @@ -1520,18 +1520,28 @@ table! {
}

table! {
live_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) {
upstairs_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) {
time -> Timestamptz,

repair_id -> Uuid,
repair_type -> crate::UpstairsRepairTypeEnum,
upstairs_id -> Uuid,
session_id -> Uuid,

region_id -> Uuid,
target_ip -> Inet,
target_port -> Int4,

notification_type -> crate::LiveRepairNotificationTypeEnum,
notification_type -> crate::UpstairsRepairNotificationTypeEnum,
}
}

table! {
upstairs_repair_progress (repair_id, time, current_item, total_items) {
repair_id -> Uuid,
time -> Timestamptz,
current_item -> Int8,
total_items -> Int8,
leftwo marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down
Loading
Loading