Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept notifications from Crucible #5135

Merged
merged 29 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
128a998
Accept live repair status reports from Crucible
jmpesp Feb 23, 2024
169b478
use replace directive for TypedUuidFor*
jmpesp Feb 23, 2024
7e7934e
no more underscores
jmpesp Feb 26, 2024
ff08b00
support status for live repair and reconciliation
jmpesp Feb 27, 2024
b1940f0
schema 37 -> 38
jmpesp Feb 27, 2024
1b03223
more schema update
jmpesp Feb 27, 2024
0cd8601
accept upstairs repair progress
jmpesp Feb 27, 2024
d2bf5f5
tests pass
jmpesp Feb 27, 2024
d6f41f3
Merge branch 'main' into crucible_repair_status_reports
jmpesp Feb 27, 2024
832e649
simple mismatched record type check
jmpesp Feb 28, 2024
6c44342
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 1, 2024
53947b3
bad merge
jmpesp Mar 1, 2024
1c81d75
move retry_until_known_result into common
jmpesp Mar 1, 2024
c39094e
prepend /crucible/0/
jmpesp Mar 8, 2024
63d05ed
add downstairs client task stopped notification
jmpesp Mar 11, 2024
567ce0c
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 11, 2024
a5e0c9f
schema 38 -> 41
jmpesp Mar 11, 2024
2b6fa26
downstairs_client_stopped_notification sql
jmpesp Mar 11, 2024
83106c3
fmt
jmpesp Mar 11, 2024
41e7aa7
snake case please
jmpesp Mar 11, 2024
86847b4
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 12, 2024
8bc626c
update URLs with prefix
jmpesp Mar 12, 2024
0b37b63
use new Error::non_resourcetype_not_found
jmpesp Mar 12, 2024
3dfabf0
use a variable, they are the same requests
jmpesp Mar 12, 2024
6e8b63c
test_upstairs_notify_downstairs_client_stop
jmpesp Mar 12, 2024
a5410aa
fmt
jmpesp Mar 12, 2024
df79ff2
separate endpoints for stop request and stopped
jmpesp Mar 13, 2024
737d18f
missing omicron.public. prefix
jmpesp Mar 14, 2024
215b014
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ progenitor::generate_api!(
NewPasswordHash = omicron_passwords::NewPasswordHash,

TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,
TypedUuidForLiveRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::LiveRepairKind>,
}
);

Expand Down
24 changes: 21 additions & 3 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ use crate::api::external::{
};
use chrono::{DateTime, Utc};
use omicron_uuid_kinds::DownstairsRegionKind;
use omicron_uuid_kinds::LiveRepairKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsRepairKind;
use omicron_uuid_kinds::UpstairsSessionKind;
use parse_display::{Display, FromStr};
use schemars::JsonSchema;
Expand Down Expand Up @@ -256,6 +256,13 @@ pub enum HostIdentifier {
Vpc(Vni),
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum UpstairsRepairType {
Live,
Reconciliation,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsUnderRepair {
pub region_uuid: TypedUuid<DownstairsRegionKind>,
Expand All @@ -264,15 +271,26 @@ pub struct DownstairsUnderRepair {

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairStartInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<LiveRepairKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairFinishInfo {
pub time: DateTime<Utc>,
pub session_id: TypedUuid<UpstairsSessionKind>,
pub repair_id: TypedUuid<LiveRepairKind>,
pub repair_id: TypedUuid<UpstairsRepairKind>,
pub repair_type: UpstairsRepairType,
pub repairs: Vec<DownstairsUnderRepair>,
pub aborted: bool,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct RepairProgress {
pub time: DateTime<Utc>,
pub current_item: i64,
pub total_items: i64,
}
4 changes: 2 additions & 2 deletions nexus/db-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ mod ipv4net;
pub mod ipv6;
mod ipv6net;
mod l4_port_range;
mod live_repair;
mod macaddr;
mod name;
mod network_interface;
Expand Down Expand Up @@ -85,6 +84,7 @@ mod switch;
mod tuf_repo;
mod typed_uuid;
mod unsigned;
mod upstairs_repair;
mod user_builtin;
mod utilization;
mod virtual_provisioning_collection;
Expand Down Expand Up @@ -141,7 +141,6 @@ pub use ipv4net::*;
pub use ipv6::*;
pub use ipv6net::*;
pub use l4_port_range::*;
pub use live_repair::*;
pub use name::*;
pub use network_interface::*;
pub use oximeter_info::*;
Expand Down Expand Up @@ -176,6 +175,7 @@ pub use switch_interface::*;
pub use switch_port::*;
pub use tuf_repo::*;
pub use typed_uuid::to_db_typed_uuid;
pub use upstairs_repair::*;
pub use user_builtin::*;
pub use utilization::*;
pub use virtual_provisioning_collection::*;
Expand Down
95 changes: 0 additions & 95 deletions nexus/db-model/src/live_repair.rs

This file was deleted.

16 changes: 13 additions & 3 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion;
///
/// This should be updated whenever the schema is changed. For more details,
/// refer to: schema/crdb/README.adoc
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(37, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(38, 0, 0);
leftwo marked this conversation as resolved.
Show resolved Hide resolved

table! {
disk (id) {
Expand Down Expand Up @@ -1519,18 +1519,28 @@ table! {
}

table! {
live_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) {
upstairs_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) {
time -> Timestamptz,

repair_id -> Uuid,
repair_type -> crate::UpstairsRepairTypeEnum,
upstairs_id -> Uuid,
session_id -> Uuid,

region_id -> Uuid,
target_ip -> Inet,
target_port -> Int4,

notification_type -> crate::LiveRepairNotificationTypeEnum,
notification_type -> crate::UpstairsRepairNotificationTypeEnum,
}
}

table! {
upstairs_repair_progress (repair_id, time, current_item, total_items) {
repair_id -> Uuid,
time -> Timestamptz,
current_item -> Int8,
total_items -> Int8,
leftwo marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down
154 changes: 154 additions & 0 deletions nexus/db-model/src/upstairs_repair.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use crate::ipv6;
use crate::schema::upstairs_repair_notification;
use crate::schema::upstairs_repair_progress;
use crate::typed_uuid::DbTypedUuid;
use crate::SqlU16;
use chrono::{DateTime, Utc};
use omicron_common::api::internal;
use omicron_uuid_kinds::DownstairsRegionKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsKind;
use omicron_uuid_kinds::UpstairsRepairKind;
use omicron_uuid_kinds::UpstairsSessionKind;
use serde::{Deserialize, Serialize};
use std::net::SocketAddrV6; // internal::nexus::UpstairsRepairType;

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "upstairs_repair_notification_type", schema = "public"))]
pub struct UpstairsRepairNotificationTypeEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
#[diesel(sql_type = UpstairsRepairNotificationTypeEnum)]
pub enum UpstairsRepairNotificationType;

// Notification types
Started => b"started"
Succeeded => b"succeeded"
Failed => b"failed"
);

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "upstairs_repair_type", schema = "public"))]
pub struct UpstairsRepairTypeEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[diesel(sql_type = UpstairsRepairTypeEnum)]
pub enum UpstairsRepairType;

// Types of repair a Crucible Upstairs can do
Live => b"live"
Reconciliation => b"reconciliation"
leftwo marked this conversation as resolved.
Show resolved Hide resolved
);

impl From<internal::nexus::UpstairsRepairType> for UpstairsRepairType {
fn from(v: internal::nexus::UpstairsRepairType) -> UpstairsRepairType {
match v {
internal::nexus::UpstairsRepairType::Live => {
UpstairsRepairType::Live
}
internal::nexus::UpstairsRepairType::Reconciliation => {
UpstairsRepairType::Reconciliation
}
}
}
}

/// A record of Crucible Upstairs repair notifications: when a repair started,
/// succeeded, failed, etc.
///
/// Each repair attempt is uniquely identified by the repair ID, upstairs ID,
/// session ID, and region ID. How those change tells Nexus about what is going
/// on:
///
/// - if all IDs are the same for different requests, Nexus knows that the
/// client is retrying the notification.
///
/// - if the upstairs ID, session ID, and region ID are all the same, but the
/// repair ID is different, then the same Upstairs is trying to repair that
/// region again. This could be due to a failed first attempt, or that
/// downstairs may have been kicked out again.
///
/// - if the upstairs ID and region ID are the same, but the session ID and
/// repair ID are different, then a different session of the same Upstairs is
/// trying to repair that Downstairs. Session IDs change each time the
/// Upstairs is created, so it could have crashed, or it could have been
/// migrated and the destination Propolis' Upstairs is attempting to repair
/// the same region.
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = upstairs_repair_notification)]
pub struct UpstairsRepairNotification {
// Importantly, this is client time, not Nexus' time that it received the
// notification.
pub time: DateTime<Utc>,

pub repair_id: DbTypedUuid<UpstairsRepairKind>,

// There's a difference between the live repairs and reconciliation: the
// Upstairs can go through reconciliation without there being any error from
// a downstairs, or any region replacement request from Nexus. One example
// is if the rack power is pulled: if everything is powered back up again
// reconciliation could be required but this isn't the fault of any problem
// with a physical disk, or any error that was returned.
//
// Alternatively any record of a live repair means that there was a problem:
// Currently, either an Upstairs kicked out a Downstairs (or two) due to
// some error or because it lagged behind the others, or Nexus has
// instructed an Upstairs to perform a region replacement.
pub repair_type: UpstairsRepairType,

pub upstairs_id: DbTypedUuid<UpstairsKind>,
pub session_id: DbTypedUuid<UpstairsSessionKind>,

pub region_id: DbTypedUuid<DownstairsRegionKind>,
pub target_ip: ipv6::Ipv6Addr,
pub target_port: SqlU16,

pub notification_type: UpstairsRepairNotificationType,
}

impl UpstairsRepairNotification {
#[allow(clippy::too_many_arguments)]
pub fn new(
time: DateTime<Utc>,
repair_id: TypedUuid<UpstairsRepairKind>,
repair_type: UpstairsRepairType,
upstairs_id: TypedUuid<UpstairsKind>,
session_id: TypedUuid<UpstairsSessionKind>,
region_id: TypedUuid<DownstairsRegionKind>,
target_addr: SocketAddrV6,
notification_type: UpstairsRepairNotificationType,
) -> Self {
Self {
time,
repair_id: repair_id.into(),
repair_type,
upstairs_id: upstairs_id.into(),
session_id: session_id.into(),
region_id: region_id.into(),
target_ip: target_addr.ip().into(),
target_port: target_addr.port().into(),
notification_type,
}
}

pub fn address(&self) -> SocketAddrV6 {
SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0)
}
}

/// A record of Crucible Upstairs repair progress.
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = upstairs_repair_progress)]
pub struct UpstairsRepairProgress {
pub repair_id: DbTypedUuid<UpstairsRepairKind>,
pub time: DateTime<Utc>,
pub current_item: i64,
pub total_items: i64,
}
Loading
Loading