Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept notifications from Crucible #5135

Merged
merged 29 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
128a998
Accept live repair status reports from Crucible
jmpesp Feb 23, 2024
169b478
use replace directive for TypedUuidFor*
jmpesp Feb 23, 2024
7e7934e
no more underscores
jmpesp Feb 26, 2024
ff08b00
support status for live repair and reconciliation
jmpesp Feb 27, 2024
b1940f0
schema 37 -> 38
jmpesp Feb 27, 2024
1b03223
more schema update
jmpesp Feb 27, 2024
0cd8601
accept upstairs repair progress
jmpesp Feb 27, 2024
d2bf5f5
tests pass
jmpesp Feb 27, 2024
d6f41f3
Merge branch 'main' into crucible_repair_status_reports
jmpesp Feb 27, 2024
832e649
simple mismatched record type check
jmpesp Feb 28, 2024
6c44342
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 1, 2024
53947b3
bad merge
jmpesp Mar 1, 2024
1c81d75
move retry_until_known_result into common
jmpesp Mar 1, 2024
c39094e
prepend /crucible/0/
jmpesp Mar 8, 2024
63d05ed
add downstairs client task stopped notification
jmpesp Mar 11, 2024
567ce0c
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 11, 2024
a5e0c9f
schema 38 -> 41
jmpesp Mar 11, 2024
2b6fa26
downstairs_client_stopped_notification sql
jmpesp Mar 11, 2024
83106c3
fmt
jmpesp Mar 11, 2024
41e7aa7
snake case please
jmpesp Mar 11, 2024
86847b4
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 12, 2024
8bc626c
update URLs with prefix
jmpesp Mar 12, 2024
0b37b63
use new Error::non_resourcetype_not_found
jmpesp Mar 12, 2024
3dfabf0
use a variable, they are the same requests
jmpesp Mar 12, 2024
6e8b63c
test_upstairs_notify_downstairs_client_stop
jmpesp Mar 12, 2024
a5410aa
fmt
jmpesp Mar 12, 2024
df79ff2
separate endpoints for stop request and stopped
jmpesp Mar 13, 2024
737d18f
missing omicron.public. prefix
jmpesp Mar 14, 2024
215b014
Merge branch 'main' into crucible_repair_status_reports
jmpesp Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,17 @@ progenitor::generate_api!(
MacAddr = omicron_common::api::external::MacAddr,
Name = omicron_common::api::external::Name,
NewPasswordHash = omicron_passwords::NewPasswordHash,

NetworkInterface = omicron_common::api::internal::shared::NetworkInterface,
NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind,
TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::DownstairsKind>,
TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,
},
patch = {
SledAgentInfo = { derives = [PartialEq, Eq] },
ByteCount = { derives = [PartialEq, Eq] },
Baseboard = { derives = [PartialEq, Eq] }
}
);

Expand Down
20 changes: 20 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,23 @@ pub struct RepairProgress {
pub current_item: i64,
pub total_items: i64,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DownstairsClientStopReason {
Replacing,
Disabled,
FailedReconcile,
IOError,
BadNegotiationOrder,
Incompatible,
FailedLiveRepair,
TooManyOutstandingJobs,
Deactivated,
}

#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)]
pub struct DownstairsClientStopped {
pub time: DateTime<Utc>,
pub reason: DownstairsClientStopReason,
}
70 changes: 70 additions & 0 deletions nexus/db-model/src/downstairs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use crate::schema::downstairs_client_stopped_notification;
use crate::typed_uuid::DbTypedUuid;
use chrono::{DateTime, Utc};
use omicron_common::api::internal;
use omicron_uuid_kinds::DownstairsKind;
use omicron_uuid_kinds::UpstairsKind;
use serde::{Deserialize, Serialize};

impl_enum_type!(
#[derive(SqlType, Debug, QueryId)]
#[diesel(postgres_type(name = "downstairs_client_stop_reason_type", schema = "public"))]
pub struct DownstairsClientStopReasonEnum;

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
#[diesel(sql_type = DownstairsClientStopReasonEnum)]
pub enum DownstairsClientStopReason;

// Reason types
Replacing => b"replacing"
Disabled => b"disabled"
FailedReconcile => b"failed_reconcile"
IOError => b"io_error"
BadNegotiationOrder => b"bad_negotiation_order"
Incompatible => b"incompatible"
FailedLiveRepair => b"failed_live_repair"
TooManyOutstandingJobs => b"too_many_outstanding_jobs"
Deactivated => b"deactivated"
);

impl From<internal::nexus::DownstairsClientStopReason>
for DownstairsClientStopReason
{
fn from(
v: internal::nexus::DownstairsClientStopReason,
) -> DownstairsClientStopReason {
match v {
internal::nexus::DownstairsClientStopReason::Replacing => DownstairsClientStopReason::Replacing,
internal::nexus::DownstairsClientStopReason::Disabled => DownstairsClientStopReason::Disabled,
internal::nexus::DownstairsClientStopReason::FailedReconcile => DownstairsClientStopReason::FailedReconcile,
internal::nexus::DownstairsClientStopReason::IOError => DownstairsClientStopReason::IOError,
internal::nexus::DownstairsClientStopReason::BadNegotiationOrder => DownstairsClientStopReason::BadNegotiationOrder,
internal::nexus::DownstairsClientStopReason::Incompatible => DownstairsClientStopReason::Incompatible,
internal::nexus::DownstairsClientStopReason::FailedLiveRepair => DownstairsClientStopReason::FailedLiveRepair,
internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs => DownstairsClientStopReason::TooManyOutstandingJobs,
internal::nexus::DownstairsClientStopReason::Deactivated => DownstairsClientStopReason::Deactivated,
}
}
}

/// A Record of when an Upstairs stopped a Downstairs client task
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = downstairs_client_stopped_notification)]
pub struct DownstairsClientStoppedNotification {
// Importantly, this is client time, not Nexus' time that it received the
// notification.
pub time: DateTime<Utc>,

// Which Upstairs sent this notification?
pub upstairs_id: DbTypedUuid<UpstairsKind>,

// Which Downstairs client was stopped?
pub downstairs_id: DbTypedUuid<DownstairsKind>,

pub reason: DownstairsClientStopReason,
}
2 changes: 2 additions & 0 deletions nexus/db-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ mod digest;
mod disk;
mod disk_state;
mod dns;
mod downstairs;
mod external_ip;
mod generation;
mod identity_provider;
Expand Down Expand Up @@ -128,6 +129,7 @@ pub use digest::*;
pub use disk::*;
pub use disk_state::*;
pub use dns::*;
pub use downstairs::*;
pub use external_ip::*;
pub use generation::*;
pub use identity_provider::*;
Expand Down
24 changes: 23 additions & 1 deletion nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion;
///
/// This should be updated whenever the schema is changed. For more details,
/// refer to: schema/crdb/README.adoc
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(40, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(41, 0, 0);

table! {
disk (id) {
Expand Down Expand Up @@ -1521,6 +1521,19 @@ table! {
}
}

table! {
probe (id) {
id -> Uuid,
name -> Text,
description -> Text,
time_created -> Timestamptz,
time_modified -> Timestamptz,
time_deleted -> Nullable<Timestamptz>,
project_id -> Uuid,
sled -> Uuid,
}
}

table! {
upstairs_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) {
time -> Timestamptz,
Expand All @@ -1547,6 +1560,15 @@ table! {
}
}

table! {
downstairs_client_stopped_notification (downstairs_id, time, reason) {
time -> Timestamptz,
upstairs_id -> Uuid,
downstairs_id -> Uuid,
reason -> crate::DownstairsClientStopReasonEnum,
}
}

table! {
db_metadata (singleton) {
singleton -> Bool,
Expand Down
39 changes: 37 additions & 2 deletions nexus/db-queries/src/db/datastore/volume.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::identity::Asset;
use crate::db::model::Dataset;
use crate::db::model::DownstairsClientStoppedNotification;
use crate::db::model::Region;
use crate::db::model::RegionSnapshot;
use crate::db::model::UpstairsRepairNotification;
Expand All @@ -29,7 +30,9 @@ use omicron_common::api::external::Error;
use omicron_common::api::external::ListResultVec;
use omicron_common::api::external::LookupResult;
use omicron_common::api::external::ResourceType;
use omicron_common::api::internal::nexus::DownstairsClientStopped;
use omicron_common::api::internal::nexus::RepairProgress;
use omicron_uuid_kinds::DownstairsKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsKind;
use omicron_uuid_kinds::UpstairsRepairKind;
Expand Down Expand Up @@ -960,8 +963,7 @@ impl DataStore {
.optional()?;

if matching_repair.is_none() {
// XXX should be 404
return Err(err.bail(Error::invalid_request(&format!(
return Err(err.bail(Error::non_resourcetype_not_found(&format!(
"upstairs {upstairs_id} repair {repair_id} not found"
))));
}
Expand All @@ -988,6 +990,39 @@ impl DataStore {
}
})
}

/// Record when a Downstairs client is stopped, and why
pub async fn downstairs_stopped_notification(
&self,
opctx: &OpContext,
upstairs_id: TypedUuid<UpstairsKind>,
downstairs_id: TypedUuid<DownstairsKind>,
downstairs_client_stopped: DownstairsClientStopped,
) -> Result<(), Error> {
use db::schema::downstairs_client_stopped_notification::dsl;

let conn = self.pool_connection_authorized(opctx).await?;

diesel::insert_into(dsl::downstairs_client_stopped_notification)
.values(DownstairsClientStoppedNotification {
time: downstairs_client_stopped.time,
upstairs_id: upstairs_id.into(),
downstairs_id: downstairs_id.into(),
reason: downstairs_client_stopped.reason.into(),
})
.on_conflict((
dsl::time,
dsl::upstairs_id,
dsl::downstairs_id,
dsl::reason,
))
.do_nothing()
.execute_async(&*conn)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;

Ok(())
}
}

#[derive(Default, Clone, Debug, Serialize, Deserialize)]
Expand Down
1 change: 1 addition & 0 deletions nexus/db-queries/src/db/pool_connection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[
"caboose_which",
"dataset_kind",
"dns_group",
"downstairs_client_stop_reason_type",
"hw_power_state",
"hw_rot_slot",
"identity_type",
Expand Down
26 changes: 26 additions & 0 deletions nexus/src/app/volume.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ use nexus_db_model::UpstairsRepairNotificationType;
use nexus_db_queries::authn;
use nexus_db_queries::context::OpContext;
use omicron_common::api::external::DeleteResult;
use omicron_common::api::internal::nexus::DownstairsClientStopped;
use omicron_common::api::internal::nexus::RepairFinishInfo;
use omicron_common::api::internal::nexus::RepairProgress;
use omicron_common::api::internal::nexus::RepairStartInfo;
use omicron_uuid_kinds::DownstairsKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsKind;
use omicron_uuid_kinds::UpstairsRepairKind;
Expand Down Expand Up @@ -140,4 +142,28 @@ impl super::Nexus {
)
.await
}

/// An Upstairs is telling us that a Downstairs client task was stopped
pub(crate) async fn downstairs_stopped_notification(
self: &Arc<Self>,
opctx: &OpContext,
upstairs_id: TypedUuid<UpstairsKind>,
downstairs_id: TypedUuid<DownstairsKind>,
downstairs_client_stopped: DownstairsClientStopped,
) -> DeleteResult {
info!(
self.log,
"received downstairs_stopped_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}",
downstairs_client_stopped,
);

self.db_datastore
.downstairs_stopped_notification(
opctx,
upstairs_id,
downstairs_id,
downstairs_client_stopped,
)
.await
}
}
45 changes: 42 additions & 3 deletions nexus/src/internal_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,14 @@ use omicron_common::api::external::http_pagination::ScanById;
use omicron_common::api::external::http_pagination::ScanParams;
use omicron_common::api::external::Error;
use omicron_common::api::internal::nexus::DiskRuntimeState;
use omicron_common::api::internal::nexus::DownstairsClientStopped;
use omicron_common::api::internal::nexus::ProducerEndpoint;
use omicron_common::api::internal::nexus::RepairFinishInfo;
use omicron_common::api::internal::nexus::RepairProgress;
use omicron_common::api::internal::nexus::RepairStartInfo;
use omicron_common::api::internal::nexus::SledInstanceState;
use omicron_common::update::ArtifactId;
use omicron_uuid_kinds::DownstairsKind;
use omicron_uuid_kinds::TypedUuid;
use omicron_uuid_kinds::UpstairsKind;
use omicron_uuid_kinds::UpstairsRepairKind;
Expand Down Expand Up @@ -85,6 +87,7 @@ pub(crate) fn internal_api() -> NexusApiDescription {
api.register(cpapi_upstairs_repair_start)?;
api.register(cpapi_upstairs_repair_finish)?;
api.register(cpapi_upstairs_repair_progress)?;
api.register(cpapi_downstairs_stopped)?;

api.register(saga_list)?;
api.register(saga_view)?;
Expand Down Expand Up @@ -528,7 +531,7 @@ struct UpstairsPathParam {
/// An Upstairs will notify this endpoint when a repair starts
#[endpoint {
method = POST,
path = "/upstairs/{upstairs_id}/repair-start",
path = "/crucible/0/upstairs/{upstairs_id}/repair-start",
}]
async fn cpapi_upstairs_repair_start(
rqctx: RequestContext<Arc<ServerContext>>,
Expand Down Expand Up @@ -556,7 +559,7 @@ async fn cpapi_upstairs_repair_start(
/// An Upstairs will notify this endpoint when a repair finishes.
#[endpoint {
method = POST,
path = "/upstairs/{upstairs_id}/repair-finish",
path = "/crucible/0/upstairs/{upstairs_id}/repair-finish",
}]
async fn cpapi_upstairs_repair_finish(
rqctx: RequestContext<Arc<ServerContext>>,
Expand Down Expand Up @@ -591,7 +594,7 @@ struct UpstairsRepairPathParam {
/// An Upstairs will update this endpoint with the progress of a repair
#[endpoint {
method = POST,
path = "/upstairs/{upstairs_id}/repair/{repair_id}/progress",
path = "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress",
}]
async fn cpapi_upstairs_repair_progress(
rqctx: RequestContext<Arc<ServerContext>>,
Expand All @@ -617,6 +620,42 @@ async fn cpapi_upstairs_repair_progress(
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

/// Path parameters for Downstairs requests (internal API)
#[derive(Deserialize, JsonSchema)]
struct UpstairsDownstairsPathParam {
upstairs_id: TypedUuid<UpstairsKind>,
downstairs_id: TypedUuid<DownstairsKind>,
}

/// An Upstairs will update this endpoint if a Downstairs client task is stopped
#[endpoint {
method = POST,
path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped",
}]
async fn cpapi_downstairs_stopped(
rqctx: RequestContext<Arc<ServerContext>>,
path_params: Path<UpstairsDownstairsPathParam>,
downstairs_client_stopped: TypedBody<DownstairsClientStopped>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let path = path_params.into_inner();

let handler = async {
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
nexus
.downstairs_stopped_notification(
&opctx,
path.upstairs_id,
path.downstairs_id,
downstairs_client_stopped.into_inner(),
)
.await?;
Ok(HttpResponseUpdatedNoContent())
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

// Sagas

/// List sagas
Expand Down
Loading
Loading