Skip to content

Commit

Permalink
Reject add sled requests for sleds that already exist (#5675)
Browse files Browse the repository at this point in the history
I ran into this as a part of #5625, where adding a previously-expunged
sled appeared to succeed, but didn't actually add anything new.

Today if we try to add a sled that is already _running_, we get a 500,
because Nexus fails when it tries to tell the sled-agent to start. But
with this PR, we fail earlier: adding a sled that already has a subnet
allocation fails before we even try to talk to the sled-agent, because
it means someone has already added this sled:

```
root@oxz_switch:~# omdb -w nexus sleds add g2 i86pc
added sled g2 (i86pc): 90413e40-8139-43b4-9081-365dab6e5579
root@oxz_switch:~# omdb -w nexus sleds add g2 i86pc
Error: adding sled

Caused by:
    Error Response: status: 400 Bad Request; headers: {"content-type": "application/json", "x-request-id": "9eb95a9f-3fe0-4f75-8846-13490b95500e", "content-length": "188", "date": "Tue, 30 Apr 2024 20:54:49 GMT"}; value: Error { error_code: Some("ObjectAlreadyExists"), message: "already exists: sled \"g2 / i86pc (90413e40-8139-43b4-9081-365dab6e5579)\"", request_id: "9eb95a9f-3fe0-4f75-8846-13490b95500e" }
```

This does change the external API slightly (204 -> 201 created, and we
now return the ID), but I think (?) that's probably fine since we have
no real consumers of that yet.
  • Loading branch information
jgallagher authored May 1, 2024
1 parent d901636 commit 472d9f7
Show file tree
Hide file tree
Showing 14 changed files with 286 additions and 65 deletions.
1 change: 1 addition & 0 deletions clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ progenitor::generate_api!(
NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind,
TypedUuidForCollectionKind = omicron_uuid_kinds::CollectionUuid,
TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::DownstairsKind>,
TypedUuidForSledKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::SledKind>,
TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,
Expand Down
8 changes: 5 additions & 3 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1231,14 +1231,16 @@ async fn cmd_nexus_sled_add(
args: &SledAddArgs,
_destruction_token: DestructiveOperationToken,
) -> Result<(), anyhow::Error> {
client
let sled_id = client
.sled_add(&UninitializedSledId {
part: args.part.clone(),
serial: args.serial.clone(),
})
.await
.context("adding sled")?;
eprintln!("added sled {} ({})", args.serial, args.part);
.context("adding sled")?
.into_inner()
.id;
eprintln!("added sled {} ({}): {sled_id}", args.serial, args.part);
Ok(())
}

Expand Down
4 changes: 3 additions & 1 deletion nexus/db-model/src/sled_underlay_subnet_allocation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::schema::sled_underlay_subnet_allocation;
use crate::typed_uuid::DbTypedUuid;
use omicron_uuid_kinds::SledKind;
use uuid::Uuid;

/// Underlay allocation for a sled added to an initialized rack
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
#[diesel(table_name = sled_underlay_subnet_allocation)]
pub struct SledUnderlaySubnetAllocation {
pub rack_id: Uuid,
pub sled_id: Uuid,
pub sled_id: DbTypedUuid<SledKind>,
pub subnet_octet: i16,
pub hw_baseboard_id: Uuid,
}
1 change: 1 addition & 0 deletions nexus/db-queries/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ pub use inventory::DataStoreInventoryTest;
use nexus_db_model::AllSchemaVersions;
pub use probe::ProbeInfo;
pub use rack::RackInit;
pub use rack::SledUnderlayAllocationResult;
pub use silo::Discoverability;
pub use switch_port::SwitchPortSettingsCombinedResult;
pub use virtual_provisioning_collection::StorageType;
Expand Down
74 changes: 57 additions & 17 deletions nexus/db-queries/src/db/datastore/rack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ use omicron_common::api::external::ResourceType;
use omicron_common::api::external::UpdateResult;
use omicron_common::bail_unless;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::SledUuid;
use slog_error_chain::InlineErrorChain;
use std::sync::{Arc, OnceLock};
use uuid::Uuid;
Expand Down Expand Up @@ -172,6 +173,15 @@ impl From<RackInitError> for Error {
}
}

/// Possible results of attempting a new sled underlay allocation
#[derive(Debug, Clone)]
pub enum SledUnderlayAllocationResult {
/// A new allocation was created
New(SledUnderlaySubnetAllocation),
/// A prior allocation was found
Existing(SledUnderlaySubnetAllocation),
}

impl DataStore {
pub async fn rack_list(
&self,
Expand Down Expand Up @@ -295,7 +305,7 @@ impl DataStore {
opctx: &OpContext,
rack_id: Uuid,
hw_baseboard_id: Uuid,
) -> Result<SledUnderlaySubnetAllocation, Error> {
) -> Result<SledUnderlayAllocationResult, Error> {
// Fetch all the existing allocations via self.rack_id
let allocations = self.rack_subnet_allocations(opctx, rack_id).await?;

Expand All @@ -306,17 +316,14 @@ impl DataStore {
const MIN_SUBNET_OCTET: i16 = 33;
let mut new_allocation = SledUnderlaySubnetAllocation {
rack_id,
sled_id: Uuid::new_v4(),
sled_id: SledUuid::new_v4().into(),
subnet_octet: MIN_SUBNET_OCTET,
hw_baseboard_id,
};
let mut allocation_already_exists = false;
for allocation in allocations {
if allocation.hw_baseboard_id == new_allocation.hw_baseboard_id {
// We already have an allocation for this sled.
new_allocation = allocation;
allocation_already_exists = true;
break;
return Ok(SledUnderlayAllocationResult::Existing(allocation));
}
if allocation.subnet_octet == new_allocation.subnet_octet {
bail_unless!(
Expand All @@ -332,11 +339,8 @@ impl DataStore {
// allocations when sleds are being added. We will need another
// mechanism ala generation numbers when we must interleave additions
// and removals of sleds.
if !allocation_already_exists {
self.sled_subnet_allocation_insert(opctx, &new_allocation).await?;
}

Ok(new_allocation)
self.sled_subnet_allocation_insert(opctx, &new_allocation).await?;
Ok(SledUnderlayAllocationResult::New(new_allocation))
}

/// Return all current underlay allocations for the rack.
Expand Down Expand Up @@ -2121,7 +2125,7 @@ mod test {
for i in 0..5i16 {
let allocation = SledUnderlaySubnetAllocation {
rack_id,
sled_id: Uuid::new_v4(),
sled_id: SledUuid::new_v4().into(),
subnet_octet: 33 + i,
hw_baseboard_id: Uuid::new_v4(),
};
Expand All @@ -2141,7 +2145,7 @@ mod test {
// sled_id. Ensure we get an error due to a unique constraint.
let mut should_fail_allocation = SledUnderlaySubnetAllocation {
rack_id,
sled_id: Uuid::new_v4(),
sled_id: SledUuid::new_v4().into(),
subnet_octet: 37,
hw_baseboard_id: Uuid::new_v4(),
};
Expand Down Expand Up @@ -2169,7 +2173,7 @@ mod test {
// Allocations outside our expected range fail
let mut should_fail_allocation = SledUnderlaySubnetAllocation {
rack_id,
sled_id: Uuid::new_v4(),
sled_id: SledUuid::new_v4().into(),
subnet_octet: 32,
hw_baseboard_id: Uuid::new_v4(),
};
Expand Down Expand Up @@ -2205,18 +2209,28 @@ mod test {

let rack_id = Uuid::new_v4();

let mut hw_baseboard_ids = vec![];
let mut allocated_octets = vec![];
for _ in 0..5 {
let hw_baseboard_id = Uuid::new_v4();
hw_baseboard_ids.push(hw_baseboard_id);
allocated_octets.push(
datastore
match datastore
.allocate_sled_underlay_subnet_octets(
&opctx,
rack_id,
Uuid::new_v4(),
hw_baseboard_id,
)
.await
.unwrap()
.subnet_octet,
{
SledUnderlayAllocationResult::New(allocation) => {
allocation.subnet_octet
}
SledUnderlayAllocationResult::Existing(allocation) => {
panic!("unexpected allocation {allocation:?}");
}
},
);
}

Expand All @@ -2232,6 +2246,32 @@ mod test {
allocations.iter().map(|a| a.subnet_octet).collect::<Vec<_>>()
);

// If we attempt to insert the same baseboards again, we should get the
// existing allocations back.
for (hw_baseboard_id, expected_octet) in
hw_baseboard_ids.into_iter().zip(expected)
{
match datastore
.allocate_sled_underlay_subnet_octets(
&opctx,
rack_id,
hw_baseboard_id,
)
.await
.unwrap()
{
SledUnderlayAllocationResult::New(allocation) => {
panic!("unexpected allocation {allocation:?}");
}
SledUnderlayAllocationResult::Existing(allocation) => {
assert_eq!(
allocation.subnet_octet, expected_octet,
"unexpected octet for {allocation:?}"
);
}
}
}

db.cleanup().await.unwrap();
logctx.cleanup_successful();
}
Expand Down
26 changes: 21 additions & 5 deletions nexus/src/app/rack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use nexus_db_queries::context::OpContext;
use nexus_db_queries::db;
use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder;
use nexus_db_queries::db::datastore::RackInit;
use nexus_db_queries::db::datastore::SledUnderlayAllocationResult;
use nexus_db_queries::db::lookup::LookupPath;
use nexus_reconfigurator_execution::silo_dns_name;
use nexus_types::deployment::blueprint_zone_type;
Expand Down Expand Up @@ -56,7 +57,10 @@ use omicron_common::api::external::ListResultVec;
use omicron_common::api::external::LookupResult;
use omicron_common::api::external::Name;
use omicron_common::api::external::NameOrId;
use omicron_common::api::external::ResourceType;
use omicron_common::api::internal::shared::ExternalPortDiscovery;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::SledUuid;
use sled_agent_client::types::AddSledRequest;
use sled_agent_client::types::StartSledAgentRequest;
use sled_agent_client::types::StartSledAgentRequestBody;
Expand Down Expand Up @@ -776,7 +780,7 @@ impl super::Nexus {
&self,
opctx: &OpContext,
sled: UninitializedSledId,
) -> Result<(), Error> {
) -> Result<SledUuid, Error> {
let baseboard_id = sled.clone().into();
let hw_baseboard_id = self
.db_datastore
Expand All @@ -787,14 +791,26 @@ impl super::Nexus {
let rack_subnet =
Ipv6Subnet::<RACK_PREFIX>::from(rack_subnet(Some(subnet))?);

let allocation = self
let allocation = match self
.db_datastore
.allocate_sled_underlay_subnet_octets(
opctx,
self.rack_id,
hw_baseboard_id,
)
.await?;
.await?
{
SledUnderlayAllocationResult::New(allocation) => allocation,
SledUnderlayAllocationResult::Existing(allocation) => {
return Err(Error::ObjectAlreadyExists {
type_name: ResourceType::Sled,
object_name: format!(
"{} / {} ({})",
sled.serial, sled.part, allocation.sled_id
),
});
}
};

// Convert `UninitializedSledId` to the sled-agent type
let baseboard_id = sled_agent_client::types::BaseboardId {
Expand All @@ -809,7 +825,7 @@ impl super::Nexus {
generation: 0,
schema_version: 1,
body: StartSledAgentRequestBody {
id: allocation.sled_id,
id: allocation.sled_id.into_untyped_uuid(),
rack_id: allocation.rack_id,
use_trust_quorum: true,
is_lrtq_learner: true,
Expand Down Expand Up @@ -852,7 +868,7 @@ impl super::Nexus {
),
})?;

Ok(())
Ok(allocation.sled_id.into())
}

async fn get_any_sled_agent_url(
Expand Down
15 changes: 11 additions & 4 deletions nexus/src/external_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ use omicron_common::api::external::{
http_pagination::data_page_params_for, AggregateBgpMessageHistory,
};
use omicron_common::bail_unless;
use omicron_uuid_kinds::SledUuid;
use parse_display::Display;
use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode;
use propolis_client::support::tungstenite::protocol::{
Expand Down Expand Up @@ -5210,6 +5211,12 @@ async fn sled_list_uninitialized(
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

/// The unique ID of a sled.
#[derive(Clone, Debug, Serialize, JsonSchema)]
pub struct SledId {
pub id: SledUuid,
}

/// Add sled to initialized rack
//
// TODO: In the future this should really be a PUT request, once we resolve
Expand All @@ -5218,19 +5225,19 @@ async fn sled_list_uninitialized(
// we are only operating on single rack systems.
#[endpoint {
method = POST,
path = "/v1/system/hardware/sleds/",
path = "/v1/system/hardware/sleds",
tags = ["system/hardware"]
}]
async fn sled_add(
rqctx: RequestContext<Arc<ServerContext>>,
sled: TypedBody<params::UninitializedSledId>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
) -> Result<HttpResponseCreated<SledId>, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let handler = async {
let opctx = crate::context::op_context_for_external_api(&rqctx).await?;
nexus.sled_add(&opctx, sled.into_inner()).await?;
Ok(HttpResponseUpdatedNoContent())
let id = nexus.sled_add(&opctx, sled.into_inner()).await?;
Ok(HttpResponseCreated(SledId { id }))
};
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
}
Expand Down
10 changes: 5 additions & 5 deletions nexus/src/internal_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

//! Handler functions (entrypoints) for HTTP APIs internal to the control plane
use crate::ServerContext;

use super::params::{OximeterInfo, RackInitializationRequest};
use crate::external_api::http_entrypoints::SledId;
use crate::ServerContext;
use dropshot::endpoint;
use dropshot::ApiDescription;
use dropshot::FreeformBody;
Expand Down Expand Up @@ -1043,13 +1043,13 @@ async fn sled_list_uninitialized(
async fn sled_add(
rqctx: RequestContext<Arc<ServerContext>>,
sled: TypedBody<UninitializedSledId>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
) -> Result<HttpResponseCreated<SledId>, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let handler = async {
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
nexus.sled_add(&opctx, sled.into_inner()).await?;
Ok(HttpResponseUpdatedNoContent())
let id = nexus.sled_add(&opctx, sled.into_inner()).await?;
Ok(HttpResponseCreated(SledId { id }))
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}
Expand Down
23 changes: 23 additions & 0 deletions nexus/test-utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ use omicron_uuid_kinds::ZpoolUuid;
use oximeter_collector::Oximeter;
use oximeter_producer::LogConfig;
use oximeter_producer::Server as ProducerServer;
use sled_agent_client::types::EarlyNetworkConfig;
use sled_agent_client::types::EarlyNetworkConfigBody;
use sled_agent_client::types::RackNetworkConfigV1;
use slog::{debug, error, o, Logger};
use std::collections::BTreeMap;
use std::collections::HashMap;
Expand Down Expand Up @@ -911,6 +914,26 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
})
.await
.expect("Failed to configure sled agent with our zones");
client
.write_network_bootstore_config(&EarlyNetworkConfig {
body: EarlyNetworkConfigBody {
ntp_servers: Vec::new(),
rack_network_config: Some(RackNetworkConfigV1 {
bfd: Vec::new(),
bgp: Vec::new(),
infra_ip_first: "192.0.2.10".parse().unwrap(),
infra_ip_last: "192.0.2.100".parse().unwrap(),
ports: Vec::new(),
rack_subnet: "fd00:1122:3344:0100::/56"
.parse()
.unwrap(),
}),
},
generation: 1,
schema_version: 1,
})
.await
.expect("Failed to write early networking config to bootstore");
}

// Set up the Crucible Pantry on an existing Sled Agent.
Expand Down
Loading

0 comments on commit 472d9f7

Please sign in to comment.