Skip to content

Commit

Permalink
Don't constrain baseboard when adding sled (#4987)
Browse files Browse the repository at this point in the history
We were artificially limiting how we added sleds to a rack by forcing
them to be of type `Baseboard::Gimlet`. Instead of constructing a
`Baseboard` inside nexus, we instead send down the serial and part
numbers and use those to uniquely identify the sled. We ignore whether
it's a PC or Gimlet as long as the ids match. This is similar to how the
inventory works and allows adding a sled to a rack on the falcon a4x2
testbed.
  • Loading branch information
andrewjstone authored Feb 6, 2024
1 parent 8f1e419 commit dd8d1aa
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 41 deletions.
32 changes: 5 additions & 27 deletions nexus/src/app/rack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ use omicron_common::api::external::ListResultVec;
use omicron_common::api::external::LookupResult;
use omicron_common::api::external::Name;
use omicron_common::api::external::NameOrId;
use omicron_common::api::external::ResourceType;
use omicron_common::api::internal::shared::ExternalPortDiscovery;
use sled_agent_client::types::AddSledRequest;
use sled_agent_client::types::EarlyNetworkConfigBody;
Expand Down Expand Up @@ -871,36 +870,15 @@ impl super::Nexus {
)
.await?;

// Grab the SPs from the last collection
let collection =
self.db_datastore.inventory_get_latest_collection(opctx).await?;

// If there isn't a collection, we don't know about the sled
let Some(collection) = collection else {
return Err(Error::unavail("no inventory data available"));
};

// Find the revision
let Some(sp) = collection.sps.get(&baseboard_id) else {
return Err(Error::ObjectNotFound {
type_name: ResourceType::Sled,
lookup_type:
omicron_common::api::external::LookupType::ByCompositeId(
format!("{sled:?}"),
),
});
};

// Convert the baseboard as necessary
let baseboard = sled_agent_client::types::Baseboard::Gimlet {
identifier: sled.serial.clone(),
model: sled.part.clone(),
revision: sp.baseboard_revision.into(),
// Convert `UninitializedSledId` to the sled-agent type
let baseboard_id = sled_agent_client::types::BaseboardId {
serial_number: sled.serial.clone(),
part_number: sled.part.clone(),
};

// Make the call to sled-agent
let req = AddSledRequest {
sled_id: baseboard,
sled_id: baseboard_id,
start_request: StartSledAgentRequest {
generation: 0,
schema_version: 1,
Expand Down
20 changes: 19 additions & 1 deletion openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -1240,7 +1240,7 @@
"type": "object",
"properties": {
"sled_id": {
"$ref": "#/components/schemas/Baseboard"
"$ref": "#/components/schemas/BaseboardId"
},
"start_request": {
"$ref": "#/components/schemas/StartSledAgentRequest"
Expand Down Expand Up @@ -1319,6 +1319,24 @@
}
]
},
"BaseboardId": {
"description": "A representation of a Baseboard ID as used in the inventory subsystem This type is essentially the same as a `Baseboard` except it doesn't have a revision or HW type (Gimlet, PC, Unknown).",
"type": "object",
"properties": {
"part_number": {
"description": "Oxide Part Number",
"type": "string"
},
"serial_number": {
"description": "Serial number (unique for a given part number)",
"type": "string"
}
},
"required": [
"part_number",
"serial_number"
]
},
"BgpConfig": {
"type": "object",
"properties": {
Expand Down
16 changes: 12 additions & 4 deletions sled-agent/src/bootstrap/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,21 @@ impl TryFrom<UnvalidatedRackInitializeRequest> for RackInitializeRequest {
pub type Certificate = nexus_client::types::Certificate;
pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig;

/// A representation of a Baseboard ID as used in the inventory subsystem
/// This type is essentially the same as a `Baseboard` except it doesn't have a
/// revision or HW type (Gimlet, PC, Unknown).
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)]
pub struct BaseboardId {
/// Oxide Part Number
pub part_number: String,
/// Serial number (unique for a given part number)
pub serial_number: String,
}

/// A request to Add a given sled after rack initialization has occurred
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)]
pub struct AddSledRequest {
pub sled_id: Baseboard,
pub sled_id: BaseboardId,
pub start_request: StartSledAgentRequest,
}

Expand Down Expand Up @@ -255,9 +266,6 @@ pub struct StartSledAgentRequestBody {
/// true.
pub is_lrtq_learner: bool,

// Note: The order of these fields is load bearing, because we serialize
// `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it
// must come after non-table fields.
/// Portion of the IP space to be managed by the Sled Agent.
pub subnet: Ipv6Subnet<SLED_PREFIX>,
}
Expand Down
26 changes: 17 additions & 9 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT;
use crate::bootstrap::early_networking::{
EarlyNetworkConfig, EarlyNetworkSetupError,
};
use crate::bootstrap::params::StartSledAgentRequest;
use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest};
use crate::config::Config;
use crate::instance_manager::{InstanceManager, ReservoirMode};
use crate::long_running_tasks::LongRunningTaskHandles;
Expand Down Expand Up @@ -1187,8 +1187,8 @@ pub enum AddSledError {
},
#[error("Failed to connect to DDM")]
DdmAdminClient(#[source] ddm_admin_client::DdmError),
#[error("Failed to learn bootstrap ip for {0}")]
NotFound(Baseboard),
#[error("Failed to learn bootstrap ip for {0:?}")]
NotFound(BaseboardId),
#[error("Failed to initialize {sled_id}: {err}")]
BootstrapTcpClient {
sled_id: Baseboard,
Expand All @@ -1199,7 +1199,7 @@ pub enum AddSledError {
/// Add a sled to an initialized rack.
pub async fn sled_add(
log: Logger,
sled_id: Baseboard,
sled_id: BaseboardId,
request: StartSledAgentRequest,
) -> Result<(), AddSledError> {
// Get all known bootstrap addresses via DDM
Expand Down Expand Up @@ -1227,16 +1227,20 @@ pub async fn sled_add(
})
.collect::<FuturesUnordered<_>>();

// Execute the futures until we find our matching sled or done searching
// Execute the futures until we find our matching sled or are done searching
let mut target_ip = None;
let mut found_baseboard = None;
while let Some((ip, result)) = addrs_to_sleds.next().await {
match result {
Ok(baseboard) => {
// Convert from progenitor type back to `sled-hardware`
// type.
let found = baseboard.into_inner().into();
if sled_id == found {
let found: Baseboard = baseboard.into_inner().into();
if sled_id.serial_number == found.identifier()
&& sled_id.part_number == found.model()
{
target_ip = Some(ip);
found_baseboard = Some(found);
break;
}
}
Expand All @@ -1259,10 +1263,14 @@ pub async fn sled_add(
log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())),
);

// Safe to unwrap, because we would have bailed when checking target_ip
// above otherwise. baseboard and target_ip are set together.
let baseboard = found_baseboard.unwrap();

client.start_sled_agent(&request).await.map_err(|err| {
AddSledError::BootstrapTcpClient { sled_id: sled_id.clone(), err }
AddSledError::BootstrapTcpClient { sled_id: baseboard.clone(), err }
})?;

info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %sled_id);
info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %baseboard);
Ok(())
}

0 comments on commit dd8d1aa

Please sign in to comment.