Skip to content

Commit

Permalink
Implement sled-agent API for adding a sled
Browse files Browse the repository at this point in the history
This code largely reuses the existing code paths for starting sled-agent
with a small change to configured the bootstore as a learner node if
trust quorum is in use.

This code path does not attempt any retries, as it should be idempotent
as long as the same sled and rack UUIDs are used.This is a command
driven by an operator and we want to know right away if it succeeded or
not. Multiple sleds can be added by Nexus with individual calls, so that
individual errors can be returned without added complexity at the sled-
agent level. These requests can also be sent to different sled-agents,
since the sled-agent is just acting as a proxy to a remote
bootstrap agent.
  • Loading branch information
andrewjstone committed Nov 6, 2023
1 parent 52e4932 commit 0d50df0
Show file tree
Hide file tree
Showing 5 changed files with 341 additions and 1 deletion.
187 changes: 187 additions & 0 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,33 @@
}
}
},
"/sleds": {
"put": {
"summary": "Add a sled to a rack that was already initialized via RSS",
"operationId": "add_sled_to_initialized_rack",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/AddSledRequest"
}
}
},
"required": true
},
"responses": {
"204": {
"description": "resource updated"
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/switch-ports": {
"post": {
"operationId": "uplink_ensure",
Expand Down Expand Up @@ -938,6 +965,90 @@
}
},
"schemas": {
"AddSledRequest": {
"description": "A request to Add a given sled after rack initialization has occurred",
"type": "object",
"properties": {
"sled_id": {
"$ref": "#/components/schemas/Baseboard"
},
"start_request": {
"$ref": "#/components/schemas/StartSledAgentRequest"
}
},
"required": [
"sled_id",
"start_request"
]
},
"Baseboard": {
"description": "Describes properties that should uniquely identify a Gimlet.",
"oneOf": [
{
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"model": {
"type": "string"
},
"revision": {
"type": "integer",
"format": "int64"
},
"type": {
"type": "string",
"enum": [
"gimlet"
]
}
},
"required": [
"identifier",
"model",
"revision",
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"unknown"
]
}
},
"required": [
"type"
]
},
{
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"model": {
"type": "string"
},
"type": {
"type": "string",
"enum": [
"pc"
]
}
},
"required": [
"identifier",
"model",
"type"
]
}
]
},
"BgpConfig": {
"type": "object",
"properties": {
Expand Down Expand Up @@ -2365,6 +2476,18 @@
"type": "string",
"pattern": "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\")[/](12[0-8]|1[0-1][0-9]|[0-9]?[0-9])$"
},
"Ipv6Subnet": {
"description": "Wraps an [`Ipv6Network`] with a compile-time prefix length.",
"type": "object",
"properties": {
"net": {
"$ref": "#/components/schemas/Ipv6Net"
}
},
"required": [
"net"
]
},
"KnownArtifactKind": {
"description": "Kinds of update artifacts, as used by Nexus to determine what updates are available and by sled-agent to determine how to apply an update when asked.",
"type": "string",
Expand Down Expand Up @@ -3184,6 +3307,70 @@
"last_port"
]
},
"StartSledAgentRequest": {
"description": "Configuration information for launching a Sled Agent.",
"type": "object",
"properties": {
"body": {
"$ref": "#/components/schemas/StartSledAgentRequestBody"
},
"generation": {
"description": "The current generation number of data as stored in CRDB.\n\nThe initial generation is set during RSS time and then only mutated by Nexus. For now, we don't actually anticipate mutating this data, but we leave open the possiblity.",
"type": "integer",
"format": "uint64",
"minimum": 0
},
"schema_version": {
"type": "integer",
"format": "uint32",
"minimum": 0
}
},
"required": [
"body",
"generation",
"schema_version"
]
},
"StartSledAgentRequestBody": {
"description": "This is the actual app level data of `StartSledAgentRequest`\n\nWe nest it below the \"header\" of `generation` and `schema_version` so that we can perform partial deserialization of `EarlyNetworkConfig` to only read the header and defer deserialization of the body once we know the schema version. This is possible via the use of [`serde_json::value::RawValue`] in future (post-v1) deserialization paths.",
"type": "object",
"properties": {
"id": {
"description": "Uuid of the Sled Agent to be created.",
"type": "string",
"format": "uuid"
},
"is_lrtq_learner": {
"description": "Is this node an LRTQ learner node?\n\nWe only put the node into learner mode if `use_trust_quorum` is also true.",
"type": "boolean"
},
"rack_id": {
"description": "Uuid of the rack to which this sled agent belongs.",
"type": "string",
"format": "uuid"
},
"subnet": {
"description": "Portion of the IP space to be managed by the Sled Agent.",
"allOf": [
{
"$ref": "#/components/schemas/Ipv6Subnet"
}
]
},
"use_trust_quorum": {
"description": "Use trust quorum for key generation",
"type": "boolean"
}
},
"required": [
"id",
"is_lrtq_learner",
"rack_id",
"subnet",
"use_trust_quorum"
]
},
"StorageLimit": {
"description": "The limit on space allowed for zone bundles, as a percentage of the overall dataset's quota.",
"type": "integer",
Expand Down
7 changes: 7 additions & 0 deletions sled-agent/src/bootstrap/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ impl TryFrom<UnvalidatedRackInitializeRequest> for RackInitializeRequest {
pub type Certificate = nexus_client::types::Certificate;
pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig;

/// A request to Add a given sled after rack initialization has occurred
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)]
pub struct AddSledRequest {
pub sled_id: Baseboard,
pub start_request: StartSledAgentRequest,
}

// A wrapper around StartSledAgentRequestV0 that was used
// for the ledger format.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)]
Expand Down
14 changes: 14 additions & 0 deletions sled-agent/src/bootstrap/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ pub enum StartError {

#[error("Failed to bind sprocket server")]
BindSprocketsServer(#[source] io::Error),

#[error("Failed to initialize lrtq node as learner: {0}")]
FailedLearnerInit(bootstore::NodeRequestError),
}

/// Server for the bootstrap agent.
Expand Down Expand Up @@ -398,6 +401,9 @@ pub enum SledAgentServerStartError {

#[error("Failed to commit sled agent request to ledger")]
CommitToLedger(#[from] ledger::Error),

#[error("Failed to initialize this lrtq node as a learner: {0}")]
FailedLearnerInit(#[from] bootstore::NodeRequestError),
}

impl From<SledAgentServerStartError> for StartError {
Expand All @@ -412,6 +418,9 @@ impl From<SledAgentServerStartError> for StartError {
SledAgentServerStartError::CommitToLedger(err) => {
Self::CommitToLedger(err)
}
SledAgentServerStartError::FailedLearnerInit(err) => {
Self::FailedLearnerInit(err)
}
}
}
}
Expand Down Expand Up @@ -441,6 +450,11 @@ async fn start_sled_agent(
LrtqOrHardcodedSecretRetriever::init_hardcoded();
}

if request.body.use_trust_quorum && request.body.is_lrtq_learner {
info!(log, "Initializing sled as learner");
bootstore.init_learner().await?;
}

// Inform the storage service that the key manager is available
managers.storage.key_manager_ready().await;

Expand Down
39 changes: 39 additions & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use super::sled_agent::SledAgent;
use crate::bootstrap::early_networking::EarlyNetworkConfig;
use crate::bootstrap::params::AddSledRequest;
use crate::params::{
CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody,
InstancePutMigrationIdsBody, InstancePutStateBody,
Expand Down Expand Up @@ -68,6 +69,7 @@ pub fn api() -> SledApiDescription {
api.register(uplink_ensure)?;
api.register(read_network_bootstore_config_cache)?;
api.register(write_network_bootstore_config)?;
api.register(add_sled_to_initialized_rack)?;

Ok(())
}
Expand Down Expand Up @@ -706,3 +708,40 @@ async fn write_network_bootstore_config(

Ok(HttpResponseUpdatedNoContent())
}

/// Add a sled to a rack that was already initialized via RSS
#[endpoint {
method = PUT,
path = "/sleds"
}]
async fn add_sled_to_initialized_rack(
rqctx: RequestContext<SledAgent>,
body: TypedBody<AddSledRequest>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let sa = rqctx.context();
let request = body.into_inner();

// Perform some minimal validation
if request.start_request.body.use_trust_quorum
&& !request.start_request.body.is_lrtq_learner
{
return Err(HttpError::for_bad_request(
None,
"New sleds must be LRTQ learners if trust quorum is in use"
.to_string(),
));
}

crate::sled_agent::add_sled_to_initialized_rack(
sa.logger().clone(),
request.sled_id,
request.start_request,
)
.await
.map_err(|e| {
HttpError::for_internal_error(format!(
"failed to add sled to rack cluster: {e}"
))
})?;
Ok(HttpResponseUpdatedNoContent())
}
Loading

0 comments on commit 0d50df0

Please sign in to comment.