Skip to content

Commit

Permalink
[nexus][sled-agent] Try harder to load firewall rules during service …
Browse files Browse the repository at this point in the history
…bringup
  • Loading branch information
smklein committed Feb 13, 2024
1 parent 30571d5 commit 9d5d238
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 14 deletions.
14 changes: 10 additions & 4 deletions nexus/src/app/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ impl super::Nexus {
// unless the DNS lookups at sled-agent are only for rack-local nexuses.
pub(crate) async fn upsert_sled(
&self,
opctx: &OpContext,
_opctx: &OpContext,
id: Uuid,
info: SledAgentStartupInfo,
) -> Result<(), Error> {
Expand Down Expand Up @@ -73,10 +73,16 @@ impl super::Nexus {
);
self.db_datastore.sled_upsert(sled).await?;

// Make sure any firewall rules for serices that may
// be running on this sled get plumbed
self.plumb_service_firewall_rules(opctx, &[id]).await?;
Ok(())
}

pub(crate) async fn sled_request_firewall_rules(
&self,
opctx: &OpContext,
id: Uuid,
) -> Result<(), Error> {
info!(self.log, "requesting firewall rules"; "sled_uuid" => id.to_string());
self.plumb_service_firewall_rules(opctx, &[id]).await?;
Ok(())
}

Expand Down
26 changes: 26 additions & 0 deletions nexus/src/internal_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type NexusApiDescription = ApiDescription<Arc<ServerContext>>;
pub(crate) fn internal_api() -> NexusApiDescription {
fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> {
api.register(sled_agent_put)?;
api.register(sled_firewall_rules_request)?;
api.register(switch_put)?;
api.register(rack_initialization_complete)?;
api.register(physical_disk_put)?;
Expand Down Expand Up @@ -126,6 +127,31 @@ async fn sled_agent_put(
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

/// Request a new set of firewall rules for a sled.
///
/// This causes Nexus to read the latest set of rules for the sled,
/// and call a Sled endpoint which applies the rules to all OPTE ports
/// that happen to exist.
#[endpoint {
method = POST,
path = "/sled-agents/{sled_id}/firewall-rules-update",
}]
async fn sled_firewall_rules_request(
rqctx: RequestContext<Arc<ServerContext>>,
path_params: Path<SledAgentPathParam>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
let path = path_params.into_inner();
let sled_id = &path.sled_id;
let handler = async {
nexus.sled_request_firewall_rules(&opctx, *sled_id).await?;
Ok(HttpResponseUpdatedNoContent())
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

/// Path parameters for Rack requests.
#[derive(Deserialize, JsonSchema)]
struct RackPathParam {
Expand Down
28 changes: 28 additions & 0 deletions openapi/nexus-internal.json
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,34 @@
}
}
},
"/sled-agents/{sled_id}/firewall-rules-update": {
"post": {
"summary": "Request a new set of firewall rules for a sled.",
"operationId": "sled_firewall_rules_request",
"parameters": [
{
"in": "path",
"name": "sled_id",
"required": true,
"schema": {
"type": "string",
"format": "uuid"
}
}
],
"responses": {
"204": {
"description": "resource updated"
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/sled-agents/{sled_id}/zpools/{zpool_id}": {
"put": {
"summary": "Report that a pool for a specified sled has come online.",
Expand Down
49 changes: 39 additions & 10 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ pub enum Error {
#[error("Failed to operate on underlay device: {0}")]
Underlay(#[from] underlay::Error),

#[error("Failed to request firewall rules")]
FirewallRequest(#[source] nexus_client::Error<nexus_client::types::Error>),

#[error(transparent)]
Services(#[from] crate::services::Error),

Expand Down Expand Up @@ -602,11 +605,24 @@ impl SledAgent {
retry_notify(
retry_policy_internal_service_aggressive(),
|| async {
self.inner
.services
.load_services()
// Load as many services as we can, and don't exit immediately
// upon failure...
let load_services_result =
self.inner.services.load_services().await.map_err(|err| {
BackoffError::transient(Error::from(err))
});

// ... and request firewall rule updates for as many services as
// we can. Note that we still make this request even if we only
// partially load some services.
let firewall_result = self
.request_firewall_update()
.await
.map_err(|err| BackoffError::transient(err))
.map_err(|err| BackoffError::transient(err));

// Only complete if we have loaded all services and firewall
// rules successfully.
load_services_result.and(firewall_result)
},
|err, delay| {
warn!(
Expand All @@ -618,10 +634,6 @@ impl SledAgent {
)
.await
.unwrap(); // we retry forever, so this can't fail

// Now that we've initialized the sled services, notify nexus again
// at which point it'll plumb any necessary firewall rules back to us.
self.notify_nexus_about_self(&self.log);
}

pub(crate) fn switch_zone_underlay_info(
Expand All @@ -642,7 +654,24 @@ impl SledAgent {
&self.inner.start_request
}

// Sends a request to Nexus informing it that the current sled exists.
/// Requests firewall rules from Nexus.
///
/// Does not retry upon failure.
async fn request_firewall_update(&self) -> Result<(), Error> {
let sled_id = self.inner.id;

self.inner
.nexus_client
.client()
.sled_firewall_rules_request(&sled_id)
.await
.map_err(|err| Error::FirewallRequest(err))?;
Ok(())
}

/// Sends a request to Nexus informing it that the current sled exists.
///
/// Does not block for neux being available.
pub(crate) fn notify_nexus_about_self(&self, log: &Logger) {
let sled_id = self.inner.id;
let nexus_client = self.inner.nexus_client.clone();
Expand All @@ -658,7 +687,7 @@ impl SledAgent {
let log = log.clone();
let fut = async move {
// Notify the control plane that we're up, and continue trying this
// until it succeeds. We retry with an randomized, capped
// until it succeeds. We retry with a randomized, capped
// exponential backoff.
//
// TODO-robustness if this returns a 400 error, we probably want to
Expand Down

0 comments on commit 9d5d238

Please sign in to comment.