From 9d5d2388f40ac9d77004982753af40b6465b4710 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 13 Feb 2024 12:46:46 -0800 Subject: [PATCH] [nexus][sled-agent] Try harder to load firewall rules during service bringup --- nexus/src/app/sled.rs | 14 +++++-- nexus/src/internal_api/http_entrypoints.rs | 26 ++++++++++++ openapi/nexus-internal.json | 28 +++++++++++++ sled-agent/src/sled_agent.rs | 49 +++++++++++++++++----- 4 files changed, 103 insertions(+), 14 deletions(-) diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 943490ac04..79e1c05408 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -44,7 +44,7 @@ impl super::Nexus { // unless the DNS lookups at sled-agent are only for rack-local nexuses. pub(crate) async fn upsert_sled( &self, - opctx: &OpContext, + _opctx: &OpContext, id: Uuid, info: SledAgentStartupInfo, ) -> Result<(), Error> { @@ -73,10 +73,16 @@ impl super::Nexus { ); self.db_datastore.sled_upsert(sled).await?; - // Make sure any firewall rules for serices that may - // be running on this sled get plumbed - self.plumb_service_firewall_rules(opctx, &[id]).await?; + Ok(()) + } + pub(crate) async fn sled_request_firewall_rules( + &self, + opctx: &OpContext, + id: Uuid, + ) -> Result<(), Error> { + info!(self.log, "requesting firewall rules"; "sled_uuid" => id.to_string()); + self.plumb_service_firewall_rules(opctx, &[id]).await?; Ok(()) } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 0122d9b439..eddc834a2a 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -57,6 +57,7 @@ type NexusApiDescription = ApiDescription>; pub(crate) fn internal_api() -> NexusApiDescription { fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> { api.register(sled_agent_put)?; + api.register(sled_firewall_rules_request)?; api.register(switch_put)?; api.register(rack_initialization_complete)?; api.register(physical_disk_put)?; @@ -126,6 +127,31 @@ async fn sled_agent_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Request a new set of firewall rules for a sled. +/// +/// This causes Nexus to read the latest set of rules for the sled, +/// and call a Sled endpoint which applies the rules to all OPTE ports +/// that happen to exist. +#[endpoint { + method = POST, + path = "/sled-agents/{sled_id}/firewall-rules-update", + }] +async fn sled_firewall_rules_request( + rqctx: RequestContext>, + path_params: Path, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let path = path_params.into_inner(); + let sled_id = &path.sled_id; + let handler = async { + nexus.sled_request_firewall_rules(&opctx, *sled_id).await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// Path parameters for Rack requests. #[derive(Deserialize, JsonSchema)] struct RackPathParam { diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 4714b64c52..569a934c39 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -835,6 +835,34 @@ } } }, + "/sled-agents/{sled_id}/firewall-rules-update": { + "post": { + "summary": "Request a new set of firewall rules for a sled.", + "operationId": "sled_firewall_rules_request", + "parameters": [ + { + "in": "path", + "name": "sled_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sled-agents/{sled_id}/zpools/{zpool_id}": { "put": { "summary": "Report that a pool for a specified sled has come online.", diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index bcc354232e..a21fc79f9b 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -105,6 +105,9 @@ pub enum Error { #[error("Failed to operate on underlay device: {0}")] Underlay(#[from] underlay::Error), + #[error("Failed to request firewall rules")] + FirewallRequest(#[source] nexus_client::Error), + #[error(transparent)] Services(#[from] crate::services::Error), @@ -602,11 +605,24 @@ impl SledAgent { retry_notify( retry_policy_internal_service_aggressive(), || async { - self.inner - .services - .load_services() + // Load as many services as we can, and don't exit immediately + // upon failure... + let load_services_result = + self.inner.services.load_services().await.map_err(|err| { + BackoffError::transient(Error::from(err)) + }); + + // ... and request firewall rule updates for as many services as + // we can. Note that we still make this request even if we only + // partially load some services. + let firewall_result = self + .request_firewall_update() .await - .map_err(|err| BackoffError::transient(err)) + .map_err(|err| BackoffError::transient(err)); + + // Only complete if we have loaded all services and firewall + // rules successfully. + load_services_result.and(firewall_result) }, |err, delay| { warn!( @@ -618,10 +634,6 @@ impl SledAgent { ) .await .unwrap(); // we retry forever, so this can't fail - - // Now that we've initialized the sled services, notify nexus again - // at which point it'll plumb any necessary firewall rules back to us. - self.notify_nexus_about_self(&self.log); } pub(crate) fn switch_zone_underlay_info( @@ -642,7 +654,24 @@ impl SledAgent { &self.inner.start_request } - // Sends a request to Nexus informing it that the current sled exists. + /// Requests firewall rules from Nexus. + /// + /// Does not retry upon failure. + async fn request_firewall_update(&self) -> Result<(), Error> { + let sled_id = self.inner.id; + + self.inner + .nexus_client + .client() + .sled_firewall_rules_request(&sled_id) + .await + .map_err(|err| Error::FirewallRequest(err))?; + Ok(()) + } + + /// Sends a request to Nexus informing it that the current sled exists. + /// + /// Does not block for neux being available. pub(crate) fn notify_nexus_about_self(&self, log: &Logger) { let sled_id = self.inner.id; let nexus_client = self.inner.nexus_client.clone(); @@ -658,7 +687,7 @@ impl SledAgent { let log = log.clone(); let fut = async move { // Notify the control plane that we're up, and continue trying this - // until it succeeds. We retry with an randomized, capped + // until it succeeds. We retry with a randomized, capped // exponential backoff. // // TODO-robustness if this returns a 400 error, we probably want to