From 083d2b61fadf1843533314fce884087aee026643 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 24 Aug 2024 11:58:40 -0700 Subject: [PATCH] [gateway] Add Oximeter HTTP service metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that #6354 has added an Oximeter producer endpoint to MGS for publishing SP sensor metrics, it seemed like a nice idea to also instrument the MGS HTTP server, similar to the existing instrumentation for other control plane services. I don't think we'll be doing a lot of tuning of MGS performance, but the metrics seem like they could still be useful because they also include the distribution of HTTP status codes, and in many cases, the latency measurements also serve as a proxy for how long it takes the *SP* to perform a certain operation, which could be a valuable signal. This commit adds an `oximeter_instruments::http::LatencyTracker` to the MGS HTTP servers. To test that it works, I started a local Clickhouse and a standalone Oximeter, and ran MGS and the SP simulator using `cargo xtask mgs-dev run`. Then, I made a few HTTP requests to various MGS APIs using `curl`; most of which were expected to succeed, and a few for SP slots that the simulator wasn't configured to simulate a SP in (to ensure that the request would fail). We can see the new metrics in OxQL: ``` 0x〉\d hardware_component:current hardware_component:fan_speed hardware_component:sensor_error_count hardware_component:temperature hardware_component:voltage http_service:request_latency_histogram oximeter_collector:collections 0x〉get http_service:request_latency_histogram | last 1 http_service:request_latency_histogram id: 1ac73746-2d3b-46d8-ac7c-44512c5f2263 name: management-gateway-service operation_id: sp_get status_code: 200 [2024-08-24 18:54:47.978590056, 2024-08-24 18:58:18.125731231]: [-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000: 0, 0.000001: 0, 0.000002: 0, 0.000003: 0, 0.000004: 0, 0.000005: 0, 0.0000059999999999999985: 0, 0.000007: 0, 0.000008: 0, 0.000009: 0, 0.00001: 0, 0.00002: 0, 0.000030000000000000004: 0, 0.00004: 0, 0.00005: 0, 0.00006: 0, 0.00007000000000000001: 0, 0.00008: 0, 0.00009: 0, 0.0001: 0, 0.0002: 0, 0.0003: 0, 0.0004: 0, 0.0005: 1, 0.0006000000000000001: 1, 0.0007: 0, 0.0007999999999999999: 0, 0.0009: 0, 0.001: 0, 0.002: 0, 0.003: 0, 0.004: 0, 0.005: 0, 0.006: 0, 0.007: 0, 0.008: 0, 0.009000000000000001: 0, 0.01: 0, 0.020000000000000004: 0, 0.03000000000000001: 0, 0.04000000000000001: 0, 0.05000000000000001: 0, 0.06000000000000001: 0, 0.07: 0, 0.08: 0, 0.09000000000000001: 0, 0.1: 0, 0.2: 0, 0.30000000000000004: 0, 0.4: 0, 0.5: 0, 0.6: 0, 0.7000000000000001: 0, 0.8: 0, 0.9: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 200: 0, 300: 0, 400: 0, 500: 0, 600: 0, 700: 0, 800: 0, 900: 0, 1000: 0, min: 0.000556233, max: 0.000603704, mean: 0.0005799685000000001, std_dev: 0.00002373549999999997, p50: 0, p90: 0.000603704, p99: 0.000603704] id: 1ac73746-2d3b-46d8-ac7c-44512c5f2263 name: management-gateway-service operation_id: ignition_list status_code: 200 [2024-08-24 18:54:47.978590056, 2024-08-24 18:58:18.125290346]: [-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000: 0, 0.000001: 0, 0.000002: 0, 0.000003: 0, 0.000004: 0, 0.000005: 0, 0.0000059999999999999985: 0, 0.000007: 0, 0.000008: 0, 0.000009: 0, 0.00001: 0, 0.00002: 0, 0.000030000000000000004: 0, 0.00004: 0, 0.00005: 0, 0.00006: 0, 0.00007000000000000001: 0, 0.00008: 0, 0.00009: 0, 0.0001: 0, 0.0002: 0, 0.0003: 0, 0.0004: 1, 0.0005: 0, 0.0006000000000000001: 0, 0.0007: 0, 0.0007999999999999999: 0, 0.0009: 0, 0.001: 0, 0.002: 0, 0.003: 0, 0.004: 0, 0.005: 0, 0.006: 0, 0.007: 0, 0.008: 0, 0.009000000000000001: 0, 0.01: 0, 0.020000000000000004: 0, 0.03000000000000001: 0, 0.04000000000000001: 0, 0.05000000000000001: 0, 0.06000000000000001: 0, 0.07: 0, 0.08: 0, 0.09000000000000001: 0, 0.1: 0, 0.2: 0, 0.30000000000000004: 0, 0.4: 0, 0.5: 0, 0.6: 0, 0.7000000000000001: 0, 0.8: 0, 0.9: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 200: 0, 300: 0, 400: 0, 500: 0, 600: 0, 700: 0, 800: 0, 900: 0, 1000: 0, min: 0.000427249, max: 0.000427249, mean: 0.000427249, std_dev: 0, p50: 0, p90: 0.000427249, p99: 0.000427249] id: 1ac73746-2d3b-46d8-ac7c-44512c5f2263 name: management-gateway-service operation_id: sp_get status_code: 400 [2024-08-24 18:54:47.978590056, 2024-08-24 18:58:18.126114126]: [-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000: 0, 0.000001: 0, 0.000002: 0, 0.000003: 0, 0.000004: 0, 0.000005: 0, 0.0000059999999999999985: 0, 0.000007: 0, 0.000008: 0, 0.000009: 0, 0.00001: 0, 0.00002: 2, 0.000030000000000000004: 0, 0.00004: 0, 0.00005: 0, 0.00006: 0, 0.00007000000000000001: 0, 0.00008: 0, 0.00009: 0, 0.0001: 0, 0.0002: 0, 0.0003: 0, 0.0004: 0, 0.0005: 0, 0.0006000000000000001: 0, 0.0007: 0, 0.0007999999999999999: 0, 0.0009: 0, 0.001: 0, 0.002: 0, 0.003: 0, 0.004: 0, 0.005: 0, 0.006: 0, 0.007: 0, 0.008: 0, 0.009000000000000001: 0, 0.01: 0, 0.020000000000000004: 0, 0.03000000000000001: 0, 0.04000000000000001: 0, 0.05000000000000001: 0, 0.06000000000000001: 0, 0.07: 0, 0.08: 0, 0.09000000000000001: 0, 0.1: 0, 0.2: 0, 0.30000000000000004: 0, 0.4: 0, 0.5: 0, 0.6: 0, 0.7000000000000001: 0, 0.8: 0, 0.9: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 200: 0, 300: 0, 400: 0, 500: 0, 600: 0, 700: 0, 800: 0, 900: 0, 1000: 0, min: 0.000020368, max: 0.000021581, mean: 0.0000209745, std_dev: 0.0000006064999999999992, p50: 0, p90: 0.000021581, p99: 0.000021581] 0x〉exit ``` --- Cargo.lock | 1 + gateway/Cargo.toml | 1 + gateway/src/context.rs | 16 + gateway/src/http_entrypoints.rs | 784 ++++++++++++++++++-------------- gateway/src/lib.rs | 1 + gateway/src/metrics.rs | 10 + 6 files changed, 478 insertions(+), 335 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 249b7c5cea..7074e40993 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5983,6 +5983,7 @@ dependencies = [ "omicron-workspace-hack", "once_cell", "oximeter", + "oximeter-instruments", "oximeter-producer", "schemars", "serde", diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index 2dce15892d..724c61fad9 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -42,6 +42,7 @@ uuid.workspace = true omicron-workspace-hack.workspace = true oximeter.workspace = true oximeter-producer.workspace = true +oximeter-instruments.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/gateway/src/context.rs b/gateway/src/context.rs index 939bb9b6b9..15592145cf 100644 --- a/gateway/src/context.rs +++ b/gateway/src/context.rs @@ -16,11 +16,13 @@ pub struct ServerContext { pub mgmt_switch: ManagementSwitch, pub host_phase2_provider: Arc, pub rack_id: OnceLock, + pub latencies: oximeter_instruments::http::LatencyTracker, pub log: Logger, } impl ServerContext { pub async fn new( + id: Uuid, host_phase2_provider: Arc, switch_config: SwitchConfig, rack_id_config: Option, @@ -37,7 +39,21 @@ impl ServerContext { OnceLock::new() }; + const START_LATENCY_DECADE: i16 = -6; + const END_LATENCY_DECADE: i16 = 3; + let latencies = + oximeter_instruments::http::LatencyTracker::with_latency_decades( + oximeter_instruments::http::HttpService { + name: "management-gateway-service".into(), + id, + }, + START_LATENCY_DECADE, + END_LATENCY_DECADE, + ) + .expect("start and end decades are hardcoded and should be valid"); + Ok(Arc::new(ServerContext { + latencies, mgmt_switch, host_phase2_provider, rack_id, diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 332f50ed8a..c10e71ad61 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -81,18 +81,22 @@ impl GatewayApi for GatewayImpl { ) -> Result, HttpError> { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - let state = sp.state().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let state = sp.state().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + let rot_state = sp + .rot_state(gateway_messages::RotBootInfo::HIGHEST_KNOWN_VERSION) + .await; - let rot_state = sp - .rot_state(gateway_messages::RotBootInfo::HIGHEST_KNOWN_VERSION) - .await; + let final_state = sp_state_from_comms(state, rot_state); - let final_state = sp_state_from_comms(state, rot_state); - Ok(HttpResponseOk(final_state)) + Ok(HttpResponseOk(final_state)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_startup_options_get( @@ -100,15 +104,18 @@ impl GatewayApi for GatewayImpl { path: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); - let mgmt_switch = &apictx.mgmt_switch; - let sp_id = path.into_inner().sp.into(); - let sp = mgmt_switch.sp(sp_id)?; + let handler = async { + let mgmt_switch = &apictx.mgmt_switch; + let sp_id = path.into_inner().sp.into(); + let sp = mgmt_switch.sp(sp_id)?; - let options = sp.get_startup_options().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let options = sp.get_startup_options().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(options.into())) + Ok(HttpResponseOk(options.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_startup_options_set( @@ -119,13 +126,16 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let mgmt_switch = &apictx.mgmt_switch; let sp_id = path.into_inner().sp.into(); - let sp = mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = mgmt_switch.sp(sp_id)?; - sp.set_startup_options(body.into_inner().into()).await.map_err( - |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err }, - )?; + sp.set_startup_options(body.into_inner().into()).await.map_err( + |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err }, + )?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_sensor_read_value( @@ -135,12 +145,17 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpSensorId { sp, sensor_id } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let value = sp.read_sensor_value(sensor_id).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let value = + sp.read_sensor_value(sensor_id).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseOk(value.into())) + }; - Ok(HttpResponseOk(value.into())) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_list( @@ -149,12 +164,15 @@ impl GatewayApi for GatewayImpl { ) -> Result, HttpError> { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let inventory = sp.inventory().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let inventory = sp.inventory().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(sp_component_list_from_comms(inventory))) + Ok(HttpResponseOk(sp_component_list_from_comms(inventory))) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_get( @@ -164,16 +182,21 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - - let details = sp.component_details(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + let details = + sp.component_details(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseOk( + details.entries.into_iter().map(Into::into).collect(), + )) + }; - Ok(HttpResponseOk( - details.entries.into_iter().map(Into::into).collect(), - )) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } // Implementation notes: @@ -198,66 +221,79 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let ComponentCabooseSlot { firmware_slot } = query_params.into_inner(); - let component = component_from_str(&component)?; - let from_utf8 = |key: &[u8], bytes| { - // This helper closure is only called with the ascii-printable [u8; 4] - // key constants we define above, so we can unwrap this conversion. - let key = str::from_utf8(key).unwrap(); - String::from_utf8(bytes).map_err(|_| { - http_err_with_message( - http::StatusCode::SERVICE_UNAVAILABLE, - "InvalidCaboose", - format!("non-utf8 data returned for caboose key {key}"), + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let ComponentCabooseSlot { firmware_slot } = + query_params.into_inner(); + let component = component_from_str(&component)?; + + let from_utf8 = |key: &[u8], bytes| { + // This helper closure is only called with the ascii-printable [u8; 4] + // key constants we define above, so we can unwrap this conversion. + let key = str::from_utf8(key).unwrap(); + String::from_utf8(bytes).map_err(|_| { + http_err_with_message( + http::StatusCode::SERVICE_UNAVAILABLE, + "InvalidCaboose", + format!("non-utf8 data returned for caboose key {key}"), + ) + }) + }; + + let git_commit = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_GIT_COMMIT, ) - }) - }; + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + let board = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_BOARD, + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + let name = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_NAME, + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + let version = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_VERSION, + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let git_commit = - sp.read_component_caboose( - component, - firmware_slot, - CABOOSE_KEY_GIT_COMMIT, - ) - .await - .map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - let board = sp - .read_component_caboose(component, firmware_slot, CABOOSE_KEY_BOARD) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - let name = sp - .read_component_caboose(component, firmware_slot, CABOOSE_KEY_NAME) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - let version = - sp.read_component_caboose( - component, - firmware_slot, - CABOOSE_KEY_VERSION, - ) - .await - .map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; + let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; + let name = from_utf8(&CABOOSE_KEY_NAME, name)?; + let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; - let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; - let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; - let name = from_utf8(&CABOOSE_KEY_NAME, name)?; - let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; + let caboose = + SpComponentCaboose { git_commit, board, name, version }; - let caboose = SpComponentCaboose { git_commit, board, name, version }; + Ok(HttpResponseOk(caboose)) + }; - Ok(HttpResponseOk(caboose)) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_clear_status( @@ -267,14 +303,18 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - sp.component_clear_status(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + sp.component_clear_status(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; - Ok(HttpResponseUpdatedNoContent {}) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_active_slot_get( @@ -284,15 +324,18 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - let slot = - sp.component_active_slot(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let slot = + sp.component_active_slot(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(SpComponentFirmwareSlot { slot })) + Ok(HttpResponseOk(SpComponentFirmwareSlot { slot })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_active_slot_set( @@ -304,16 +347,22 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - let slot = body.into_inner().slot; - let persist = query_params.into_inner().persist; - - sp.set_component_active_slot(component, slot, persist).await.map_err( - |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err }, - )?; - - Ok(HttpResponseUpdatedNoContent {}) + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + let slot = body.into_inner().slot; + let persist = query_params.into_inner().persist; + + sp.set_component_active_slot(component, slot, persist) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_serial_console_attach( @@ -321,6 +370,10 @@ impl GatewayApi for GatewayImpl { path: Path, websocket: WebsocketUpgrade, ) -> WebsocketEndpointResult { + // TODO(eliza): I'm not sure whether there's a way to make + // `oximeter_instruments`'s HTTP latency tracker work with websockets + // requests? It would be nice to get the latency and any error returned + // prior to actually returning the websocket stream... let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); @@ -356,13 +409,15 @@ impl GatewayApi for GatewayImpl { // we don't use it at all to detach. let PathSpComponent { sp, component: _ } = path.into_inner(); let sp_id = sp.into(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + sp.serial_console_detach().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let sp = apictx.mgmt_switch.sp(sp_id)?; - sp.serial_console_detach().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_reset( @@ -372,20 +427,23 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - - sp.reset_component_prepare(component) - // We always want to run with the watchdog when resetting as - // disabling the watchdog should be considered a debug only feature - .and_then(|()| sp.reset_component_trigger(component, false)) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - - Ok(HttpResponseUpdatedNoContent {}) + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + sp.reset_component_prepare(component) + // We always want to run with the watchdog when resetting as + // disabling the watchdog should be considered a debug only feature + .and_then(|()| sp.reset_component_trigger(component, false)) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_update( @@ -398,19 +456,22 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - let ComponentUpdateIdSlot { id, firmware_slot } = - query_params.into_inner(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + let ComponentUpdateIdSlot { id, firmware_slot } = + query_params.into_inner(); - // TODO-performance: this makes a full copy of the uploaded data - let image = body.as_bytes().to_vec(); + // TODO-performance: this makes a full copy of the uploaded data + let image = body.as_bytes().to_vec(); - sp.start_update(component, id, firmware_slot, image) - .await - .map_err(|err| SpCommsError::UpdateFailed { sp: sp_id, err })?; + sp.start_update(component, id, firmware_slot, image) + .await + .map_err(|err| SpCommsError::UpdateFailed { sp: sp_id, err })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_update_status( @@ -421,14 +482,17 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - let status = sp.update_status(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let status = sp.update_status(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(status.into())) + Ok(HttpResponseOk(status.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_update_abort( @@ -440,15 +504,18 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - let UpdateAbortBody { id } = body.into_inner(); - sp.update_abort(component, id).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let UpdateAbortBody { id } = body.into_inner(); + sp.update_abort(component, id).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_rot_cmpa_get( @@ -459,24 +526,26 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); + let handler = async { + // Ensure the caller knows they're asking for the RoT + if component_from_str(&component)? != SpComponent::ROT { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the RoT has a CFPA".into(), + )); + } + + let sp = apictx.mgmt_switch.sp(sp_id)?; + let data = sp.read_rot_cmpa().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - // Ensure the caller knows they're asking for the RoT - if component_from_str(&component)? != SpComponent::ROT { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the RoT has a CFPA".into(), - )); - } - - let sp = apictx.mgmt_switch.sp(sp_id)?; - let data = sp.read_rot_cmpa().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - - let base64_data = - base64::engine::general_purpose::STANDARD.encode(data); + let base64_data = + base64::engine::general_purpose::STANDARD.encode(data); - Ok(HttpResponseOk(RotCmpa { base64_data })) + Ok(HttpResponseOk(RotCmpa { base64_data })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_rot_cfpa_get( @@ -490,29 +559,32 @@ impl GatewayApi for GatewayImpl { let GetCfpaParams { slot } = params.into_inner(); let sp_id = sp.into(); - // Ensure the caller knows they're asking for the RoT - if component_from_str(&component)? != SpComponent::ROT { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the RoT has a CFPA".into(), - )); - } + let handler = async { + // Ensure the caller knows they're asking for the RoT + if component_from_str(&component)? != SpComponent::ROT { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the RoT has a CFPA".into(), + )); + } + + let sp = apictx.mgmt_switch.sp(sp_id)?; + let data = match slot { + RotCfpaSlot::Active => sp.read_rot_active_cfpa().await, + RotCfpaSlot::Inactive => sp.read_rot_inactive_cfpa().await, + RotCfpaSlot::Scratch => sp.read_rot_scratch_cfpa().await, + } + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let sp = apictx.mgmt_switch.sp(sp_id)?; - let data = match slot { - RotCfpaSlot::Active => sp.read_rot_active_cfpa().await, - RotCfpaSlot::Inactive => sp.read_rot_inactive_cfpa().await, - RotCfpaSlot::Scratch => sp.read_rot_scratch_cfpa().await, - } - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + let base64_data = + base64::engine::general_purpose::STANDARD.encode(data); - let base64_data = - base64::engine::general_purpose::STANDARD.encode(data); + Ok(HttpResponseOk(RotCfpa { base64_data, slot })) + }; - Ok(HttpResponseOk(RotCfpa { base64_data, slot })) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_rot_boot_info( @@ -526,20 +598,24 @@ impl GatewayApi for GatewayImpl { let GetRotBootInfoParams { version } = params.into_inner(); let sp_id = sp.into(); - // Ensure the caller knows they're asking for the RoT - if component_from_str(&component)? != SpComponent::ROT { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "rot_boot_info only makes sent for a RoT".into(), - )); - } + let handler = async { + // Ensure the caller knows they're asking for the RoT + if component_from_str(&component)? != SpComponent::ROT { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "rot_boot_info only makes sent for a RoT".into(), + )); + } + + let sp = apictx.mgmt_switch.sp(sp_id)?; + let state = sp.rot_state(version).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let sp = apictx.mgmt_switch.sp(sp_id)?; - let state = sp.rot_state(version).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + Ok(HttpResponseOk(state.into())) + }; - Ok(HttpResponseOk(state.into())) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn ignition_list( @@ -547,17 +623,19 @@ impl GatewayApi for GatewayImpl { ) -> Result>, HttpError> { let apictx = rqctx.context(); let mgmt_switch = &apictx.mgmt_switch; - - let out = mgmt_switch - .bulk_ignition_state() - .await? - .map(|(id, state)| SpIgnitionInfo { - id: id.into(), - details: state.into(), - }) - .collect(); - - Ok(HttpResponseOk(out)) + let handler = async { + let out = mgmt_switch + .bulk_ignition_state() + .await? + .map(|(id, state)| SpIgnitionInfo { + id: id.into(), + details: state.into(), + }) + .collect(); + + Ok(HttpResponseOk(out)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn ignition_get( @@ -568,19 +646,23 @@ impl GatewayApi for GatewayImpl { let mgmt_switch = &apictx.mgmt_switch; let sp_id = path.into_inner().sp.into(); - let ignition_target = mgmt_switch.ignition_target(sp_id)?; - - let state = mgmt_switch - .ignition_controller() - .ignition_state(ignition_target) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - - let info = SpIgnitionInfo { id: sp_id.into(), details: state.into() }; - Ok(HttpResponseOk(info)) + let handler = async { + let ignition_target = mgmt_switch.ignition_target(sp_id)?; + + let state = mgmt_switch + .ignition_controller() + .ignition_state(ignition_target) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + let info = + SpIgnitionInfo { id: sp_id.into(), details: state.into() }; + Ok(HttpResponseOk(info)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn ignition_command( @@ -591,18 +673,22 @@ impl GatewayApi for GatewayImpl { let mgmt_switch = &apictx.mgmt_switch; let PathSpIgnitionCommand { sp, command } = path.into_inner(); let sp_id = sp.into(); - let ignition_target = mgmt_switch.ignition_target(sp_id)?; - mgmt_switch - .ignition_controller() - .ignition_command(ignition_target, command.into()) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + let handler = async { + let ignition_target = mgmt_switch.ignition_target(sp_id)?; - Ok(HttpResponseUpdatedNoContent {}) + mgmt_switch + .ignition_controller() + .ignition_command(ignition_target, command.into()) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_power_state_get( @@ -611,13 +697,16 @@ impl GatewayApi for GatewayImpl { ) -> Result, HttpError> { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - let power_state = sp.power_state().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let power_state = sp.power_state().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(power_state.into())) + Ok(HttpResponseOk(power_state.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_power_state_set( @@ -627,14 +716,17 @@ impl GatewayApi for GatewayImpl { ) -> Result { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let power_state = body.into_inner(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let power_state = body.into_inner(); - sp.set_power_state(power_state.into()).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + sp.set_power_state(power_state.into()).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_installinator_image_id_set( @@ -646,21 +738,23 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - let image_id = ipcc::InstallinatorImageId::from(body.into_inner()); + let image_id = ipcc::InstallinatorImageId::from(body.into_inner()); - sp.set_ipcc_key_lookup_value( - Key::InstallinatorImageId as u8, - image_id.serialize(), - ) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + sp.set_ipcc_key_lookup_value( + Key::InstallinatorImageId as u8, + image_id.serialize(), + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_installinator_image_id_delete( @@ -671,20 +765,22 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - // We clear the image ID by setting it to a 0-length vec. - sp.set_ipcc_key_lookup_value( - Key::InstallinatorImageId as u8, - Vec::new(), - ) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + // We clear the image ID by setting it to a 0-length vec. + sp.set_ipcc_key_lookup_value( + Key::InstallinatorImageId as u8, + Vec::new(), + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_host_phase2_progress_get( @@ -692,37 +788,41 @@ impl GatewayApi for GatewayImpl { path: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); - let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; - - let Some(progress) = sp.most_recent_host_phase2_request().await else { - return Ok(HttpResponseOk(HostPhase2Progress::None)); - }; - - // Our `host_phase2_provider` is using an in-memory cache, so the only way - // we can fail to get the total size is if we no longer have the image that - // this SP most recently requested. We'll treat that as "no progress - // information", since it almost certainly means our progress info on this - // SP is very stale. - let Ok(total_size) = - apictx.host_phase2_provider.total_size(progress.hash).await - else { - return Ok(HttpResponseOk(HostPhase2Progress::None)); - }; - - let image_id = HostPhase2RecoveryImageId { - sha256_hash: ArtifactHash(progress.hash), + let handler = async { + let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; + + let Some(progress) = sp.most_recent_host_phase2_request().await + else { + return Ok(HttpResponseOk(HostPhase2Progress::None)); + }; + + // Our `host_phase2_provider` is using an in-memory cache, so the only way + // we can fail to get the total size is if we no longer have the image that + // this SP most recently requested. We'll treat that as "no progress + // information", since it almost certainly means our progress info on this + // SP is very stale. + let Ok(total_size) = + apictx.host_phase2_provider.total_size(progress.hash).await + else { + return Ok(HttpResponseOk(HostPhase2Progress::None)); + }; + + let image_id = HostPhase2RecoveryImageId { + sha256_hash: ArtifactHash(progress.hash), + }; + + // `progress` tells us the offset the SP requested and the amount of data we + // sent starting at that offset; report the end of that chunk to our caller. + let offset = progress.offset.saturating_add(progress.data_sent); + + Ok(HttpResponseOk(HostPhase2Progress::Available { + image_id, + offset, + total_size, + age: progress.received.elapsed(), + })) }; - - // `progress` tells us the offset the SP requested and the amount of data we - // sent starting at that offset; report the end of that chunk to our caller. - let offset = progress.offset.saturating_add(progress.data_sent); - - Ok(HttpResponseOk(HostPhase2Progress::Available { - image_id, - offset, - total_size, - age: progress.received.elapsed(), - })) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_host_phase2_progress_delete( @@ -730,11 +830,14 @@ impl GatewayApi for GatewayImpl { path: Path, ) -> Result { let apictx = rqctx.context(); - let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; + let handler = async { + let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; - sp.clear_most_recent_host_phase2_request().await; + sp.clear_most_recent_host_phase2_request().await; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn recovery_host_phase2_upload( @@ -742,44 +845,55 @@ impl GatewayApi for GatewayImpl { body: UntypedBody, ) -> Result, HttpError> { let apictx = rqctx.context(); - - // TODO: this makes a full copy of the host image, potentially unnecessarily - // if it's malformed. - let image = body.as_bytes().to_vec(); - - let sha256_hash = - apictx.host_phase2_provider.insert(image).await.map_err(|err| { - // Any cache-insertion failure indicates a malformed image; map them - // to bad requests. - HttpError::for_bad_request( - Some("BadHostPhase2Image".to_string()), - err.to_string(), - ) - })?; - let sha256_hash = ArtifactHash(sha256_hash); - - Ok(HttpResponseOk(HostPhase2RecoveryImageId { sha256_hash })) + let handler = async { + // TODO: this makes a full copy of the host image, potentially unnecessarily + // if it's malformed. + let image = body.as_bytes().to_vec(); + + let sha256_hash = + apictx.host_phase2_provider.insert(image).await.map_err( + |err| { + // Any cache-insertion failure indicates a malformed image; map them + // to bad requests. + HttpError::for_bad_request( + Some("BadHostPhase2Image".to_string()), + err.to_string(), + ) + }, + )?; + let sha256_hash = ArtifactHash(sha256_hash); + + Ok(HttpResponseOk(HostPhase2RecoveryImageId { sha256_hash })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_local_switch_id( rqctx: RequestContext, ) -> Result, HttpError> { let apictx = rqctx.context(); + let handler = async { + let id = apictx.mgmt_switch.local_switch()?; - let id = apictx.mgmt_switch.local_switch()?; - - Ok(HttpResponseOk(id.into())) + Ok(HttpResponseOk(id.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_all_ids( rqctx: RequestContext, ) -> Result>, HttpError> { let apictx = rqctx.context(); - - let all_ids = - apictx.mgmt_switch.all_sps()?.map(|(id, _)| id.into()).collect(); - - Ok(HttpResponseOk(all_ids)) + let handler = async { + let all_ids = apictx + .mgmt_switch + .all_sps()? + .map(|(id, _)| id.into()) + .collect(); + + Ok(HttpResponseOk(all_ids)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } } diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 8e764dc63f..9c0afa6d77 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -143,6 +143,7 @@ impl Server { config.host_phase2_recovery_image_cache_max_images, )); let apictx = ServerContext::new( + args.id, host_phase2_provider, config.switch, args.rack_id, diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index d4e0795ae0..7c133f5d97 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -242,6 +242,7 @@ impl Metrics { let server = { let log = log.new(slog::o!("component" => "producer-server")); let registry = ProducerRegistry::with_id(id); + // Register the producer for SP sensor metrics. registry .register_producer(Producer { sample_rx, log: log.clone() }) // TODO(ben): when you change `register_producer` to not return @@ -251,6 +252,15 @@ impl Metrics { actually return an `Err`, so this shouldn't ever \ happen...", ); + // Also, register the producer for the HTTP API metrics. + registry + .register_producer(apictx.latencies.clone()) + // TODO(ben): do this one too pls + .expect( + "`ProducerRegistry::register_producer()` will never \ + actually return an `Err`, so this shouldn't ever \ + happen...", + ); tokio::spawn( ServerManager { log, addrs: addrs_rx, registry }.run(cfg),