From 811d857dc975c771fb45c1700ad54447165a8c07 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker <ben@oxide.computer> Date: Thu, 4 Jul 2024 21:19:02 +0000 Subject: [PATCH] Don't fail requests if we cannot track their latencies - Fixes #5998 - Add a helper trait to extract the status code from a generic kind of response, so that we can use the latency tracker for methods that return a `http::Response` directly. Implement this for the extant response types we need to instrument. - Add latency tracking to the device OAuth endpoints that were previously untracked. --- Cargo.lock | 3 ++ nexus/src/external_api/device_auth.rs | 16 +++--- oximeter/instruments/Cargo.toml | 6 +++ oximeter/instruments/src/http.rs | 78 +++++++++++++++++++++++---- 4 files changed, 87 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d79933cd87f..9d83c6cee7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6404,11 +6404,14 @@ dependencies = [ "dropshot", "futures", "http 0.2.12", + "hyper 0.14.28", "kstat-rs", "libc", "omicron-workspace-hack", "oximeter", "rand 0.8.5", + "schemars", + "serde", "slog", "slog-async", "slog-term", diff --git a/nexus/src/external_api/device_auth.rs b/nexus/src/external_api/device_auth.rs index 2aa1965e791..883dbf4e198 100644 --- a/nexus/src/external_api/device_auth.rs +++ b/nexus/src/external_api/device_auth.rs @@ -92,9 +92,11 @@ pub(crate) async fn device_auth_request( &model.into_response(rqctx.server.using_tls(), host), ) }; - // TODO: instrumentation doesn't work because we use `Response<Body>` - //apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await - handler.await + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await } #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] @@ -250,7 +252,9 @@ pub(crate) async fn device_access_token( ), } }; - // TODO: instrumentation doesn't work because we use `Response<Body>` - //apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await - handler.await + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await } diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index f51278f7943..d5dcac411a3 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -13,9 +13,12 @@ chrono = { workspace = true, optional = true } dropshot = { workspace = true, optional = true } futures = { workspace = true, optional = true } http = { workspace = true, optional = true } +hyper = { workspace = true, optional = true } kstat-rs = { workspace = true, optional = true } libc = { workspace = true, optional = true } oximeter = { workspace = true, optional = true } +schemars = { workspace = true, optional = true } +serde = { workspace = true, optional = true } slog = { workspace = true, optional = true } tokio = { workspace = true, optional = true } thiserror = { workspace = true, optional = true } @@ -29,7 +32,10 @@ http-instruments = [ "dep:dropshot", "dep:futures", "dep:http", + "dep:hyper", "dep:oximeter", + "dep:schemars", + "dep:serde", "dep:uuid" ] kstat = [ diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index 4bc6cf86771..e48008fda90 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -4,14 +4,16 @@ //! Instrumentation tools for HTTP services. -// Copyright 2021 Oxide Computer Company +// Copyright 2024 Oxide Computer Company use dropshot::{ HttpError, HttpResponse, RequestContext, RequestInfo, ServerContext, }; use futures::Future; +use http::Response; use http::StatusCode; use http::Uri; +use hyper::body::Body; use oximeter::{ histogram::Histogram, histogram::Record, Metric, MetricsError, Producer, Sample, Target, @@ -93,6 +95,56 @@ impl RequestLatencyHistogram { } } +/// A helper trait to extract a status code from a Dropshot response. +pub trait TrackableResponse: HttpResponse { + fn status_code(&self) -> StatusCode; +} + +macro_rules! impl_generic_response { + ($dropshot_type:ty) => { + impl<T> TrackableResponse for $dropshot_type + where + T: serde::Serialize + schemars::JsonSchema + Send + Sync + 'static, + { + fn status_code(&self) -> StatusCode { + <Self as dropshot::HttpCodedResponse>::STATUS_CODE + } + } + }; +} + +macro_rules! impl_response { + ($dropshot_type:ty) => { + impl TrackableResponse for $dropshot_type { + fn status_code(&self) -> StatusCode { + <Self as dropshot::HttpCodedResponse>::STATUS_CODE + } + } + }; +} + +impl_generic_response!(dropshot::HttpResponseOk<T>); +impl_generic_response!(dropshot::HttpResponseAccepted<T>); +impl_generic_response!(dropshot::HttpResponseCreated<T>); +impl_response!(dropshot::HttpResponseUpdatedNoContent); +impl_response!(dropshot::HttpResponseDeleted); + +impl<T, H> TrackableResponse for dropshot::HttpResponseHeaders<T, H> +where + T: dropshot::HttpCodedResponse, + H: serde::Serialize + schemars::JsonSchema + Send + Sync + 'static, +{ + fn status_code(&self) -> StatusCode { + <T as dropshot::HttpCodedResponse>::STATUS_CODE + } +} + +impl TrackableResponse for Response<Body> { + fn status_code(&self) -> StatusCode { + self.status() + } +} + /// The `LatencyTracker` is an [`oximeter::Producer`] that tracks the latencies of requests for an /// HTTP service, in seconds. /// @@ -171,23 +223,29 @@ impl LatencyTracker { handler: H, ) -> Result<R, HttpError> where - R: HttpResponse, + R: TrackableResponse, H: Future<Output = Result<R, HttpError>>, T: ServerContext, { let start = Instant::now(); let result = handler.await; let latency = start.elapsed(); - let status_code = match result { - Ok(_) => R::response_metadata().success.unwrap(), + let status_code = match &result { + Ok(response) => response.status_code(), Err(ref e) => e.status_code, }; - self.update(&context.request, status_code, latency).map_err(|e| { - HttpError::for_internal_error(format!( - "error instrumenting dropshot request handler: {}", - e - )) - })?; + if let Err(e) = self.update(&context.request, status_code, latency) { + slog::error!( + &context.log, + "error instrumenting dropshot handler"; + "error" => ?e, + "status_code" => status_code.as_u16(), + "method" => %context.request.method(), + "uri" => %context.request.uri(), + "remote_addr" => context.request.remote_addr(), + "latency" => ?latency, + ); + } result } }