From 811d857dc975c771fb45c1700ad54447165a8c07 Mon Sep 17 00:00:00 2001
From: Benjamin Naecker <ben@oxide.computer>
Date: Thu, 4 Jul 2024 21:19:02 +0000
Subject: [PATCH] Don't fail requests if we cannot track their latencies

- Fixes #5998
- Add a helper trait to extract the status code from a generic kind of
  response, so that we can use the latency tracker for methods that
  return a `http::Response` directly. Implement this for the extant
  response types we need to instrument.
- Add latency tracking to the device OAuth endpoints that were
  previously untracked.
---
 Cargo.lock                            |  3 ++
 nexus/src/external_api/device_auth.rs | 16 +++---
 oximeter/instruments/Cargo.toml       |  6 +++
 oximeter/instruments/src/http.rs      | 78 +++++++++++++++++++++++----
 4 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d79933cd87f..9d83c6cee7e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6404,11 +6404,14 @@ dependencies = [
  "dropshot",
  "futures",
  "http 0.2.12",
+ "hyper 0.14.28",
  "kstat-rs",
  "libc",
  "omicron-workspace-hack",
  "oximeter",
  "rand 0.8.5",
+ "schemars",
+ "serde",
  "slog",
  "slog-async",
  "slog-term",
diff --git a/nexus/src/external_api/device_auth.rs b/nexus/src/external_api/device_auth.rs
index 2aa1965e791..883dbf4e198 100644
--- a/nexus/src/external_api/device_auth.rs
+++ b/nexus/src/external_api/device_auth.rs
@@ -92,9 +92,11 @@ pub(crate) async fn device_auth_request(
             &model.into_response(rqctx.server.using_tls(), host),
         )
     };
-    // TODO: instrumentation doesn't work because we use `Response<Body>`
-    //apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
-    handler.await
+    apictx
+        .context
+        .external_latencies
+        .instrument_dropshot_handler(&rqctx, handler)
+        .await
 }
 
 #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
@@ -250,7 +252,9 @@ pub(crate) async fn device_access_token(
             ),
         }
     };
-    // TODO: instrumentation doesn't work because we use `Response<Body>`
-    //apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
-    handler.await
+    apictx
+        .context
+        .external_latencies
+        .instrument_dropshot_handler(&rqctx, handler)
+        .await
 }
diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml
index f51278f7943..d5dcac411a3 100644
--- a/oximeter/instruments/Cargo.toml
+++ b/oximeter/instruments/Cargo.toml
@@ -13,9 +13,12 @@ chrono = { workspace = true, optional = true }
 dropshot = { workspace = true, optional = true }
 futures = { workspace = true, optional = true }
 http = { workspace = true, optional = true }
+hyper = { workspace = true, optional = true }
 kstat-rs = { workspace = true, optional = true }
 libc = { workspace = true, optional = true }
 oximeter = { workspace = true, optional = true }
+schemars = { workspace = true, optional = true }
+serde = { workspace = true, optional = true }
 slog = { workspace = true, optional = true }
 tokio = { workspace = true, optional = true }
 thiserror = { workspace = true, optional = true }
@@ -29,7 +32,10 @@ http-instruments = [
     "dep:dropshot",
     "dep:futures",
     "dep:http",
+    "dep:hyper",
     "dep:oximeter",
+    "dep:schemars",
+    "dep:serde",
     "dep:uuid"
 ]
 kstat = [
diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs
index 4bc6cf86771..e48008fda90 100644
--- a/oximeter/instruments/src/http.rs
+++ b/oximeter/instruments/src/http.rs
@@ -4,14 +4,16 @@
 
 //! Instrumentation tools for HTTP services.
 
-// Copyright 2021 Oxide Computer Company
+// Copyright 2024 Oxide Computer Company
 
 use dropshot::{
     HttpError, HttpResponse, RequestContext, RequestInfo, ServerContext,
 };
 use futures::Future;
+use http::Response;
 use http::StatusCode;
 use http::Uri;
+use hyper::body::Body;
 use oximeter::{
     histogram::Histogram, histogram::Record, Metric, MetricsError, Producer,
     Sample, Target,
@@ -93,6 +95,56 @@ impl RequestLatencyHistogram {
     }
 }
 
+/// A helper trait to extract a status code from a Dropshot response.
+pub trait TrackableResponse: HttpResponse {
+    fn status_code(&self) -> StatusCode;
+}
+
+macro_rules! impl_generic_response {
+    ($dropshot_type:ty) => {
+        impl<T> TrackableResponse for $dropshot_type
+        where
+            T: serde::Serialize + schemars::JsonSchema + Send + Sync + 'static,
+        {
+            fn status_code(&self) -> StatusCode {
+                <Self as dropshot::HttpCodedResponse>::STATUS_CODE
+            }
+        }
+    };
+}
+
+macro_rules! impl_response {
+    ($dropshot_type:ty) => {
+        impl TrackableResponse for $dropshot_type {
+            fn status_code(&self) -> StatusCode {
+                <Self as dropshot::HttpCodedResponse>::STATUS_CODE
+            }
+        }
+    };
+}
+
+impl_generic_response!(dropshot::HttpResponseOk<T>);
+impl_generic_response!(dropshot::HttpResponseAccepted<T>);
+impl_generic_response!(dropshot::HttpResponseCreated<T>);
+impl_response!(dropshot::HttpResponseUpdatedNoContent);
+impl_response!(dropshot::HttpResponseDeleted);
+
+impl<T, H> TrackableResponse for dropshot::HttpResponseHeaders<T, H>
+where
+    T: dropshot::HttpCodedResponse,
+    H: serde::Serialize + schemars::JsonSchema + Send + Sync + 'static,
+{
+    fn status_code(&self) -> StatusCode {
+        <T as dropshot::HttpCodedResponse>::STATUS_CODE
+    }
+}
+
+impl TrackableResponse for Response<Body> {
+    fn status_code(&self) -> StatusCode {
+        self.status()
+    }
+}
+
 /// The `LatencyTracker` is an [`oximeter::Producer`] that tracks the latencies of requests for an
 /// HTTP service, in seconds.
 ///
@@ -171,23 +223,29 @@ impl LatencyTracker {
         handler: H,
     ) -> Result<R, HttpError>
     where
-        R: HttpResponse,
+        R: TrackableResponse,
         H: Future<Output = Result<R, HttpError>>,
         T: ServerContext,
     {
         let start = Instant::now();
         let result = handler.await;
         let latency = start.elapsed();
-        let status_code = match result {
-            Ok(_) => R::response_metadata().success.unwrap(),
+        let status_code = match &result {
+            Ok(response) => response.status_code(),
             Err(ref e) => e.status_code,
         };
-        self.update(&context.request, status_code, latency).map_err(|e| {
-            HttpError::for_internal_error(format!(
-                "error instrumenting dropshot request handler: {}",
-                e
-            ))
-        })?;
+        if let Err(e) = self.update(&context.request, status_code, latency) {
+            slog::error!(
+                &context.log,
+                "error instrumenting dropshot handler";
+                "error" => ?e,
+                "status_code" => status_code.as_u16(),
+                "method" => %context.request.method(),
+                "uri" => %context.request.uri(),
+                "remote_addr" => context.request.remote_addr(),
+                "latency" => ?latency,
+            );
+        }
         result
     }
 }