From c9dbd1573873c2989b1f70a048320a67f3722e80 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 27 Aug 2024 20:34:14 +0000 Subject: [PATCH 1/4] Track HTTP request latencies in nanoseconds - Change the `http_service:request_latency_histogram` to express latencies in nanoseconds, as a histogram of u64s. - Update the latency tracking types in `oximeter_instruments::http` to use new bin type, and clean up some of the documentation. - Expunged old timeseries schema - Closes #6443 --- gateway/src/context.rs | 8 +- nexus/src/context.rs | 8 +- .../replicated/12/timeseries-to-delete.txt | 1 + .../single-node/12/timeseries-to-delete.txt | 1 + oximeter/db/src/model.rs | 2 +- oximeter/instruments/src/http.rs | 60 +++++++-------- oximeter/oximeter/schema/http-service.toml | 4 +- oximeter/types/src/histogram.rs | 73 +++++++++++++++++++ 8 files changed, 119 insertions(+), 38 deletions(-) create mode 100644 oximeter/db/schema/replicated/12/timeseries-to-delete.txt create mode 100644 oximeter/db/schema/single-node/12/timeseries-to-delete.txt diff --git a/gateway/src/context.rs b/gateway/src/context.rs index 15592145cf..a0f9c3327f 100644 --- a/gateway/src/context.rs +++ b/gateway/src/context.rs @@ -39,10 +39,12 @@ impl ServerContext { OnceLock::new() }; - const START_LATENCY_DECADE: i16 = -6; - const END_LATENCY_DECADE: i16 = 3; + // Track from 1 microsecond == 1e3 nanoseconds + const START_LATENCY_DECADE: u16 = 3; + // To 1000s == 1e9 * 1e3 == 1e12 nanoseconds + const END_LATENCY_DECADE: u16 = 12; let latencies = - oximeter_instruments::http::LatencyTracker::with_latency_decades( + oximeter_instruments::http::LatencyTracker::with_log_linear_bins( oximeter_instruments::http::HttpService { name: "management-gateway-service".into(), id, diff --git a/nexus/src/context.rs b/nexus/src/context.rs index 8cb696c62f..ea8fe3d463 100644 --- a/nexus/src/context.rs +++ b/nexus/src/context.rs @@ -146,9 +146,11 @@ impl ServerContext { name: name.to_string().into(), id: config.deployment.id, }; - const START_LATENCY_DECADE: i16 = -6; - const END_LATENCY_DECADE: i16 = 3; - LatencyTracker::with_latency_decades( + // Start at 1 microsecond == 1e3 nanoseconds. + const START_LATENCY_DECADE: u16 = 3; + // End at 1000s == (1e9 * 1e3) == 1e12 nanoseconds. + const END_LATENCY_DECADE: u16 = 12; + LatencyTracker::with_log_linear_bins( target, START_LATENCY_DECADE, END_LATENCY_DECADE, diff --git a/oximeter/db/schema/replicated/12/timeseries-to-delete.txt b/oximeter/db/schema/replicated/12/timeseries-to-delete.txt new file mode 100644 index 0000000000..40b90e05ff --- /dev/null +++ b/oximeter/db/schema/replicated/12/timeseries-to-delete.txt @@ -0,0 +1 @@ +http_service:request_latency_histogram diff --git a/oximeter/db/schema/single-node/12/timeseries-to-delete.txt b/oximeter/db/schema/single-node/12/timeseries-to-delete.txt new file mode 100644 index 0000000000..40b90e05ff --- /dev/null +++ b/oximeter/db/schema/single-node/12/timeseries-to-delete.txt @@ -0,0 +1 @@ +http_service:request_latency_histogram diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index a3e9d109ff..d57819b0d0 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -45,7 +45,7 @@ use uuid::Uuid; /// - [`crate::Client::initialize_db_with_version`] /// - [`crate::Client::ensure_schema`] /// - The `clickhouse-schema-updater` binary in this crate -pub const OXIMETER_VERSION: u64 = 11; +pub const OXIMETER_VERSION: u64 = 12; // Wrapper type to represent a boolean in the database. // diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index 2eef327d02..feb0545edc 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -24,11 +24,11 @@ pub use http_service::RequestLatencyHistogram; impl RequestLatencyHistogram { /// Build a new `RequestLatencyHistogram` with a specified histogram. /// - /// Latencies are expressed in seconds. + /// Latencies are expressed in nanoseconds. pub fn new( operation_id: &str, status_code: StatusCode, - histogram: Histogram, + histogram: Histogram, ) -> Self { Self { operation_id: operation_id.to_string().into(), @@ -37,24 +37,26 @@ impl RequestLatencyHistogram { } } - /// Build a `RequestLatencyHistogram` with a histogram whose bins span the given decades. + /// Build a histogram whose bins span the given powers of ten. /// - /// `start_decade` and `end_decade` specify the lower and upper limits of the histogram's - /// range, as a power of 10. For example, passing `-3` and `2` results in a histogram with bins - /// spanning `[10 ** -3, 10 ** 2)`. There are 10 bins in each decade. See the - /// [`Histogram::span_decades`] method for more details. + /// `start_power` and `end_power` specify the lower and upper limits of + /// the histogram's range, as powers of 10. For example, passing 2 and 4 + /// results in a histogram with bins from `[10 ** 2, 10 ** 4)`. There are 10 + /// bins in each power of 10. /// - /// Latencies are expressed as seconds. - pub fn with_latency_decades( + /// See the [`Histogram::span_decades`] method for more details. + /// + /// Latencies are expressed in nanoseconds. + pub fn with_log_linear_bins( operation_id: &str, status_code: StatusCode, - start_decade: i16, - end_decade: i16, + start_power: u16, + end_power: u16, ) -> Result { Ok(Self::new( operation_id, status_code, - Histogram::span_decades(start_decade, end_decade)?, + Histogram::span_decades(start_power, end_power)?, )) } @@ -71,7 +73,7 @@ impl RequestLatencyHistogram { } /// The `LatencyTracker` is an [`oximeter::Producer`] that tracks the latencies of requests for an -/// HTTP service, in seconds. +/// HTTP service, in nanoseconds. /// /// Consumers should construct one `LatencyTracker` for each HTTP service they wish to instrument. /// As requests are received, the [`LatencyTracker::update`] method can be called with the @@ -94,14 +96,14 @@ pub struct LatencyTracker { /// The histogram used to track each request. /// /// We store it here to clone as we see new requests. - histogram: Histogram, + histogram: Histogram, } impl LatencyTracker { /// Build a new tracker for the given `service`, using `histogram` to track latencies. /// /// Note that the same histogram is used for each tracked timeseries. - pub fn new(service: HttpService, histogram: Histogram) -> Self { + pub fn new(service: HttpService, histogram: Histogram) -> Self { Self { service, latencies: Arc::new(Mutex::new(HashMap::new())), @@ -109,18 +111,18 @@ impl LatencyTracker { } } - /// Build a new tracker for the given `service`, with a histogram that spans the given decades - /// (powers of 10). See [`RequestLatencyHistogram::with_latency_decades`] for details on the - /// arguments. - pub fn with_latency_decades( + /// Build a new tracker with log-linear bins. + /// + /// This creates a tracker for the `service`, using 10 bins per power of 10, + /// from `[10 ** start_power, 10 ** end_power)`. + /// + /// [`RequestLatencyHistogram::with_latency_decades`] for details on the arguments. + pub fn with_log_linear_bins( service: HttpService, - start_decade: i16, - end_decade: i16, + start_power: u16, + end_power: u16, ) -> Result { - Ok(Self::new( - service, - Histogram::span_decades(start_decade, end_decade)?, - )) + Ok(Self::new(service, Histogram::span_decades(start_power, end_power)?)) } /// Update (or create) a timeseries in response to a new request. @@ -142,7 +144,7 @@ impl LatencyTracker { self.histogram.clone(), ) }); - entry.datum.sample(latency.as_secs_f64()).map_err(MetricsError::from) + entry.datum.sample(latency.as_nanos() as _).map_err(MetricsError::from) } /// Instrument the given Dropshot endpoint handler function. @@ -218,16 +220,16 @@ mod tests { fn test_latency_tracker() { let service = HttpService { name: "my-service".into(), id: ID.parse().unwrap() }; - let hist = Histogram::new(&[0.0, 1.0]).unwrap(); + let hist = Histogram::new(&[100, 1000]).unwrap(); let tracker = LatencyTracker::new(service, hist); let status_code0 = StatusCode::OK; let status_code1 = StatusCode::NOT_FOUND; let operation_id = "some_operation_id"; tracker - .update(operation_id, status_code0, Duration::from_secs_f64(0.5)) + .update(operation_id, status_code0, Duration::from_nanos(200)) .unwrap(); tracker - .update(operation_id, status_code1, Duration::from_secs_f64(0.5)) + .update(operation_id, status_code1, Duration::from_nanos(200)) .unwrap(); let key0 = RequestLatencyHistogram::key_for(operation_id, status_code0); let key1 = RequestLatencyHistogram::key_for(operation_id, status_code1); diff --git a/oximeter/oximeter/schema/http-service.toml b/oximeter/oximeter/schema/http-service.toml index 5270f6942c..2e2e6fb359 100644 --- a/oximeter/oximeter/schema/http-service.toml +++ b/oximeter/oximeter/schema/http-service.toml @@ -11,8 +11,8 @@ versions = [ [[metrics]] name = "request_latency_histogram" description = "Duration for the server to handle a request" -units = "seconds" -datum_type = "histogram_f64" +units = "nanoseconds" +datum_type = "histogram_u64" versions = [ { added_in = 1, fields = [ "operation_id", "status_code" ] } ] diff --git a/oximeter/types/src/histogram.rs b/oximeter/types/src/histogram.rs index 2a4feab382..0eef30d30d 100644 --- a/oximeter/types/src/histogram.rs +++ b/oximeter/types/src/histogram.rs @@ -1191,6 +1191,65 @@ where } } +pub trait Bits: Integer { + const BITS: u32; + fn next_power(self) -> Option; +} + +macro_rules! impl_bits { + ($type_:ty) => { + impl Bits for $type_ { + const BITS: u32 = Self::BITS; + + fn next_power(self) -> Option { + self.checked_mul(2) + } + } + }; +} + +impl_bits!(u8); +impl_bits!(u16); +impl_bits!(u32); +impl_bits!(u64); + +/// A trait for generating logarithmically-spaced bins. +pub trait LogBins: Bits { + /// Compute the left bin edges for a histogram with power-of-2 bins. + /// + /// Bins start at 1, and increase in powers-of-2 until the maximum of the + /// support type. + fn power_of_two() -> Vec; +} + +impl LogBins for T +where + T: HistogramSupport + Bits, +{ + fn power_of_two() -> Vec { + let mut out = Vec::with_capacity(T::BITS as _); + let mut x = T::one(); + out.push(x); + while let Some(next) = x.next_power() { + out.push(next); + x = next; + } + out + } +} + +impl Histogram +where + T: LogBins + HistogramSupport, +{ + /// Create a histogram with logarithmically spaced bins at each power of 2. + /// + /// This is only available for unsigned integer support types. + pub fn power_of_two() -> Result { + Self::new(&T::power_of_two()) + } +} + // Helper to ensure all values are comparable, i.e., not NaN. fn ensure_finite(value: T) -> Result<(), HistogramError> where @@ -1797,4 +1856,18 @@ mod tests { "expected to overflow a u8, since support type is not wide enough", ); } + + #[test] + fn test_log_bins_u8() { + let bins = u8::power_of_two(); + assert_eq!(bins, [1, 2, 4, 8, 16, 32, 64, 128],); + } + + #[test] + fn test_log_bins_u64() { + let bins = u64::power_of_two(); + for (i, bin) in bins.iter().enumerate() { + assert_eq!(*bin, 1u64 << i); + } + } } From da16edf4fb88be7b630c8894364cb336910a16aa Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 28 Aug 2024 18:06:09 +0000 Subject: [PATCH 2/4] Fixup rustdoc link --- oximeter/instruments/src/http.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index feb0545edc..efd053ad66 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -116,7 +116,8 @@ impl LatencyTracker { /// This creates a tracker for the `service`, using 10 bins per power of 10, /// from `[10 ** start_power, 10 ** end_power)`. /// - /// [`RequestLatencyHistogram::with_latency_decades`] for details on the arguments. + /// [`RequestLatencyHistogram::with_log_linear_bins`] for details on the + /// arguments. pub fn with_log_linear_bins( service: HttpService, start_power: u16, From bc6f0d01010ad9456cc9f38c527a469f1e1f20b5 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 28 Aug 2024 23:20:35 +0000 Subject: [PATCH 3/4] Review feedback - s/decade/power in more places - Remove intermediate `LogBins` trait, just use the Histogram constructor --- gateway/src/context.rs | 8 +++---- nexus/src/context.rs | 8 +++---- oximeter/types/src/histogram.rs | 42 ++++++++++----------------------- 3 files changed, 20 insertions(+), 38 deletions(-) diff --git a/gateway/src/context.rs b/gateway/src/context.rs index a0f9c3327f..dc5717604b 100644 --- a/gateway/src/context.rs +++ b/gateway/src/context.rs @@ -40,17 +40,17 @@ impl ServerContext { }; // Track from 1 microsecond == 1e3 nanoseconds - const START_LATENCY_DECADE: u16 = 3; + const LATENCY_START_POWER: u16 = 3; // To 1000s == 1e9 * 1e3 == 1e12 nanoseconds - const END_LATENCY_DECADE: u16 = 12; + const LATENCY_END_POWER: u16 = 12; let latencies = oximeter_instruments::http::LatencyTracker::with_log_linear_bins( oximeter_instruments::http::HttpService { name: "management-gateway-service".into(), id, }, - START_LATENCY_DECADE, - END_LATENCY_DECADE, + LATENCY_START_POWER, + LATENCY_END_POWER, ) .expect("start and end decades are hardcoded and should be valid"); diff --git a/nexus/src/context.rs b/nexus/src/context.rs index ea8fe3d463..9620a3937a 100644 --- a/nexus/src/context.rs +++ b/nexus/src/context.rs @@ -147,13 +147,13 @@ impl ServerContext { id: config.deployment.id, }; // Start at 1 microsecond == 1e3 nanoseconds. - const START_LATENCY_DECADE: u16 = 3; + const LATENCY_START_POWER: u16 = 3; // End at 1000s == (1e9 * 1e3) == 1e12 nanoseconds. - const END_LATENCY_DECADE: u16 = 12; + const LATENCY_END_POWER: u16 = 12; LatencyTracker::with_log_linear_bins( target, - START_LATENCY_DECADE, - END_LATENCY_DECADE, + LATENCY_START_POWER, + LATENCY_END_POWER, ) .unwrap() }; diff --git a/oximeter/types/src/histogram.rs b/oximeter/types/src/histogram.rs index 0eef30d30d..c91ab9f966 100644 --- a/oximeter/types/src/histogram.rs +++ b/oximeter/types/src/histogram.rs @@ -1213,40 +1213,22 @@ impl_bits!(u16); impl_bits!(u32); impl_bits!(u64); -/// A trait for generating logarithmically-spaced bins. -pub trait LogBins: Bits { - /// Compute the left bin edges for a histogram with power-of-2 bins. - /// - /// Bins start at 1, and increase in powers-of-2 until the maximum of the - /// support type. - fn power_of_two() -> Vec; -} - -impl LogBins for T -where - T: HistogramSupport + Bits, -{ - fn power_of_two() -> Vec { - let mut out = Vec::with_capacity(T::BITS as _); - let mut x = T::one(); - out.push(x); - while let Some(next) = x.next_power() { - out.push(next); - x = next; - } - out - } -} - impl Histogram where - T: LogBins + HistogramSupport, + T: Bits + HistogramSupport, { /// Create a histogram with logarithmically spaced bins at each power of 2. /// /// This is only available for unsigned integer support types. - pub fn power_of_two() -> Result { - Self::new(&T::power_of_two()) + pub fn power_of_two() -> Self { + let mut bins = Vec::with_capacity(T::BITS as _); + let mut x = T::one(); + bins.push(x); + while let Some(next) = x.next_power() { + bins.push(next); + x = next; + } + Self::new(&bins).expect("Bits is statically known") } } @@ -1859,13 +1841,13 @@ mod tests { #[test] fn test_log_bins_u8() { - let bins = u8::power_of_two(); + let (bins, _) = Histogram::::power_of_two().bins_and_counts(); assert_eq!(bins, [1, 2, 4, 8, 16, 32, 64, 128],); } #[test] fn test_log_bins_u64() { - let bins = u64::power_of_two(); + let (bins, _) = Histogram::::power_of_two().bins_and_counts(); for (i, bin) in bins.iter().enumerate() { assert_eq!(*bin, 1u64 << i); } From 0c05928e14e5ee2ae03566d7bb51e1bfd5c0f281 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Thu, 29 Aug 2024 01:13:04 +0000 Subject: [PATCH 4/4] Histogram adds a leftmost bin --- oximeter/types/src/histogram.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/oximeter/types/src/histogram.rs b/oximeter/types/src/histogram.rs index c91ab9f966..2507f2f5c6 100644 --- a/oximeter/types/src/histogram.rs +++ b/oximeter/types/src/histogram.rs @@ -1842,13 +1842,14 @@ mod tests { #[test] fn test_log_bins_u8() { let (bins, _) = Histogram::::power_of_two().bins_and_counts(); - assert_eq!(bins, [1, 2, 4, 8, 16, 32, 64, 128],); + assert_eq!(bins, [0, 1, 2, 4, 8, 16, 32, 64, 128],); } #[test] fn test_log_bins_u64() { let (bins, _) = Histogram::::power_of_two().bins_and_counts(); - for (i, bin) in bins.iter().enumerate() { + assert_eq!(bins[0], 0); + for (i, bin) in bins.iter().skip(1).enumerate() { assert_eq!(*bin, 1u64 << i); } }