diff --git a/gateway/src/context.rs b/gateway/src/context.rs index 15592145cf..dc5717604b 100644 --- a/gateway/src/context.rs +++ b/gateway/src/context.rs @@ -39,16 +39,18 @@ impl ServerContext { OnceLock::new() }; - const START_LATENCY_DECADE: i16 = -6; - const END_LATENCY_DECADE: i16 = 3; + // Track from 1 microsecond == 1e3 nanoseconds + const LATENCY_START_POWER: u16 = 3; + // To 1000s == 1e9 * 1e3 == 1e12 nanoseconds + const LATENCY_END_POWER: u16 = 12; let latencies = - oximeter_instruments::http::LatencyTracker::with_latency_decades( + oximeter_instruments::http::LatencyTracker::with_log_linear_bins( oximeter_instruments::http::HttpService { name: "management-gateway-service".into(), id, }, - START_LATENCY_DECADE, - END_LATENCY_DECADE, + LATENCY_START_POWER, + LATENCY_END_POWER, ) .expect("start and end decades are hardcoded and should be valid"); diff --git a/nexus/src/context.rs b/nexus/src/context.rs index 8cb696c62f..9620a3937a 100644 --- a/nexus/src/context.rs +++ b/nexus/src/context.rs @@ -146,12 +146,14 @@ impl ServerContext { name: name.to_string().into(), id: config.deployment.id, }; - const START_LATENCY_DECADE: i16 = -6; - const END_LATENCY_DECADE: i16 = 3; - LatencyTracker::with_latency_decades( + // Start at 1 microsecond == 1e3 nanoseconds. + const LATENCY_START_POWER: u16 = 3; + // End at 1000s == (1e9 * 1e3) == 1e12 nanoseconds. + const LATENCY_END_POWER: u16 = 12; + LatencyTracker::with_log_linear_bins( target, - START_LATENCY_DECADE, - END_LATENCY_DECADE, + LATENCY_START_POWER, + LATENCY_END_POWER, ) .unwrap() }; diff --git a/oximeter/db/schema/replicated/12/timeseries-to-delete.txt b/oximeter/db/schema/replicated/12/timeseries-to-delete.txt new file mode 100644 index 0000000000..40b90e05ff --- /dev/null +++ b/oximeter/db/schema/replicated/12/timeseries-to-delete.txt @@ -0,0 +1 @@ +http_service:request_latency_histogram diff --git a/oximeter/db/schema/single-node/12/timeseries-to-delete.txt b/oximeter/db/schema/single-node/12/timeseries-to-delete.txt new file mode 100644 index 0000000000..40b90e05ff --- /dev/null +++ b/oximeter/db/schema/single-node/12/timeseries-to-delete.txt @@ -0,0 +1 @@ +http_service:request_latency_histogram diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index a3e9d109ff..d57819b0d0 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -45,7 +45,7 @@ use uuid::Uuid; /// - [`crate::Client::initialize_db_with_version`] /// - [`crate::Client::ensure_schema`] /// - The `clickhouse-schema-updater` binary in this crate -pub const OXIMETER_VERSION: u64 = 11; +pub const OXIMETER_VERSION: u64 = 12; // Wrapper type to represent a boolean in the database. // diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index 2eef327d02..efd053ad66 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -24,11 +24,11 @@ pub use http_service::RequestLatencyHistogram; impl RequestLatencyHistogram { /// Build a new `RequestLatencyHistogram` with a specified histogram. /// - /// Latencies are expressed in seconds. + /// Latencies are expressed in nanoseconds. pub fn new( operation_id: &str, status_code: StatusCode, - histogram: Histogram, + histogram: Histogram, ) -> Self { Self { operation_id: operation_id.to_string().into(), @@ -37,24 +37,26 @@ impl RequestLatencyHistogram { } } - /// Build a `RequestLatencyHistogram` with a histogram whose bins span the given decades. + /// Build a histogram whose bins span the given powers of ten. /// - /// `start_decade` and `end_decade` specify the lower and upper limits of the histogram's - /// range, as a power of 10. For example, passing `-3` and `2` results in a histogram with bins - /// spanning `[10 ** -3, 10 ** 2)`. There are 10 bins in each decade. See the - /// [`Histogram::span_decades`] method for more details. + /// `start_power` and `end_power` specify the lower and upper limits of + /// the histogram's range, as powers of 10. For example, passing 2 and 4 + /// results in a histogram with bins from `[10 ** 2, 10 ** 4)`. There are 10 + /// bins in each power of 10. /// - /// Latencies are expressed as seconds. - pub fn with_latency_decades( + /// See the [`Histogram::span_decades`] method for more details. + /// + /// Latencies are expressed in nanoseconds. + pub fn with_log_linear_bins( operation_id: &str, status_code: StatusCode, - start_decade: i16, - end_decade: i16, + start_power: u16, + end_power: u16, ) -> Result { Ok(Self::new( operation_id, status_code, - Histogram::span_decades(start_decade, end_decade)?, + Histogram::span_decades(start_power, end_power)?, )) } @@ -71,7 +73,7 @@ impl RequestLatencyHistogram { } /// The `LatencyTracker` is an [`oximeter::Producer`] that tracks the latencies of requests for an -/// HTTP service, in seconds. +/// HTTP service, in nanoseconds. /// /// Consumers should construct one `LatencyTracker` for each HTTP service they wish to instrument. /// As requests are received, the [`LatencyTracker::update`] method can be called with the @@ -94,14 +96,14 @@ pub struct LatencyTracker { /// The histogram used to track each request. /// /// We store it here to clone as we see new requests. - histogram: Histogram, + histogram: Histogram, } impl LatencyTracker { /// Build a new tracker for the given `service`, using `histogram` to track latencies. /// /// Note that the same histogram is used for each tracked timeseries. - pub fn new(service: HttpService, histogram: Histogram) -> Self { + pub fn new(service: HttpService, histogram: Histogram) -> Self { Self { service, latencies: Arc::new(Mutex::new(HashMap::new())), @@ -109,18 +111,19 @@ impl LatencyTracker { } } - /// Build a new tracker for the given `service`, with a histogram that spans the given decades - /// (powers of 10). See [`RequestLatencyHistogram::with_latency_decades`] for details on the + /// Build a new tracker with log-linear bins. + /// + /// This creates a tracker for the `service`, using 10 bins per power of 10, + /// from `[10 ** start_power, 10 ** end_power)`. + /// + /// [`RequestLatencyHistogram::with_log_linear_bins`] for details on the /// arguments. - pub fn with_latency_decades( + pub fn with_log_linear_bins( service: HttpService, - start_decade: i16, - end_decade: i16, + start_power: u16, + end_power: u16, ) -> Result { - Ok(Self::new( - service, - Histogram::span_decades(start_decade, end_decade)?, - )) + Ok(Self::new(service, Histogram::span_decades(start_power, end_power)?)) } /// Update (or create) a timeseries in response to a new request. @@ -142,7 +145,7 @@ impl LatencyTracker { self.histogram.clone(), ) }); - entry.datum.sample(latency.as_secs_f64()).map_err(MetricsError::from) + entry.datum.sample(latency.as_nanos() as _).map_err(MetricsError::from) } /// Instrument the given Dropshot endpoint handler function. @@ -218,16 +221,16 @@ mod tests { fn test_latency_tracker() { let service = HttpService { name: "my-service".into(), id: ID.parse().unwrap() }; - let hist = Histogram::new(&[0.0, 1.0]).unwrap(); + let hist = Histogram::new(&[100, 1000]).unwrap(); let tracker = LatencyTracker::new(service, hist); let status_code0 = StatusCode::OK; let status_code1 = StatusCode::NOT_FOUND; let operation_id = "some_operation_id"; tracker - .update(operation_id, status_code0, Duration::from_secs_f64(0.5)) + .update(operation_id, status_code0, Duration::from_nanos(200)) .unwrap(); tracker - .update(operation_id, status_code1, Duration::from_secs_f64(0.5)) + .update(operation_id, status_code1, Duration::from_nanos(200)) .unwrap(); let key0 = RequestLatencyHistogram::key_for(operation_id, status_code0); let key1 = RequestLatencyHistogram::key_for(operation_id, status_code1); diff --git a/oximeter/oximeter/schema/http-service.toml b/oximeter/oximeter/schema/http-service.toml index 5270f6942c..2e2e6fb359 100644 --- a/oximeter/oximeter/schema/http-service.toml +++ b/oximeter/oximeter/schema/http-service.toml @@ -11,8 +11,8 @@ versions = [ [[metrics]] name = "request_latency_histogram" description = "Duration for the server to handle a request" -units = "seconds" -datum_type = "histogram_f64" +units = "nanoseconds" +datum_type = "histogram_u64" versions = [ { added_in = 1, fields = [ "operation_id", "status_code" ] } ] diff --git a/oximeter/types/src/histogram.rs b/oximeter/types/src/histogram.rs index 2a4feab382..2507f2f5c6 100644 --- a/oximeter/types/src/histogram.rs +++ b/oximeter/types/src/histogram.rs @@ -1191,6 +1191,47 @@ where } } +pub trait Bits: Integer { + const BITS: u32; + fn next_power(self) -> Option; +} + +macro_rules! impl_bits { + ($type_:ty) => { + impl Bits for $type_ { + const BITS: u32 = Self::BITS; + + fn next_power(self) -> Option { + self.checked_mul(2) + } + } + }; +} + +impl_bits!(u8); +impl_bits!(u16); +impl_bits!(u32); +impl_bits!(u64); + +impl Histogram +where + T: Bits + HistogramSupport, +{ + /// Create a histogram with logarithmically spaced bins at each power of 2. + /// + /// This is only available for unsigned integer support types. + pub fn power_of_two() -> Self { + let mut bins = Vec::with_capacity(T::BITS as _); + let mut x = T::one(); + bins.push(x); + while let Some(next) = x.next_power() { + bins.push(next); + x = next; + } + Self::new(&bins).expect("Bits is statically known") + } +} + // Helper to ensure all values are comparable, i.e., not NaN. fn ensure_finite(value: T) -> Result<(), HistogramError> where @@ -1797,4 +1838,19 @@ mod tests { "expected to overflow a u8, since support type is not wide enough", ); } + + #[test] + fn test_log_bins_u8() { + let (bins, _) = Histogram::::power_of_two().bins_and_counts(); + assert_eq!(bins, [0, 1, 2, 4, 8, 16, 32, 64, 128],); + } + + #[test] + fn test_log_bins_u64() { + let (bins, _) = Histogram::::power_of_two().bins_and_counts(); + assert_eq!(bins[0], 0); + for (i, bin) in bins.iter().skip(1).enumerate() { + assert_eq!(*bin, 1u64 << i); + } + } }