Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track HTTP request latencies in nanoseconds #6470

Merged
merged 4 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions gateway/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,18 @@ impl ServerContext {
OnceLock::new()
};

const START_LATENCY_DECADE: i16 = -6;
const END_LATENCY_DECADE: i16 = 3;
// Track from 1 microsecond == 1e3 nanoseconds
const LATENCY_START_POWER: u16 = 3;
// To 1000s == 1e9 * 1e3 == 1e12 nanoseconds
const LATENCY_END_POWER: u16 = 12;
let latencies =
oximeter_instruments::http::LatencyTracker::with_latency_decades(
oximeter_instruments::http::LatencyTracker::with_log_linear_bins(
oximeter_instruments::http::HttpService {
name: "management-gateway-service".into(),
id,
},
START_LATENCY_DECADE,
END_LATENCY_DECADE,
LATENCY_START_POWER,
LATENCY_END_POWER,
)
.expect("start and end decades are hardcoded and should be valid");

Expand Down
12 changes: 7 additions & 5 deletions nexus/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,14 @@ impl ServerContext {
name: name.to_string().into(),
id: config.deployment.id,
};
const START_LATENCY_DECADE: i16 = -6;
const END_LATENCY_DECADE: i16 = 3;
LatencyTracker::with_latency_decades(
// Start at 1 microsecond == 1e3 nanoseconds.
const LATENCY_START_POWER: u16 = 3;
// End at 1000s == (1e9 * 1e3) == 1e12 nanoseconds.
const LATENCY_END_POWER: u16 = 12;
LatencyTracker::with_log_linear_bins(
target,
START_LATENCY_DECADE,
END_LATENCY_DECADE,
LATENCY_START_POWER,
LATENCY_END_POWER,
)
.unwrap()
};
Expand Down
1 change: 1 addition & 0 deletions oximeter/db/schema/replicated/12/timeseries-to-delete.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http_service:request_latency_histogram
1 change: 1 addition & 0 deletions oximeter/db/schema/single-node/12/timeseries-to-delete.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http_service:request_latency_histogram
2 changes: 1 addition & 1 deletion oximeter/db/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use uuid::Uuid;
/// - [`crate::Client::initialize_db_with_version`]
/// - [`crate::Client::ensure_schema`]
/// - The `clickhouse-schema-updater` binary in this crate
pub const OXIMETER_VERSION: u64 = 11;
pub const OXIMETER_VERSION: u64 = 12;

// Wrapper type to represent a boolean in the database.
//
Expand Down
59 changes: 31 additions & 28 deletions oximeter/instruments/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ pub use http_service::RequestLatencyHistogram;
impl RequestLatencyHistogram {
/// Build a new `RequestLatencyHistogram` with a specified histogram.
///
/// Latencies are expressed in seconds.
/// Latencies are expressed in nanoseconds.
pub fn new(
operation_id: &str,
status_code: StatusCode,
histogram: Histogram<f64>,
histogram: Histogram<u64>,
) -> Self {
Self {
operation_id: operation_id.to_string().into(),
Expand All @@ -37,24 +37,26 @@ impl RequestLatencyHistogram {
}
}

/// Build a `RequestLatencyHistogram` with a histogram whose bins span the given decades.
/// Build a histogram whose bins span the given powers of ten.
///
/// `start_decade` and `end_decade` specify the lower and upper limits of the histogram's
/// range, as a power of 10. For example, passing `-3` and `2` results in a histogram with bins
/// spanning `[10 ** -3, 10 ** 2)`. There are 10 bins in each decade. See the
/// [`Histogram::span_decades`] method for more details.
/// `start_power` and `end_power` specify the lower and upper limits of
/// the histogram's range, as powers of 10. For example, passing 2 and 4
/// results in a histogram with bins from `[10 ** 2, 10 ** 4)`. There are 10
/// bins in each power of 10.
///
/// Latencies are expressed as seconds.
pub fn with_latency_decades(
/// See the [`Histogram::span_decades`] method for more details.
///
/// Latencies are expressed in nanoseconds.
pub fn with_log_linear_bins(
operation_id: &str,
status_code: StatusCode,
start_decade: i16,
end_decade: i16,
start_power: u16,
end_power: u16,
) -> Result<Self, MetricsError> {
Ok(Self::new(
operation_id,
status_code,
Histogram::span_decades(start_decade, end_decade)?,
Histogram::span_decades(start_power, end_power)?,
))
}

Expand All @@ -71,7 +73,7 @@ impl RequestLatencyHistogram {
}

/// The `LatencyTracker` is an [`oximeter::Producer`] that tracks the latencies of requests for an
/// HTTP service, in seconds.
/// HTTP service, in nanoseconds.
///
/// Consumers should construct one `LatencyTracker` for each HTTP service they wish to instrument.
/// As requests are received, the [`LatencyTracker::update`] method can be called with the
Expand All @@ -94,33 +96,34 @@ pub struct LatencyTracker {
/// The histogram used to track each request.
///
/// We store it here to clone as we see new requests.
histogram: Histogram<f64>,
histogram: Histogram<u64>,
}

impl LatencyTracker {
/// Build a new tracker for the given `service`, using `histogram` to track latencies.
///
/// Note that the same histogram is used for each tracked timeseries.
pub fn new(service: HttpService, histogram: Histogram<f64>) -> Self {
pub fn new(service: HttpService, histogram: Histogram<u64>) -> Self {
Self {
service,
latencies: Arc::new(Mutex::new(HashMap::new())),
histogram,
}
}

/// Build a new tracker for the given `service`, with a histogram that spans the given decades
/// (powers of 10). See [`RequestLatencyHistogram::with_latency_decades`] for details on the
/// Build a new tracker with log-linear bins.
///
/// This creates a tracker for the `service`, using 10 bins per power of 10,
/// from `[10 ** start_power, 10 ** end_power)`.
///
/// [`RequestLatencyHistogram::with_log_linear_bins`] for details on the
/// arguments.
pub fn with_latency_decades(
pub fn with_log_linear_bins(
service: HttpService,
start_decade: i16,
end_decade: i16,
start_power: u16,
end_power: u16,
) -> Result<Self, MetricsError> {
Ok(Self::new(
service,
Histogram::span_decades(start_decade, end_decade)?,
))
Ok(Self::new(service, Histogram::span_decades(start_power, end_power)?))
}

/// Update (or create) a timeseries in response to a new request.
Expand All @@ -142,7 +145,7 @@ impl LatencyTracker {
self.histogram.clone(),
)
});
entry.datum.sample(latency.as_secs_f64()).map_err(MetricsError::from)
entry.datum.sample(latency.as_nanos() as _).map_err(MetricsError::from)
}

/// Instrument the given Dropshot endpoint handler function.
Expand Down Expand Up @@ -218,16 +221,16 @@ mod tests {
fn test_latency_tracker() {
let service =
HttpService { name: "my-service".into(), id: ID.parse().unwrap() };
let hist = Histogram::new(&[0.0, 1.0]).unwrap();
let hist = Histogram::new(&[100, 1000]).unwrap();
let tracker = LatencyTracker::new(service, hist);
let status_code0 = StatusCode::OK;
let status_code1 = StatusCode::NOT_FOUND;
let operation_id = "some_operation_id";
tracker
.update(operation_id, status_code0, Duration::from_secs_f64(0.5))
.update(operation_id, status_code0, Duration::from_nanos(200))
.unwrap();
tracker
.update(operation_id, status_code1, Duration::from_secs_f64(0.5))
.update(operation_id, status_code1, Duration::from_nanos(200))
.unwrap();
let key0 = RequestLatencyHistogram::key_for(operation_id, status_code0);
let key1 = RequestLatencyHistogram::key_for(operation_id, status_code1);
Expand Down
4 changes: 2 additions & 2 deletions oximeter/oximeter/schema/http-service.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ versions = [
[[metrics]]
name = "request_latency_histogram"
description = "Duration for the server to handle a request"
units = "seconds"
datum_type = "histogram_f64"
units = "nanoseconds"
datum_type = "histogram_u64"
versions = [
{ added_in = 1, fields = [ "operation_id", "status_code" ] }
]
Expand Down
56 changes: 56 additions & 0 deletions oximeter/types/src/histogram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,47 @@ where
}
}

pub trait Bits: Integer {
const BITS: u32;
fn next_power(self) -> Option<Self>;
}

macro_rules! impl_bits {
($type_:ty) => {
impl Bits for $type_ {
const BITS: u32 = Self::BITS;

fn next_power(self) -> Option<Self> {
self.checked_mul(2)
}
}
};
}

impl_bits!(u8);
impl_bits!(u16);
impl_bits!(u32);
impl_bits!(u64);

impl<T> Histogram<T>
where
T: Bits + HistogramSupport,
{
/// Create a histogram with logarithmically spaced bins at each power of 2.
///
/// This is only available for unsigned integer support types.
pub fn power_of_two() -> Self {
let mut bins = Vec::with_capacity(T::BITS as _);
let mut x = T::one();
bins.push(x);
while let Some(next) = x.next_power() {
bins.push(next);
x = next;
}
Self::new(&bins).expect("Bits is statically known")
}
}

// Helper to ensure all values are comparable, i.e., not NaN.
fn ensure_finite<T>(value: T) -> Result<(), HistogramError>
where
Expand Down Expand Up @@ -1797,4 +1838,19 @@ mod tests {
"expected to overflow a u8, since support type is not wide enough",
);
}

#[test]
fn test_log_bins_u8() {
let (bins, _) = Histogram::<u8>::power_of_two().bins_and_counts();
assert_eq!(bins, [0, 1, 2, 4, 8, 16, 32, 64, 128],);
}

#[test]
fn test_log_bins_u64() {
let (bins, _) = Histogram::<u64>::power_of_two().bins_and_counts();
assert_eq!(bins[0], 0);
for (i, bin) in bins.iter().skip(1).enumerate() {
assert_eq!(*bin, 1u64 << i);
}
}
}
Loading