From 9c5a2ec47ff80973d5a436eac4dc52bdb4a76f26 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 10 Oct 2023 21:28:06 -0700 Subject: [PATCH 1/4] Use stable hash, stable format, for deriving timeseries_key --- Cargo.lock | 21 +++++ Cargo.toml | 1 + oximeter/db/Cargo.toml | 4 + oximeter/db/src/lib.rs | 94 ++++++++++++++++++++- oximeter/db/test-output/timeseries-keys.txt | 12 +++ oximeter/oximeter/Cargo.toml | 1 + oximeter/oximeter/src/types.rs | 10 ++- 7 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 oximeter/db/test-output/timeseries-keys.txt diff --git a/Cargo.lock b/Cargo.lock index 306e953049..efdd7e2ce1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -493,6 +493,16 @@ dependencies = [ "sha2", ] +[[package]] +name = "bcs" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd3ffe8b19a604421a5d461d4a70346223e535903fbc3067138bddbebddcf77" +dependencies = [ + "serde", + "thiserror", +] + [[package]] name = "bhyve_api" version = "0.0.0" @@ -3098,6 +3108,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "highway" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ba82c000837f4e74df01a5520f0dc48735d4aed955a99eae4428bab7cf3acd" + [[package]] name = "hkdf" version = "0.12.3" @@ -5732,6 +5748,7 @@ dependencies = [ "rstest", "schemars", "serde", + "strum", "thiserror", "trybuild", "uuid", @@ -5810,10 +5827,13 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "bcs", "bytes", "chrono", "clap 4.4.3", "dropshot", + "expectorate", + "highway", "itertools 0.11.0", "omicron-test-utils", "omicron-workspace-hack", @@ -5827,6 +5847,7 @@ dependencies = [ "slog-async", "slog-dtrace", "slog-term", + "strum", "thiserror", "tokio", "usdt", diff --git a/Cargo.toml b/Cargo.toml index da7b582fe3..25e33803da 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -199,6 +199,7 @@ headers = "0.3.9" heck = "0.4" hex = "0.4.3" hex-literal = "0.4.1" +highway = "1.1.0" hkdf = "0.12.3" http = "0.2.9" httptest = "0.15.4" diff --git a/oximeter/db/Cargo.toml b/oximeter/db/Cargo.toml index ad6d584b1b..d37c57ccce 100644 --- a/oximeter/db/Cargo.toml +++ b/oximeter/db/Cargo.toml @@ -8,10 +8,12 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true async-trait.workspace = true +bcs.workspace = true bytes = { workspace = true, features = [ "serde" ] } chrono.workspace = true clap.workspace = true dropshot.workspace = true +highway.workspace = true oximeter.workspace = true regex.workspace = true reqwest = { workspace = true, features = [ "json" ] } @@ -28,9 +30,11 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] +expectorate.workspace = true itertools.workspace = true omicron-test-utils.workspace = true slog-dtrace.workspace = true +strum.workspace = true [[bin]] name = "oxdb" diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs index 9720d6914d..3aec217f26 100644 --- a/oximeter/db/src/lib.rs +++ b/oximeter/db/src/lib.rs @@ -335,11 +335,13 @@ pub(crate) fn timeseries_key_for<'a>( target_fields: impl Iterator, metric_fields: impl Iterator, ) -> TimeseriesKey { - use std::collections::hash_map::DefaultHasher; + use highway::HighwayHasher; use std::hash::{Hash, Hasher}; - let mut hasher = DefaultHasher::new(); + let mut hasher = HighwayHasher::default(); for field in target_fields.chain(metric_fields) { - field.hash(&mut hasher); + bcs::to_bytes(&field) + .expect("Failed to serialized field to bytes") + .hash(&mut hasher); } hasher.finish() } @@ -393,4 +395,90 @@ mod tests { assert!(TimeseriesName::try_from("a:").is_err()); assert!(TimeseriesName::try_from("123").is_err()); } + + #[test] + fn test_timeseries_key_generation_hashes_fields_sequentially() { + use super::timeseries_key_for; + use oximeter::{Field, FieldValue}; + + let f = |name: &str, value| Field { name: name.to_string(), value }; + + // Confirm that "targets" and "metrics" are interchangeable, + // we just hash everything sequentially. + assert_eq!( + timeseries_key_for( + [&f("a", FieldValue::String("a".to_string()))].into_iter(), + [&f("b", FieldValue::String("b".to_string()))].into_iter(), + ), + timeseries_key_for( + [ + &f("a", FieldValue::String("a".to_string())), + &f("b", FieldValue::String("b".to_string())), + ] + .into_iter(), + [].into_iter(), + ), + ); + + // However, order still matters ("a, b" != "b, a") + assert_ne!( + timeseries_key_for( + [&f("a", FieldValue::String("a".to_string()))].into_iter(), + [&f("b", FieldValue::String("b".to_string()))].into_iter(), + ), + timeseries_key_for( + [&f("b", FieldValue::String("b".to_string()))].into_iter(), + [&f("a", FieldValue::String("a".to_string()))].into_iter(), + ), + ); + } + + #[test] + fn test_timeseries_key_stability() { + use super::timeseries_key_for; + use oximeter::{Field, FieldValue}; + use strum::EnumCount; + + let values = [ + ("string", FieldValue::String(String::default())), + ("i8", FieldValue::I8(-0x0A)), + ("u8", FieldValue::U8(0x0A)), + ("i16", FieldValue::I16(-0x0ABC)), + ("u16", FieldValue::U16(0x0ABC)), + ("i32", FieldValue::I32(-0x0ABC_0000)), + ("u32", FieldValue::U32(0x0ABC_0000)), + ("i64", FieldValue::I64(-0x0ABC_0000_0000_0000)), + ("u64", FieldValue::U64(0x0ABC_0000_0000_0000)), + ( + "ipaddr", + FieldValue::IpAddr(std::net::IpAddr::V4( + std::net::Ipv4Addr::LOCALHOST, + )), + ), + ("uuid", FieldValue::Uuid(uuid::Uuid::nil())), + ("bool", FieldValue::Bool(true)), + ]; + + // Exhaustively testing enums is a bit tricky. Although it's easy to + // check "all variants of an enum are matched", it harder to test "all + // variants of an enum have been supplied". + // + // We use this as a proxy, confirming that each variant is represented + // here for the purposes of tracking stability. + assert_eq!(values.len(), FieldValue::COUNT); + + let mut output = vec![]; + for (name, value) in values { + let key = timeseries_key_for( + [&Field { name: name.to_string(), value }].into_iter(), + [].into_iter(), + ); + output.push(format!("{name} -> {key}")); + } + + expectorate::assert_contents( + "test-output/timeseries-keys.txt", + &output.join("\n"), + ); + } } diff --git a/oximeter/db/test-output/timeseries-keys.txt b/oximeter/db/test-output/timeseries-keys.txt new file mode 100644 index 0000000000..dc94aac545 --- /dev/null +++ b/oximeter/db/test-output/timeseries-keys.txt @@ -0,0 +1,12 @@ +string -> 11027713821408113420 +i8 -> 11040867383141339877 +u8 -> 15607688254753473406 +i16 -> 13295860386975871341 +u16 -> 2145565374036862015 +i32 -> 10931977742674674182 +u32 -> 6138968585984979982 +i64 -> 981603688282082647 +u64 -> 4754979326506678930 +ipaddr -> 17297273002898199682 +uuid -> 9564253805848631232 +bool -> 13392625023595096799 \ No newline at end of file diff --git a/oximeter/oximeter/Cargo.toml b/oximeter/oximeter/Cargo.toml index 7d01b8f8be..8a69494d5a 100644 --- a/oximeter/oximeter/Cargo.toml +++ b/oximeter/oximeter/Cargo.toml @@ -13,6 +13,7 @@ omicron-common.workspace = true oximeter-macro-impl.workspace = true schemars = { workspace = true, features = [ "uuid1", "bytes", "chrono" ] } serde.workspace = true +strum.workspace = true thiserror.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/oximeter/oximeter/src/types.rs b/oximeter/oximeter/src/types.rs index aa61a426e3..1c8af6f83e 100644 --- a/oximeter/oximeter/src/types.rs +++ b/oximeter/oximeter/src/types.rs @@ -90,7 +90,15 @@ impl_field_type_from! { bool, FieldType::Bool } /// The `FieldValue` contains the value of a target or metric field. #[derive( - Clone, Debug, Hash, PartialEq, Eq, JsonSchema, Serialize, Deserialize, + Clone, + Debug, + Hash, + PartialEq, + Eq, + JsonSchema, + Serialize, + Deserialize, + strum::EnumCount, )] #[serde(tag = "type", content = "value", rename_all = "snake_case")] pub enum FieldValue { From 929e761c9210b501fe83126936eca85bbddc83de Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 12 Oct 2023 11:08:15 -0700 Subject: [PATCH 2/4] Better tests --- oximeter/db/src/lib.rs | 59 ++++++++----------- ...ies-keys.txt => field-timeseries-keys.txt} | 0 .../db/test-output/sample-timeseries-key.txt | 1 + 3 files changed, 26 insertions(+), 34 deletions(-) rename oximeter/db/test-output/{timeseries-keys.txt => field-timeseries-keys.txt} (100%) create mode 100644 oximeter/db/test-output/sample-timeseries-key.txt diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs index 3aec217f26..d57fcdc525 100644 --- a/oximeter/db/src/lib.rs +++ b/oximeter/db/src/lib.rs @@ -372,8 +372,9 @@ const TIMESERIES_NAME_REGEX: &str = #[cfg(test)] mod tests { - use super::TimeseriesName; + use super::*; use std::convert::TryFrom; + use uuid::Uuid; #[test] fn test_timeseries_name() { @@ -396,46 +397,36 @@ mod tests { assert!(TimeseriesName::try_from("123").is_err()); } + // Validates that the timeseries_key stability for a sample is stable. #[test] - fn test_timeseries_key_generation_hashes_fields_sequentially() { - use super::timeseries_key_for; - use oximeter::{Field, FieldValue}; + fn test_timeseries_key_sample_stability() { + #[derive(oximeter::Target)] + pub struct TestTarget { + pub name: String, + pub num: i64, + } - let f = |name: &str, value| Field { name: name.to_string(), value }; + #[derive(oximeter::Metric)] + pub struct TestMetric { + pub id: Uuid, + pub datum: i64, + } - // Confirm that "targets" and "metrics" are interchangeable, - // we just hash everything sequentially. - assert_eq!( - timeseries_key_for( - [&f("a", FieldValue::String("a".to_string()))].into_iter(), - [&f("b", FieldValue::String("b".to_string()))].into_iter(), - ), - timeseries_key_for( - [ - &f("a", FieldValue::String("a".to_string())), - &f("b", FieldValue::String("b".to_string())), - ] - .into_iter(), - [].into_iter(), - ), - ); + let target = TestTarget { name: String::from("Hello"), num: 1337 }; + let metric = TestMetric { id: Uuid::nil(), datum: 0x1de }; + let sample = Sample::new(&target, &metric).unwrap(); + let key = super::timeseries_key(&sample); - // However, order still matters ("a, b" != "b, a") - assert_ne!( - timeseries_key_for( - [&f("a", FieldValue::String("a".to_string()))].into_iter(), - [&f("b", FieldValue::String("b".to_string()))].into_iter(), - ), - timeseries_key_for( - [&f("b", FieldValue::String("b".to_string()))].into_iter(), - [&f("a", FieldValue::String("a".to_string()))].into_iter(), - ), + expectorate::assert_contents( + "test-output/sample-timeseries-key.txt", + &format!("{key}"), ); } + // Validates that the timeseries_key stability for specific fields is + // stable. #[test] - fn test_timeseries_key_stability() { - use super::timeseries_key_for; + fn test_timeseries_key_field_stability() { use oximeter::{Field, FieldValue}; use strum::EnumCount; @@ -477,7 +468,7 @@ mod tests { } expectorate::assert_contents( - "test-output/timeseries-keys.txt", + "test-output/field-timeseries-keys.txt", &output.join("\n"), ); } diff --git a/oximeter/db/test-output/timeseries-keys.txt b/oximeter/db/test-output/field-timeseries-keys.txt similarity index 100% rename from oximeter/db/test-output/timeseries-keys.txt rename to oximeter/db/test-output/field-timeseries-keys.txt diff --git a/oximeter/db/test-output/sample-timeseries-key.txt b/oximeter/db/test-output/sample-timeseries-key.txt new file mode 100644 index 0000000000..2537247457 --- /dev/null +++ b/oximeter/db/test-output/sample-timeseries-key.txt @@ -0,0 +1 @@ +13614541394457760444 \ No newline at end of file From 2e91139bb7699a1ca3e5e54306e8801a3aa23440 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 12 Oct 2023 12:39:36 -0700 Subject: [PATCH 3/4] Sorted fields --- oximeter/db/src/lib.rs | 23 ++++++++++++++--------- oximeter/oximeter/src/types.rs | 10 ++++++++++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs index d57fcdc525..4fa241af74 100644 --- a/oximeter/db/src/lib.rs +++ b/oximeter/db/src/lib.rs @@ -328,17 +328,20 @@ pub struct TimeseriesPageSelector { pub(crate) type TimeseriesKey = u64; pub(crate) fn timeseries_key(sample: &Sample) -> TimeseriesKey { - timeseries_key_for(sample.target_fields(), sample.metric_fields()) + timeseries_key_for( + sample.sorted_target_fields(), + sample.sorted_metric_fields(), + ) } -pub(crate) fn timeseries_key_for<'a>( - target_fields: impl Iterator, - metric_fields: impl Iterator, +fn timeseries_key_for<'a>( + target_fields: &BTreeMap, + metric_fields: &BTreeMap, ) -> TimeseriesKey { use highway::HighwayHasher; use std::hash::{Hash, Hasher}; let mut hasher = HighwayHasher::default(); - for field in target_fields.chain(metric_fields) { + for field in target_fields.values().chain(metric_fields.values()) { bcs::to_bytes(&field) .expect("Failed to serialized field to bytes") .hash(&mut hasher); @@ -460,10 +463,12 @@ mod tests { let mut output = vec![]; for (name, value) in values { - let key = timeseries_key_for( - [&Field { name: name.to_string(), value }].into_iter(), - [].into_iter(), - ); + let target_fields = BTreeMap::from([( + "field".to_string(), + Field { name: name.to_string(), value }, + )]); + let metric_fields = BTreeMap::new(); + let key = timeseries_key_for(&target_fields, &metric_fields); output.push(format!("{name} -> {key}")); } diff --git a/oximeter/oximeter/src/types.rs b/oximeter/oximeter/src/types.rs index 1c8af6f83e..d3f1b9e746 100644 --- a/oximeter/oximeter/src/types.rs +++ b/oximeter/oximeter/src/types.rs @@ -769,6 +769,11 @@ impl Sample { self.target.fields.values() } + /// Return the sorted fields of this sample's target. + pub fn sorted_target_fields(&self) -> &BTreeMap { + &self.target.fields + } + /// Return the name of this sample's metric. pub fn metric_name(&self) -> &str { &self.metric.name @@ -779,6 +784,11 @@ impl Sample { self.metric.fields.values() } + /// Return the sorted fields of this sample's metric + pub fn sorted_metric_fields(&self) -> &BTreeMap { + &self.metric.fields + } + // Check validity of field names for the target and metric. Currently this // just verifies there are no duplicate names between them. fn verify_field_names( From c0110b7e94c4972157b6260474f06fcea2ea8545 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 12 Oct 2023 13:09:47 -0700 Subject: [PATCH 4/4] Hash party --- oximeter/db/src/lib.rs | 40 ++++++++++++++++--- .../db/test-output/field-timeseries-keys.txt | 24 +++++------ .../db/test-output/sample-timeseries-key.txt | 2 +- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs index 4fa241af74..c878b8ff2a 100644 --- a/oximeter/db/src/lib.rs +++ b/oximeter/db/src/lib.rs @@ -329,23 +329,45 @@ pub(crate) type TimeseriesKey = u64; pub(crate) fn timeseries_key(sample: &Sample) -> TimeseriesKey { timeseries_key_for( + &sample.timeseries_name, sample.sorted_target_fields(), sample.sorted_metric_fields(), + sample.measurement.datum_type(), ) } -fn timeseries_key_for<'a>( +// It's critical that values used for derivation of the timeseries_key are stable. +// We use "bcs" to ensure stability of the derivation across hardware and rust toolchain revisions. +fn canonicalize(what: &str, value: &T) -> Vec { + bcs::to_bytes(value) + .unwrap_or_else(|_| panic!("Failed to serialize {what}")) +} + +fn timeseries_key_for( + timeseries_name: &str, target_fields: &BTreeMap, metric_fields: &BTreeMap, + datum_type: DatumType, ) -> TimeseriesKey { + // We use HighwayHasher primarily for stability - it should provide a stable + // hash for the values used to derive the timeseries_key. use highway::HighwayHasher; use std::hash::{Hash, Hasher}; + + // NOTE: The order of these ".hash" calls matters, changing them will change + // the derivation of the "timeseries_key". We have change-detector tests for + // modifications like this, but be cautious, making such a change will + // impact all currently-provisioned databases. let mut hasher = HighwayHasher::default(); - for field in target_fields.values().chain(metric_fields.values()) { - bcs::to_bytes(&field) - .expect("Failed to serialized field to bytes") - .hash(&mut hasher); + canonicalize("timeseries name", timeseries_name).hash(&mut hasher); + for field in target_fields.values() { + canonicalize("target field", &field).hash(&mut hasher); } + for field in metric_fields.values() { + canonicalize("metric field", &field).hash(&mut hasher); + } + canonicalize("datum type", &datum_type).hash(&mut hasher); + hasher.finish() } @@ -468,7 +490,13 @@ mod tests { Field { name: name.to_string(), value }, )]); let metric_fields = BTreeMap::new(); - let key = timeseries_key_for(&target_fields, &metric_fields); + let key = timeseries_key_for( + "timeseries name", + &target_fields, + &metric_fields, + // ... Not actually, but we are only trying to compare fields here. + DatumType::Bool, + ); output.push(format!("{name} -> {key}")); } diff --git a/oximeter/db/test-output/field-timeseries-keys.txt b/oximeter/db/test-output/field-timeseries-keys.txt index dc94aac545..d82c143600 100644 --- a/oximeter/db/test-output/field-timeseries-keys.txt +++ b/oximeter/db/test-output/field-timeseries-keys.txt @@ -1,12 +1,12 @@ -string -> 11027713821408113420 -i8 -> 11040867383141339877 -u8 -> 15607688254753473406 -i16 -> 13295860386975871341 -u16 -> 2145565374036862015 -i32 -> 10931977742674674182 -u32 -> 6138968585984979982 -i64 -> 981603688282082647 -u64 -> 4754979326506678930 -ipaddr -> 17297273002898199682 -uuid -> 9564253805848631232 -bool -> 13392625023595096799 \ No newline at end of file +string -> 5554437373902071418 +i8 -> 8051527130089763326 +u8 -> 1403385090410880239 +i16 -> 4425083437960417917 +u16 -> 13883626507745758865 +i32 -> 14729289749324644435 +u32 -> 12103188004421096629 +i64 -> 961258395613152243 +u64 -> 15804125619400967189 +ipaddr -> 14737150884237616680 +uuid -> 16911606541498230091 +bool -> 10983724023695040909 \ No newline at end of file diff --git a/oximeter/db/test-output/sample-timeseries-key.txt b/oximeter/db/test-output/sample-timeseries-key.txt index 2537247457..aeb515c78e 100644 --- a/oximeter/db/test-output/sample-timeseries-key.txt +++ b/oximeter/db/test-output/sample-timeseries-key.txt @@ -1 +1 @@ -13614541394457760444 \ No newline at end of file +365003276793586811 \ No newline at end of file