diff --git a/.changes/unreleased/Under the Hood-20220816-122032.yaml b/.changes/unreleased/Under the Hood-20220816-122032.yaml new file mode 100644 index 00000000000..859b79ce882 --- /dev/null +++ b/.changes/unreleased/Under the Hood-20220816-122032.yaml @@ -0,0 +1,7 @@ +kind: Under the Hood +body: add more information to log line interop test failures +time: 2022-08-16T12:20:32.119588+01:00 +custom: + Author: nathaniel-may + Issue: "5658" + PR: "5659" diff --git a/test/interop/log_parsing/src/main.rs b/test/interop/log_parsing/src/main.rs index 07e88157fb6..5764a5ed21e 100644 --- a/test/interop/log_parsing/src/main.rs +++ b/test/interop/log_parsing/src/main.rs @@ -1,4 +1,3 @@ - use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::env; @@ -7,15 +6,15 @@ use std::fs::File; use std::io::{self, BufRead}; use walkdir::WalkDir; - // Applies schema tests to file input // if these fail, we either have a problem in dbt that needs to be resolved // or we have changed our interface and the log_version should be bumped in dbt, // modeled appropriately here, and publish new docs for the new log_version. fn main() -> Result<(), Box> { let log_name = "dbt.log"; - let path = env::var("LOG_DIR").expect("must pass absolute log path to tests with env var `LOG_DIR=/logs/live/here/`"); - + let path = env::var("LOG_DIR") + .expect("must pass absolute log path to tests with env var `LOG_DIR=/logs/live/here/`"); + println!("Looking for files named `{}` in {}", log_name, path); let lines: Vec = get_input(&path, log_name)?; println!("collected {} log lines.", lines.len()); @@ -23,12 +22,19 @@ fn main() -> Result<(), Box> { println!(""); println!("testing type-level schema compliance by deserializing each line..."); - let log_lines: Vec = deserialized_input(&lines) - .map_err(|e| format!("schema test failure: json doesn't match type definition\n{}", e))?; + let log_lines: Vec = deserialized_input(&lines).map_err(|e| { + format!( + "schema test failure: json doesn't match type definition\n{}", + e + ) + })?; println!("Done."); println!(""); - println!("because we skip non-json log lines, there are {} collected values to test.", log_lines.len()); + println!( + "because we skip non-json log lines, there are {} collected values to test.", + log_lines.len() + ); println!(""); // make sure when we read a string in then output it back to a string the two strings @@ -49,7 +55,6 @@ fn main() -> Result<(), Box> { Ok(()) } - // each nested type of LogLine should define its own value_test function // that asserts values are within an expected set of values when possible. trait ValueTest { @@ -68,11 +73,11 @@ struct LogLine { level: String, invocation_id: String, thread_name: String, - data: serde_json::Value, // TODO be more specific + data: serde_json::Value, // TODO be more specific } impl ValueTest for LogLine { - fn value_test(&self){ + fn value_test(&self) { assert_eq!( self.log_version, 2, "The log version changed. Be sure this was intentional." @@ -99,7 +104,7 @@ impl ValueTest for LogLine { // this requires handling the date with "%Y-%m-%dT%H:%M:%S%.6f" which requires this // boilerplate-looking module. mod custom_date_format { - use chrono::{NaiveDateTime, DateTime, Utc}; + use chrono::{DateTime, NaiveDateTime, Utc}; use serde::{self, Deserialize, Deserializer, Serializer}; const FORMAT: &'static str = "%Y-%m-%dT%H:%M:%S%.6fZ"; @@ -117,7 +122,10 @@ mod custom_date_format { D: Deserializer<'de>, { let s = String::deserialize(deserializer)?; - Ok(DateTime::::from_utc(NaiveDateTime::parse_from_str(&s, FORMAT).map_err(serde::de::Error::custom)?, Utc)) + Ok(DateTime::::from_utc( + NaiveDateTime::parse_from_str(&s, FORMAT).map_err(serde::de::Error::custom)?, + Utc, + )) } } @@ -134,16 +142,26 @@ fn get_input(path: &str, file_name: &str) -> Result, String> { let f_name = e.file_name().to_string_lossy(); if f_name.ends_with(file_name) { let contents = File::open(e.path()) - .map_err(|e| format!("Something went wrong opening the log file {}\n{}", f_name, e)) + .map_err(|e| { + format!( + "Something went wrong opening the log file {}\n{}", + f_name, e + ) + }) .and_then(|file| { io::BufReader::new(file) .lines() .map(|l| { - l.map_err(|e| format!("Something went wrong reading lines of the log file {}\n{}", f_name, e)) + l.map_err(|e| { + format!( + "Something went wrong reading lines of the log file {}\n{}", + f_name, e + ) + }) }) .collect::, String>>() }); - + Some(contents) } else { None @@ -166,69 +184,91 @@ fn deserialized_input(log_lines: &[String]) -> serde_json::Result> .collect() } -// turn a String into a LogLine and back into a String returning both Strings so -// they can be compared -fn deserialize_serialize_loop( - log_lines: &[String], -) -> serde_json::Result> { - log_lines - .into_iter() - .map(|log_line| { - serde_json::from_str::(log_line).and_then(|parsed| { - serde_json::to_string(&parsed).map(|json| (log_line.clone(), json)) - }) - }) - .collect() +// Take a json string, deserialize it to a known value, and a generic json value. +// return the two json values so they can be compared. +// this helps to check if the deserialize-serialize loop for the known value drops necessary information. +// +// This function is used as a helper to check every json logline that dbt outputs for expected values. +// In practice, this often returns an Err value if the logs have unexpected non-json values such as logged +// exceptions, warnings, or printed statements. +fn deserialize_twice<'a, T: Serialize + Deserialize<'a>>( + json_str: &'a str, +) -> Result<(serde_json::Value, serde_json::Value), String> { + // deserialize the string into a JSON value with no knowledge of T's structure + let deserialized_json = + serde_json::from_str::(json_str).map_err(|_| json_str)?; + + // deserialize the string into a T + let deserialized_t = serde_json::from_str::<'a, T>(json_str).map_err(|_| json_str)?; + + // serialize the T value into a string again + let serialized_t = serde_json::to_string(&deserialized_t).map_err(|_| json_str)?; + + // deserialize the string into a JSON value + let deserialized_t_json = + serde_json::from_str::(&serialized_t).map_err(|_| json_str)?; + + Ok((deserialized_json, deserialized_t_json)) +} + +// This is only used to make more useful error messages and avoid assert statements that say that two massive +// json objects are not the same. this will dig into the values to find an inner json value where they differ. +fn compare_json(x: &serde_json::Value, y: &serde_json::Value) -> Result<(), serde_json::Value> { + if x == y { + return Ok(()); + } + + match (x, y) { + // check for object key mismatches + (serde_json::Value::Object(mx), serde_json::Value::Object(my)) => { + // check if the second map is missing keys from the first + for (xk, xv) in mx { + if !my.contains_key(xk) { + let mut m_err = serde_json::Map::new(); + m_err.insert(xk.clone(), xv.clone()); + return Err(serde_json::Value::Object(m_err)); + } + } + + // check if the first map is missing keys from the second + for (yk, yv) in my { + if !mx.contains_key(yk) { + let mut m_err = serde_json::Map::new(); + m_err.insert(yk.clone(), yv.clone()); + return Err(serde_json::Value::Object(m_err)); + } + } + + // all the keys are the same, so compare each value recursively. + for k in mx.keys() { + // unwrapping (which panics) since we know all these keys exist in both maps. + let xv = mx.get(k).unwrap(); + let yv = my.get(k).unwrap(); + + compare_json(xv, yv)?; + } + + // only reaches here if all the keys are the same and all of the values are the same + // and the top-level equality check on the map failed. + Ok(()) + } + + // must be a non-object json value. since there are no keys to specify, return the first value. + (x, _) => Err(x.clone()), + } } // make sure when we read a string in then output it back to a string the two strings // contain all the same key-value pairs. fn test_deserialize_serialize_is_unchanged(lines: &[String]) { - let objects: Result, serde_json::Error> = - deserialize_serialize_loop(lines).and_then(|v| { - v.into_iter() - .map(|(s0, s1)| { - serde_json::from_str::(&s0).and_then(|s0v| { - serde_json::from_str::(&s1).map(|s1v| (s0v, s1v)) - }) - }) - .collect() - }); - - match objects { - Err(e) => assert!(false, "{}", e), - Ok(v) => { - for pair in v { - match pair { - ( - serde_json::Value::Object(original), - serde_json::Value::Object(looped), - ) => { - // looping through each key of each json value gives us meaningful failure messages - // instead of "this big string" != "this other big string" - for (key, value) in original.clone() { - let looped_val = looped.get(&key); - assert_eq!( - looped_val, - Some(&value), - "original key value ({}, {}) expected in re-serialized result", - key, - value - ) - } - for (key, value) in looped.clone() { - let original_val = original.get(&key); - assert_eq!( - original_val, - Some(&value), - "looped key value ({}, {}) not found in original result", - key, - value - ) - } - } - _ => assert!(false, "not comparing json objects"), - } + for line in lines { + match deserialize_twice::(line) { + // error if there are not two values to compare + Err(log_line) => assert!(false, "Logline cannot be deserialized into a json LogLine twice for value comparison:\n{}\n", log_line), + // if there are two values to compare, assert they are the same or find the key-value pair where they differ. + Ok((x, y)) => match compare_json(&x, &y) { + Err(json_value) => assert!(false, "LogLine values were inconsistent.\nSpecific difference:\n{}\nWhole log line:\n{}", json_value, line), + Ok(()) => () } } } @@ -247,9 +287,9 @@ mod tests { #[test] fn test_values() { - assert!(deserialized_input(&[LOG_LINE.to_owned()]).map(|v| { - v.into_iter().map(|ll| ll.value_test()) - }).is_ok()) + assert!(deserialized_input(&[LOG_LINE.to_owned()]) + .map(|v| { v.into_iter().map(|ll| ll.value_test()) }) + .is_ok()) } #[test]