Print more information on log line interop test failures (#5659)

Print more information on log line interop test failures (cherry picked from commit 739fb98)
dbt-labs · Feb 6, 2024 · c11a26b · c11a26b
1 parent fb6075d
commit c11a26b
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 76 deletions.
diff --git a/.changes/unreleased/Under the Hood-20220816-122032.yaml b/.changes/unreleased/Under the Hood-20220816-122032.yaml
@@ -0,0 +1,7 @@
+kind: Under the Hood
+body: add more information to log line interop test failures
+time: 2022-08-16T12:20:32.119588+01:00
+custom:
+  Author: nathaniel-may
+  Issue: "5658"
+  PR: "5659"
diff --git a/test/interop/log_parsing/src/main.rs b/test/interop/log_parsing/src/main.rs
@@ -1,4 +1,3 @@
-
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::env;
@@ -7,28 +6,35 @@ use std::fs::File;
 use std::io::{self, BufRead};
 use walkdir::WalkDir;
 
-
 // Applies schema tests to file input
 // if these fail, we either have a problem in dbt that needs to be resolved
 // or we have changed our interface and the log_version should be bumped in dbt,
 // modeled appropriately here, and publish new docs for the new log_version.
 fn main() -> Result<(), Box<dyn Error>> {
     let log_name = "dbt.log";
-    let path = env::var("LOG_DIR").expect("must pass absolute log path to tests with env var `LOG_DIR=/logs/live/here/`");
-
+    let path = env::var("LOG_DIR")
+        .expect("must pass absolute log path to tests with env var `LOG_DIR=/logs/live/here/`");
+
     println!("Looking for files named `{}` in {}", log_name, path);
     let lines: Vec<String> = get_input(&path, log_name)?;
     println!("collected {} log lines.", lines.len());
 
     println!("");
 
     println!("testing type-level schema compliance by deserializing each line...");
-    let log_lines: Vec<LogLine> = deserialized_input(&lines)
-        .map_err(|e| format!("schema test failure: json doesn't match type definition\n{}", e))?;
+    let log_lines: Vec<LogLine> = deserialized_input(&lines).map_err(|e| {
+        format!(
+            "schema test failure: json doesn't match type definition\n{}",
+            e
+        )
+    })?;
     println!("Done.");
 
     println!("");
-    println!("because we skip non-json log lines, there are {} collected values to test.", log_lines.len());
+    println!(
+        "because we skip non-json log lines, there are {} collected values to test.",
+        log_lines.len()
+    );
     println!("");
 
     // make sure when we read a string in then output it back to a string the two strings
@@ -49,7 +55,6 @@ fn main() -> Result<(), Box<dyn Error>> {
     Ok(())
 }
 
-
 // each nested type of LogLine should define its own value_test function
 // that asserts values are within an expected set of values when possible.
 trait ValueTest {
@@ -68,11 +73,11 @@ struct LogLine {
     level: String,
     invocation_id: String,
     thread_name: String,
-    data: serde_json::Value,      // TODO be more specific
+    data: serde_json::Value, // TODO be more specific
 }
 
 impl ValueTest for LogLine {
-    fn value_test(&self){
+    fn value_test(&self) {
         assert_eq!(
             self.log_version, 2,
             "The log version changed. Be sure this was intentional."
@@ -99,7 +104,7 @@ impl ValueTest for LogLine {
 // this requires handling the date with "%Y-%m-%dT%H:%M:%S%.6f" which requires this
 // boilerplate-looking module.
 mod custom_date_format {
-    use chrono::{NaiveDateTime, DateTime, Utc};
+    use chrono::{DateTime, NaiveDateTime, Utc};
     use serde::{self, Deserialize, Deserializer, Serializer};
 
     const FORMAT: &'static str = "%Y-%m-%dT%H:%M:%S%.6fZ";
@@ -117,7 +122,10 @@ mod custom_date_format {
         D: Deserializer<'de>,
     {
         let s = String::deserialize(deserializer)?;
-        Ok(DateTime::<Utc>::from_utc(NaiveDateTime::parse_from_str(&s, FORMAT).map_err(serde::de::Error::custom)?, Utc))
+        Ok(DateTime::<Utc>::from_utc(
+            NaiveDateTime::parse_from_str(&s, FORMAT).map_err(serde::de::Error::custom)?,
+            Utc,
+        ))
     }
 }
 
@@ -134,16 +142,26 @@ fn get_input(path: &str, file_name: &str) -> Result<Vec<String>, String> {
             let f_name = e.file_name().to_string_lossy();
             if f_name.ends_with(file_name) {
                 let contents = File::open(e.path())
-                    .map_err(|e| format!("Something went wrong opening the log file {}\n{}", f_name, e))
+                    .map_err(|e| {
+                        format!(
+                            "Something went wrong opening the log file {}\n{}",
+                            f_name, e
+                        )
+                    })
                     .and_then(|file| {
                         io::BufReader::new(file)
                             .lines()
                             .map(|l| {
-                                l.map_err(|e| format!("Something went wrong reading lines of the log file {}\n{}", f_name, e))
+                                l.map_err(|e| {
+                                    format!(
+                                        "Something went wrong reading lines of the log file {}\n{}",
+                                        f_name, e
+                                    )
+                                })
                             })
                             .collect::<Result<Vec<String>, String>>()
                     });
-                    
+
                 Some(contents)
             } else {
                 None
@@ -166,69 +184,91 @@ fn deserialized_input(log_lines: &[String]) -> serde_json::Result<Vec<LogLine>>
         .collect()
 }
 
-// turn a String into a LogLine and back into a String returning both Strings so
-// they can be compared
-fn deserialize_serialize_loop(
-    log_lines: &[String],
-) -> serde_json::Result<Vec<(String, String)>> {
-    log_lines
-        .into_iter()
-        .map(|log_line| {
-            serde_json::from_str::<LogLine>(log_line).and_then(|parsed| {
-                serde_json::to_string(&parsed).map(|json| (log_line.clone(), json))
-            })
-        })
-        .collect()
+// Take a json string, deserialize it to a known value, and a generic json value.
+// return the two json values so they can be compared.
+// this helps to check if the deserialize-serialize loop for the known value drops necessary information.
+//
+// This function is used as a helper to check every json logline that dbt outputs for expected values.
+// In practice, this often returns an Err value if the logs have unexpected non-json values such as logged
+// exceptions, warnings, or printed statements.
+fn deserialize_twice<'a, T: Serialize + Deserialize<'a>>(
+    json_str: &'a str,
+) -> Result<(serde_json::Value, serde_json::Value), String> {
+    // deserialize the string into a JSON value with no knowledge of T's structure
+    let deserialized_json =
+        serde_json::from_str::<serde_json::Value>(json_str).map_err(|_| json_str)?;
+
+    // deserialize the string into a T
+    let deserialized_t = serde_json::from_str::<'a, T>(json_str).map_err(|_| json_str)?;
+
+    // serialize the T value into a string again
+    let serialized_t = serde_json::to_string(&deserialized_t).map_err(|_| json_str)?;
+
+    // deserialize the string into a JSON value
+    let deserialized_t_json =
+        serde_json::from_str::<serde_json::Value>(&serialized_t).map_err(|_| json_str)?;
+
+    Ok((deserialized_json, deserialized_t_json))
+}
+
+// This is only used to make more useful error messages and avoid assert statements that say that two massive
+// json objects are not the same. this will dig into the values to find an inner json value where they differ.
+fn compare_json(x: &serde_json::Value, y: &serde_json::Value) -> Result<(), serde_json::Value> {
+    if x == y {
+        return Ok(());
+    }
+
+    match (x, y) {
+        // check for object key mismatches
+        (serde_json::Value::Object(mx), serde_json::Value::Object(my)) => {
+            // check if the second map is missing keys from the first
+            for (xk, xv) in mx {
+                if !my.contains_key(xk) {
+                    let mut m_err = serde_json::Map::new();
+                    m_err.insert(xk.clone(), xv.clone());
+                    return Err(serde_json::Value::Object(m_err));
+                }
+            }
+
+            // check if the first map is missing keys from the second
+            for (yk, yv) in my {
+                if !mx.contains_key(yk) {
+                    let mut m_err = serde_json::Map::new();
+                    m_err.insert(yk.clone(), yv.clone());
+                    return Err(serde_json::Value::Object(m_err));
+                }
+            }
+
+            // all the keys are the same, so compare each value recursively.
+            for k in mx.keys() {
+                // unwrapping (which panics) since we know all these keys exist in both maps.
+                let xv = mx.get(k).unwrap();
+                let yv = my.get(k).unwrap();
+
+                compare_json(xv, yv)?;
+            }
+
+            // only reaches here if all the keys are the same and all of the values are the same
+            // and the top-level equality check on the map failed.
+            Ok(())
+        }
+
+        // must be a non-object json value. since there are no keys to specify, return the first value.
+        (x, _) => Err(x.clone()),
+    }
 }
 
 // make sure when we read a string in then output it back to a string the two strings
 // contain all the same key-value pairs.
 fn test_deserialize_serialize_is_unchanged(lines: &[String]) {
-    let objects: Result<Vec<(serde_json::Value, serde_json::Value)>, serde_json::Error> =
-        deserialize_serialize_loop(lines).and_then(|v| {
-            v.into_iter()
-                .map(|(s0, s1)| {
-                    serde_json::from_str::<serde_json::Value>(&s0).and_then(|s0v| {
-                        serde_json::from_str::<serde_json::Value>(&s1).map(|s1v| (s0v, s1v))
-                    })
-                })
-                .collect()
-        });
-
-    match objects {
-        Err(e) => assert!(false, "{}", e),
-        Ok(v) => {
-            for pair in v {
-                match pair {
-                    (
-                        serde_json::Value::Object(original),
-                        serde_json::Value::Object(looped),
-                    ) => {
-                        // looping through each key of each json value gives us meaningful failure messages
-                        // instead of "this big string" != "this other big string"
-                        for (key, value) in original.clone() {
-                            let looped_val = looped.get(&key);
-                            assert_eq!(
-                                looped_val,
-                                Some(&value),
-                                "original key value ({}, {}) expected in re-serialized result",
-                                key,
-                                value
-                            )
-                        }
-                        for (key, value) in looped.clone() {
-                            let original_val = original.get(&key);
-                            assert_eq!(
-                                original_val,
-                                Some(&value),
-                                "looped key value ({}, {}) not found in original result",
-                                key,
-                                value
-                            )
-                        }
-                    }
-                    _ => assert!(false, "not comparing json objects"),
-                }
+    for line in lines {
+        match deserialize_twice::<LogLine>(line) {
+            // error if there are not two values to compare
+            Err(log_line) => assert!(false, "Logline cannot be deserialized into a json LogLine twice for value comparison:\n{}\n", log_line),
+            // if there are two values to compare, assert they are the same or find the key-value pair where they differ.
+            Ok((x, y)) => match compare_json(&x, &y) {
+                Err(json_value) => assert!(false, "LogLine values were inconsistent.\nSpecific difference:\n{}\nWhole log line:\n{}", json_value, line),
+                Ok(()) => ()
             }
         }
     }
@@ -247,9 +287,9 @@ mod tests {
 
     #[test]
     fn test_values() {
-        assert!(deserialized_input(&[LOG_LINE.to_owned()]).map(|v| {
-            v.into_iter().map(|ll| ll.value_test())
-        }).is_ok())
+        assert!(deserialized_input(&[LOG_LINE.to_owned()])
+            .map(|v| { v.into_iter().map(|ll| ll.value_test()) })
+            .is_ok())
     }
 
     #[test]