From d4163f05b3e84a895e60cf87da190eb339f064d1 Mon Sep 17 00:00:00 2001 From: Blaise Bruer Date: Mon, 2 Sep 2024 14:15:56 -0500 Subject: [PATCH] Improve debugging on some error messages In debugging Scheduler errors when developing the Redis scheduler, adding these messages to the hints helped find the bugs. --- nativelink-scheduler/src/simple_scheduler_state_manager.rs | 4 +++- nativelink-scheduler/tests/simple_scheduler_test.rs | 4 ++-- nativelink-store/src/ac_utils.rs | 5 +++-- nativelink-util/src/action_messages.rs | 4 +++- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/nativelink-scheduler/src/simple_scheduler_state_manager.rs b/nativelink-scheduler/src/simple_scheduler_state_manager.rs index e988623825..0d002f984c 100644 --- a/nativelink-scheduler/src/simple_scheduler_state_manager.rs +++ b/nativelink-scheduler/src/simple_scheduler_state_manager.rs @@ -506,7 +506,9 @@ where }, error: Some(err.clone().merge(make_err!( Code::Internal, - "Job cancelled because it attempted to execute too many times and failed {}", + "Job cancelled because it attempted to execute too many times {} > {} times {}", + awaited_action.attempts, + self.max_job_retries, format!("for operation_id: {operation_id}, maybe_worker_id: {maybe_worker_id:?}"), ))), ..ActionResult::default() diff --git a/nativelink-scheduler/tests/simple_scheduler_test.rs b/nativelink-scheduler/tests/simple_scheduler_test.rs index d747707823..f9cc56505f 100644 --- a/nativelink-scheduler/tests/simple_scheduler_test.rs +++ b/nativelink-scheduler/tests/simple_scheduler_test.rs @@ -2008,8 +2008,8 @@ async fn worker_retries_on_internal_error_and_fails_test() -> Result<(), Error> if let ActionStage::Completed(stage) = &mut received_state.stage { if let Some(real_err) = &mut stage.error { assert!( - real_err.to_string().contains("Job cancelled because it attempted to execute too many times and failed"), - "{real_err} did not contain 'Job cancelled because it attempted to execute too many times and failed'", + real_err.to_string().contains("Job cancelled because it attempted to execute too many times"), + "{real_err} did not contain 'Job cancelled because it attempted to execute too many times'", ); *real_err = err; } diff --git a/nativelink-store/src/ac_utils.rs b/nativelink-store/src/ac_utils.rs index 5137a27ce9..2915fb55d5 100644 --- a/nativelink-store/src/ac_utils.rs +++ b/nativelink-store/src/ac_utils.rs @@ -51,13 +51,14 @@ pub async fn get_size_and_decode_digest( store: &impl StoreLike, key: impl Into>, ) -> Result<(T, usize), Error> { + let key = key.into(); // Note: For unknown reasons we appear to be hitting: // https://github.com/rust-lang/rust/issues/92096 // or a smiliar issue if we try to use the non-store driver function, so we // are using the store driver function here. let mut store_data_resp = store .as_store_driver_pin() - .get_part_unchunked(key.into(), 0, Some(MAX_ACTION_MSG_SIZE)) + .get_part_unchunked(key.borrow(), 0, Some(MAX_ACTION_MSG_SIZE)) .await; if let Err(err) = &mut store_data_resp { if err.code == Code::NotFound { @@ -74,7 +75,7 @@ pub async fn get_size_and_decode_digest( .err_tip_with_code(|e| { ( Code::NotFound, - format!("Stored value appears to be corrupt: {e}"), + format!("Stored value appears to be corrupt: {e} - {key:?}"), ) }) .map(|v| (v, store_data_len)) diff --git a/nativelink-util/src/action_messages.rs b/nativelink-util/src/action_messages.rs index 5c483a4fe2..a010d73f2e 100644 --- a/nativelink-util/src/action_messages.rs +++ b/nativelink-util/src/action_messages.rs @@ -213,7 +213,9 @@ impl std::fmt::Display for ActionUniqueQualifier { Self::Uncachable(action) => (false, action), }; f.write_fmt(format_args!( - "{}/{}/{}-{}/{}", + // Note: We use underscores because it makes escaping easier + // for redis. + "{}_{}_{}_{}_{}", unique_key.instance_name, unique_key.digest_function, unique_key.digest.hash_str(),