Skip to content

Commit

Permalink
Improve debugging on some error messages
Browse files Browse the repository at this point in the history
In debugging Scheduler errors when developing the Redis scheduler,
adding these messages to the hints helped find the bugs.
  • Loading branch information
allada committed Sep 2, 2024
1 parent a559ca1 commit f053a0c
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 5 deletions.
4 changes: 3 additions & 1 deletion nativelink-scheduler/src/simple_scheduler_state_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,9 @@ where
},
error: Some(err.clone().merge(make_err!(
Code::Internal,
"Job cancelled because it attempted to execute too many times and failed {}",
"Job cancelled because it attempted to execute too many times {} > {} times {}",
awaited_action.attempts,
self.max_job_retries,
format!("for operation_id: {operation_id}, maybe_worker_id: {maybe_worker_id:?}"),
))),
..ActionResult::default()
Expand Down
4 changes: 2 additions & 2 deletions nativelink-scheduler/tests/simple_scheduler_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2008,8 +2008,8 @@ async fn worker_retries_on_internal_error_and_fails_test() -> Result<(), Error>
if let ActionStage::Completed(stage) = &mut received_state.stage {
if let Some(real_err) = &mut stage.error {
assert!(
real_err.to_string().contains("Job cancelled because it attempted to execute too many times and failed"),
"{real_err} did not contain 'Job cancelled because it attempted to execute too many times and failed'",
real_err.to_string().contains("Job cancelled because it attempted to execute too many times"),
"{real_err} did not contain 'Job cancelled because it attempted to execute too many times'",
);
*real_err = err;
}
Expand Down
5 changes: 3 additions & 2 deletions nativelink-store/src/ac_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,14 @@ pub async fn get_size_and_decode_digest<T: Message + Default + 'static>(
store: &impl StoreLike,
key: impl Into<StoreKey<'_>>,
) -> Result<(T, usize), Error> {
let key = key.into();
// Note: For unknown reasons we appear to be hitting:
// https://github.com/rust-lang/rust/issues/92096
// or a smiliar issue if we try to use the non-store driver function, so we
// are using the store driver function here.
let mut store_data_resp = store
.as_store_driver_pin()
.get_part_unchunked(key.into(), 0, Some(MAX_ACTION_MSG_SIZE))
.get_part_unchunked(key.borrow(), 0, Some(MAX_ACTION_MSG_SIZE))
.await;
if let Err(err) = &mut store_data_resp {
if err.code == Code::NotFound {
Expand All @@ -74,7 +75,7 @@ pub async fn get_size_and_decode_digest<T: Message + Default + 'static>(
.err_tip_with_code(|e| {
(
Code::NotFound,
format!("Stored value appears to be corrupt: {e}"),
format!("Stored value appears to be corrupt: {e} - {key:?}"),
)
})
.map(|v| (v, store_data_len))
Expand Down
2 changes: 2 additions & 0 deletions nativelink-util/src/action_messages.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ impl std::fmt::Display for ActionUniqueQualifier {
Self::Uncachable(action) => (false, action),
};
f.write_fmt(format_args!(
// Note: We use underscores because it makes escaping easier
// for redis.
"{}/{}/{}-{}/{}",
unique_key.instance_name,
unique_key.digest_function,
Expand Down

0 comments on commit f053a0c

Please sign in to comment.