From f9c78265f4f28a9df6e1f595c3a60029596aeb11 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Fri, 6 Oct 2023 20:00:22 +0000 Subject: [PATCH 1/5] Don't unwrap when we can't create a region If we can't create a dataset for a region, don't panic, just fail. --- agent/src/main.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index 0e2a37bf6..46529d6a4 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -1553,7 +1553,7 @@ fn worker( * then we finish up destroying the region. */ match &r.state { - State::Requested => { + State::Requested => 'requested: { /* * Compute the actual size required for a full region, * then add our metadata overhead to that. @@ -1574,14 +1574,25 @@ fn worker( ); // if regions need to be created, do that before apply_smf. - let region_dataset = regions_dataset + let region_dataset = match regions_dataset .ensure_child_dataset( &r.id.0, Some(reservation), Some(quota), &log, - ) - .unwrap(); + ) { + Ok(region_dataset) => region_dataset, + Err(e) => { + error!( + log, + "Dataset {} creation failed: {}", + &r.id.0, + e, + ); + df.fail(&r.id); + break 'requested; + } + }; // It's important that a region transition to "Created" // only after it has been created as a dataset: From beef95c73ce7ae9fc5cd6c20da4dc5c2f50f5d27 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Mon, 9 Oct 2023 17:31:16 +0000 Subject: [PATCH 2/5] Fewer unwraps --- agent/src/main.rs | 59 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index 46529d6a4..6d07ea25b 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -1573,7 +1573,8 @@ fn worker( quota, ); - // if regions need to be created, do that before apply_smf. + // If regions need to be created, do that before + // apply_smf. let region_dataset = match regions_dataset .ensure_child_dataset( &r.id.0, @@ -1594,6 +1595,20 @@ fn worker( } }; + let dataset_path = match region_dataset.path() { + Ok(dataset_path) => dataset_path, + Err(e) => { + error!( + log, + "Failed to find path for dataset {}: {}", + &r.id.0, + e, + ); + df.fail(&r.id); + break 'requested; + } + }; + // It's important that a region transition to "Created" // only after it has been created as a dataset: // after the crucible agent restarts, `apply_smf` will @@ -1607,7 +1622,7 @@ fn worker( &log, &downstairs_program, &r, - ®ion_dataset.path().unwrap(), + &dataset_path, ) .and_then(|_| df.created(&r.id)); @@ -1623,7 +1638,7 @@ fn worker( let result = apply_smf( &log, &df, - regions_dataset.path().unwrap(), + dataset_path, &downstairs_prefix, &snapshot_prefix, ); @@ -1635,12 +1650,25 @@ fn worker( } } - State::Tombstoned => { + State::Tombstoned => 'tombstoned: { + let dataset_path = match regions_dataset.path() { + Ok(dataset_path) => dataset_path, + Err(e) => { + error!( + log, + "Cannot get path on tombstoned dataset {}: {}", + &r.id.0, + e, + ); + df.fail(&r.id); + break 'tombstoned; + } + }; info!(log, "applying SMF actions before removal..."); let result = apply_smf( &log, &df, - regions_dataset.path().unwrap(), + dataset_path, &downstairs_prefix, &snapshot_prefix, ); @@ -1651,11 +1679,22 @@ fn worker( info!(log, "SMF ok!"); } - // After SMF successfully shuts off downstairs, remove zfs - // dataset. - let region_dataset = regions_dataset - .from_child_dataset(&r.id.0) - .unwrap(); + // After SMF successfully shuts off downstairs, remove + // zfs dataset. + let region_dataset = + match regions_dataset.from_child_dataset(&r.id.0) { + Ok(region_dataset) => region_dataset, + Err(e) => { + error!( + log, + "Cannot find region {:?} to remove: {}", + r.id.0, + e, + ); + df.fail(&r.id); + break 'tombstoned; + } + }; let res = worker_region_destroy(&log, &r, region_dataset) From 56704b0a918432ccf89ad7844e7993892d411dd1 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Wed, 11 Oct 2023 00:43:52 +0000 Subject: [PATCH 3/5] Try Tombstoned instead of Failed --- agent/src/datafile.rs | 3 +-- agent/src/main.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index a57c2b467..24eef0e18 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -453,11 +453,10 @@ impl DataFile { /** * Mark a particular region as failed to provision. */ - pub fn fail(&self, id: &RegionId) { + pub fn fail(&self, id: &RegionId, nstate: State) { let mut inner = self.inner.lock().unwrap(); let r = inner.regions.get_mut(id).unwrap(); - let nstate = State::Failed; if r.state == nstate { return; } diff --git a/agent/src/main.rs b/agent/src/main.rs index 6d07ea25b..84f9a7f31 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -1128,7 +1128,7 @@ mod test { })?; // Pretend creating the region failed - harness.df.fail(®ion_id); + harness.df.fail(®ion_id, State::Failed); // Now call apply_smf harness.apply_smf()?; @@ -1590,7 +1590,7 @@ fn worker( &r.id.0, e, ); - df.fail(&r.id); + df.fail(&r.id, State::Tombstoned); break 'requested; } }; @@ -1604,7 +1604,7 @@ fn worker( &r.id.0, e, ); - df.fail(&r.id); + df.fail(&r.id, State::Failed); break 'requested; } }; @@ -1631,7 +1631,7 @@ fn worker( log, "region {:?} create failed: {:?}", r.id.0, e ); - df.fail(&r.id); + df.fail(&r.id, State::Failed); } info!(log, "applying SMF actions post create..."); @@ -1660,7 +1660,7 @@ fn worker( &r.id.0, e, ); - df.fail(&r.id); + df.fail(&r.id, State::Failed); break 'tombstoned; } }; @@ -1691,7 +1691,7 @@ fn worker( r.id.0, e, ); - df.fail(&r.id); + df.fail(&r.id, State::Failed); break 'tombstoned; } }; @@ -1705,7 +1705,7 @@ fn worker( log, "region {:?} destroy failed: {:?}", r.id.0, e ); - df.fail(&r.id); + df.fail(&r.id, State::Failed); } } _ => { From 1a56db99f0a40a4d4798de9f4b037e0e248602a2 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Wed, 11 Oct 2023 22:20:58 +0000 Subject: [PATCH 4/5] Requested -> Destroyed --- agent/src/datafile.rs | 4 +++- agent/src/main.rs | 25 ++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index 24eef0e18..15c1df932 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -453,10 +453,11 @@ impl DataFile { /** * Mark a particular region as failed to provision. */ - pub fn fail(&self, id: &RegionId, nstate: State) { + pub fn fail(&self, id: &RegionId) { let mut inner = self.inner.lock().unwrap(); let r = inner.regions.get_mut(id).unwrap(); + let nstate = State::Failed; if r.state == nstate { return; } @@ -612,6 +613,7 @@ impl DataFile { let r = inner.regions.get_mut(id).unwrap(); let nstate = State::Destroyed; match &r.state { + State::Requested => (), State::Tombstoned => (), x => bail!("region to destroy in weird state {:?}", x), } diff --git a/agent/src/main.rs b/agent/src/main.rs index 84f9a7f31..8be2507ae 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -1128,7 +1128,7 @@ mod test { })?; // Pretend creating the region failed - harness.df.fail(®ion_id, State::Failed); + harness.df.fail(®ion_id); // Now call apply_smf harness.apply_smf()?; @@ -1527,7 +1527,6 @@ fn worker( downstairs_prefix: String, snapshot_prefix: String, ) { - // XXX unwraps here ok? loop { /* * This loop fires whenever there's work to do. This work may be: @@ -1590,7 +1589,15 @@ fn worker( &r.id.0, e, ); - df.fail(&r.id, State::Tombstoned); + if let Err(e) = df.destroyed(&r.id) { + error!( + log, + "Dataset {} destruction failed: {}", + &r.id.0, + e, + ); + df.fail(&r.id); + } break 'requested; } }; @@ -1604,7 +1611,7 @@ fn worker( &r.id.0, e, ); - df.fail(&r.id, State::Failed); + df.fail(&r.id); break 'requested; } }; @@ -1631,7 +1638,8 @@ fn worker( log, "region {:?} create failed: {:?}", r.id.0, e ); - df.fail(&r.id, State::Failed); + df.fail(&r.id); + break 'requested; } info!(log, "applying SMF actions post create..."); @@ -1660,7 +1668,7 @@ fn worker( &r.id.0, e, ); - df.fail(&r.id, State::Failed); + df.fail(&r.id); break 'tombstoned; } }; @@ -1691,11 +1699,10 @@ fn worker( r.id.0, e, ); - df.fail(&r.id, State::Failed); + df.fail(&r.id); break 'tombstoned; } }; - let res = worker_region_destroy(&log, &r, region_dataset) .and_then(|_| df.destroyed(&r.id)); @@ -1705,7 +1712,7 @@ fn worker( log, "region {:?} destroy failed: {:?}", r.id.0, e ); - df.fail(&r.id, State::Failed); + df.fail(&r.id); } } _ => { From d65da02f98ab6041c8d00af2b79e9d9aeb5c6950 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Fri, 13 Oct 2023 00:15:48 +0000 Subject: [PATCH 5/5] dont fail at failing --- agent/src/datafile.rs | 22 +++++----------------- agent/src/main.rs | 14 +++----------- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index 15c1df932..d9dca74c9 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -682,7 +682,7 @@ impl DataFile { * - Already destroyed; no more work to do. */ } - State::Requested | State::Created => { + State::Requested | State::Created | State::Failed => { /* * Schedule the destruction of this region. */ @@ -697,17 +697,6 @@ impl DataFile { self.bell.notify_all(); self.store(inner); } - State::Failed => { - /* - * For now, this terminal state will preserve evidence for - * investigation. - */ - bail!( - "region {} failed to provision and cannot be \ - destroyed", - r.id.0 - ); - } } Ok(()) @@ -773,7 +762,10 @@ impl DataFile { let region = region.unwrap(); match region.state { - State::Requested | State::Destroyed | State::Tombstoned => { + State::Requested + | State::Destroyed + | State::Tombstoned + | State::Failed => { // Either the region hasn't been created yet, or it has been // destroyed or marked to be destroyed (both of which require // that no snapshots exist). Return an empty list. @@ -783,10 +775,6 @@ impl DataFile { State::Created => { // proceed to next section } - - State::Failed => { - bail!("region.state is failed!"); - } } let mut path = self.base_path.to_path_buf(); diff --git a/agent/src/main.rs b/agent/src/main.rs index 8be2507ae..879c4ca3b 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -206,7 +206,7 @@ impl ZFSDataset { if i == 4 { bail!( - "zfs list mountpoint failed! out:{} err:{}", + "zfs destroy dataset failed! out:{} err:{}", out, err ); @@ -1589,15 +1589,7 @@ fn worker( &r.id.0, e, ); - if let Err(e) = df.destroyed(&r.id) { - error!( - log, - "Dataset {} destruction failed: {}", - &r.id.0, - e, - ); - df.fail(&r.id); - } + df.fail(&r.id); break 'requested; } }; @@ -1699,7 +1691,7 @@ fn worker( r.id.0, e, ); - df.fail(&r.id); + let _ = df.destroyed(&r.id); break 'tombstoned; } };