From a07fd1d13fa5bd89606bf6f2ae23574cbd5db08b Mon Sep 17 00:00:00 2001 From: Michael Constant Date: Mon, 18 Nov 2024 02:38:36 -0800 Subject: [PATCH 1/3] Compaction also needs to use allocate_non_transactional() If we relocate the region tracker page during compaction and then the transaction aborts, the allocation of the new tracker page gets rolled back, but the free of the old tracker page doesn't! This leaves us in an inconsistent state with no tracker page allocated. So we need to use allocate_non_transactional() here, which guarantees the page will remain allocated even if the rest of the transaction rolls back. --- src/tree_store/page_store/page_manager.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index b441602b..f0dd6631 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -539,7 +539,7 @@ impl TransactionalMemory { // Allocate a larger tracker page self.free(tracker_page); tracker_page = self - .allocate_non_transactional(tracker_len)? + .allocate_non_transactional(tracker_len, false)? .get_page_number(); let mut state = self.state.lock().unwrap(); state.header.set_region_tracker(tracker_page); @@ -559,10 +559,8 @@ impl TransactionalMemory { let old_tracker_page = state.header.region_tracker(); // allocate acquires this lock, so we need to drop it drop(state); - // Allocate the new page. Unlike other region-tracker allocations, this happens inside - // a transaction, so we use an ordinary allocation (which gets committed or rolled back - // along with the rest of the transaction) rather than allocate_non_transactional() - let new_page = self.allocate_lowest(region_tracker_size.try_into().unwrap())?; + let new_page = + self.allocate_non_transactional(region_tracker_size.try_into().unwrap(), true)?; if new_page.get_page_number().is_before(old_tracker_page) { let mut state = self.state.lock().unwrap(); state.header.set_region_tracker(new_page.get_page_number()); @@ -1110,8 +1108,8 @@ impl TransactionalMemory { // Allocate a page not associated with any transaction. The page is immediately considered committed, // and won't be rolled back if an abort happens. This is only used for the region tracker - fn allocate_non_transactional(&self, allocation_size: usize) -> Result { - self.allocate_helper(allocation_size, false, false) + fn allocate_non_transactional(&self, allocation_size: usize, lowest: bool) -> Result { + self.allocate_helper(allocation_size, lowest, false) } pub(crate) fn count_allocated_pages(&self) -> Result { From af49d40962d44c08d25d3d7454ca5876df2546c7 Mon Sep 17 00:00:00 2001 From: Michael Constant Date: Mon, 18 Nov 2024 16:48:11 -0800 Subject: [PATCH 2/3] Don't flush allocator state after a failed repair If repair fails, our allocator state is inconsistent; don't try to write it to disk --- src/tree_store/page_store/page_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index f0dd6631..3aeac228 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -1129,7 +1129,7 @@ impl TransactionalMemory { impl Drop for TransactionalMemory { fn drop(&mut self) { - if thread::panicking() { + if thread::panicking() || self.needs_recovery.load(Ordering::Acquire) { return; } From 420791eb83f69f794ef437120660673148ad74a5 Mon Sep 17 00:00:00 2001 From: Michael Constant Date: Mon, 18 Nov 2024 02:38:36 -0800 Subject: [PATCH 3/3] Don't mark the region tracker page as allocated When writing the allocator state table, it's better not to include the region tracker page as one of the allocated pages. Instead, we should allocate the region tracker page during quick-repair after loading the rest of the allocator state, exactly like we do for full repair. This avoids an unlikely corruption that could happen for databases larger than 4 TB, if we crash on shutdown after writing out a new region tracker page number but before clearing the recovery_required bit. It also means that in the future, when we drop support for the old allocator state format, we'll be able to get rid of the region tracker page entirely instead of having to keep around some of the allocation code for compatibility. --- src/tree_store/page_store/page_manager.rs | 88 +++++++++++++---------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index 3aeac228..fc653815 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -361,26 +361,7 @@ impl TransactionalMemory { } pub(crate) fn end_repair(&self) -> Result<()> { - let mut state = self.state.lock().unwrap(); - let tracker_len = state.allocators.region_tracker.to_vec().len(); - let tracker_page = state.header.region_tracker(); - - let allocator = state.get_region_mut(tracker_page.region); - // Allocate a new tracker page, if the old one was overwritten or is too small - if allocator.is_allocated(tracker_page.page_index, tracker_page.page_order) - || tracker_page.page_size_bytes(self.page_size) < tracker_len as u64 - { - drop(state); - let new_tracker_page = self.allocate(tracker_len)?.get_page_number(); - - let mut state = self.state.lock().unwrap(); - state.header.set_region_tracker(new_tracker_page); - self.write_header(&state.header)?; - self.storage.flush(false)?; - } else { - allocator.record_alloc(tracker_page.page_index, tracker_page.page_order); - drop(state); - } + self.allocate_region_tracker_page()?; let mut state = self.state.lock().unwrap(); let tracker_page = state.header.region_tracker(); @@ -436,10 +417,30 @@ impl TransactionalMemory { num_regions: u32, ) -> Result { // Has the number of regions changed since reserve_allocator_state() was called? - if num_regions != self.state.lock().unwrap().header.layout().num_regions() { + let state = self.state.lock().unwrap(); + if num_regions != state.header.layout().num_regions() { return Ok(false); } + // Temporarily free the region tracker page, because we don't want to include it in our + // recorded allocations + let tracker_page = state.header.region_tracker(); + drop(state); + self.free(tracker_page); + + let result = self.try_save_allocator_state_inner(tree, num_regions); + + // Restore the region tracker page + self.mark_page_allocated(tracker_page); + + result + } + + fn try_save_allocator_state_inner( + &self, + tree: &mut AllocatorStateTree, + num_regions: u32, + ) -> Result { for i in 0..num_regions { let region_bytes = &self.state.lock().unwrap().allocators.region_allocators[i as usize].to_vec(); @@ -509,10 +510,8 @@ impl TransactionalMemory { state.allocators.resize_to(layout); drop(state); - // Allocate a larger region tracker page if necessary. This also happens automatically on - // shutdown, but we do it here because we want our allocator state to exactly match the - // result of running a full repair - self.ensure_region_tracker_page()?; + // Allocate a page for the region tracker + self.allocate_region_tracker_page()?; self.state.lock().unwrap().header.recovery_required = false; self.needs_recovery.store(false, Ordering::Release); @@ -527,22 +526,33 @@ impl TransactionalMemory { allocator.is_allocated(page.page_index, page.page_order) } - // Make sure we have a large enough region-tracker page. This uses allocate_non_transactional(), - // so it should only be called from outside a transaction - fn ensure_region_tracker_page(&self) -> Result { - let state = self.state.lock().unwrap(); + // Allocate a page for the region tracker. If possible, this will pick the same page that + // was used last time; otherwise it'll pick a new page and update the database header to + // match + fn allocate_region_tracker_page(&self) -> Result { + let mut state = self.state.lock().unwrap(); let tracker_len = state.allocators.region_tracker.to_vec().len(); - let mut tracker_page = state.header.region_tracker(); - drop(state); + let tracker_page = state.header.region_tracker(); + + let allocator = state.get_region_mut(tracker_page.region); + // Pick a new tracker page, if the old one was overwritten or is too small + if allocator.is_allocated(tracker_page.page_index, tracker_page.page_order) + || tracker_page.page_size_bytes(self.page_size) < tracker_len as u64 + { + drop(state); - if tracker_page.page_size_bytes(self.page_size) < (tracker_len as u64) { - // Allocate a larger tracker page - self.free(tracker_page); - tracker_page = self + let new_tracker_page = self .allocate_non_transactional(tracker_len, false)? .get_page_number(); + let mut state = self.state.lock().unwrap(); - state.header.set_region_tracker(tracker_page); + state.header.set_region_tracker(new_tracker_page); + self.write_header(&state.header)?; + self.storage.flush(false)?; + } else { + // The old page is available, so just mark it as allocated + allocator.record_alloc(tracker_page.page_index, tracker_page.page_order); + drop(state); } Ok(()) @@ -1133,8 +1143,10 @@ impl Drop for TransactionalMemory { return; } - // Allocate a larger region-tracker page if necessary - if self.ensure_region_tracker_page().is_err() { + // Reallocate the region tracker page, which will grow it if necessary + let tracker_page = self.state.lock().unwrap().header.region_tracker(); + self.free(tracker_page); + if self.allocate_region_tracker_page().is_err() { #[cfg(feature = "logging")] warn!("Failure while flushing allocator state. Repair required at restart."); return;