From 6dd1e1d8207564a44e46e6566649d2e05b3a3415 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Sat, 31 Aug 2024 19:02:04 -0700 Subject: [PATCH 1/4] Refactor visit_all_pages() to be more reusable --- src/db.rs | 8 +-- src/multimap_table.rs | 73 ++++++++++++++++++-- src/tree_store/btree.rs | 110 ++++++++++++++++++++++++++++++ src/tree_store/mod.rs | 4 +- src/tree_store/table_tree.rs | 40 +++++------ src/tree_store/table_tree_base.rs | 63 +++-------------- 6 files changed, 215 insertions(+), 83 deletions(-) diff --git a/src/db.rs b/src/db.rs index c25143e2..55179393 100644 --- a/src/db.rs +++ b/src/db.rs @@ -568,8 +568,8 @@ impl Database { // Chain all the other tables to the master table iter for entry in iter { let definition = entry?.value(); - definition.visit_all_pages(mem.clone(), |page_number| { - assert!(mem.is_allocated(page_number)); + definition.visit_all_pages(mem.clone(), |path| { + assert!(mem.is_allocated(path.page_number())); Ok(()) })?; } @@ -594,9 +594,9 @@ impl Database { // Chain all the other tables to the master table iter for entry in iter { let definition = entry?.value(); - definition.visit_all_pages(mem.clone(), |page_number| { + definition.visit_all_pages(mem.clone(), |path| { // TODO: simplify mark_pages_allocated() - mem.mark_pages_allocated([Ok(page_number)].into_iter(), allow_duplicates)?; + mem.mark_pages_allocated([Ok(path.page_number())].into_iter(), allow_duplicates)?; Ok(()) })?; } diff --git a/src/multimap_table.rs b/src/multimap_table.rs index 3985867b..34cf5be8 100644 --- a/src/multimap_table.rs +++ b/src/multimap_table.rs @@ -5,8 +5,8 @@ use crate::table::{ReadableTableMetadata, TableStats}; use crate::tree_store::{ btree_stats, AllPageNumbersBtreeIter, BranchAccessor, BranchMutator, Btree, BtreeHeader, BtreeMut, BtreeRangeIter, BtreeStats, CachePriority, Checksum, LeafAccessor, LeafMutator, Page, - PageHint, PageNumber, RawBtree, RawLeafBuilder, TransactionalMemory, UntypedBtreeMut, BRANCH, - DEFERRED, LEAF, MAX_PAIR_LENGTH, MAX_VALUE_LENGTH, + PageHint, PageNumber, PagePath, RawBtree, RawLeafBuilder, TransactionalMemory, UntypedBtree, + UntypedBtreeMut, BRANCH, DEFERRED, LEAF, MAX_PAIR_LENGTH, MAX_VALUE_LENGTH, }; use crate::types::{Key, TypeName, Value}; use crate::{AccessGuard, MultimapTableHandle, Result, StorageError, WriteTransaction}; @@ -342,7 +342,7 @@ pub(crate) fn finalize_tree_and_subtree_checksums( Ok(tree.get_root()) } -pub(crate) fn parse_subtree_roots( +fn parse_subtree_roots( page: &T, fixed_key_size: Option, fixed_value_size: Option, @@ -372,6 +372,69 @@ pub(crate) fn parse_subtree_roots( } } +pub(crate) struct UntypedMultiBtree { + mem: Arc, + root: Option, + key_width: Option, + value_width: Option, +} + +impl UntypedMultiBtree { + pub(crate) fn new( + root: Option, + mem: Arc, + key_width: Option, + value_width: Option, + ) -> Self { + Self { + mem, + root, + key_width, + value_width, + } + } + + // Applies visitor to pages in the tree + pub(crate) fn visit_all_pages(&self, mut visitor: F) -> Result + where + F: FnMut(&PagePath) -> Result, + { + let tree = UntypedBtree::new( + self.root, + self.mem.clone(), + self.key_width, + UntypedDynamicCollection::fixed_width_with(self.value_width), + ); + tree.visit_all_pages(|path| { + visitor(path)?; + let page = self.mem.get_page(path.page_number())?; + match page.memory()[0] { + LEAF => { + for header in parse_subtree_roots(&page, self.key_width, self.value_width) { + let subtree = UntypedBtree::new( + Some(header), + self.mem.clone(), + self.value_width, + <() as Value>::fixed_width(), + ); + subtree.visit_all_pages(|subpath| { + let full_path = path.with_subpath(subpath); + visitor(&full_path) + })?; + } + } + BRANCH => { + // No-op. The tree.visit_pages() call will process this sub-tree + } + _ => unreachable!(), + } + Ok(()) + })?; + + Ok(()) + } +} + pub(crate) struct LeafKeyIter<'a, V: Key + 'static> { inline_collection: AccessGuard<'a, &'static DynamicCollection>, fixed_key_size: Option, @@ -474,7 +537,7 @@ impl Into for DynamicCollectionType { /// See [Exotically Sized Types](https://doc.rust-lang.org/nomicon/exotic-sizes.html#dynamically-sized-types-dsts) /// section of the Rustonomicon for more details. #[repr(transparent)] -pub(crate) struct DynamicCollection { +struct DynamicCollection { _value_type: PhantomData, data: [u8], } @@ -637,7 +700,7 @@ impl DynamicCollection { } #[repr(transparent)] -pub(crate) struct UntypedDynamicCollection { +struct UntypedDynamicCollection { data: [u8], } diff --git a/src/tree_store/btree.rs b/src/tree_store/btree.rs index ae1eb649..17f655f2 100644 --- a/src/tree_store/btree.rs +++ b/src/tree_store/btree.rs @@ -28,6 +28,102 @@ pub(crate) struct BtreeStats { pub(crate) fragmented_bytes: u64, } +pub(crate) struct PagePath { + path: Vec, +} + +impl PagePath { + pub(crate) fn new_root(page_number: PageNumber) -> Self { + Self { + path: vec![page_number], + } + } + + pub(crate) fn with_child(&self, page_number: PageNumber) -> Self { + let mut path = self.path.clone(); + path.push(page_number); + Self { path } + } + + pub(crate) fn with_subpath(&self, other: &Self) -> Self { + let mut path = self.path.clone(); + path.extend(&other.path); + Self { path } + } + + pub(crate) fn n_parent(&self, n: usize) -> Option { + if n > self.path.len() - 1 { + None + } else { + Some(self.path[self.path.len() - 1 - n]) + } + } + + pub(crate) fn page_number(&self) -> PageNumber { + self.path[self.path.len() - 1] + } +} + +pub(crate) struct UntypedBtree { + mem: Arc, + root: Option, + key_width: Option, + _value_width: Option, +} + +impl UntypedBtree { + pub(crate) fn new( + root: Option, + mem: Arc, + key_width: Option, + value_width: Option, + ) -> Self { + Self { + mem, + root, + key_width, + _value_width: value_width, + } + } + + // Applies visitor to pages in the tree + pub(crate) fn visit_all_pages(&self, mut visitor: F) -> Result + where + F: FnMut(&PagePath) -> Result, + { + if let Some(page_number) = self.root.map(|x| x.root) { + self.visit_pages_helper(PagePath::new_root(page_number), &mut visitor)?; + } + + Ok(()) + } + + fn visit_pages_helper(&self, path: PagePath, visitor: &mut F) -> Result + where + F: FnMut(&PagePath) -> Result, + { + visitor(&path)?; + let page = self.mem.get_page(path.page_number())?; + + match page.memory()[0] { + LEAF => { + // No-op + } + BRANCH => { + let accessor = BranchAccessor::new(&page, self.key_width); + for i in 0..accessor.count_children() { + let child_page = accessor.child_page(i).unwrap(); + let child_path = path.with_child(child_page); + self.visit_pages_helper(child_path, visitor)?; + } + } + _ => unreachable!(), + } + + Ok(()) + } +} + pub(crate) struct UntypedBtreeMut { mem: Arc, root: Option, @@ -276,6 +372,7 @@ impl BtreeMut<'_, K, V> { Ok(()) } + #[allow(dead_code)] pub(crate) fn all_pages_iter(&self) -> Result> { if let Some(root) = self.root.map(|x| x.root) { Ok(Some(AllPageNumbersBtreeIter::new( @@ -289,6 +386,19 @@ impl BtreeMut<'_, K, V> { } } + pub(crate) fn visit_all_pages(&self, visitor: F) -> Result + where + F: FnMut(&PagePath) -> Result, + { + let tree = UntypedBtree::new( + self.root, + self.mem.clone(), + K::fixed_width(), + V::fixed_width(), + ); + tree.visit_all_pages(visitor) + } + pub(crate) fn get_root(&self) -> Option { self.root } diff --git a/src/tree_store/mod.rs b/src/tree_store/mod.rs index b79bc79d..2fbf524b 100644 --- a/src/tree_store/mod.rs +++ b/src/tree_store/mod.rs @@ -6,7 +6,9 @@ mod page_store; mod table_tree; mod table_tree_base; -pub(crate) use btree::{btree_stats, Btree, BtreeMut, BtreeStats, RawBtree, UntypedBtreeMut}; +pub(crate) use btree::{ + btree_stats, Btree, BtreeMut, BtreeStats, PagePath, RawBtree, UntypedBtree, UntypedBtreeMut, +}; pub use btree_base::{AccessGuard, AccessGuardMut}; pub(crate) use btree_base::{ BranchAccessor, BranchMutator, BtreeHeader, Checksum, LeafAccessor, LeafMutator, diff --git a/src/tree_store/table_tree.rs b/src/tree_store/table_tree.rs index d4b4e3b2..e390d02a 100644 --- a/src/tree_store/table_tree.rs +++ b/src/tree_store/table_tree.rs @@ -7,8 +7,8 @@ use crate::tree_store::btree::{btree_stats, UntypedBtreeMut}; use crate::tree_store::btree_base::BtreeHeader; use crate::tree_store::page_store::{new_allocators, BuddyAllocator}; use crate::tree_store::{ - Btree, BtreeMut, BtreeRangeIter, InternalTableDefinition, PageHint, PageNumber, RawBtree, - TableType, TransactionalMemory, + Btree, BtreeMut, BtreeRangeIter, InternalTableDefinition, PageHint, PageNumber, PagePath, + RawBtree, TableType, TransactionalMemory, }; use crate::types::{Key, MutInPlaceValue, TypeName, Value}; use crate::{DatabaseStats, Result}; @@ -291,13 +291,21 @@ impl<'txn> TableTreeMut<'txn> { pub(crate) fn all_referenced_pages(&self) -> Result> { let mut result = new_allocators(self.mem.get_layout()); + self.visit_all_pages(|path| { + let page = path.page_number(); + result[page.region as usize].record_alloc(page.page_index, page.page_order); + Ok(()) + })?; + + Ok(result) + } + + pub(crate) fn visit_all_pages(&self, mut visitor: F) -> Result + where + F: FnMut(&PagePath) -> Result, + { // All the pages in the table tree itself - if let Some(iter) = self.tree.all_pages_iter()? { - for page in iter { - let page = page?; - result[page.region as usize].record_alloc(page.page_index, page.page_order); - } - } + self.tree.visit_all_pages(&mut visitor)?; // All the normal tables for entry in self.list_tables(TableType::Normal)? { @@ -305,10 +313,7 @@ impl<'txn> TableTreeMut<'txn> { .get_table_untyped(&entry, TableType::Normal) .map_err(|e| e.into_storage_error_or_corrupted("Internal corruption"))? .unwrap(); - definition.visit_all_pages(self.mem.clone(), |page| { - result[page.region as usize].record_alloc(page.page_index, page.page_order); - Ok(()) - })?; + definition.visit_all_pages(self.mem.clone(), |path| visitor(path))?; } for entry in self.list_tables(TableType::Multimap)? { @@ -316,13 +321,10 @@ impl<'txn> TableTreeMut<'txn> { .get_table_untyped(&entry, TableType::Multimap) .map_err(|e| e.into_storage_error_or_corrupted("Internal corruption"))? .unwrap(); - definition.visit_all_pages(self.mem.clone(), |page| { - result[page.region as usize].record_alloc(page.page_index, page.page_order); - Ok(()) - })?; + definition.visit_all_pages(self.mem.clone(), |path| visitor(path))?; } - Ok(result) + Ok(()) } // Queues an update to the table root @@ -509,8 +511,8 @@ impl<'txn> TableTreeMut<'txn> { ) -> Result { if let Some(definition) = self.get_table_untyped(name, table_type)? { let mut freed_pages = self.freed_pages.lock().unwrap(); - definition.visit_all_pages(self.mem.clone(), |page_number| { - freed_pages.push(page_number); + definition.visit_all_pages(self.mem.clone(), |path| { + freed_pages.push(path.page_number()); Ok(()) })?; drop(freed_pages); diff --git a/src/tree_store/table_tree_base.rs b/src/tree_store/table_tree_base.rs index 9a0a8f1b..590ed95a 100644 --- a/src/tree_store/table_tree_base.rs +++ b/src/tree_store/table_tree_base.rs @@ -1,6 +1,6 @@ -use crate::multimap_table::{parse_subtree_roots, relocate_subtrees, DynamicCollection}; +use crate::multimap_table::{relocate_subtrees, UntypedMultiBtree}; use crate::tree_store::{ - AllPageNumbersBtreeIter, BtreeHeader, PageNumber, TransactionalMemory, UntypedBtreeMut, + BtreeHeader, PageNumber, PagePath, TransactionalMemory, UntypedBtree, UntypedBtreeMut, }; use crate::{Key, Result, TableError, TypeName, Value}; use std::mem::size_of; @@ -183,13 +183,9 @@ impl InternalTableDefinition { Ok(()) } - pub(crate) fn visit_all_pages<'a, F>( - &self, - mem: Arc, - mut visitor: F, - ) -> Result + pub(crate) fn visit_all_pages<'a, F>(&self, mem: Arc, visitor: F) -> Result where - F: FnMut(PageNumber) -> Result + 'a, + F: FnMut(&PagePath) -> Result + 'a, { match self { InternalTableDefinition::Normal { @@ -198,18 +194,8 @@ impl InternalTableDefinition { fixed_value_size, .. } => { - if let Some(header) = table_root { - let table_pages_iter = AllPageNumbersBtreeIter::new( - header.root, - *fixed_key_size, - *fixed_value_size, - mem, - )?; - - for page in table_pages_iter { - visitor(page?)?; - } - } + let tree = UntypedBtree::new(*table_root, mem, *fixed_key_size, *fixed_value_size); + tree.visit_all_pages(visitor)?; } InternalTableDefinition::Multimap { table_root, @@ -217,40 +203,9 @@ impl InternalTableDefinition { fixed_value_size, .. } => { - if let Some(header) = table_root { - let table_pages_iter = AllPageNumbersBtreeIter::new( - header.root, - *fixed_key_size, - DynamicCollection::<()>::fixed_width_with(*fixed_value_size), - mem.clone(), - )?; - for page in table_pages_iter { - visitor(page?)?; - } - - let table_pages_iter = AllPageNumbersBtreeIter::new( - header.root, - *fixed_key_size, - DynamicCollection::<()>::fixed_width_with(*fixed_value_size), - mem.clone(), - )?; - for table_page in table_pages_iter { - let page = mem.get_page(table_page?)?; - let subtree_roots = - parse_subtree_roots(&page, *fixed_key_size, *fixed_value_size); - for subtree_header in subtree_roots { - let sub_root_iter = AllPageNumbersBtreeIter::new( - subtree_header.root, - *fixed_value_size, - <()>::fixed_width(), - mem.clone(), - )?; - for page in sub_root_iter { - visitor(page?)?; - } - } - } - } + let tree = + UntypedMultiBtree::new(*table_root, mem, *fixed_key_size, *fixed_value_size); + tree.visit_all_pages(visitor)?; } } From 8bf4304bf3aac734ca5d68d1b688c3bc7fb85392 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Sun, 1 Sep 2024 20:10:07 -0700 Subject: [PATCH 2/4] Make PageNumber Ord implementation explicit --- src/tree_store/page_store/base.rs | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/tree_store/page_store/base.rs b/src/tree_store/page_store/base.rs index 4c91fa1a..6f6cda69 100644 --- a/src/tree_store/page_store/base.rs +++ b/src/tree_store/page_store/base.rs @@ -1,5 +1,6 @@ use crate::tree_store::page_store::cached_file::WritablePage; use crate::tree_store::page_store::page_manager::MAX_MAX_PAGE_ORDER; +use std::cmp::Ordering; #[cfg(debug_assertions)] use std::collections::HashMap; #[cfg(debug_assertions)] @@ -21,13 +22,38 @@ pub(crate) const MAX_PAGE_INDEX: u32 = 0x000F_FFFF; // highest 5bits: page order exponent // // Assuming a reasonable page size, like 4kiB, this allows for 4kiB * 2^20 * 2^20 = 4PiB of usable space -#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +#[derive(Copy, Clone, Eq, PartialEq, Hash)] pub(crate) struct PageNumber { pub(crate) region: u32, pub(crate) page_index: u32, pub(crate) page_order: u8, } +// PageNumbers are ordered as determined by their starting address in the database file +impl Ord for PageNumber { + fn cmp(&self, other: &Self) -> Ordering { + match self.region.cmp(&other.region) { + Ordering::Less => Ordering::Less, + Ordering::Equal => { + let self_order0 = self.page_index * 2u32.pow(self.page_order as u32); + let other_order0 = other.page_index * 2u32.pow(other.page_order as u32); + assert!( + self_order0 != other_order0 || self.page_order == other.page_order, + "{self:?} overlaps {other:?}, but is not equal" + ); + self_order0.cmp(&other_order0) + } + Ordering::Greater => Ordering::Greater, + } + } +} + +impl PartialOrd for PageNumber { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + impl PageNumber { #[inline(always)] pub(crate) const fn serialized_size() -> usize { From 03877d22fc6824557dce390c4cca9bca88f35cef Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Mon, 2 Sep 2024 13:33:51 -0700 Subject: [PATCH 3/4] Major performance improvement to compact() compact() previously moved pages down to lower indices in a semi-random and optimistic way. However, this took O(N^2) time in the size of the database. This changes the compaction algorithm to identify the pages at the end of the database file which can be moved, and only move those pages. This is done iteratively until no more pages can be moved --- src/multimap_table.rs | 38 +++++++----------- src/transactions.rs | 64 +++++++++++++++++++++++++----- src/tree_store/btree.rs | 65 +++++++++++++++---------------- src/tree_store/table_tree.rs | 57 ++++++++++++++++++++++----- src/tree_store/table_tree_base.rs | 5 ++- 5 files changed, 153 insertions(+), 76 deletions(-) diff --git a/src/multimap_table.rs b/src/multimap_table.rs index 34cf5be8..ca037e6f 100644 --- a/src/multimap_table.rs +++ b/src/multimap_table.rs @@ -12,6 +12,7 @@ use crate::types::{Key, TypeName, Value}; use crate::{AccessGuard, MultimapTableHandle, Result, StorageError, WriteTransaction}; use std::borrow::Borrow; use std::cmp::max; +use std::collections::HashMap; use std::convert::TryInto; use std::marker::PhantomData; use std::mem; @@ -197,18 +198,17 @@ pub(crate) fn relocate_subtrees( value_size: Option, mem: Arc, freed_pages: Arc>>, + relocation_map: &HashMap, ) -> Result<(PageNumber, Checksum)> { let old_page = mem.get_page(root.0)?; - let mut new_page = mem.allocate_lowest( - old_page.memory().len(), - CachePriority::default_btree(old_page.memory()), - )?; - + let mut new_page = if let Some(new_page_number) = relocation_map.get(&root.0) { + mem.get_page_mut(*new_page_number)? + } else { + return Ok(root); + }; let new_page_number = new_page.get_page_number(); new_page.memory_mut().copy_from_slice(old_page.memory()); - let mut changed = false; - match old_page.memory()[0] { LEAF => { let accessor = LeafAccessor::new( @@ -234,11 +234,11 @@ pub(crate) fn relocate_subtrees( value_size, <() as Value>::fixed_width(), ); - if tree.relocate()? { + tree.relocate(relocation_map)?; + if sub_root != tree.get_root().unwrap() { let new_collection = UntypedDynamicCollection::make_subtree_data(tree.get_root().unwrap()); mutator.insert(i, true, entry.key(), &new_collection); - changed = true; } } } @@ -255,29 +255,21 @@ pub(crate) fn relocate_subtrees( value_size, mem.clone(), freed_pages.clone(), + relocation_map, )?; mutator.write_child_page(i, new_child, new_checksum); - if new_child != child { - changed = true; - } } } } _ => unreachable!(), } - if changed || new_page_number.is_before(old_page.get_page_number()) { - let old_page_number = old_page.get_page_number(); - drop(old_page); - if !mem.free_if_uncommitted(old_page_number) { - freed_pages.lock().unwrap().push(old_page_number); - } - Ok((new_page_number, DEFERRED)) - } else { - drop(new_page); - mem.free(new_page_number); - Ok(root) + let old_page_number = old_page.get_page_number(); + drop(old_page); + if !mem.free_if_uncommitted(old_page_number) { + freed_pages.lock().unwrap().push(old_page_number); } + Ok((new_page_number, DEFERRED)) } // Finalize all the checksums in the tree, including any Dynamic collection subtrees diff --git a/src/transactions.rs b/src/transactions.rs index 7bd7def2..23662e68 100644 --- a/src/transactions.rs +++ b/src/transactions.rs @@ -5,9 +5,9 @@ use crate::sealed::Sealed; use crate::table::ReadOnlyUntypedTable; use crate::transaction_tracker::{SavepointId, TransactionId, TransactionTracker}; use crate::tree_store::{ - Btree, BtreeHeader, BtreeMut, FreedPageList, FreedTableKey, InternalTableDefinition, PageHint, - PageNumber, SerializedSavepoint, TableTree, TableTreeMut, TableType, TransactionalMemory, - MAX_PAIR_LENGTH, MAX_VALUE_LENGTH, + Btree, BtreeHeader, BtreeMut, CachePriority, FreedPageList, FreedTableKey, + InternalTableDefinition, Page, PageHint, PageNumber, SerializedSavepoint, TableTree, + TableTreeMut, TableType, TransactionalMemory, MAX_PAIR_LENGTH, MAX_VALUE_LENGTH, }; use crate::types::{Key, Value}; use crate::{ @@ -20,7 +20,7 @@ use crate::{ use log::{debug, warn}; use std::borrow::Borrow; use std::cmp::min; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Debug, Display, Formatter}; use std::marker::PhantomData; use std::ops::RangeBounds; @@ -30,6 +30,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use std::{panic, thread}; +const MAX_PAGES_PER_COMPACTION: usize = 1_000_000; const NEXT_SAVEPOINT_TABLE: SystemTableDefinition<(), SavepointId> = SystemTableDefinition::new("next_savepoint_id"); pub(crate) const SAVEPOINT_TABLE: SystemTableDefinition = @@ -1160,18 +1161,63 @@ impl WriteTransaction { progress = true; } - // Relocate the btree pages + // Find the 1M highest pages + let mut highest_pages = BTreeMap::new(); let mut tables = self.tables.lock().unwrap(); let table_tree = &mut tables.table_tree; - if table_tree.compact_tables()? { - progress = true; - } + table_tree.highest_index_pages(MAX_PAGES_PER_COMPACTION, &mut highest_pages)?; let mut system_tables = self.system_tables.lock().unwrap(); let system_table_tree = &mut system_tables.table_tree; - if system_table_tree.compact_tables()? { + system_table_tree.highest_index_pages(MAX_PAGES_PER_COMPACTION, &mut highest_pages)?; + + // Calculate how many of them can be relocated to lower pages, starting from the last page + let mut relocation_map = HashMap::new(); + for path in highest_pages.into_values().rev() { + if relocation_map.contains_key(&path.page_number()) { + continue; + } + let old_page = self.mem.get_page(path.page_number())?; + let mut new_page = self.mem.allocate_lowest( + old_page.memory().len(), + CachePriority::default_btree(old_page.memory()), + )?; + let new_page_number = new_page.get_page_number(); + // We have to copy at least the page type into the new page. + // Otherwise its cache priority will be calculated incorrectly + new_page.memory_mut()[0] = old_page.memory()[0]; + drop(new_page); + // We're able to move this to a lower page, so insert it and rewrite all its parents + if new_page_number < path.page_number() { + relocation_map.insert(path.page_number(), new_page_number); + for parent in path.parents() { + if relocation_map.contains_key(parent) { + continue; + } + let old_parent = self.mem.get_page(*parent)?; + let mut new_page = self.mem.allocate_lowest( + old_parent.memory().len(), + CachePriority::default_btree(old_parent.memory()), + )?; + let new_page_number = new_page.get_page_number(); + // We have to copy at least the page type into the new page. + // Otherwise its cache priority will be calculated incorrectly + new_page.memory_mut()[0] = old_parent.memory()[0]; + drop(new_page); + relocation_map.insert(*parent, new_page_number); + } + } else { + self.mem.free(new_page_number); + break; + } + } + + if !relocation_map.is_empty() { progress = true; } + table_tree.relocate_tables(&relocation_map)?; + system_table_tree.relocate_tables(&relocation_map)?; + Ok(progress) } diff --git a/src/tree_store/btree.rs b/src/tree_store/btree.rs index 17f655f2..03d62553 100644 --- a/src/tree_store/btree.rs +++ b/src/tree_store/btree.rs @@ -5,7 +5,7 @@ use crate::tree_store::btree_base::{ }; use crate::tree_store::btree_iters::BtreeExtractIf; use crate::tree_store::btree_mutator::MutateHelper; -use crate::tree_store::page_store::{CachePriority, Page, PageImpl, PageMut, TransactionalMemory}; +use crate::tree_store::page_store::{Page, PageImpl, PageMut, TransactionalMemory}; use crate::tree_store::{ AccessGuardMut, AllPageNumbersBtreeIter, BtreeRangeIter, PageHint, PageNumber, }; @@ -15,6 +15,7 @@ use crate::{AccessGuard, Result}; use log::trace; use std::borrow::Borrow; use std::cmp::max; +use std::collections::HashMap; use std::marker::PhantomData; use std::ops::RangeBounds; use std::sync::{Arc, Mutex}; @@ -28,6 +29,7 @@ pub(crate) struct BtreeStats { pub(crate) fragmented_bytes: u64, } +#[derive(Clone)] pub(crate) struct PagePath { path: Vec, } @@ -51,12 +53,8 @@ impl PagePath { Self { path } } - pub(crate) fn n_parent(&self, n: usize) -> Option { - if n > self.path.len() - 1 { - None - } else { - Some(self.path[self.path.len() - 1 - n]) - } + pub(crate) fn parents(&self) -> &[PageNumber] { + &self.path[..self.path.len() - 1] } pub(crate) fn page_number(&self) -> PageNumber { @@ -261,10 +259,14 @@ impl UntypedBtreeMut { Ok(()) } - // Relocate the btree to lower pages - pub(crate) fn relocate(&mut self) -> Result { + pub(crate) fn relocate( + &mut self, + relocation_map: &HashMap, + ) -> Result { if let Some(root) = self.get_root() { - if let Some((new_root, new_checksum)) = self.relocate_helper(root.root)? { + if let Some((new_root, new_checksum)) = + self.relocate_helper(root.root, relocation_map)? + { self.root = Some(BtreeHeader::new(new_root, new_checksum, root.length)); return Ok(true); } @@ -272,22 +274,21 @@ impl UntypedBtreeMut { Ok(false) } - // Relocates the given page to a lower page if possible, and returns the new page number + // Relocates the given subtree to the pages specified in relocation_map fn relocate_helper( &mut self, page_number: PageNumber, + relocation_map: &HashMap, ) -> Result> { let old_page = self.mem.get_page(page_number)?; - let mut new_page = self.mem.allocate_lowest( - old_page.memory().len(), - CachePriority::default_btree(old_page.memory()), - )?; - let new_page_number = new_page.get_page_number(); - + let mut new_page = if let Some(new_page_number) = relocation_map.get(&page_number) { + self.mem.get_page_mut(*new_page_number)? + } else { + return Ok(None); + }; new_page.memory_mut().copy_from_slice(old_page.memory()); let node_mem = old_page.memory(); - let mut changed = false; match node_mem[0] { LEAF => { // No-op @@ -297,27 +298,22 @@ impl UntypedBtreeMut { let mut mutator = BranchMutator::new(&mut new_page); for i in 0..accessor.count_children() { let child = accessor.child_page(i).unwrap(); - if let Some((new_child, new_checksum)) = self.relocate_helper(child)? { + if let Some((new_child, new_checksum)) = + self.relocate_helper(child, relocation_map)? + { mutator.write_child_page(i, new_child, new_checksum); - changed = true; } } } _ => unreachable!(), } - if changed || new_page_number.is_before(page_number) { - let mut freed_pages = self.freed_pages.lock().unwrap(); - if !self.mem.free_if_uncommitted(page_number) { - freed_pages.push(page_number); - } - - Ok(Some((new_page_number, DEFERRED))) - } else { - drop(new_page); - self.mem.free(new_page_number); - Ok(None) + let mut freed_pages = self.freed_pages.lock().unwrap(); + if !self.mem.free_if_uncommitted(page_number) { + freed_pages.push(page_number); } + + Ok(Some((new_page.get_page_number(), DEFERRED))) } } @@ -403,7 +399,10 @@ impl BtreeMut<'_, K, V> { self.root } - pub(crate) fn relocate(&mut self) -> Result { + pub(crate) fn relocate( + &mut self, + relocation_map: &HashMap, + ) -> Result { let mut tree = UntypedBtreeMut::new( self.get_root(), self.mem.clone(), @@ -411,7 +410,7 @@ impl BtreeMut<'_, K, V> { K::fixed_width(), V::fixed_width(), ); - if tree.relocate()? { + if tree.relocate(relocation_map)? { self.root = tree.get_root(); Ok(true) } else { diff --git a/src/tree_store/table_tree.rs b/src/tree_store/table_tree.rs index e390d02a..b13a8610 100644 --- a/src/tree_store/table_tree.rs +++ b/src/tree_store/table_tree.rs @@ -13,7 +13,7 @@ use crate::tree_store::{ use crate::types::{Key, MutInPlaceValue, TypeName, Value}; use crate::{DatabaseStats, Result}; use std::cmp::max; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::mem; use std::mem::size_of; use std::ops::RangeFull; @@ -553,8 +553,13 @@ impl<'txn> TableTreeMut<'txn> { } } - pub(crate) fn compact_tables(&mut self) -> Result { - let mut progress = false; + // Returns the paths to the n pages that are closest to the end of the database + // The return value is sorted, according to path.page_number()'s Ord + pub(crate) fn highest_index_pages( + &self, + n: usize, + output: &mut BTreeMap, + ) -> Result { for entry in self.tree.range::(&(..))? { let entry = entry?; let mut definition = entry.value(); @@ -564,20 +569,52 @@ impl<'txn> TableTreeMut<'txn> { definition.set_header(*updated_root, *updated_length); } - if let Some(new_root) = - definition.relocate_tree(self.mem.clone(), self.freed_pages.clone())? + definition.visit_all_pages(self.mem.clone(), |path| { + output.insert(path.page_number(), path.clone()); + while output.len() > n { + output.pop_first(); + } + Ok(()) + })?; + } + + self.tree.visit_all_pages(|path| { + output.insert(path.page_number(), path.clone()); + while output.len() > n { + output.pop_first(); + } + Ok(()) + })?; + + Ok(()) + } + + pub(crate) fn relocate_tables( + &mut self, + relocation_map: &HashMap, + ) -> Result { + for entry in self.tree.range::(&(..))? { + let entry = entry?; + let mut definition = entry.value(); + if let Some((updated_root, updated_length)) = + self.pending_table_updates.get(entry.key()) { - progress = true; + definition.set_header(*updated_root, *updated_length); + } + + if let Some(new_root) = definition.relocate_tree( + self.mem.clone(), + self.freed_pages.clone(), + relocation_map, + )? { self.pending_table_updates .insert(entry.key().to_string(), (new_root, definition.get_length())); } } - if self.tree.relocate()? { - progress = true; - } + self.tree.relocate(relocation_map)?; - Ok(progress) + Ok(()) } pub fn stats(&self) -> Result { diff --git a/src/tree_store/table_tree_base.rs b/src/tree_store/table_tree_base.rs index 590ed95a..19bb13c3 100644 --- a/src/tree_store/table_tree_base.rs +++ b/src/tree_store/table_tree_base.rs @@ -3,6 +3,7 @@ use crate::tree_store::{ BtreeHeader, PageNumber, PagePath, TransactionalMemory, UntypedBtree, UntypedBtreeMut, }; use crate::{Key, Result, TableError, TypeName, Value}; +use std::collections::HashMap; use std::mem::size_of; use std::sync::{Arc, Mutex}; @@ -216,6 +217,7 @@ impl InternalTableDefinition { &mut self, mem: Arc, freed_pages: Arc>>, + relocation_map: &HashMap, ) -> Result>> { let original_root = self.private_get_root(); let relocated_root = match self { @@ -233,6 +235,7 @@ impl InternalTableDefinition { *fixed_value_size, mem.clone(), freed_pages.clone(), + relocation_map, )?; Some(BtreeHeader::new(page_number, checksum, header.length)) } else { @@ -247,7 +250,7 @@ impl InternalTableDefinition { self.private_get_fixed_key_size(), self.private_get_fixed_value_size(), ); - tree.relocate()?; + tree.relocate(relocation_map)?; if tree.get_root() != original_root { self.set_header(tree.get_root(), self.get_length()); Ok(Some(tree.get_root())) From e5804a6cdc4f03c66f804eca934b22ffee4027ac Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Mon, 2 Sep 2024 13:36:20 -0700 Subject: [PATCH 4/4] Rename "heed" benchmark backend to "lmdb" heed is a wrapper around lmdb --- benches/common.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/common.rs b/benches/common.rs index 00b0f114..c7d663f8 100644 --- a/benches/common.rs +++ b/benches/common.rs @@ -340,7 +340,7 @@ impl<'a> BenchDatabase for HeedBenchDatabase<'a> { type R<'db> = HeedBenchReadTransaction<'db> where Self: 'db; fn db_type_name() -> &'static str { - "heed" + "lmdb" } fn write_transaction(&self) -> Self::W<'_> {