From 25903c5decc67ec493a36b95583648c763e68ffc Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Fri, 24 Mar 2023 10:59:56 -0700 Subject: [PATCH 1/4] Remove mmap backend The userspace cache backend is now within a factor of about 1.5-2x, and I've given up on proving that the mmap backend is sound. It's just too complex and there was already a soundness bug found in 1293d4f --- benches/int_benchmark.rs | 22 +-- benches/large_values_benchmark.rs | 37 +--- benches/lmdb_benchmark.rs | 28 --- benches/multithreaded_insert_benchmark.rs | 4 +- docs/design.md | 8 - fuzz/fuzz_targets/common.rs | 1 - fuzz/fuzz_targets/fuzz_redb.rs | 13 +- src/db.rs | 68 ------- src/tree_store/page_store/base.rs | 11 +- src/tree_store/page_store/header.rs | 4 - src/tree_store/page_store/mmap.rs | 230 ---------------------- src/tree_store/page_store/mmap/unix.rs | 136 ------------- src/tree_store/page_store/mmap/windows.rs | 189 ------------------ src/tree_store/page_store/mod.rs | 1 - src/tree_store/page_store/page_manager.rs | 42 +--- 15 files changed, 15 insertions(+), 779 deletions(-) delete mode 100644 src/tree_store/page_store/mmap.rs delete mode 100644 src/tree_store/page_store/mmap/unix.rs delete mode 100644 src/tree_store/page_store/mmap/windows.rs diff --git a/benches/int_benchmark.rs b/benches/int_benchmark.rs index cc6bbf26..0f24980f 100644 --- a/benches/int_benchmark.rs +++ b/benches/int_benchmark.rs @@ -63,17 +63,6 @@ fn main() { benchmark(table) }; - let redb_mmap_results = { - let tmpfile: NamedTempFile = NamedTempFile::new_in(current_dir().unwrap()).unwrap(); - let db = unsafe { - redb::Database::builder() - .create_mmapped(tmpfile.path()) - .unwrap() - }; - let table = RedbBenchDatabase::new(&db); - benchmark(table) - }; - let lmdb_results = { let tmpfile: TempDir = tempfile::tempdir_in(current_dir().unwrap()).unwrap(); let env = lmdb::Environment::new().open(tmpfile.path()).unwrap(); @@ -112,7 +101,6 @@ fn main() { for results in [ redb_results, - redb_mmap_results, lmdb_results, rocksdb_results, sled_results, @@ -125,15 +113,7 @@ fn main() { let mut table = comfy_table::Table::new(); table.set_width(100); - table.set_header([ - "", - "redb", - "redb-mmap", - "lmdb", - "rocksdb", - "sled", - "sanakirja", - ]); + table.set_header(["", "redb", "lmdb", "rocksdb", "sled", "sanakirja"]); for row in rows { table.add_row(row); } diff --git a/benches/large_values_benchmark.rs b/benches/large_values_benchmark.rs index 09051642..b0f8e8ce 100644 --- a/benches/large_values_benchmark.rs +++ b/benches/large_values_benchmark.rs @@ -88,30 +88,6 @@ fn main() { benchmark(table) }; - let redb_mmap_latency_results = { - let tmpfile: NamedTempFile = NamedTempFile::new_in(current_dir().unwrap()).unwrap(); - let db = unsafe { - redb::Database::builder() - .set_write_strategy(WriteStrategy::Checksum) - .create_mmapped(tmpfile.path()) - .unwrap() - }; - let table = RedbBenchDatabase::new(&db); - benchmark(table) - }; - - let redb_mmap_throughput_results = { - let tmpfile: NamedTempFile = NamedTempFile::new_in(current_dir().unwrap()).unwrap(); - let db = unsafe { - redb::Database::builder() - .set_write_strategy(WriteStrategy::TwoPhase) - .create_mmapped(tmpfile.path()) - .unwrap() - }; - let table = RedbBenchDatabase::new(&db); - benchmark(table) - }; - let lmdb_results = { let tmpfile: TempDir = tempfile::tempdir_in(current_dir().unwrap()).unwrap(); let env = lmdb::Environment::new().open(tmpfile.path()).unwrap(); @@ -143,8 +119,6 @@ fn main() { for results in [ redb_latency_results, redb_throughput_results, - redb_mmap_latency_results, - redb_mmap_throughput_results, lmdb_results, rocksdb_results, sled_results, @@ -156,16 +130,7 @@ fn main() { let mut table = comfy_table::Table::new(); table.set_width(100); - table.set_header([ - "", - "redb (1PC+C)", - "redb (2PC)", - "redb (mmap-1PC+C)", - "redb (mmap-2PC)", - "lmdb", - "rocksdb", - "sled", - ]); + table.set_header(["", "redb (1PC+C)", "redb (2PC)", "lmdb", "rocksdb", "sled"]); for row in rows { table.add_row(row); } diff --git a/benches/lmdb_benchmark.rs b/benches/lmdb_benchmark.rs index a9d814c6..c6be6e4d 100644 --- a/benches/lmdb_benchmark.rs +++ b/benches/lmdb_benchmark.rs @@ -302,30 +302,6 @@ fn main() { benchmark(table) }; - let redb_mmap_latency_results = { - let tmpfile: NamedTempFile = NamedTempFile::new_in(&tmpdir).unwrap(); - let db = unsafe { - redb::Database::builder() - .set_write_strategy(WriteStrategy::Checksum) - .create_mmapped(tmpfile.path()) - .unwrap() - }; - let table = RedbBenchDatabase::new(&db); - benchmark(table) - }; - - let redb_mmap_throughput_results = { - let tmpfile: NamedTempFile = NamedTempFile::new_in(&tmpdir).unwrap(); - let db = unsafe { - redb::Database::builder() - .set_write_strategy(WriteStrategy::TwoPhase) - .create_mmapped(tmpfile.path()) - .unwrap() - }; - let table = RedbBenchDatabase::new(&db); - benchmark(table) - }; - let lmdb_results = { let tmpfile: TempDir = tempfile::tempdir_in(&tmpdir).unwrap(); let env = lmdb::Environment::new().open(tmpfile.path()).unwrap(); @@ -367,8 +343,6 @@ fn main() { for results in [ redb_latency_results, redb_throughput_results, - redb_mmap_latency_results, - redb_mmap_throughput_results, lmdb_results, rocksdb_results, sled_results, @@ -385,8 +359,6 @@ fn main() { "", "redb (1PC+C)", "redb (2PC)", - "redb (mmap-1PC+C)", - "redb (mmap-2PC)", "lmdb", "rocksdb", "sled", diff --git a/benches/multithreaded_insert_benchmark.rs b/benches/multithreaded_insert_benchmark.rs index 8be4b006..0361e5c0 100644 --- a/benches/multithreaded_insert_benchmark.rs +++ b/benches/multithreaded_insert_benchmark.rs @@ -33,7 +33,7 @@ fn main() { { let tmpfile: NamedTempFile = NamedTempFile::new_in(current_dir().unwrap()).unwrap(); - let db = unsafe { Database::builder().create_mmapped(tmpfile.path()).unwrap() }; + let db = Database::builder().create(tmpfile.path()).unwrap(); let start = Instant::now(); let write_txn = db.begin_write().unwrap(); @@ -63,7 +63,7 @@ fn main() { { let tmpfile: NamedTempFile = NamedTempFile::new_in(current_dir().unwrap()).unwrap(); - let db = unsafe { Database::builder().create_mmapped(tmpfile.path()).unwrap() }; + let db = Database::builder().create(tmpfile.path()).unwrap(); let start = Instant::now(); let write_txn = db.begin_write().unwrap(); diff --git a/docs/design.md b/docs/design.md index b282c289..54490ca4 100644 --- a/docs/design.md +++ b/docs/design.md @@ -371,14 +371,6 @@ the database to a savepoint the root of the b-tree is restored, and the snapshot is diff'ed against the currently allocated pages to determine which have been allocated since the savepoint was created. These pages are then queued to be freed, completing the rollback. -# File backends - -redb has two "backends" for interacting with the file on disk. The first, the "mmap backend", is -used via the `Builder::create_mmapped()` API. This is an unsafe API because of the semantics of `mmap()`. However, -it can be faster for some use cases and it allows the OS to manage all caching of redb data. -The second backend, the "userspace backend", is entirely safe and has a user configurable cache -which it uses to cache ranges of the underlying `File`. This backend is used via the `Builder::create()` API. - # Assumptions about underlying media redb is designed to be safe even in the event of power failure or on poorly behaved media. Therefore, we make only a few assumptions about the guarantees provided by the underlying filesystem: diff --git a/fuzz/fuzz_targets/common.rs b/fuzz/fuzz_targets/common.rs index 19b89c45..ee19aeef 100644 --- a/fuzz/fuzz_targets/common.rs +++ b/fuzz/fuzz_targets/common.rs @@ -131,7 +131,6 @@ pub(crate) struct FuzzTransaction { pub(crate) struct FuzzConfig { pub use_checksums: bool, pub multimap_table: bool, - pub use_mmap: bool, pub read_cache_size: BoundedUSize, pub write_cache_size: BoundedUSize, pub thread0_transactions: Vec, diff --git a/fuzz/fuzz_targets/fuzz_redb.rs b/fuzz/fuzz_targets/fuzz_redb.rs index b1b5e5f6..5a7cefb1 100644 --- a/fuzz/fuzz_targets/fuzz_redb.rs +++ b/fuzz/fuzz_targets/fuzz_redb.rs @@ -284,21 +284,12 @@ fuzz_target!(|config: FuzzConfig| { } else { WriteStrategy::TwoPhase }; - let db = if config.use_mmap { - unsafe { - Database::builder() - .set_write_strategy(write_strategy) - .set_page_size(config.page_size.value) - .create_mmapped(redb_file.path()) - } - } else { - Database::builder() + let db = Database::builder() .set_write_strategy(write_strategy) .set_page_size(config.page_size.value) .set_read_cache_size(config.read_cache_size.value) .set_write_cache_size(config.write_cache_size.value) - .create(redb_file.path()) - }; + .create(redb_file.path()); let db = Arc::new(db.unwrap()); diff --git a/src/db.rs b/src/db.rs index a4a56a13..c7aa1797 100644 --- a/src/db.rs +++ b/src/db.rs @@ -247,7 +247,6 @@ impl Database { #[allow(clippy::too_many_arguments)] fn new( file: File, - use_mmap: bool, page_size: usize, region_size: Option, initial_size: Option, @@ -261,7 +260,6 @@ impl Database { info!("Opening database {:?}", &file_path); let mut mem = TransactionalMemory::new( file, - use_mmap, page_size, region_size, initial_size, @@ -544,16 +542,12 @@ impl Builder { } /// Set the amount of memory (in bytes) used for caching data that has been read - /// - /// This setting is ignored when calling `create_mmapped()`/`open_mmapped()` pub fn set_read_cache_size(&mut self, bytes: usize) -> &mut Self { self.read_cache_size_bytes = bytes; self } /// Set the amount of memory (in bytes) used for caching data that has been written - /// - /// This setting is ignored when calling `create_mmapped()`/`open_mmapped()` pub fn set_write_cache_size(&mut self, bytes: usize) -> &mut Self { self.write_cache_size_bytes = bytes; self @@ -589,38 +583,6 @@ impl Builder { Database::new( file, - false, - self.page_size, - self.region_size, - self.initial_size, - self.read_cache_size_bytes, - self.write_cache_size_bytes, - self.write_strategy, - ) - } - - /// Opens the specified file as a redb database using the mmap backend. - /// * if the file does not exist, or is an empty file, a new database will be initialized in it - /// * if the file is a valid redb database, it will be opened - /// * otherwise this function will return an error - /// - /// # Safety - /// - /// Caller must ensure that the memory representing the memory-mapped file is not modified externally. - /// In particular: - /// 1) the file referenced by `path` must not be concurrently modified by any other process - /// 2) an I/O failure writing back to disk must not mutate the the memory. You should consider - /// reading this paper before assuming that your OS provides this gaurantee: - pub unsafe fn create_mmapped(&self, path: impl AsRef) -> Result { - let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .open(path)?; - - Database::new( - file, - true, self.page_size, self.region_size, self.initial_size, @@ -638,36 +600,6 @@ impl Builder { let file = OpenOptions::new().read(true).write(true).open(path)?; Database::new( file, - false, - self.page_size, - None, - self.initial_size, - self.read_cache_size_bytes, - self.write_cache_size_bytes, - None, - ) - } else { - Err(Error::Io(io::Error::from(ErrorKind::InvalidData))) - } - } - - /// Opens an existing redb database using the mmap backend. - /// - /// # Safety - /// - /// Caller must ensure that the memory representing the memory-mapped file is not modified externally. - /// In particular: - /// 1) the file referenced by `path` must not be concurrently modified by any other process - /// 2) an I/O failure writing back to disk must not mutate the the memory. You should consider - /// reading this paper before assuming that your OS provides this gaurantee: - pub unsafe fn open_mmapped(&self, path: impl AsRef) -> Result { - if !path.as_ref().exists() { - Err(Error::Io(ErrorKind::NotFound.into())) - } else if File::open(path.as_ref())?.metadata()?.len() > 0 { - let file = OpenOptions::new().read(true).write(true).open(path)?; - Database::new( - file, - true, self.page_size, None, self.initial_size, diff --git a/src/tree_store/page_store/base.rs b/src/tree_store/page_store/base.rs index 5b8a2eef..730d7b4a 100644 --- a/src/tree_store/page_store/base.rs +++ b/src/tree_store/page_store/base.rs @@ -105,15 +105,13 @@ pub(crate) trait Page { // TODO: remove this in favor of multiple Page implementations #[derive(Clone)] -pub(super) enum PageHack<'a> { - Ref(&'a [u8]), +pub(super) enum PageHack { ArcMem(Arc>), } -impl<'a> AsRef<[u8]> for PageHack<'a> { +impl AsRef<[u8]> for PageHack { fn as_ref(&self) -> &[u8] { match self { - PageHack::Ref(x) => x, PageHack::ArcMem(x) => x, } } @@ -121,14 +119,12 @@ impl<'a> AsRef<[u8]> for PageHack<'a> { // TODO: remove this in favor of multiple Page implementations pub(super) enum PageHackMut<'a> { - Ref(&'a mut [u8]), Writable(WritablePage<'a>), } impl<'a> AsRef<[u8]> for PageHackMut<'a> { fn as_ref(&self) -> &[u8] { match self { - PageHackMut::Ref(x) => x, PageHackMut::Writable(x) => x.mem(), } } @@ -137,14 +133,13 @@ impl<'a> AsRef<[u8]> for PageHackMut<'a> { impl<'a> AsMut<[u8]> for PageHackMut<'a> { fn as_mut(&mut self) -> &mut [u8] { match self { - PageHackMut::Ref(x) => x, PageHackMut::Writable(x) => x.mem_mut(), } } } pub struct PageImpl<'a> { - pub(super) mem: PageHack<'a>, + pub(super) mem: PageHack, pub(super) page_number: PageNumber, #[cfg(debug_assertions)] pub(super) open_pages: &'a Mutex>, diff --git a/src/tree_store/page_store/header.rs b/src/tree_store/page_store/header.rs index 1144769e..63131492 100644 --- a/src/tree_store/page_store/header.rs +++ b/src/tree_store/page_store/header.rs @@ -409,7 +409,6 @@ mod test { assert!(TransactionalMemory::new( file, - false, PAGE_SIZE, None, None, @@ -490,7 +489,6 @@ mod test { assert!(TransactionalMemory::new( file, - false, PAGE_SIZE, None, None, @@ -546,7 +544,6 @@ mod test { assert!(TransactionalMemory::new( file, - false, PAGE_SIZE, None, None, @@ -615,7 +612,6 @@ mod test { assert!(TransactionalMemory::new( file, - false, PAGE_SIZE, None, None, diff --git a/src/tree_store/page_store/mmap.rs b/src/tree_store/page_store/mmap.rs deleted file mode 100644 index 44c4babc..00000000 --- a/src/tree_store/page_store/mmap.rs +++ /dev/null @@ -1,230 +0,0 @@ -use crate::tree_store::page_store::file_lock::LockedFile; -use crate::{Error, Result}; -use std::fs::File; -use std::io; -use std::io::ErrorKind; -use std::slice; -use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU64, AtomicUsize, Ordering}; -use std::sync::Mutex; - -#[cfg(unix)] -mod unix; -#[cfg(unix)] -use unix::*; - -#[cfg(windows)] -mod windows; -use crate::transaction_tracker::TransactionId; -use crate::tree_store::page_store::base::{PageHack, PageHackMut, PageHint, PhysicalStorage}; -#[cfg(windows)] -use windows::*; - -// TODO: add some tests that use the mmap backend -pub(crate) struct Mmap { - file: LockedFile, - old_mmaps: Mutex>, - mmap: Mutex, - current_ptr: AtomicPtr, - len: AtomicUsize, - // TODO: this is an annoying hack and should be removed - current_transaction_id: AtomicU64, - fsync_failed: AtomicBool, -} - -// mmap() is documented as being multi-thread safe -unsafe impl Send for Mmap {} -unsafe impl Sync for Mmap {} - -impl Mmap { - pub(crate) fn new(file: File) -> Result { - let len = file.metadata()?.len(); - let lock = LockedFile::new(file)?; - - let mmap = MmapInner::create_mapping(lock.file(), len)?; - - let address = mmap.base_addr(); - - // Try to flush any pages in the page cache that are out of sync with disk. - // See here for why: - #[cfg(all(unix, not(target_os = "android")))] - unsafe { - libc::posix_madvise( - address as *mut libc::c_void, - len.try_into().unwrap(), - libc::POSIX_MADV_DONTNEED, - ); - } - - let mapping = Self { - file: lock, - old_mmaps: Mutex::new(vec![]), - mmap: Mutex::new(mmap), - current_ptr: AtomicPtr::new(address), - len: AtomicUsize::new(len.try_into().unwrap()), - current_transaction_id: AtomicU64::new(0), - fsync_failed: AtomicBool::new(false), - }; - - mapping.flush()?; - - Ok(mapping) - } - - #[inline] - fn len(&self) -> usize { - self.len.load(Ordering::Acquire) - } - - #[inline] - fn check_fsync_failure(&self) -> Result<()> { - if self.fsync_failed.load(Ordering::Acquire) { - Err(Error::Io(io::Error::from(ErrorKind::Other))) - } else { - Ok(()) - } - } - - #[inline] - fn set_fsync_failed(&self, failed: bool) { - self.fsync_failed.store(failed, Ordering::Release); - } -} - -impl PhysicalStorage for Mmap { - /// SAFETY: Caller must ensure that the values passed to this method are monotonically increasing - // TODO: Remove this method and replace it with a call that returns an accessor that uses Arc to reference count the mmaps - unsafe fn mark_transaction(&self, id: TransactionId) { - self.current_transaction_id.store(id.0, Ordering::Release); - } - - /// SAFETY: Caller must ensure that all references, from get_memory() or get_memory_mut(), created - /// before the matching (same value) call to mark_transaction() have been dropped - unsafe fn gc(&self, oldest_live_id: TransactionId) -> Result { - let mut mmaps = self.old_mmaps.lock().unwrap(); - for (id, mmap) in mmaps.iter() { - // Flush all old mmaps before we drop them - if *id < oldest_live_id { - mmap.flush()?; - } - } - mmaps.retain(|(id, _)| *id >= oldest_live_id); - - Ok(()) - } - - /// SAFETY: if `new_len < len()`, caller must ensure that no references to - /// memory in `new_len..len()` exist - unsafe fn resize(&self, new_len: u64) -> Result<()> { - let new_len: usize = new_len.try_into().unwrap(); - self.check_fsync_failure()?; - - let mut mmap = self.mmap.lock().unwrap(); - self.file.file().set_len(new_len as u64)?; - if mmap.can_resize(new_len as u64) { - mmap.resize(new_len as u64)?; - } else { - let transaction_id = TransactionId(self.current_transaction_id.load(Ordering::Acquire)); - let new_mmap = MmapInner::create_mapping(self.file.file(), new_len as u64)?; - let old_mmap = std::mem::replace(&mut *mmap, new_mmap); - self.old_mmaps - .lock() - .unwrap() - .push((transaction_id, old_mmap)); - self.current_ptr.store(mmap.base_addr(), Ordering::Release); - } - - self.len.store(new_len, Ordering::Release); - - Ok(()) - } - - #[inline] - fn flush(&self) -> Result<()> { - self.check_fsync_failure()?; - - let res = self.mmap.lock().unwrap().flush(); - if res.is_err() { - self.set_fsync_failed(true); - #[cfg(all(unix, not(target_os = "android")))] - { - // Acquire lock on mmap to ensure that a resize doesn't occur - let lock = self.mmap.lock().unwrap(); - let ptr = self.current_ptr.load(Ordering::Acquire); - // Try to flush any pages in the page cache that are out of sync with disk. - // See here for why: - unsafe { - libc::posix_madvise( - ptr as *mut libc::c_void, - self.len(), - libc::POSIX_MADV_DONTNEED, - ); - } - drop(lock); - } - } - - res - } - - #[inline] - fn eventual_flush(&self) -> Result { - self.check_fsync_failure()?; - let res = self.mmap.lock().unwrap().eventual_flush(); - if res.is_err() { - self.set_fsync_failed(true); - #[cfg(all(unix, not(target_os = "android")))] - { - // Acquire lock on mmap to ensure that a resize doesn't occur - let lock = self.mmap.lock().unwrap(); - let ptr = self.current_ptr.load(Ordering::Acquire); - // Try to flush any pages in the page cache that are out of sync with disk. - // See here for why: - unsafe { - libc::posix_madvise( - ptr as *mut libc::c_void, - self.len(), - libc::POSIX_MADV_DONTNEED, - ); - } - drop(lock); - } - } - - res - } - - fn write_barrier(&self) -> Result { - // no-op - Ok(()) - } - - unsafe fn read(&self, offset: u64, len: usize, _hint: PageHint) -> Result { - let offset: usize = offset.try_into().unwrap(); - assert!(offset + len <= self.len()); - self.check_fsync_failure()?; - let ptr = self.current_ptr.load(Ordering::Acquire).add(offset); - Ok(PageHack::Ref(slice::from_raw_parts(ptr, len))) - } - - #[allow(clippy::mut_from_ref)] - unsafe fn write(&self, offset: u64, len: usize) -> Result { - let offset: usize = offset.try_into().unwrap(); - assert!(offset + len <= self.len()); - self.check_fsync_failure()?; - let ptr = self.current_ptr.load(Ordering::Acquire).add(offset); - Ok(PageHackMut::Ref(slice::from_raw_parts_mut(ptr, len))) - } - - fn read_direct(&self, offset: u64, len: usize) -> Result> { - self.check_fsync_failure()?; - self.file.read(offset, len) - } - - fn cancel_pending_write(&self, _offset: u64, _len: usize) { - // no-op - } - - fn invalidate_cache(&self, _offset: u64, _len: usize) { - // no-op - } -} diff --git a/src/tree_store/page_store/mmap/unix.rs b/src/tree_store/page_store/mmap/unix.rs deleted file mode 100644 index 6a7b752f..00000000 --- a/src/tree_store/page_store/mmap/unix.rs +++ /dev/null @@ -1,136 +0,0 @@ -use super::*; -use std::os::unix::io::{AsRawFd, RawFd}; -use std::ptr; - -pub(super) struct MmapInner { - mmap: *mut u8, - capacity: usize, - fd: RawFd, -} - -impl MmapInner { - pub(super) fn create_mapping(file: &File, len: u64) -> Result { - // Use len * 2, so that there is some room for growth without having to create a new mmap and GC it - let capacity: usize = (len * 2).try_into().unwrap(); - let mmap = unsafe { - libc::mmap( - ptr::null_mut(), - capacity as libc::size_t, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_SHARED, - file.as_raw_fd(), - 0, - ) - }; - if mmap == libc::MAP_FAILED { - Err(io::Error::last_os_error().into()) - } else { - // Accesses will primarily be jumping between nodes in b-trees, so will be to random pages - // Benchmarks show ~2x better performance when the database no longer fits in memory - let result = - unsafe { libc::madvise(mmap, capacity as libc::size_t, libc::MADV_RANDOM) }; - if result != 0 { - Err(io::Error::last_os_error().into()) - } else { - Ok(Self { - mmap: mmap as *mut u8, - capacity, - fd: file.as_raw_fd(), - }) - } - } - } - - pub(super) fn can_resize(&self, new_len: u64) -> bool { - new_len <= self.capacity as u64 - } - - pub(super) fn base_addr(&self) -> *mut u8 { - self.mmap - } - - /// Safety: if new_len < len(), caller must ensure that no references to memory in new_len..len() exist - #[inline] - pub(super) unsafe fn resize(&self, new_len: u64) -> Result<()> { - assert!(new_len <= self.capacity as u64); - - let mmap = libc::mmap( - self.mmap as *mut libc::c_void, - self.capacity as libc::size_t, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_SHARED | libc::MAP_FIXED, - self.fd, - 0, - ); - - if mmap == libc::MAP_FAILED { - Err(io::Error::last_os_error().into()) - } else { - assert_eq!(mmap as *mut u8, self.mmap); - let result = libc::madvise(mmap, self.capacity as libc::size_t, libc::MADV_RANDOM); - if result != 0 { - Err(io::Error::last_os_error().into()) - } else { - Ok(()) - } - } - } - - #[inline] - pub(super) fn flush(&self) -> Result { - // Disable fsync when fuzzing, since it doesn't test crash consistency - #[cfg(not(fuzzing))] - { - #[cfg(not(target_os = "macos"))] - { - let result = unsafe { - libc::msync( - self.mmap as *mut libc::c_void, - self.capacity as libc::size_t, - libc::MS_SYNC, - ) - }; - if result != 0 { - return Err(io::Error::last_os_error().into()); - } - } - #[cfg(target_os = "macos")] - { - let code = unsafe { libc::fcntl(self.fd, libc::F_FULLFSYNC) }; - if code == -1 { - return Err(io::Error::last_os_error().into()); - } - } - } - Ok(()) - } - - #[inline] - pub(super) fn eventual_flush(&self) -> Result { - #[cfg(not(target_os = "macos"))] - { - self.flush()?; - } - #[cfg(all(target_os = "macos", not(fuzzing)))] - { - // TODO: It may be unsafe to mix F_BARRIERFSYNC with writes to the mmap. - // Investigate switching to `write()` - let code = unsafe { libc::fcntl(self.fd, libc::F_BARRIERFSYNC) }; - if code == -1 { - return Err(io::Error::last_os_error().into()); - } - } - Ok(()) - } -} - -impl Drop for MmapInner { - fn drop(&mut self) { - unsafe { - libc::munmap( - self.mmap as *mut libc::c_void, - self.capacity as libc::size_t, - ); - } - } -} diff --git a/src/tree_store/page_store/mmap/windows.rs b/src/tree_store/page_store/mmap/windows.rs deleted file mode 100644 index f9f62a47..00000000 --- a/src/tree_store/page_store/mmap/windows.rs +++ /dev/null @@ -1,189 +0,0 @@ -#![allow(clippy::upper_case_acronyms)] - -use super::*; -use std::ffi::c_void; -use std::os::windows::io::AsRawHandle; -use std::os::windows::io::RawHandle; -use std::ptr; - -const PAGE_READWRITE: u32 = 0x4; - -const STANDARD_RIGHTS_REQUIRED: u32 = 0x000f0000; - -const SECTION_QUERY: u32 = 0x0001; -const SECTION_MAP_WRITE: u32 = 0x0002; -const SECTION_MAP_READ: u32 = 0x0004; -const SECTION_MAP_EXECUTE: u32 = 0x0008; -const SECTION_EXTEND_SIZE: u32 = 0x0010; -const SECTION_ALL_ACCESS: u32 = STANDARD_RIGHTS_REQUIRED - | SECTION_QUERY - | SECTION_MAP_WRITE - | SECTION_MAP_READ - | SECTION_MAP_EXECUTE - | SECTION_EXTEND_SIZE; - -const FILE_MAP_ALL_ACCESS: u32 = SECTION_ALL_ACCESS; - -#[repr(C)] -struct SECURITY_ATTRIBUTES { - length: u32, - descriptor: *mut c_void, - inherit: u32, -} - -extern "system" { - /// - fn CreateFileMappingW( - file: RawHandle, - attributes: *mut SECURITY_ATTRIBUTES, - protect: u32, - max_size_high: u32, - max_size_low: u32, - name: *const u16, - ) -> RawHandle; - - /// - fn MapViewOfFileEx( - file_mapping: RawHandle, - desired_access: u32, - offset_high: u32, - offset_low: u32, - bytes_to_map: usize, - base_address: *mut u8, - ) -> *mut u8; - - /// - fn FlushFileBuffers(file: RawHandle) -> u32; - - /// - fn FlushViewOfFile(base_address: *const u8, number_of_bytes_to_flush: usize) -> u32; - - /// - fn UnmapViewOfFile(base_address: *const u8) -> u32; - - /// - fn CloseHandle(handle: RawHandle) -> u32; -} - -struct AutoHandle { - inner: RawHandle, -} - -impl Drop for AutoHandle { - fn drop(&mut self) { - unsafe { - CloseHandle(self.inner); - } - } -} - -pub(super) struct MmapInner { - mmap: *mut u8, - len: usize, - handle: RawHandle, -} - -impl MmapInner { - pub(super) fn create_mapping(file: &File, len: u64) -> Result { - // `CreateFileMappingW` documents: - // - // https://docs.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-createfilemappingw - // > An attempt to map a file with a length of 0 (zero) fails with an error code - // > of ERROR_FILE_INVALID. Applications should test for files with a length of 0 - // > (zero) and reject those files. - assert!(len > 0); - - let mmap = unsafe { Self::map_file(file, len)? }; - - Ok(Self { - mmap, - len: len.try_into().unwrap(), - handle: file.as_raw_handle(), - }) - } - - pub(super) fn can_resize(&self, _new_len: u64) -> bool { - false - } - - pub(super) fn base_addr(&self) -> *mut u8 { - self.mmap - } - - unsafe fn map_file(file: &File, len: u64) -> Result<*mut u8> { - let handle = file.as_raw_handle(); - - #[allow(clippy::cast_possible_truncation)] - let lo = (len & u32::MAX as u64) as u32; - let hi = (len >> 32) as u32; - - let ptr = { - let mapping = AutoHandle { - inner: CreateFileMappingW( - handle, - ptr::null_mut(), - PAGE_READWRITE, - hi, - lo, - ptr::null(), - ), - }; - if mapping.inner.is_null() { - return Err(Error::Io(io::Error::last_os_error())); - } - - MapViewOfFileEx( - mapping.inner, - FILE_MAP_ALL_ACCESS, - 0, - 0, - len.try_into().unwrap(), - ptr::null_mut(), - ) - }; - - Ok(ptr) - } - - pub(super) unsafe fn resize(&self, _len: u64) -> Result<()> { - unimplemented!() - } - - pub(super) fn flush(&self) -> Result { - self.eventual_flush()?; - - #[cfg(not(fuzzing))] - { - if unsafe { FlushFileBuffers(self.handle) } == 0 { - return Err(Error::Io(io::Error::last_os_error())); - } - } - Ok(()) - } - - #[inline] - pub(super) fn eventual_flush(&self) -> Result { - #[cfg(not(fuzzing))] - { - let result = unsafe { FlushViewOfFile(self.mmap, self.len) }; - if result != 0 { - Ok(()) - } else { - Err(Error::Io(io::Error::last_os_error())) - } - } - - #[cfg(fuzzing)] - { - Ok(()) - } - } -} - -impl Drop for MmapInner { - fn drop(&mut self) { - unsafe { - UnmapViewOfFile(self.mmap); - } - } -} diff --git a/src/tree_store/page_store/mod.rs b/src/tree_store/page_store/mod.rs index 5e3f4fb8..cca3304e 100644 --- a/src/tree_store/page_store/mod.rs +++ b/src/tree_store/page_store/mod.rs @@ -5,7 +5,6 @@ mod cached_file; mod file_lock; mod header; mod layout; -mod mmap; mod page_manager; mod region; mod savepoint; diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index a4bb4570..346bc6dc 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -7,7 +7,6 @@ use crate::tree_store::page_store::buddy_allocator::BuddyAllocator; use crate::tree_store::page_store::cached_file::PagedCachedFile; use crate::tree_store::page_store::header::{DatabaseHeader, DB_HEADER_SIZE, MAGICNUMBER}; use crate::tree_store::page_store::layout::DatabaseLayout; -use crate::tree_store::page_store::mmap::Mmap; use crate::tree_store::page_store::region::{RegionHeaderAccessor, RegionHeaderMutator}; use crate::tree_store::page_store::utils::is_page_aligned; use crate::tree_store::page_store::{hash128_with_seed, PageImpl, PageMut}; @@ -21,8 +20,6 @@ use std::collections::HashMap; use std::collections::HashSet; use std::convert::TryInto; use std::fs::File; -#[cfg(unix)] -use std::io; use std::mem::size_of; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; @@ -445,15 +442,12 @@ pub(crate) struct TransactionalMemory { region_header_with_padding_size: u64, #[allow(dead_code)] pages_are_os_page_aligned: bool, - #[allow(dead_code)] - use_mmap: bool, } impl TransactionalMemory { #[allow(clippy::too_many_arguments)] pub(crate) fn new( file: File, - use_mmap: bool, page_size: usize, requested_region_size: Option, initial_size: Option, @@ -498,16 +492,12 @@ impl TransactionalMemory { } } - let mut storage: Box = if use_mmap { - Box::new(Mmap::new(file)?) - } else { - Box::new(PagedCachedFile::new( - file.try_clone().unwrap(), - page_size as u64, - read_cache_size_bytes, - write_cache_size_bytes, - )?) - }; + let mut storage: Box = Box::new(PagedCachedFile::new( + file.try_clone().unwrap(), + page_size as u64, + read_cache_size_bytes, + write_cache_size_bytes, + )?); let magic_number: [u8; MAGICNUMBER.len()] = storage .read_direct(0, MAGICNUMBER.len())? @@ -641,7 +631,6 @@ impl TransactionalMemory { region_size, region_header_with_padding_size: region_header_size, pages_are_os_page_aligned: is_page_aligned(page_size), - use_mmap, }) } @@ -1344,25 +1333,6 @@ impl TransactionalMemory { let mut mem = unsafe { self.storage.write(address_range.start, len)? }; debug_assert!(mem.as_ref().len() >= allocation_size); - // TODO: move this into the mmap implementation - #[cfg(unix)] - if self.use_mmap { - let len = mem.as_ref().len(); - // If this is a large page, hint that it should be paged in - if self.pages_are_os_page_aligned && len > self.get_page_size() { - let result = unsafe { - libc::madvise( - mem.as_mut().as_mut_ptr() as *mut libc::c_void, - len as libc::size_t, - libc::MADV_WILLNEED, - ) - }; - if result != 0 { - return Err(io::Error::last_os_error().into()); - } - } - } - #[cfg(debug_assertions)] { // Poison the memory in debug mode to help detect uninitialized reads From 089164d02738152c2b68f13c0877264ca192dca4 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Fri, 24 Mar 2023 11:13:00 -0700 Subject: [PATCH 2/4] Remove PhysicalStorage trait --- src/transactions.rs | 11 ------ src/tree_store/page_store/base.rs | 44 ----------------------- src/tree_store/page_store/cached_file.rs | 41 +++++++++------------ src/tree_store/page_store/page_manager.rs | 25 +++++-------- 4 files changed, 25 insertions(+), 96 deletions(-) diff --git a/src/transactions.rs b/src/transactions.rs index d40b5c2c..861b4fac 100644 --- a/src/transactions.rs +++ b/src/transactions.rs @@ -120,11 +120,6 @@ impl<'db> WriteTransaction<'db> { info!("Beginning write transaction id={:?}", transaction_id); *live_write_transaction = Some(transaction_id); - // SAFETY: this id came from increment_transaction_id() which generates monotonic ids - unsafe { - db.get_memory().mark_transaction(transaction_id); - } - let root_page = db.get_memory().get_data_root(); let freed_root = db.get_memory().get_freed_root(); let freed_pages = Arc::new(Mutex::new(vec![])); @@ -481,12 +476,6 @@ impl<'db> WriteTransaction<'db> { .oldest_live_read_transaction() .unwrap_or(self.transaction_id); - // SAFETY: durable_commit() is called from commit() which takes ownership of self, - // and oldest_live_read tracks the oldest read transaction that is in progress - unsafe { - self.mem.mmap_gc(oldest_live_read)?; - } - let root = self .table_tree .write() diff --git a/src/tree_store/page_store/base.rs b/src/tree_store/page_store/base.rs index 730d7b4a..19e999fe 100644 --- a/src/tree_store/page_store/base.rs +++ b/src/tree_store/page_store/base.rs @@ -1,7 +1,5 @@ -use crate::transaction_tracker::TransactionId; use crate::tree_store::page_store::cached_file::WritablePage; use crate::tree_store::page_store::page_manager::MAX_MAX_PAGE_ORDER; -use crate::Result; #[cfg(debug_assertions)] use std::collections::HashMap; #[cfg(debug_assertions)] @@ -236,48 +234,6 @@ pub(crate) enum PageHint { Clean, } -// TODO simplify this trait. It leaks a lot of details of the two implementations -pub(super) trait PhysicalStorage: Send + Sync { - /// SAFETY: Caller must ensure that the values passed to this method are monotonically increasing - // TODO: Remove this method and replace it with a call that returns an accessor that uses Arc to reference count the mmaps - unsafe fn mark_transaction(&self, id: TransactionId); - - /// SAFETY: Caller must ensure that all references, from get_memory() or get_memory_mut(), created - /// before the matching (same value) call to mark_transaction() have been dropped - unsafe fn gc(&self, oldest_live_id: TransactionId) -> Result; - - /// SAFETY: if `new_len < len()`, caller must ensure that no references to - /// memory in `new_len..len()` exist - unsafe fn resize(&self, new_len: u64) -> Result; - - fn flush(&self) -> Result; - - fn eventual_flush(&self) -> Result; - - // Make writes visible to readers, but does not guarantee any durability - fn write_barrier(&self) -> Result; - - // Read with caching. Caller must not read overlapping ranges without first calling invalidate_cache(). - // Doing so will not cause UB, but is a logic error. - // - // Safety: caller must ensure that [start, end) does not alias any existing references returned - // from .write() - unsafe fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result; - - // Safety: caller must ensure that [start, end) does not alias any existing references returned - // from .read() or .write() - unsafe fn write(&self, offset: u64, len: usize) -> Result; - - // Read directly from the file, ignoring any cached data - fn read_direct(&self, offset: u64, len: usize) -> Result>; - - // Discard pending writes to the given range - fn cancel_pending_write(&self, offset: u64, len: usize); - - // Invalidate any caching of the given range. After this call overlapping reads of the range are allowed - fn invalidate_cache(&self, offset: u64, len: usize); -} - #[cfg(test)] mod test { use crate::tree_store::PageNumber; diff --git a/src/tree_store/page_store/cached_file.rs b/src/tree_store/page_store/cached_file.rs index 4b732fa3..b42de5cb 100644 --- a/src/tree_store/page_store/cached_file.rs +++ b/src/tree_store/page_store/cached_file.rs @@ -1,5 +1,4 @@ -use crate::transaction_tracker::TransactionId; -use crate::tree_store::page_store::base::{PageHack, PageHackMut, PageHint, PhysicalStorage}; +use crate::tree_store::page_store::base::{PageHack, PageHackMut, PageHint}; use crate::tree_store::page_store::file_lock::LockedFile; use crate::{Error, Result}; use std::collections::BTreeMap; @@ -145,20 +144,9 @@ impl PagedCachedFile { Ok(()) } -} - -impl PhysicalStorage for PagedCachedFile { - unsafe fn mark_transaction(&self, _id: TransactionId) { - // no-op - } - - unsafe fn gc(&self, _oldest_live_id: TransactionId) -> Result { - // no-op - Ok(()) - } // Caller should invalidate all cached pages that are no longer valid - unsafe fn resize(&self, len: u64) -> Result { + pub(super) unsafe fn resize(&self, len: u64) -> Result { // TODO: be more fine-grained about this invalidation for slot in 0..self.read_cache.len() { let cache = mem::take(&mut *self.read_cache[slot].write().unwrap()); @@ -171,7 +159,7 @@ impl PhysicalStorage for PagedCachedFile { self.file.file().set_len(len).map_err(Error::from) } - fn flush(&self) -> Result { + pub(super) fn flush(&self) -> Result { self.check_fsync_failure()?; self.flush_write_buffer()?; // Disable fsync when fuzzing, since it doesn't test crash consistency @@ -198,7 +186,7 @@ impl PhysicalStorage for PagedCachedFile { Ok(()) } - fn eventual_flush(&self) -> Result { + pub(super) fn eventual_flush(&self) -> Result { self.check_fsync_failure()?; #[cfg(not(target_os = "macos"))] @@ -219,17 +207,19 @@ impl PhysicalStorage for PagedCachedFile { } // Make writes visible to readers, but does not guarantee any durability - fn write_barrier(&self) -> Result { + pub(super) fn write_barrier(&self) -> Result { self.flush_write_buffer() } - fn read_direct(&self, offset: u64, len: usize) -> Result> { + // Read directly from the file, ignoring any cached data + pub(super) fn read_direct(&self, offset: u64, len: usize) -> Result> { self.check_fsync_failure()?; self.file.read(offset, len) } - // Caller must explicitly invalidate overlapping regions that are read - unsafe fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result { + // Read with caching. Caller must not read overlapping ranges without first calling invalidate_cache(). + // Doing so will not cause UB, but is a logic error. + pub(super) unsafe fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result { self.check_fsync_failure()?; debug_assert_eq!(0, offset % self.page_size); #[cfg(feature = "cache_metrics")] @@ -277,13 +267,16 @@ impl PhysicalStorage for PagedCachedFile { Ok(PageHack::ArcMem(buffer)) } - fn cancel_pending_write(&self, offset: u64, _len: usize) { + // Discard pending writes to the given range + pub(super) fn cancel_pending_write(&self, offset: u64, _len: usize) { assert_eq!(0, offset % self.page_size); self.write_buffer.lock().unwrap().remove(&offset); } - // Invalidating a cached region in subsections is permitted, as long as all subsections are invalidated - fn invalidate_cache(&self, offset: u64, _len: usize) { + // Invalidate any caching of the given range. After this call overlapping reads of the range are allowed + // + // NOTE: Invalidating a cached region in subsections is permitted, as long as all subsections are invalidated + pub(super) fn invalidate_cache(&self, offset: u64, _len: usize) { let cache_slot: usize = (offset % self.read_cache.len() as u64).try_into().unwrap(); let mut lock = self.read_cache[cache_slot].write().unwrap(); if let Some(removed) = lock.remove(&offset) { @@ -296,7 +289,7 @@ impl PhysicalStorage for PagedCachedFile { } } - unsafe fn write(&self, offset: u64, len: usize) -> Result { + pub(super) unsafe fn write(&self, offset: u64, len: usize) -> Result { self.check_fsync_failure()?; assert_eq!(0, offset % self.page_size); let mut lock = self.write_buffer.lock().unwrap(); diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index 346bc6dc..7f31dd5e 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -1,7 +1,7 @@ use crate::db::WriteStrategy; use crate::transaction_tracker::TransactionId; use crate::tree_store::btree_base::Checksum; -use crate::tree_store::page_store::base::{PageHint, PhysicalStorage}; +use crate::tree_store::page_store::base::PageHint; use crate::tree_store::page_store::bitmap::{BtreeBitmap, BtreeBitmapMut}; use crate::tree_store::page_store::buddy_allocator::BuddyAllocator; use crate::tree_store::page_store::cached_file::PagedCachedFile; @@ -241,7 +241,7 @@ impl Allocators { } } - fn from_bytes(header: &DatabaseHeader, storage: &dyn PhysicalStorage) -> Result { + fn from_bytes(header: &DatabaseHeader, storage: &PagedCachedFile) -> Result { let page_size = header.page_size(); let region_header_size = header.region_header_pages() * page_size; let region_size = @@ -280,7 +280,7 @@ impl Allocators { &self, region_tracker_page: PageNumber, layout: DatabaseLayout, - storage: &mut Box, + storage: &mut PagedCachedFile, ) -> Result { let page_size = layout.full_region_layout().page_size(); let region_header_size = @@ -397,7 +397,7 @@ struct InMemoryState { } impl InMemoryState { - fn from_bytes(header: DatabaseHeader, file: &dyn PhysicalStorage) -> Result { + fn from_bytes(header: DatabaseHeader, file: &PagedCachedFile) -> Result { let allocators = Allocators::from_bytes(&header, file)?; Ok(Self { header, allocators }) } @@ -421,8 +421,7 @@ pub(crate) struct TransactionalMemory { log_since_commit: Mutex>, // True if the allocator state was corrupted when the file was opened needs_recovery: bool, - // TODO: should be a compile-time type parameter - storage: Box, + storage: PagedCachedFile, state: Mutex, // The current layout for the active transaction. // May include uncommitted changes to the database layout, if it grew or shrank @@ -492,12 +491,12 @@ impl TransactionalMemory { } } - let mut storage: Box = Box::new(PagedCachedFile::new( + let mut storage = PagedCachedFile::new( file.try_clone().unwrap(), page_size as u64, read_cache_size_bytes, write_cache_size_bytes, - )?); + )?; let magic_number: [u8; MAGICNUMBER.len()] = storage .read_direct(0, MAGICNUMBER.len())? @@ -608,7 +607,7 @@ impl TransactionalMemory { let region_size = layout.full_region_layout().len(); let region_header_size = layout.full_region_layout().data_section().start; - let state = InMemoryState::from_bytes(header, storage.as_ref())?; + let state = InMemoryState::from_bytes(header, &storage)?; assert!(page_size >= DB_HEADER_SIZE); @@ -1141,14 +1140,6 @@ impl TransactionalMemory { self.allocated_since_commit.lock().unwrap().contains(&page) } - pub(crate) unsafe fn mark_transaction(&self, id: TransactionId) { - self.storage.mark_transaction(id) - } - - pub(crate) unsafe fn mmap_gc(&self, oldest_live_id: TransactionId) -> Result { - self.storage.gc(oldest_live_id) - } - fn allocate_helper( &self, state: &mut InMemoryState, From afa7a50f15248c56a75ef9b7c6994082311f36f0 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Fri, 24 Mar 2023 11:30:59 -0700 Subject: [PATCH 3/4] Remove nearly all unsafe code --- src/multimap_table.rs | 99 +++++++---------------- src/table.rs | 33 ++------ src/transactions.rs | 22 ++--- src/tree_store/btree.rs | 21 ++--- src/tree_store/btree_base.rs | 21 ++--- src/tree_store/btree_iters.rs | 16 +--- src/tree_store/btree_mutator.rs | 34 ++------ src/tree_store/page_store/cached_file.rs | 6 +- src/tree_store/page_store/page_manager.rs | 92 ++++++++------------- src/tree_store/table_tree.rs | 11 +-- 10 files changed, 105 insertions(+), 250 deletions(-) diff --git a/src/multimap_table.rs b/src/multimap_table.rs index 97762d31..199ea1ad 100644 --- a/src/multimap_table.rs +++ b/src/multimap_table.rs @@ -343,11 +343,8 @@ impl<'a, V: RedbKey + 'static> Drop for MultimapValueIter<'a, V> { if !self.free_on_drop.is_empty() { let mut freed_pages = self.freed_pages.as_ref().unwrap().lock().unwrap(); for page in self.free_on_drop.iter() { - unsafe { - // Safety: we have a &mut on the transaction - if !self.mem.unwrap().free_if_uncommitted(*page) { - freed_pages.push(*page); - } + if !self.mem.unwrap().free_if_uncommitted(*page) { + freed_pages.push(*page); } } } @@ -506,10 +503,8 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' drop(builder); drop(guard); let inline_data = DynamicCollection::make_inline_data(&data); - unsafe { - self.tree - .insert(key.borrow(), &DynamicCollection::new(&inline_data))? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&inline_data))?; } else { // convert into a subtree let mut page = self.mem.allocate(leaf_data.len())?; @@ -524,18 +519,13 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' self.mem, self.freed_pages.clone(), ); - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - let existed = unsafe { subtree.insert(value.borrow(), &())?.is_some() }; + let existed = subtree.insert(value.borrow(), &())?.is_some(); assert_eq!(existed, found); let (new_root, new_checksum) = subtree.get_root().unwrap(); let subtree_data = DynamicCollection::make_subtree_data(new_root, new_checksum); - unsafe { - self.tree - .insert(key.borrow(), &DynamicCollection::new(&subtree_data))? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&subtree_data))?; } found @@ -547,16 +537,11 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' self.freed_pages.clone(), ); drop(guard); - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - let existed = unsafe { subtree.insert(value.borrow(), &())?.is_some() }; + let existed = subtree.insert(value.borrow(), &())?.is_some(); let (new_root, new_checksum) = subtree.get_root().unwrap(); let subtree_data = DynamicCollection::make_subtree_data(new_root, new_checksum); - unsafe { - self.tree - .insert(key.borrow(), &DynamicCollection::new(&subtree_data))? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&subtree_data))?; existed } @@ -576,23 +561,16 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' builder.append(value_bytes_ref, <() as RedbValue>::as_bytes(&()).as_ref()); drop(builder); let inline_data = DynamicCollection::make_inline_data(&data); - unsafe { - self.tree - .insert(key.borrow(), &DynamicCollection::new(&inline_data))? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&inline_data))?; } else { let mut subtree: BtreeMut<'_, V, ()> = BtreeMut::new(None, self.mem, self.freed_pages.clone()); - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - unsafe { subtree.insert(value.borrow(), &())? }; + subtree.insert(value.borrow(), &())?; let (new_root, new_checksum) = subtree.get_root().unwrap(); let subtree_data = DynamicCollection::make_subtree_data(new_root, new_checksum); - unsafe { - self.tree - .insert(key.borrow(), &DynamicCollection::new(&subtree_data))? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&subtree_data))?; } false }; @@ -631,7 +609,7 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' let old_num_pairs = accessor.num_pairs(); if old_num_pairs == 1 { drop(guard); - unsafe { self.tree.remove(key.borrow())? }; + self.tree.remove(key.borrow())?; } else { let old_pairs_len = accessor.length_of_pairs(0, old_num_pairs); let removed_value_len = accessor.entry(position).unwrap().key().len(); @@ -659,10 +637,8 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' drop(guard); let inline_data = DynamicCollection::make_inline_data(&new_data); - unsafe { - self.tree - .insert(key.borrow(), &DynamicCollection::new(&inline_data))? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&inline_data))?; } true } else { @@ -674,10 +650,7 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' let mut subtree: BtreeMut = BtreeMut::new(Some(v.as_subtree()), self.mem, self.freed_pages.clone()); drop(guard); - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - let existed = unsafe { subtree.remove(value.borrow())?.is_some() }; + let existed = subtree.remove(value.borrow())?.is_some(); if let Some((new_root, new_checksum)) = subtree.get_root() { let page = self.mem.get_page(new_root)?; @@ -692,41 +665,29 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbKey + 'static> MultimapTable<'db, ' if len < self.mem.get_page_size() / 2 { let inline_data = DynamicCollection::make_inline_data(&page.memory()[..len]); - unsafe { - self.tree.insert( - key.borrow(), - &DynamicCollection::new(&inline_data), - )? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&inline_data))?; drop(page); - unsafe { - if !self.mem.free_if_uncommitted(new_root) { - (*self.freed_pages).lock().unwrap().push(new_root); - } + if !self.mem.free_if_uncommitted(new_root) { + (*self.freed_pages).lock().unwrap().push(new_root); } } else { let subtree_data = DynamicCollection::make_subtree_data(new_root, new_checksum); - unsafe { - self.tree.insert( - key.borrow(), - &DynamicCollection::new(&subtree_data), - )? - }; + self.tree + .insert(key.borrow(), &DynamicCollection::new(&subtree_data))?; } } BRANCH => { - unsafe { - let subtree_data = - DynamicCollection::make_subtree_data(new_root, new_checksum); - self.tree - .insert(key.borrow(), &DynamicCollection::new(&subtree_data))? - }; + let subtree_data = + DynamicCollection::make_subtree_data(new_root, new_checksum); + self.tree + .insert(key.borrow(), &DynamicCollection::new(&subtree_data))?; } _ => unreachable!(), } } else { - unsafe { self.tree.remove(key.borrow())? }; + self.tree.remove(key.borrow())?; } existed diff --git a/src/table.rs b/src/table.rs index cfd27ab8..d55d1f54 100644 --- a/src/table.rs +++ b/src/table.rs @@ -78,10 +78,7 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbValue + 'static> Table<'db, 'txn, K // TODO: we should not require Clone here KR: Borrow> + Clone + 'b, { - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - unsafe { self.tree.drain(range).map(Drain::new) } + self.tree.drain(range).map(Drain::new) } /// Applies `predicate` to all key-value pairs in the specified range. All entries for which @@ -96,14 +93,9 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbValue + 'static> Table<'db, 'txn, K // TODO: we should not require Clone here KR: Borrow> + Clone + 'b, { - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - unsafe { - self.tree - .drain_filter(range, predicate) - .map(DrainFilter::new) - } + self.tree + .drain_filter(range, predicate) + .map(DrainFilter::new) } /// Insert mapping of the given key to the given value @@ -118,10 +110,7 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbValue + 'static> Table<'db, 'txn, K K: 'a, V: 'a, { - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - unsafe { self.tree.insert(key.borrow(), value.borrow()) } + self.tree.insert(key.borrow(), value.borrow()) } /// Reserve space to insert a key-value pair @@ -135,10 +124,7 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbValue + 'static> Table<'db, 'txn, K where K: 'a, { - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - unsafe { self.tree.insert_reserve(key.borrow(), value_length) } + self.tree.insert_reserve(key.borrow(), value_length) } /// Removes the given key @@ -151,10 +137,7 @@ impl<'db, 'txn, K: RedbKey + 'static, V: RedbValue + 'static> Table<'db, 'txn, K where K: 'a, { - // Safety: No other references to this table can exist. - // Tables can only be opened mutably in one location (see Error::TableAlreadyOpen), - // and we borrow &mut self. - unsafe { self.tree.remove(key.borrow()) } + self.tree.remove(key.borrow()) } } @@ -213,7 +196,7 @@ pub trait ReadableTable { /// # fn main() -> Result<(), Error> { /// # let tmpfile: NamedTempFile = NamedTempFile::new().unwrap(); /// # let filename = tmpfile.path(); - /// let db = unsafe { Database::create(filename)? }; + /// let db = Database::create(filename)?; /// let write_txn = db.begin_write()?; /// { /// let mut table = write_txn.open_table(TABLE)?; diff --git a/src/transactions.rs b/src/transactions.rs index 861b4fac..7d848fcc 100644 --- a/src/transactions.rs +++ b/src/transactions.rs @@ -217,10 +217,7 @@ impl<'db> WriteTransaction<'db> { let mut freed_pages = vec![]; for page in allocated_since_savepoint { if self.mem.uncommitted(page) { - // Safety: The page is uncommitted and we have a &mut on the transaction - unsafe { - self.mem.free(page); - } + self.mem.free(page); } else { freed_pages.push(page); } @@ -260,8 +257,7 @@ impl<'db> WriteTransaction<'db> { to_remove.push(entry.key()); } for key in to_remove { - // Safety: all references to the freed table above have already been dropped. - unsafe { freed_tree.remove(&key)? }; + freed_tree.remove(&key)?; } *self.freed_tree.lock().unwrap() = freed_tree; @@ -532,18 +528,13 @@ impl<'db> WriteTransaction<'db> { // 1..=length because the array is length prefixed for i in 1..=length { let page = PageNumber::from_le_bytes(value[i * 8..(i + 1) * 8].try_into().unwrap()); - // Safety: we free only pages that were marked to be freed before the oldest live transaction, - // therefore no one can have a reference to this page still - unsafe { - self.mem.free(page); - } + self.mem.free(page); } } // Remove all the old transactions for key in to_remove { - // Safety: all references to the freed table above have already been dropped. - unsafe { freed_tree.remove(&key)? }; + freed_tree.remove(&key)?; } Ok(()) @@ -561,10 +552,7 @@ impl<'db> WriteTransaction<'db> { transaction_id: self.transaction_id.0, pagination_id: pagination_counter, }; - // Safety: The freed table is only accessed from the writer, so only this function - // is using it. The only reference retrieved, access_guard, is dropped before the next call - // to this method - let mut access_guard = unsafe { freed_tree.insert_reserve(&key, buffer_size)? }; + let mut access_guard = freed_tree.insert_reserve(&key, buffer_size)?; let mut freed_pages = self.freed_pages.lock().unwrap(); let len = freed_pages.len(); diff --git a/src/tree_store/btree.rs b/src/tree_store/btree.rs index 2001f629..411a3b57 100644 --- a/src/tree_store/btree.rs +++ b/src/tree_store/btree.rs @@ -52,8 +52,7 @@ impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> BtreeMut<'a, K, V> { *(*self.root).lock().unwrap() } - // Safety: caller must ensure that no uncommitted data is accessed within this tree, from other references - pub(crate) unsafe fn insert( + pub(crate) fn insert( &mut self, key: &K::SelfType<'_>, value: &V::SelfType<'_>, @@ -79,9 +78,8 @@ impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> BtreeMut<'a, K, V> { /// Reserve space to insert a key-value pair /// The returned reference will have length equal to value_length - // Safety: caller must ensure that no uncommitted data is accessed within this tree, from other references // TODO: return type should be V, not [u8] - pub(crate) unsafe fn insert_reserve( + pub(crate) fn insert_reserve( &mut self, key: &K::SelfType<'_>, value_length: usize, @@ -107,11 +105,7 @@ impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> BtreeMut<'a, K, V> { Ok(guard) } - // Safety: caller must ensure that no uncommitted data is accessed within this tree, from other references - pub(crate) unsafe fn remove( - &mut self, - key: &K::SelfType<'_>, - ) -> Result>> { + pub(crate) fn remove(&mut self, key: &K::SelfType<'_>) -> Result>> { #[cfg(feature = "logging")] trace!("Btree(root={:?}): Deleting {:?}", &self.root, key); let mut root = self.root.lock().unwrap(); @@ -171,8 +165,7 @@ impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> BtreeMut<'a, K, V> { self.read_tree().range(range) } - // Safety: caller must ensure that no uncommitted data is accessed within this tree, from other references - pub(crate) unsafe fn drain< + pub(crate) fn drain< 'a0, T: RangeBounds + Clone + 'a0, // TODO: we shouldn't require Clone @@ -205,8 +198,7 @@ impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> BtreeMut<'a, K, V> { Ok(result) } - // Safety: caller must ensure that no uncommitted data is accessed within this tree, from other references - pub(crate) unsafe fn drain_filter< + pub(crate) fn drain_filter< 'a0, T: RangeBounds + Clone + 'a0, // TODO: we shouldn't require Clone @@ -365,8 +357,7 @@ impl<'a, K: RedbKey, V: RedbValue> Btree<'a, K, V> { if let Some(entry_index) = accessor.find_key::(query) { let (start, end) = accessor.value_range(entry_index).unwrap(); // Safety: free_on_drop is false - let guard = - unsafe { AccessGuard::new(page, start, end - start, false, self.mem) }; + let guard = AccessGuard::new(page, start, end - start, false, self.mem); Ok(Some(guard)) } else { Ok(None) diff --git a/src/tree_store/btree_base.rs b/src/tree_store/btree_base.rs index 26c9d311..bc076f56 100644 --- a/src/tree_store/btree_base.rs +++ b/src/tree_store/btree_base.rs @@ -54,8 +54,7 @@ pub(crate) enum FreePolicy { } impl FreePolicy { - // Safety: Caller must ensure there are no references to page, unless self is FreePolicy::Never - pub(crate) unsafe fn conditional_free( + pub(crate) fn conditional_free( &self, page: PageNumber, freed: &mut Vec, @@ -116,9 +115,7 @@ pub struct AccessGuard<'a, V: RedbValue> { } impl<'a, V: RedbValue> AccessGuard<'a, V> { - // Safety: if free_on_drop is true, caller must guarantee that no other references to page exist, - // and that no references will be created until this AccessGuard is dropped - pub(super) unsafe fn new( + pub(super) fn new( page: PageImpl<'a>, offset: usize, len: usize, @@ -163,9 +160,7 @@ impl<'a, V: RedbValue> AccessGuard<'a, V> { } } - // Safety: if free_on_drop is true, caller must guarantee that no other references to page exist, - // and that no references will be created until this AccessGuard is dropped - pub(super) unsafe fn remove_on_drop( + pub(super) fn remove_on_drop( page: PageMut<'a>, offset: usize, len: usize, @@ -200,10 +195,7 @@ impl<'a, V: RedbValue> Drop for AccessGuard<'a, V> { let mut dummy = EitherPage::OwnedMemory(vec![]); mem::swap(&mut self.page, &mut dummy); drop(dummy); - // Safety: caller to new() guaranteed that no other references to this page exist - unsafe { - self.mem.unwrap().free(page_number); - } + self.mem.unwrap().free(page_number); } OnDrop::RemoveEntry { position, @@ -266,10 +258,9 @@ impl<'a, K: RedbKey, V: RedbValue> AccessGuardMut<'a, K, V> { assert_eq!(LEAF, self.page.memory()[0]); Ok(self.checksum_helper(&self.page)) } else { - // Safe because we're the only one with mutable access, and this is a dirty page so - // no readers can have a reference to it assert!(self.mem.uncommitted(page_number)); - let mut page = unsafe { self.mem.get_page_mut(page_number)? }; + // This is a dirty page so it can't be read cached + let mut page = self.mem.get_page_mut(page_number)?; assert_eq!(BRANCH, page.memory()[0]); let accessor = BranchAccessor::new(&page, K::fixed_width()); let (child_index, child_page) = accessor.child_for_key::(&self.key); diff --git a/src/tree_store/btree_iters.rs b/src/tree_store/btree_iters.rs index 40c84b51..3e87ce06 100644 --- a/src/tree_store/btree_iters.rs +++ b/src/tree_store/btree_iters.rs @@ -249,9 +249,7 @@ pub(crate) struct BtreeDrain<'a, K: RedbKey + 'a, V: RedbValue + 'a> { } impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> BtreeDrain<'a, K, V> { - // Safety: caller must ensure that there are no references to free_on_drop pages, other than those - // within `inner` - pub(crate) unsafe fn new( + pub(crate) fn new( inner: BtreeRangeIter<'a, K, V>, free_on_drop: Vec, master_free_list: Arc>>, @@ -290,9 +288,7 @@ impl<'a, K: RedbKey + 'a, V: RedbValue + 'a> Drop for BtreeDrain<'a, K, V> { let mut master_free_list = self.master_free_list.lock().unwrap(); for page in self.free_on_drop.drain(..) { - // Safety: Caller guaranteed that there are no other references to these pages, - // and we just consumed all of ours in the loop above. - if unsafe { !self.mem.free_if_uncommitted(page) } { + if !self.mem.free_if_uncommitted(page) { master_free_list.push(page); } } @@ -319,9 +315,7 @@ impl< F: for<'f> FnMut(K::SelfType<'f>, V::SelfType<'f>) -> bool, > BtreeDrainFilter<'a, K, V, F> { - // Safety: caller must ensure that there are no references to free_on_drop pages, other than those - // within `inner` - pub(crate) unsafe fn new( + pub(crate) fn new( inner: BtreeRangeIter<'a, K, V>, predicate: F, free_on_drop: Vec, @@ -394,9 +388,7 @@ impl< let mut master_free_list = self.master_free_list.lock().unwrap(); for page in self.free_on_drop.drain(..) { - // Safety: Caller guaranteed that there are no other references to these pages, - // and we just consumed all of ours in the loop above. - if unsafe { !self.mem.free_if_uncommitted(page) } { + if !self.mem.free_if_uncommitted(page) { master_free_list.push(page); } } diff --git a/src/tree_store/btree_mutator.rs b/src/tree_store/btree_mutator.rs index db99c4ba..d33369e4 100644 --- a/src/tree_store/btree_mutator.rs +++ b/src/tree_store/btree_mutator.rs @@ -65,20 +65,16 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { } } + // TODO: can we remove this method now that delete is safe? pub(crate) fn safe_delete( &mut self, key: &K::SelfType<'_>, ) -> Result>> { assert_eq!(self.free_policy, FreePolicy::Never); - // Safety: we asserted that the free policy is Never - unsafe { self.delete(key) } + self.delete(key) } - // Safety: caller must ensure that no references to uncommitted pages in this table exist - pub(crate) unsafe fn delete( - &mut self, - key: &K::SelfType<'_>, - ) -> Result>> { + pub(crate) fn delete(&mut self, key: &K::SelfType<'_>) -> Result>> { if let Some((p, checksum)) = *self.root { let (deletion_result, found) = self.delete_helper(self.mem.get_page(p)?, checksum, K::as_bytes(key).as_ref())?; @@ -109,9 +105,8 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { } } - // Safety: caller must ensure that no references to uncommitted pages in this tree exist #[allow(clippy::type_complexity)] - pub(crate) unsafe fn insert( + pub(crate) fn insert( &mut self, key: &K::SelfType<'_>, value: &V::SelfType<'_>, @@ -156,8 +151,7 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { Ok((old_value, guard)) } - // Safety: caller must ensure that no references to uncommitted pages in this table exist - unsafe fn insert_helper( + fn insert_helper( &mut self, page: PageImpl<'a>, page_checksum: Checksum, @@ -390,8 +384,6 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { } else if self.mem.uncommitted(page.get_page_number()) { let page_number = page.get_page_number(); drop(page); - // Safety: Since the page is uncommitted, no other transactions could have it open - // and we just dropped our reference to it, on the line above let mut mutpage = self.mem.get_page_mut(page_number)?; let mut mutator = BranchMutator::new(&mut mutpage); mutator.write_child_page( @@ -474,8 +466,6 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { // Free the original page, since we've replaced it let page_number = page.get_page_number(); drop(page); - // Safety: If the page is uncommitted, no other transactions can have references to it, - // and we just dropped ours on the line above self.free_policy .conditional_free(page_number, self.freed, self.mem); @@ -485,8 +475,7 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { }) } - // Safety: caller must ensure that no references to uncommitted pages in this table exist - unsafe fn delete_leaf_helper( + fn delete_leaf_helper( &mut self, page: PageImpl<'a>, checksum: Checksum, @@ -511,8 +500,6 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { let (start, end) = accessor.value_range(position).unwrap(); let page_number = page.get_page_number(); drop(page); - // Safety: caller guaranteed that no other references to uncommitted data exist, - // and we just dropped the reference to page let page_mut = self.mem.get_page_mut(page_number)?; // TODO: optimize this! @@ -616,8 +603,7 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { } } - // Safety: caller must ensure that no references to uncommitted pages in this table exist - unsafe fn delete_branch_helper( + fn delete_branch_helper( &mut self, page: PageImpl<'a>, checksum: Checksum, @@ -635,8 +621,6 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { if let Subtree(new_child, new_child_checksum) = result { let (result_page, result_checksum) = if self.mem.uncommitted(original_page_number) { drop(page); - // Safety: Caller guarantees there are no references to uncommitted pages, - // and we just dropped our reference to it on the line above let mut mutpage = self.mem.get_page_mut(original_page_number)?; let mut mutator = BranchMutator::new(&mut mutpage); mutator.write_child_page(child_index, new_child, new_child_checksum); @@ -952,9 +936,7 @@ impl<'a, 'b, K: RedbKey, V: RedbValue> MutateHelper<'a, 'b, K, V> { // Returns the page number of the sub-tree with this key deleted, or None if the sub-tree is empty. // If key is not found, guaranteed not to modify the tree - // - // Safety: caller must ensure that no references to uncommitted pages in this table exist - unsafe fn delete_helper( + fn delete_helper( &mut self, page: PageImpl<'a>, checksum: Checksum, diff --git a/src/tree_store/page_store/cached_file.rs b/src/tree_store/page_store/cached_file.rs index b42de5cb..6fd139f9 100644 --- a/src/tree_store/page_store/cached_file.rs +++ b/src/tree_store/page_store/cached_file.rs @@ -146,7 +146,7 @@ impl PagedCachedFile { } // Caller should invalidate all cached pages that are no longer valid - pub(super) unsafe fn resize(&self, len: u64) -> Result { + pub(super) fn resize(&self, len: u64) -> Result { // TODO: be more fine-grained about this invalidation for slot in 0..self.read_cache.len() { let cache = mem::take(&mut *self.read_cache[slot].write().unwrap()); @@ -219,7 +219,7 @@ impl PagedCachedFile { // Read with caching. Caller must not read overlapping ranges without first calling invalidate_cache(). // Doing so will not cause UB, but is a logic error. - pub(super) unsafe fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result { + pub(super) fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result { self.check_fsync_failure()?; debug_assert_eq!(0, offset % self.page_size); #[cfg(feature = "cache_metrics")] @@ -289,7 +289,7 @@ impl PagedCachedFile { } } - pub(super) unsafe fn write(&self, offset: u64, len: usize) -> Result { + pub(super) fn write(&self, offset: u64, len: usize) -> Result { self.check_fsync_failure()?; assert_eq!(0, offset % self.page_size); let mut lock = self.write_buffer.lock().unwrap(); diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index 7f31dd5e..ac8e7d0f 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -287,8 +287,7 @@ impl Allocators { (layout.full_region_layout().get_header_pages() * page_size) as u64; let region_size = layout.full_region_layout().num_pages() as u64 * page_size as u64 + region_header_size; - // Safety: we have a mutable reference to the Mmap, so no one else can have a reference this memory - let mut region_tracker_bytes = unsafe { + let mut region_tracker_bytes = { let range = region_tracker_page.address_range( page_size as u64, region_size, @@ -312,8 +311,7 @@ impl Allocators { .try_into() .unwrap(); - // Safety: we have a mutable reference to the storage, so no one else can have a reference this memory - let mut mem = unsafe { storage.write(base, len)? }; + let mut mem = storage.write(base, len)?; mem.as_mut() .copy_from_slice(&self.region_headers[i as usize]); } @@ -528,25 +526,19 @@ impl TransactionalMemory { DatabaseHeader::new(layout, checksum_type, TransactionId(0), tracker_page); header.recovery_required = false; - // Safety: we own the storage object and have no other references to this memory - unsafe { - storage - .write(0, DB_HEADER_SIZE)? - .as_mut() - .copy_from_slice(&header.to_bytes(false, false)); - } + storage + .write(0, DB_HEADER_SIZE)? + .as_mut() + .copy_from_slice(&header.to_bytes(false, false)); allocators.flush_to(tracker_page, layout, &mut storage)?; storage.flush()?; // Write the magic number only after the data structure is initialized and written to disk // to ensure that it's crash safe - // Safety: we own the storage object and have no other references to this memory - unsafe { - storage - .write(0, DB_HEADER_SIZE)? - .as_mut() - .copy_from_slice(&header.to_bytes(true, false)); - } + storage + .write(0, DB_HEADER_SIZE)? + .as_mut() + .copy_from_slice(&header.to_bytes(true, false)); storage.flush()?; } let header_bytes = storage.read_direct(0, DB_HEADER_SIZE)?; @@ -592,13 +584,10 @@ impl TransactionalMemory { } } assert!(!repair_info.invalid_magic_number); - // Safety: we own the storage object and have no other references to this memory - unsafe { - storage - .write(0, DB_HEADER_SIZE)? - .as_mut() - .copy_from_slice(&header.to_bytes(true, false)); - } + storage + .write(0, DB_HEADER_SIZE)? + .as_mut() + .copy_from_slice(&header.to_bytes(true, false)); storage.flush()?; } @@ -637,9 +626,7 @@ impl TransactionalMemory { let mut state = self.state.lock().unwrap(); assert!(!state.header.recovery_required); state.header.recovery_required = true; - unsafe { - self.write_header(&state.header, false)?; - } + self.write_header(&state.header, false)?; self.storage.flush() } @@ -705,7 +692,7 @@ impl TransactionalMemory { Ok(()) } - unsafe fn write_header(&self, header: &DatabaseHeader, swap_primary: bool) -> Result { + fn write_header(&self, header: &DatabaseHeader, swap_primary: bool) -> Result { self.storage .write(0, DB_HEADER_SIZE)? .as_mut() @@ -716,11 +703,11 @@ impl TransactionalMemory { pub(crate) fn end_repair(&mut self) -> Result<()> { let mut state = self.state.lock().unwrap(); - unsafe { self.write_header(&state.header, false)? }; + self.write_header(&state.header, false)?; self.storage.flush()?; state.header.recovery_required = false; - unsafe { self.write_header(&state.header, false)? }; + self.write_header(&state.header, false)?; let result = self.storage.flush(); self.needs_recovery = false; @@ -802,7 +789,7 @@ impl TransactionalMemory { secondary.freed_root = freed_root; secondary.layout = layout.layout; secondary.region_tracker = layout.tracker_page; - unsafe { self.write_header(&state.header, false)? }; + self.write_header(&state.header, false)?; // Use 2-phase commit, if checksums are disabled if matches!(checksum_type, ChecksumType::Unused) { @@ -814,7 +801,7 @@ impl TransactionalMemory { } // Swap the primary bit on-disk - unsafe { self.write_header(&state.header, true)? }; + self.write_header(&state.header, true)?; if eventual { self.storage.eventual_flush()?; } else { @@ -823,13 +810,8 @@ impl TransactionalMemory { // Only swap the in-memory primary bit after the fsync is successful state.header.swap_primary_slot(); - // Safety: try_shrink() only removes unallocated free pages at the end of the database file - // references to unallocated pages are not allowed to exist, and we've now promoted the - // shrunked layout to the primary if shrunk { - unsafe { - self.storage.resize(layout.layout.len())?; - } + self.storage.resize(layout.layout.len())?; } self.log_since_commit.lock().unwrap().clear(); @@ -942,11 +924,8 @@ impl TransactionalMemory { layout: restore, tracker_page: restore_tracker_page, }; - // Safety: we've rollbacked the transaction, so any data in that was written into - // space that was grown during this transaction no longer exists - unsafe { - self.storage.resize(layout.layout.len())?; - } + + self.storage.resize(layout.layout.len())?; } Ok(()) @@ -977,7 +956,6 @@ impl TransactionalMemory { .or_default()) += 1; } - // Safety: we asserted that no mutable references are open let range = page_number.address_range( self.page_size as u64, self.region_size, @@ -985,7 +963,7 @@ impl TransactionalMemory { self.page_size, ); let len: usize = (range.end - range.start).try_into().unwrap(); - let mem = unsafe { self.storage.read(range.start, len, hint)? }; + let mem = self.storage.read(range.start, len, hint)?; Ok(PageImpl { mem, @@ -997,8 +975,8 @@ impl TransactionalMemory { }) } - // Safety: the caller must ensure that no references to the memory in `page` exist - pub(crate) unsafe fn get_page_mut(&self, page_number: PageNumber) -> Result { + // NOTE: the caller must ensure that the read cache has been invalidated or stale reads my occur + pub(crate) fn get_page_mut(&self, page_number: PageNumber) -> Result { #[cfg(debug_assertions)] { assert!(!self @@ -1066,8 +1044,7 @@ impl TransactionalMemory { } } - // Safety: the caller must ensure that no references to the memory in `page` exist - pub(crate) unsafe fn free(&self, page: PageNumber) { + pub(crate) fn free(&self, page: PageNumber) { let mut state = self.state.lock().unwrap(); let region_index = page.region; // Free in the regional allocator @@ -1098,8 +1075,7 @@ impl TransactionalMemory { } // Frees the page if it was allocated since the last commit. Returns true, if the page was freed - // Safety: the caller must ensure that no references to the memory in `page` exist - pub(crate) unsafe fn free_if_uncommitted(&self, page: PageNumber) -> bool { + pub(crate) fn free_if_uncommitted(&self, page: PageNumber) -> bool { if self.allocated_since_commit.lock().unwrap().remove(&page) { let mut state = self.state.lock().unwrap(); // Free in the regional allocator @@ -1266,10 +1242,8 @@ impl TransactionalMemory { )?; assert!(new_layout.len() >= layout.len()); - // Safety: We're growing the storage - unsafe { - self.storage.resize(new_layout.len())?; - } + self.storage.resize(new_layout.len())?; + state.allocators.resize_to(new_layout); *layout = new_layout; Ok(()) @@ -1318,10 +1292,8 @@ impl TransactionalMemory { .try_into() .unwrap(); - // Safety: - // The address range we're returning was just allocated, so no other references exist #[allow(unused_mut)] - let mut mem = unsafe { self.storage.write(address_range.start, len)? }; + let mut mem = self.storage.write(address_range.start, len)?; debug_assert!(mem.as_ref().len() >= allocation_size); #[cfg(debug_assertions)] @@ -1393,7 +1365,7 @@ impl Drop for TransactionalMemory { if self.storage.flush().is_ok() && !self.needs_recovery { state.header.recovery_required = false; - let _ = unsafe { self.write_header(&state.header, false) }; + let _ = self.write_header(&state.header, false); let _ = self.storage.flush(); } } diff --git a/src/tree_store/table_tree.rs b/src/tree_store/table_tree.rs index 02206c92..51a8e55e 100644 --- a/src/tree_store/table_tree.rs +++ b/src/tree_store/table_tree.rs @@ -346,10 +346,7 @@ impl<'txn> TableTree<'txn> { continue; } definition.table_root = table_root; - // Safety: References into the master table are never returned to the user - unsafe { - self.tree.insert(&name.as_str(), &definition)?; - } + self.tree.insert(&name.as_str(), &definition)?; } Ok(self.tree.get_root()) } @@ -448,8 +445,7 @@ impl<'txn> TableTree<'txn> { self.pending_table_updates.remove(name); - // Safety: References into the master table are never returned to the user - let found = unsafe { self.tree.remove(&name)?.is_some() }; + let found = self.tree.remove(&name)?.is_some(); return Ok(found); } @@ -477,8 +473,7 @@ impl<'txn> TableTree<'txn> { key_type: K::type_name(), value_type: V::type_name(), }; - // Safety: References into the master table are never returned to the user - unsafe { self.tree.insert(&name, &table)? }; + self.tree.insert(&name, &table)?; Ok(table) } From b15f95c3c429b22850c429a4f16d8ade2d292973 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Fri, 24 Mar 2023 11:40:08 -0700 Subject: [PATCH 4/4] Remove PageHack and PageHackMut --- src/tree_store/page_store/base.rs | 43 +++-------------------- src/tree_store/page_store/cached_file.rs | 16 ++++----- src/tree_store/page_store/page_manager.rs | 16 ++++----- 3 files changed, 20 insertions(+), 55 deletions(-) diff --git a/src/tree_store/page_store/base.rs b/src/tree_store/page_store/base.rs index 19e999fe..3054d6e3 100644 --- a/src/tree_store/page_store/base.rs +++ b/src/tree_store/page_store/base.rs @@ -101,43 +101,8 @@ pub(crate) trait Page { fn get_page_number(&self) -> PageNumber; } -// TODO: remove this in favor of multiple Page implementations -#[derive(Clone)] -pub(super) enum PageHack { - ArcMem(Arc>), -} - -impl AsRef<[u8]> for PageHack { - fn as_ref(&self) -> &[u8] { - match self { - PageHack::ArcMem(x) => x, - } - } -} - -// TODO: remove this in favor of multiple Page implementations -pub(super) enum PageHackMut<'a> { - Writable(WritablePage<'a>), -} - -impl<'a> AsRef<[u8]> for PageHackMut<'a> { - fn as_ref(&self) -> &[u8] { - match self { - PageHackMut::Writable(x) => x.mem(), - } - } -} - -impl<'a> AsMut<[u8]> for PageHackMut<'a> { - fn as_mut(&mut self) -> &mut [u8] { - match self { - PageHackMut::Writable(x) => x.mem_mut(), - } - } -} - pub struct PageImpl<'a> { - pub(super) mem: PageHack, + pub(super) mem: Arc>, pub(super) page_number: PageNumber, #[cfg(debug_assertions)] pub(super) open_pages: &'a Mutex>, @@ -197,7 +162,7 @@ impl<'a> Clone for PageImpl<'a> { } pub(crate) struct PageMut<'a> { - pub(super) mem: PageHackMut<'a>, + pub(super) mem: WritablePage<'a>, pub(super) page_number: PageNumber, #[cfg(debug_assertions)] pub(super) open_pages: &'a Mutex>, @@ -207,13 +172,13 @@ pub(crate) struct PageMut<'a> { impl<'a> PageMut<'a> { pub(crate) fn memory_mut(&mut self) -> &mut [u8] { - self.mem.as_mut() + self.mem.mem_mut() } } impl<'a> Page for PageMut<'a> { fn memory(&self) -> &[u8] { - self.mem.as_ref() + self.mem.mem() } fn get_page_number(&self) -> PageNumber { diff --git a/src/tree_store/page_store/cached_file.rs b/src/tree_store/page_store/cached_file.rs index 6fd139f9..f3f6b5b1 100644 --- a/src/tree_store/page_store/cached_file.rs +++ b/src/tree_store/page_store/cached_file.rs @@ -1,4 +1,4 @@ -use crate::tree_store::page_store::base::{PageHack, PageHackMut, PageHint}; +use crate::tree_store::page_store::base::PageHint; use crate::tree_store::page_store::file_lock::LockedFile; use crate::{Error, Result}; use std::collections::BTreeMap; @@ -219,7 +219,7 @@ impl PagedCachedFile { // Read with caching. Caller must not read overlapping ranges without first calling invalidate_cache(). // Doing so will not cause UB, but is a logic error. - pub(super) fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result { + pub(super) fn read(&self, offset: u64, len: usize, hint: PageHint) -> Result>> { self.check_fsync_failure()?; debug_assert_eq!(0, offset % self.page_size); #[cfg(feature = "cache_metrics")] @@ -231,7 +231,7 @@ impl PagedCachedFile { #[cfg(feature = "cache_metrics")] self.reads_hits.fetch_add(1, Ordering::Release); debug_assert_eq!(cached.len(), len); - return Ok(PageHack::ArcMem(cached.clone())); + return Ok(cached.clone()); } } @@ -242,7 +242,7 @@ impl PagedCachedFile { #[cfg(feature = "cache_metrics")] self.reads_hits.fetch_add(1, Ordering::Release); debug_assert_eq!(cached.len(), len); - return Ok(PageHack::ArcMem(cached.clone())); + return Ok(cached.clone()); } } @@ -264,7 +264,7 @@ impl PagedCachedFile { self.read_cache_bytes.fetch_sub(removed, Ordering::AcqRel); } - Ok(PageHack::ArcMem(buffer)) + Ok(buffer) } // Discard pending writes to the given range @@ -289,7 +289,7 @@ impl PagedCachedFile { } } - pub(super) fn write(&self, offset: u64, len: usize) -> Result { + pub(super) fn write(&self, offset: u64, len: usize) -> Result { self.check_fsync_failure()?; assert_eq!(0, offset % self.page_size); let mut lock = self.write_buffer.lock().unwrap(); @@ -334,10 +334,10 @@ impl PagedCachedFile { self.read_direct(offset, len)? } }; - Ok(PageHackMut::Writable(WritablePage { + Ok(WritablePage { buffer: &self.write_buffer, offset, data, - })) + }) } } diff --git a/src/tree_store/page_store/page_manager.rs b/src/tree_store/page_store/page_manager.rs index ac8e7d0f..a513105a 100644 --- a/src/tree_store/page_store/page_manager.rs +++ b/src/tree_store/page_store/page_manager.rs @@ -298,7 +298,7 @@ impl Allocators { storage.write(range.start, len)? }; region_tracker_bytes - .as_mut() + .mem_mut() .copy_from_slice(&self.region_tracker); assert_eq!(self.region_headers.len(), layout.num_regions() as usize); @@ -312,7 +312,7 @@ impl Allocators { .unwrap(); let mut mem = storage.write(base, len)?; - mem.as_mut() + mem.mem_mut() .copy_from_slice(&self.region_headers[i as usize]); } @@ -528,7 +528,7 @@ impl TransactionalMemory { header.recovery_required = false; storage .write(0, DB_HEADER_SIZE)? - .as_mut() + .mem_mut() .copy_from_slice(&header.to_bytes(false, false)); allocators.flush_to(tracker_page, layout, &mut storage)?; @@ -537,7 +537,7 @@ impl TransactionalMemory { // to ensure that it's crash safe storage .write(0, DB_HEADER_SIZE)? - .as_mut() + .mem_mut() .copy_from_slice(&header.to_bytes(true, false)); storage.flush()?; } @@ -586,7 +586,7 @@ impl TransactionalMemory { assert!(!repair_info.invalid_magic_number); storage .write(0, DB_HEADER_SIZE)? - .as_mut() + .mem_mut() .copy_from_slice(&header.to_bytes(true, false)); storage.flush()?; } @@ -695,7 +695,7 @@ impl TransactionalMemory { fn write_header(&self, header: &DatabaseHeader, swap_primary: bool) -> Result { self.storage .write(0, DB_HEADER_SIZE)? - .as_mut() + .mem_mut() .copy_from_slice(&header.to_bytes(true, swap_primary)); Ok(()) @@ -1294,12 +1294,12 @@ impl TransactionalMemory { #[allow(unused_mut)] let mut mem = self.storage.write(address_range.start, len)?; - debug_assert!(mem.as_ref().len() >= allocation_size); + debug_assert!(mem.mem().len() >= allocation_size); #[cfg(debug_assertions)] { // Poison the memory in debug mode to help detect uninitialized reads - mem.as_mut().fill(0xFF); + mem.mem_mut().fill(0xFF); } Ok(PageMut {